<h1><b>Movie Recommendation System Using NLP and Hybrid Filtering</b></h1>


In [1]:
import pandas as pd

# **Load the dataset**

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

# **Read The dataset**

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


# **Combination of the dataset**

In [5]:
combined_data = pd.merge(ratings, movies, on='movieId')

In [6]:
combined_data.to_csv("movies-and-ratings.csv", index = False)

In [7]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000095 entries, 0 to 25000094
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(3), object(2)
memory usage: 1.3+ GB


In [8]:
combined_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [9]:
combined_data.tail()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
25000090,162358,200192,2.0,1553453039,Den frusna leoparden (1986),(no genres listed)
25000091,162358,200194,2.0,1553453843,Tough Luck (2004),Action|Adventure|Thriller
25000092,162386,139970,3.5,1549215965,I Don't Speak English (1995),Comedy
25000093,162386,200726,4.0,1554651417,The Graduates (1995),Children|Drama
25000094,162386,200728,4.0,1554651472,Il pesce innamorato (1999),(no genres listed)


# **Check for null values in the combined dataset**

In [10]:
combined_data.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

# **Check data types of each column in the combined dataset**

In [11]:
combined_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

# **Perform statistical analysis on the combined dataset**

In [12]:
combined_data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


# **Data Visualisation**

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

**1. Question: What is the distribution of movie ratings?**

In [None]:
# plt.figure(figsize=(10, 6))
# sns.histplot(combined_data['rating'], bins=5, kde=True)
# plt.title('Distribution of Movie Ratings')
# plt.xlabel('Rating')
# plt.ylabel('Frequency')
# plt.show()

**2. Question: How many movies are there in each genre?**

In [None]:
# # Count the number of movies in each genre
# genre_data = combined_data['genres'].iloc[0:100000]
# genre_counts = genre_data.str.split('|', expand=True).stack().value_counts()

# # Visualize the number of movies in each genre
# plt.figure(figsize=(12, 8))
# genre_counts.plot(kind='bar', color='skyblue')
# plt.title('Number of Movies in Each Genre')
# plt.xlabel('Genre')
# plt.ylabel('Number of Movies')
# plt.show()

**3. Question: What are the top 10 most rated movies?**

In [None]:
# # Find the top 10 most rated movies
# top_rated_movies = combined_data.groupby('title')['rating'].count().sort_values(ascending=False).head(10)

# # Visualize the top 10 most rated movies
# plt.figure(figsize=(12, 8))
# top_rated_movies.plot(kind='bar', color='salmon')
# plt.title('Top 10 Most Rated Movies')
# plt.xlabel('Movie Title')
# plt.ylabel('Number of Ratings')
# plt.show()

**4. How does the average rating vary with the number of ratings received by a movie?**

In [None]:
# ratings_count = combined_data.groupby('title')['rating'].count()

# avg_rating = combined_data.groupby('title')['rating'].mean()

# plt.figure(figsize=(12, 8))
# sns.scatterplot(x=ratings_count, y=avg_rating, color='purple')
# plt.title('Average Rating vs. Number of Ratings')
# plt.xlabel('Number of Ratings')
# plt.ylabel('Average Rating')
# plt.show()

In [None]:
#pip install spacy

#**Movie Reccomendation Using NLP**

In [16]:
#bash command to download the English language model from Spacy
#python -m spacy download en_core_web_sm

In [14]:
import spacy

# Load the English language model from SpaCy
nlp = spacy.load('en_core_web_sm')

In [29]:
# # Shuffling the data before slicing the dataframe to nullify data biasness
# model_data = combined_data.sample(frac=1).reset_index(drop=True)

In [17]:
model_data = movies

In [18]:
model_data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
model_data.tail(5)

Unnamed: 0,movieId,title,genres
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


In [20]:
combined_data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [33]:
#model_data = combined_data.iloc[0:10]

In [21]:
model_data.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [36]:
# import spacy
# import pandas as pd
# from spacy import displacy

# # Loading the English language model from SpaCy
# nlp = spacy.load('en_core_web_sm')



In [22]:
# Applying NLP to movie titles
model_data['title_nlp'] = model_data['title'].apply(nlp)

In [24]:
model_data.head(10)

Unnamed: 0,movieId,title,genres,title_nlp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"(Toy, Story, (, 1995, ))"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"(Jumanji, (, 1995, ))"
2,3,Grumpier Old Men (1995),Comedy|Romance,"(Grumpier, Old, Men, (, 1995, ))"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"(Waiting, to, Exhale, (, 1995, ))"
4,5,Father of the Bride Part II (1995),Comedy,"(Father, of, the, Bride, Part, II, (, 1995, ))"
5,6,Heat (1995),Action|Crime|Thriller,"(Heat, (, 1995, ))"
6,7,Sabrina (1995),Comedy|Romance,"(Sabrina, (, 1995, ))"
7,8,Tom and Huck (1995),Adventure|Children,"(Tom, and, Huck, (, 1995, ))"
8,9,Sudden Death (1995),Action,"(Sudden, Death, (, 1995, ))"
9,10,GoldenEye (1995),Action|Adventure|Thriller,"(GoldenEye, (, 1995, ))"


In [25]:
from spacy import displacy

In [26]:
# Visualizing named entities in an example movie title
example_title = model_data['title'].iloc[0]
example_title_doc = nlp(example_title)

# Displaying named entities visualization
displacy.render(example_title_doc, style='ent', jupyter=True)

# Showing the visualization
plt.show()

In [None]:
# !pip install scikit-surprise

# **Hybrid Filtering Using The "surprise" Package**

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Data preparation
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(model_data[['userId', 'movieId', 'rating']], reader)

# Feature selection
trainset, testset = train_test_split(data, test_size=0.2)

# Training the model
algo = SVD()
algo.fit(trainset)

# Testing the model
predictions = algo.test(testset)

# Model evaluation
accuracy.rmse(predictions)

# Applying Hybrid Filtering
def hybrid_recommendation(user_id, title, n=10):
    # Collaborative Filtering
    cf_predictions = [(algo.predict(user_id, row['movieId']).est, row['movieId']) for index, row in combined_data.iterrows()]

    # Applying Content-Based Filtering
    content_based_scores = []
    title_doc = nlp(title)
    for index, row in combined_data.iterrows():
        similarity_score = title_doc.similarity(row['title_nlp'])
        content_based_scores.append((similarity_score, row['movieId']))

    # Combining the results
    hybrid_scores = [(0.5 * cf_score + 0.5 * cb_score, movie_id) for (cf_score, movie_id), (cb_score, _) in zip(cf_predictions, content_based_scores)]

    # Determining top recommendation
    hybrid_scores.sort(reverse=True)
    top_recommendations = [movie_id for _, movie_id in hybrid_scores[:n]]
    return top_recommendations

user_id = 1
user_favorite_movie = 'Pulp Fiction (1994)'
recommended_movies = hybrid_recommendation(user_id, user_favorite_movie)
print(recommended_movies)