# MOVIE RECOMMENDER SYSTEM

COLLABORATIVE FILTERING

In [4]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
df1 = pd.read_csv('D:/Internship/ratings_small.csv', low_memory=False) # Read Rating_Small csv file
df2 = pd.read_csv('D:/Internship/credits.csv', low_memory=False)  # Read Credits csv file
df3 = pd.read_csv('D:/Internship/keywords.csv', low_memory=False) # Read Keywords csv file
df4 = pd.read_csv('D:/Internship/movies_metadata.csv', low_memory=False) # Read movie_metadata csv file

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [22]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [24]:
# To prevent value error due to "data type mismatch" during merge(), converting movieId and id as string
df1.movieId = df1.movieId.astype(str)
df4.id = df4.id.astype(str)

In [25]:
# merging df1 and df4 to get movie titles and drop rows for which title is not available
data = pd.merge(df1, df4[['id', 'original_title']], left_on='movieId', right_on='id')

In [26]:
# get total counts of no. of occurence of movie
data['count'] = data.groupby('movieId').transform('count')['userId']

In [27]:
# fetch top 100 movies based on count
movieId = data.drop_duplicates('movieId').sort_values(
    'count', ascending=False).iloc[:100]['movieId']

In [28]:
# filter out data as per the movieId
data = data[data['movieId'].isin(movieId)].reset_index(drop=True)

In [29]:
# get total counts of movies each user has seen
data['count'] = data.groupby('userId').transform('count')['movieId']

In [30]:
# fetch top 20000 users based on no. of movies watched
userId = data.drop_duplicates('userId').sort_values(
    'count', ascending=False).iloc[:20001]['userId']

In [31]:
# filter out data as per the userId
data = data[data['userId'].isin(userId)].reset_index(drop=True)

In [32]:
# create a user movie rating matrix
df = data.pivot(index='userId', columns='movieId', values='rating')
df.head()

movieId,104,1073,1089,110,111,1213,1246,1259,1265,1380,...,648,750,778,780,858,8961,912,919,924,95
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,4.0,,,,,,,...,,,,,,,,,,
3,,,,4.0,,,,,,,...,,,4.0,,,,,,,
4,,5.0,5.0,,,5.0,,4.0,5.0,5.0,...,,,,,5.0,,,5.0,,
5,4.0,,,,,,,,,5.0,...,,,,,2.5,,,4.0,,
6,,,,,4.0,,,4.5,,,...,,,,,,,,,,


In [33]:
# replace NaN with user based average rating
df_imputed = df.fillna(df.mean(axis=0))

# get similarity between all users
similarity_matrix = cosine_similarity(df_imputed.values)

In [35]:
df_imputed.head()

movieId,104,1073,1089,110,111,1213,1246,1259,1265,1380,...,648,750,778,780,858,8961,912,919,924,95
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,3.292553,3.753378,4.162879,4.0,4.224576,4.20229,3.784211,4.09375,3.839394,3.477778,...,3.532738,4.209524,4.141129,3.483945,4.4875,3.861111,4.235043,3.957265,3.886179,3.177419
3,3.292553,3.753378,4.162879,4.0,4.224576,4.20229,3.784211,4.09375,3.839394,3.477778,...,3.532738,4.209524,4.0,3.483945,4.4875,3.861111,4.235043,3.957265,3.886179,3.177419
4,3.292553,5.0,5.0,3.945175,4.224576,5.0,3.784211,4.0,5.0,5.0,...,3.532738,4.209524,4.141129,3.483945,5.0,3.861111,4.235043,5.0,3.886179,3.177419
5,4.0,3.753378,4.162879,3.945175,4.224576,4.20229,3.784211,4.09375,3.839394,5.0,...,3.532738,4.209524,4.141129,3.483945,2.5,3.861111,4.235043,4.0,3.886179,3.177419
6,3.292553,3.753378,4.162879,3.945175,4.0,4.20229,3.784211,4.5,3.839394,3.477778,...,3.532738,4.209524,4.141129,3.483945,4.4875,3.861111,4.235043,3.957265,3.886179,3.177419


In [52]:
# Creating definition for recommendation based on user rating
def get_recommendation(user_index):
    ind = user_index
    sim_scores = list(enumerate(similarity_matrix[ind]))

    # getting movies that are unrated by the given user
    unrated_movies = df.iloc[ind][df.iloc[ind].isna()].index

    # getting weighted ratings of unrated movies by all other users
    movie_ratings = (df_imputed.iloc[similarity_matrix[ind]][unrated_movies].T * [
        x[1] for x in sim_scores]).T

    # getting top 100 similar users by skipping the current user
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    # getting mean of movie rating by top 100 most similar users for the unrated movies
    movie_ratings = movie_ratings.iloc[[x[0] for x in sim_scores]].mean()

    # getting recommended movie titles in sorted order
    recommended_movies = df4[df4['id'].isin(movie_ratings.reset_index().sort_values(
        0, ascending=False)['movieId'])][['original_title', 'id']]
    assumed_ratings = sorted(movie_ratings, reverse=True)

    return pd.DataFrame({'movieId':recommended_movies[:5]['id'], 
                         'Recommended Movie':recommended_movies[:5]['original_title'], 
                         'Assumed Rating':assumed_ratings[:5]})


In [54]:
user_id = 655
recommended_movies = get_recommendation(user_id).reset_index(drop=True)
# get top 5 high rated movies by user
temp = data[data['userId']==df.index[user_id]].sort_values(
    'rating', ascending=False)[['rating', 'original_title', 'userId']].iloc[:5].reset_index(drop=True)
recommended_movies['userId'] = temp['userId']
recommended_movies['Movie Watched'] = temp['original_title']
recommended_movies['Rated']= temp['rating']
recommended_movies

Unnamed: 0,movieId,Recommended Movie,Assumed Rating,userId,Movie Watched,Rated
0,527,Once Were Warriors,4.984316,671,The Poseidon Adventure,5.0
1,110,Trois couleurs : Rouge,4.984316,671,The Million Dollar Hotel,5.0
2,6,Judgment Night,4.473424,671,5 Card Stud,5.0
3,329,Jurassic Park,4.222251,671,Muxmäuschenstill,5.0
4,858,Sleepless in Seattle,4.221758,671,The 39 Steps,5.0
