In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

In [2]:
movies_df = pd.read_csv('E:/Projects/Content Based Movie Recommendation System/movies.csv')
ratings_df = pd.read_csv('E:/Projects/Content Based Movie Recommendation System/ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
#Dropping the genres column
movies_df = movies_df.drop('genres', 1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [6]:
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [7]:
inputMovies = pd.read_excel('C:/Users/swapn/OneDrive/Desktop/Collaborative.xlsx')
df = pd.read_csv('E:/Projects/Content Based Movie Recommendation System/movies.csv')
inputMovies.columns = ['movieId', 'title', 'rating']
inputMovies = inputMovies.drop('movieId', 1)

inputId = df[df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', 1)

inputMovies['year'] = inputMovies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
inputMovies['year'] = inputMovies.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
inputMovies['title'] = inputMovies.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
inputMovies['title'] = inputMovies['title'].apply(lambda x: x.strip())
inputMovies = inputMovies.drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,4.5
1,2,Jumanji,3.2
2,44,Mortal Kombat,3.6


In [8]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
479,13,2,2.0
482,13,44,2.0
749,15,1,4.0
1247,17,1,5.0
1248,17,2,3.0


In [9]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])
print(userSubsetGroup.head())
userSubsetGroup

          userId  movieId  rating
479           13        2     2.0
482           13       44     2.0
749           15        1     4.0
1247          17        1     5.0
1248          17        2     3.0
...          ...      ...     ...
10909838  198157       44     4.0
10910148  198159        1     4.0
10910569  198161        2     1.0
10910956  198165        1     5.0
10910983  198166        1     3.0

[46192 rows x 3 columns]


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BA2CCA3F10>

In [10]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[(17,
        userId  movieId  rating
  1247      17        1     5.0
  1248      17        2     3.0
  1268      17       44     1.0),
 (94,
        userId  movieId  rating
  8461      94        1     5.0
  8462      94        2     4.0
  8465      94       44     3.0),
 (114,
         userId  movieId  rating
  10278     114        1     3.0
  10279     114        2     1.0
  10287     114       44     1.0)]

In [11]:
userSubsetGroup = userSubsetGroup[0:100]

In [12]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist() 
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [13]:
pearsonCorrelationDict.items()

dict_items([(17, 0.6758453353343774), (94, 0.6758453353343641), (114, 0.9538209664765354), (178, -0.7370434740955071), (303, 0.7370434740954867), (341, 0.9538209664765276), (393, 0.9538209664765355), (407, 0.7216433578677489), (599, 0.6758453353343721), (604, 0.6758453353343641), (686, 0.6758453353343801), (703, 0.9762210399274243), (735, 0.44491211497559197), (784, 0.5636214801906787), (815, 0.7559289460184553), (927, 0.9762210399274296), (1130, -0.7370434740955025), (1204, 0.8029550685469649), (1255, 0.3003757045930602), (1414, 0.9057962601140015), (1502, 0.7370434740955037), (1598, 0.9538209664765303), (1599, 0.21677749238102773), (1615, -0.2167774923810366), (1625, 0.6758453353343694), (1700, 0.7370434740954867), (1770, -0.9538209664765355), (1824, 0.9995971261503097), (1839, -0.737043474095501), (1966, 0.8029550685469649), (2065, 0.21677749238102714), (2115, 0.6758453353343765), (2153, 0.5868395765143242), (2289, 0.9538209664765355), (2372, 0.6758453353343641), (2397, 0), (2404, 0

In [14]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.675845,17
1,0.675845,94
2,0.953821,114
3,-0.737043,178
4,0.737043,303


In [15]:
#top 50 most similar users to input
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
27,0.999597,1824
82,0.999597,5632
53,0.999597,3281
92,0.993399,6247
87,0.976221,5886


In [16]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.999597,1824,1,5.0
1,0.999597,1824,2,2.0
2,0.999597,1824,4,2.0
3,0.999597,1824,6,3.0
4,0.999597,1824,10,1.0


In [17]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.999597,1824,1,5.0,4.997986
1,0.999597,1824,2,2.0,1.999194
2,0.999597,1824,4,2.0,1.999194
3,0.999597,1824,6,3.0,2.998791
4,0.999597,1824,10,1.0,0.999597


In [18]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,44.484121,192.667113
2,44.484121,120.299743
3,16.783203,46.196923
4,5.191281,11.818849
5,16.101356,43.833342


In [19]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.331144,1
2,2.70433,2
3,2.752569,3
4,2.276673,4
5,2.722339,5


In [20]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.reset_index(drop=True)
recommendation_df.head(20)

Unnamed: 0,weighted average recommendation score,movieId
0,5.0,4216
1,5.0,8754
2,5.0,2066
3,5.0,6630
4,5.0,2442
5,5.0,6533
6,5.0,706
7,5.0,2575
8,5.0,3787
9,5.0,685


In [21]:
final_df = movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'].tolist())]
result = pd.merge(final_df, recommendation_df, on='movieId')
result = result.sort_values(by='weighted average recommendation score', ascending=False)
result.to_excel("C:/Users/swapn/OneDrive/Desktop/Collaborative Results.xlsx")  
result

Unnamed: 0,movieId,title,year,weighted average recommendation score
3072,4216,Longtime Companion,1990,5.0
5009,8754,"Prime of Miss Jean Brodie, The",1969,5.0
1503,2066,Out of the Past,1947,5.0
4332,6630,"Inn of the Sixth Happiness, The",1958,5.0
1799,2442,Hilary and Jackie,1998,5.0
...,...,...,...,...
4733,7446,Clifford's Really Big Movie,2004,0.5
8105,102880,After Earth,2013,0.5
8084,102113,One-Eyed Monster,2008,0.5
4766,7617,Rooster Cogburn,1975,0.5
