In [294]:
import pandas as pd
from math import sqrt
import numpy as np

In [295]:
mdf = pd.read_csv('movies.csv')
rdf = pd.read_csv('ratings.csv')
mdf.info()
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [296]:
userInput = [{'title':'Breakfast Club, The (1985)', 'rating':1},
             {'title':'Seven (a.k.a. Se7en) (1995)', 'rating':3},
             {'title':'Grumpier Old Men (1995)', 'rating':4},
             {'title':'Pulp Fiction (1994)', 'rating':0},
             {'title':'Dumb & Dumber (Dumb and Dumber) (1994)', 'rating':2}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)


                                    title  rating
0              Breakfast Club, The (1985)       1
1             Seven (a.k.a. Se7en) (1995)       3
2                 Grumpier Old Men (1995)       4
3                     Pulp Fiction (1994)       0
4  Dumb & Dumber (Dumb and Dumber) (1994)       2


In [297]:
inputId = mdf[mdf['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                   title  rating
0        3                 Grumpier Old Men (1995)       4
1       47             Seven (a.k.a. Se7en) (1995)       3
2      231  Dumb & Dumber (Dumb and Dumber) (1994)       2
3      296                     Pulp Fiction (1994)       0
4     1968              Breakfast Club, The (1985)       1


  inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment


In [298]:
userSubset = rdf[rdf['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
3            52      52         52
47          203     203        203
231         133     133        133
296         307     307        307
1968        113     113        113


In [299]:

#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


[(42,       userId  movieId  rating  timestamp
5875      42        3     4.0  996221045
5883      42       47     4.0  996218105
5907      42      231     4.0  996220872
5917      42      296     5.0  996214107
6087      42     1968     4.0  996258201), (64,       userId  movieId  rating   timestamp
9429      64        3     3.5  1161519668
9440      64       47     4.5  1161520185
9459      64      231     4.5  1161520600
9466      64      296     4.5  1161520143
9622      64     1968     3.5  1161520955), (68,        userId  movieId  rating   timestamp
10362      68        3     2.0  1158533415
10378      68       47     4.0  1158531489
10407      68      231     3.5  1158532011
10419      68      296     2.0  1180916156
10679      68     1968     3.0  1158531622), (91,        userId  movieId  rating   timestamp
14123      91        3     3.0  1112712323
14136      91       47     4.5  1112712832
14162      91      231     3.0  1112713266
14173      91      296     4.5  1112711264
14

In [300]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    


In [301]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())


   similarityIndex  userId
0        -0.707107      42
1        -0.288675      64
2         0.176777      68
3        -0.288675      91
4        -0.242536     226


In [302]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
15         0.982708       6
66         0.981981     176
51         0.944911      56
16         0.894427      19
61         0.891042     136


In [303]:
topUsersRating=topUsers.merge(rdf, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))


    similarityIndex  userId  movieId  rating  timestamp
0          0.982708       6        2     4.0  845553522
1          0.982708       6        3     5.0  845554296
2          0.982708       6        4     3.0  845554349
3          0.982708       6        5     5.0  845553938
4          0.982708       6        6     4.0  845553757
..              ...     ...      ...     ...        ...
95         0.982708       6      225     4.0  845553381
96         0.982708       6      230     5.0  845553966
97         0.982708       6      231     3.0  845553174
98         0.982708       6      234     2.0  845554176
99         0.982708       6      236     4.0  845553559

[100 rows x 5 columns]


In [304]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())


   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0         0.982708       6        2     4.0  845553522        3.930831
1         0.982708       6        3     5.0  845554296        4.913538
2         0.982708       6        4     3.0  845554349        2.948123
3         0.982708       6        5     5.0  845553938        4.913538
4         0.982708       6        6     4.0  845553757        3.930831


In [305]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   2.659447           10.673640
2                   2.154032            8.578056
3                   4.279714           18.839433
4                   1.738637            5.215910
5                   1.487588            8.456657


In [306]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']



recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))


         weighted average recommendation score  movieId
movieId                                                
1                                     4.013480        1
2                                     3.982325        2
3                                     4.402031        3
4                                     3.000000        4
5                                     5.684813        5
6                                     2.778376        6
7                                     3.496185        7
8                                     3.937101        8
9                                     3.000000        9
10                                    3.755094       10


In [308]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df.head(10))



         weighted average recommendation score  movieId
movieId                                                
4158                                       inf     4158
1488                                       inf     1488
1188                              1.838204e+15     1188
2360                              1.134800e+15     2360
2594                              1.080762e+14     2594
4995                              5.679664e+02     4995
41569                             1.385820e+02    41569
3717                              7.472112e+01     3717
3452                              4.894994e+01     3452
72167                             4.744994e+01    72167


In [317]:
recommended_movie=mdf.loc[mdf['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie.loc[1000:1006,:])

      movieId                                              title  \
1000     1302                             Field of Dreams (1989)   
1002     1304          Butch Cassidy and the Sundance Kid (1969)   
1004     1306  Until the End of the World (Bis ans Ende der W...   
1005     1307                     When Harry Met Sally... (1989)   

                      genres  
1000  Children|Drama|Fantasy  
1002          Action|Western  
1004  Adventure|Drama|Sci-Fi  
1005          Comedy|Romance  
