In [29]:
import pandas as pd
from math import sqrt
import numpy as np

In [None]:
movies_df = pd.read_csv('/content/movies.csv')
ratings_df = pd.read_csv('/content/ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB
None


In [None]:
userInput = [{'title':'Star Wars: Episode IV - A New Hope (1977)', 'rating':3},
             {'title':'Forrest Gump (1994)', 'rating':4.5},
             {'title':'Beauty and the Beast (1991)', 'rating':2},
             {'title':'Insidious (2010)', 'rating':3.5},
             {'title':'Conjuring, The (2013)', 'rating':5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                       title  rating
0  Star Wars: Episode IV - A New Hope (1977)     3.0
1                        Forrest Gump (1994)     4.5
2                Beauty and the Beast (1991)     2.0
3                           Insidious (2010)     3.5
4                      Conjuring, The (2013)     5.0


In [None]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())] # get all movies data where the title is listed in the input list
inputMovies = pd.merge(inputId, inputMovies) # merge the ids received from movies dataframe to inputMovies dataframe
inputMovies = inputMovies.drop('genres', 1) # for now genre is not used for the analysis
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                      title  rating
0      260  Star Wars: Episode IV - A New Hope (1977)     3.0
1      356                        Forrest Gump (1994)     4.5
2      595                Beauty and the Beast (1991)     2.0
3    85788                           Insidious (2010)     3.5
4   103688                      Conjuring, The (2013)     5.0


  inputMovies = inputMovies.drop('genres', 1) # for now genre is not used for the analysis


In [None]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())] # get all ratings data that has the same movie ids as the one in the inputMovies dataframe

#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
print(userSubset.groupby('movieId').count()) # count the amount of reviews for each movies

         userId  rating  timestamp
movieId                           
260        1508    1508       1508
356        1736    1736       1736
595         704     704        704
85788        55      55         55
103688       80      80         80


In [None]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId']) # now data is grouped by user id

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True) # get 5 sorted datas from each user

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(2176,         userId  movieId  rating     timestamp
210478    2176      260     5.0  1.459109e+09
210486    2176      356     5.0  1.459110e+09
210505    2176      595     4.0  1.459110e+09
210866    2176    85788     5.0  1.459111e+09
210887    2176   103688     5.0  1.459111e+09), (2402,         userId  movieId  rating     timestamp
237574    2402      260     4.0  1.429820e+09
237600    2402      356     3.5  1.429820e+09
237666    2402      595     3.5  1.444760e+09
239707    2402    85788     3.0  1.444418e+09
239930    2402   103688     2.5  1.444759e+09), (3653,         userId  movieId  rating     timestamp
368343    3653      260     4.0  1.453531e+09
368355    3653      356     4.5  1.455761e+09
368380    3653      595     3.0  1.453693e+09
369021    3653    85788     1.5  1.453581e+09
369069    3653   103688     4.0  1.455761e+09), (3786,         userId  movieId  rating     timestamp
384164    3786      260     4.0  1.558881e+09
384170    3786      356     4.0  1.558882e+09

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True) # get 5 sorted datas from each user


In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()

    # Notes: Pearson Correlation is used to find similarity values ​​between users and using the Firefly Algorithm to determine the film that best suits the user
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index') #create dataframe from the created dictionary
pearsonDF.columns = ['similarityIndex'] # name the column 'similarityIndex'
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.749269    2176
1        -0.606143    2402
2         0.403509    3653
3         0.888523    3786
4        -0.515122    4127


In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50] # sort topUsers data by similarityIndex
print(topUsers.head())

    similarityIndex  userId
36         0.997176     165
35         0.995871     156
55         0.993399     415
39         0.953821     189
67         0.953821     562


In [None]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner') # merge ratings dataframe with the topUsers dataframe
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating     timestamp
0          0.997176     165        1     4.0  1.431083e+09
1          0.997176     165        6     4.0  1.431097e+09
2          0.997176     165       10     3.5  1.431082e+09
3          0.997176     165       32     4.5  1.429773e+09
4          0.997176     165       47     2.0  1.431097e+09
..              ...     ...      ...     ...           ...
95         0.997176     165     2116     4.0  1.431083e+09
96         0.997176     165     2167     3.0  1.431083e+09
97         0.997176     165     2194     4.0  1.431097e+09
98         0.997176     165     2268     4.0  1.431199e+09
99         0.997176     165     2278     3.0  1.431083e+09

[100 rows x 5 columns]


In [None]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating     timestamp  weightedRating
0         0.997176     165        1     4.0  1.431083e+09        3.988706
1         0.997176     165        6     4.0  1.431097e+09        3.988706
2         0.997176     165       10     3.5  1.431082e+09        3.490118
3         0.997176     165       32     4.5  1.429773e+09        4.487294
4         0.997176     165       47     2.0  1.431097e+09        1.994353


In [None]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  30.208967          113.539960
2                  17.177846           54.273822
3                   5.071901           13.969984
5                   6.304337           14.760115
6                  14.593011           52.502538


In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.758485        1
2                                     3.159524        2
3                                     2.754388        3
5                                     2.341264        5
6                                     3.597786        6
7                                     3.452657        7
8                                     3.500000        8
9                                     3.264853        9
10                                    3.329587       10
11                                    2.911337       11


In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False) # sort data by weighted average
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
101850                                     5.0   101850
26082                                      5.0    26082
7234                                       5.0     7234
116397                                     5.0   116397
26587                                      5.0    26587
...                                        ...      ...
5852                                       0.5     5852
193185                                     0.5   193185
4434                                       0.5     4434
7244                                       0.5     7244
110773                                     0.5   110773

[8754 rows x 2 columns]


In [None]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie (exclude same movie from recommendation list)
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)


       movieId                                          title  \
0            1                               Toy Story (1995)   
1            2                                 Jumanji (1995)   
2            3                        Grumpier Old Men (1995)   
4            5             Father of the Bride Part II (1995)   
5            6                                    Heat (1995)   
...        ...                                            ...   
86076   287397                                Kandahar (2023)   
86296   288135                                My Fault (2023)   
86303   288167                            Extraction 2 (2023)   
86310   288209  Black Clover: Sword of the Wizard King (2023)   
86400   288563                          Lust Stories 2 (2023)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
4      