# Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_data = pd.read_csv('movie_data.csv')
ratings_data = pd.read_csv('data/ratings.csv')

In [3]:
movies_data.shape, ratings_data.shape

((34208, 5), (22884377, 4))

#### data preprocessing

In [4]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34208 entries, 0 to 34207
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  34208 non-null  int64  
 1   movieId     34208 non-null  int64  
 2   title       34208 non-null  object 
 3   genres      34208 non-null  object 
 4   year        34140 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.3+ MB


In [5]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22884377 entries, 0 to 22884376
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 698.4 MB


In [6]:
ratings_data.isnull().value_counts()

userId  movieId  rating  timestamp
False   False    False   False        22884377
dtype: int64

In [7]:
movies_data.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,year
0,0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,4,5,Father of the Bride Part II,Comedy,1995.0


In [8]:
movies_data.isnull().value_counts()


Unnamed: 0  movieId  title  genres  year 
False       False    False  False   False    34140
                                    True        68
dtype: int64

In [9]:
movies_data['year'] = movies_data['year'].fillna(0)
movies_data.isnull().value_counts()

Unnamed: 0  movieId  title  genres  year 
False       False    False  False   False    34208
dtype: int64

In [10]:
movies_data['year'] = movies_data['year'].astype('int')
movies_data.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,year
0,0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,2,Jumanji,Adventure|Children|Fantasy,1995
2,2,3,Grumpier Old Men,Comedy|Romance,1995
3,3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,4,5,Father of the Bride Part II,Comedy,1995


In [11]:
#Drop unwanted columns from dataframe
movies_data = movies_data.drop(['Unnamed: 0', 'genres'], axis=1)
movies_data.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [12]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [13]:
#Drop unwanted columns from dataframe
ratings_data = ratings_data.drop('timestamp', axis=1)
ratings_data.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [14]:
# input user to recommend movies
userInput = [
            {'title':'Batman: The Dark Knight Returns, Part 1', 'rating':5},
            {'title':'3 Idiots', 'rating':5},
            {'title':'Rockstar', 'rating':3.5},
            {'title':'Harry Potter and the Chamber of Secrets', 'rating':2},
            {'title':"Iron Man", 'rating':5},
            {'title':"Pirates of the Caribbean: At World's End", 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Batman: The Dark Knight Returns, Part 1",5.0
1,3 Idiots,5.0
2,Rockstar,3.5
3,Harry Potter and the Chamber of Secrets,2.0
4,Iron Man,5.0
5,Pirates of the Caribbean: At World's End,4.5


Extract input movies's ID's from the movies_data and add  into inputMovvies.


In [15]:
#Filtering out the movies by titles
input_movieid = movies_data[movies_data['title'].isin(inputMovies['title'])]
input_movieid

Unnamed: 0,movieId,title,year
5718,5816,Harry Potter and the Chamber of Secrets,2002
11885,53125,Pirates of the Caribbean: At World's End,2007
12646,59315,Iron Man,2008
14801,73881,3 Idiots,2009
19930,98124,"Batman: The Dark Knight Returns, Part 1",2012
29705,135601,Rockstar,2011


In [16]:
# merging inputMovies and input_movieid so we can get movieId and user ratings.
# merging by title
inputMovies = pd.merge(input_movieid, inputMovies, on='title')
inputMovies

Unnamed: 0,movieId,title,year,rating
0,5816,Harry Potter and the Chamber of Secrets,2002,2.0
1,53125,Pirates of the Caribbean: At World's End,2007,4.5
2,59315,Iron Man,2008,5.0
3,73881,3 Idiots,2009,5.0
4,98124,"Batman: The Dark Knight Returns, Part 1",2012,5.0
5,135601,Rockstar,2011,3.5


In [17]:
#drop unwanted columns
inputMovies = inputMovies.drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,5816,Harry Potter and the Chamber of Secrets,2.0
1,53125,Pirates of the Caribbean: At World's End,4.5
2,59315,Iron Man,5.0
3,73881,3 Idiots,5.0
4,98124,"Batman: The Dark Knight Returns, Part 1",5.0
5,135601,Rockstar,3.5


In [18]:
# filtering out users who watched same movie that our input user watched
same_user = ratings_data[ratings_data['movieId'].isin(inputMovies['movieId'])]
same_user.head()

Unnamed: 0,userId,movieId,rating
1175,15,59315,2.5
2328,19,59315,4.5
2744,22,5816,4.0
2866,23,59315,5.0
3567,34,5816,4.5


In [19]:
# create groups of user who watched same movies
# groupby by userId
user_group = same_user.groupby(['userId'])

In [20]:
same_user['userId'].value_counts()

38949     6
2569      5
72754     5
208053    5
211163    5
         ..
97082     1
97076     1
97075     1
97064     1
247751    1
Name: userId, Length: 29786, dtype: int64

In [21]:
user_group.get_group(38949)

Unnamed: 0,userId,movieId,rating
3599361,38949,5816,4.5
3599623,38949,53125,4.0
3599677,38949,59315,4.5
3599785,38949,73881,1.5
3599971,38949,98124,3.5
3600205,38949,135601,4.5


sort these groups so the users that share the most movies in common with the input have higher priority. This provides a richer recommendation since we won't go through every single user.

In [22]:
# sorting it so the users with most movies in common with input user will have high priority
user_group = sorted(user_group, key=lambda x : len(x[1]),reverse=True)
user_group[0:3]

[(38949,
           userId  movieId  rating
  3599361   38949     5816     4.5
  3599623   38949    53125     4.0
  3599677   38949    59315     4.5
  3599785   38949    73881     1.5
  3599971   38949    98124     3.5
  3600205   38949   135601     4.5),
 (2569,
          userId  movieId  rating
  234230    2569     5816     5.0
  234417    2569    53125     4.5
  234455    2569    59315     5.0
  234551    2569    73881     5.0
  234781    2569    98124     4.0),
 (9431,
          userId  movieId  rating
  868586    9431     5816     4.0
  868666    9431    53125     4.0
  868685    9431    59315     5.0
  868727    9431    73881     5.0
  868800    9431    98124     3.5)]

#### Similarity of users to input user
compare all users to our specified user and find the one that is most similar.
We're going to find out how similar each user is to the input through the Pearson Correlation Coefficient. It is used to measure the strength of a linear association between the two variables. 

##### Why Pearson Correlation?
Pearson correlation is invariant to scaling, i.e. multiplying all elements by a nonzero constant or adding any constant to all elements. For example, if you have two vectors X and Y, then, pearson(X, Y) == pearson(X, 2 * Y + 3). This is a pretty important property in recommendation systems because, for example, two users might rate two series of items totally differently in terms of absolute rates, but they would be similar users (i.e. with similar ideas) with similar rates in various scales.

The formula for finding this coefficient between sets X and Y with N values is

r = sum((X - X.mean)*(Y - Y.mean)) / sqrt(sum(X - X.mean)**2) * sqrt(sum(Y - Y.mean))

In [23]:
from math import sqrt

# Store the Pearson Correlation in a dictionary
# where the key is the user Id and the value is the coefficient
pearson_corr_dict = {}

#For every user in our user_group
for name, group in user_group:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearson_corr_dict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearson_corr_dict[name] = 0

# pearson_corr_dict

In [24]:
pearson_df = pd.DataFrame.from_dict(pearson_corr_dict, orient='index')
pearson_df.columns = ['corr']
pearson_df['userId'] = pearson_df.index
pearson_df.index = range(len(pearson_df))
pearson_df.head()

Unnamed: 0,corr,userId
0,-0.492925,38949
1,-0.342997,2569
2,0.300123,9431
3,-0.052005,14815
4,0.307268,19071


In [25]:
pearson_df.shape

(29786, 2)

In [26]:
pearson_df = pearson_df.sort_values(by='corr', ascending=False)
pearson_df.head()

Unnamed: 0,corr,userId
1850,1.0,131882
386,1.0,6802
2685,1.0,200554
2312,1.0,169944
2103,1.0,153748


In [27]:
pearson_df['corr'].value_counts()

 0.000000    22848
 1.000000     2596
-1.000000     1380
 0.155543      175
 0.628619      124
             ...  
-0.168594        1
-0.194576        1
-0.197288        1
-0.200178        1
 0.033942        1
Name: corr, Length: 462, dtype: int64

In [28]:
# The top N similar users to input user
N = 10
top_users = pearson_df[0:N]
top_users

Unnamed: 0,corr,userId
1850,1.0,131882
386,1.0,6802
2685,1.0,200554
2312,1.0,169944
2103,1.0,153748
2335,1.0,171828
3156,1.0,240605
1053,1.0,61285
2067,1.0,151289
823,1.0,42432


#### recommending movies to the input user.
##### Rating of selected users to all movies
take the weighted average of the ratings of the movies using the Pearson Correlation as the weight. But to do this, we first need to get the movies watched by the users in our pearson_df from the ratings_data and then store their correlation in a new column "corr". This can be done by merging of these two tables.

In [29]:
topUsersRating=top_users.merge(ratings_data, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,corr,userId,movieId,rating
0,1.0,131882,47,4.0
1,1.0,131882,110,4.0
2,1.0,131882,208,3.5
3,1.0,131882,260,4.0
4,1.0,131882,296,4.0


Now multiply the movie rating by its weight (corr), then sum up the new ratings and divide it by the sum of the weights.

We can do this by multiplying two columns, then grouping up the dataframe by movieId and then dividing two columns

It shows the idea of all similar users to respective movies for the input user

In [30]:
# multiply corr by user's ratings
topUsersRating['weighted_rating'] = topUsersRating['corr']*topUsersRating['rating']
topUsersRating.head() 

Unnamed: 0,corr,userId,movieId,rating,weighted_rating
0,1.0,131882,47,4.0,4.0
1,1.0,131882,110,4.0,4.0
2,1.0,131882,208,3.5,3.5
3,1.0,131882,260,4.0,4.0
4,1.0,131882,296,4.0,4.0


In [31]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['corr','weighted_rating']]
tempTopUsersRating.columns = ['sum_corr','sum_weighted_rating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_corr,sum_weighted_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.0,22.0
2,3.0,10.0
3,1.0,2.5
5,1.0,2.5
6,4.0,14.5


In [32]:
#Now take the weighted average and store it in new dataframe
recommendation_df = pd.DataFrame()
recommendation_df['weighted_avg_recommendation_score'] = tempTopUsersRating['sum_weighted_rating']/tempTopUsersRating['sum_corr']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted_avg_recommendation_score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.142857,1
2,3.333333,2
3,2.5,3
5,2.5,5
6,3.625,6


Now sort it and see the top 10 movies that the algorithm recommended!

In [33]:
recommendation_df = recommendation_df.sort_values(by='weighted_avg_recommendation_score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted_avg_recommendation_score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
86504,5.0,86504
30810,5.0,30810
26701,5.0,26701
26934,5.0,26934
1172,5.0,1172
2414,5.0,2414
27523,5.0,27523
111921,5.0,111921
2420,5.0,2420
27660,5.0,27660


## Final Recommendation table

In [34]:
movies_data.loc[movies_data['movieId'].isin(recommendation_df.head(10)['movieId'])]

Unnamed: 0,movieId,title,year
1149,1172,Cinema Paradiso (Nuovo cinema Paradiso),1989
2330,2414,Young Sherlock Holmes,1985
2336,2420,"Karate Kid, The",1984
9000,26701,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,1989
9125,26934,God of Cookery (Sik san),1996
9373,27523,My Sassy Girl (Yeopgijeogin geunyeo),2001
9423,27660,"Animatrix, The",2003
9593,30810,"Life Aquatic with Steve Zissou, The",2004
17119,86504,Voices from the List,2004
23663,111921,The Fault in Our Stars,2014
