<a href="https://colab.research.google.com/github/tanyaryabov/ML/blob/master/RS_collaborative_filtering_user_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MyDrive  Shareddrives


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#Load Dataset
movies = pd.read_csv('/content/drive/MyDrive/movies.csv')
print(movies.shape)
movies.head()

In [None]:
#Remove the year from title and add onto a separate column
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies['title']=movies.title.str.replace('(\(\d\d\d\d\))','')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies['title']=movies.title.apply(lambda x:x.strip())
#Dropping the genres column
movies = movies.drop('genres', 1)
movies.head()

In [None]:
ratings= pd.read_csv('/content/drive/MyDrive/ratings.csv')
#Drop timestamp column from a dataframe
ratings = ratings.drop('timestamp', 1)
ratings.head()

In [54]:
#input user to recommend movies to
userInput = [
            {'title':'Balto', 'rating':5},
            {'title':'Toy Story', 'rating':5},
            {'title':'Pocahontas', 'rating':4.5},
            {'title':"When Night Is Falling", 'rating':4.5},
            {'title':'Swan Princess, The', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Balto,5.0
1,Toy Story,5.0
2,Pocahontas,4.5
3,When Night Is Falling,4.5
4,"Swan Princess, The",4.5


In [55]:
#add movieId to user input:
#Filtering out the movies by title
inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]

#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)

#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,5.0
1,13,Balto,5.0
2,48,Pocahontas,4.5
3,49,When Night Is Falling,4.5
4,313,"Swan Princess, The",4.5


In [None]:
#subset of users that have watched and reviewed the movies in our input:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

In [58]:
#group up the rows by user ID
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])


In [62]:
#sort these groups so the users that share the most movies in common with the input have higher priority. 
#This provides a richer recommendation since we won't go through every single user.
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
#Top most user with id 91 having all 5 similar moves watched
#dataframe of top user group
userSubsetGroup[0][1]

Unnamed: 0,userId,movieId,rating
75355,476,1,4.0
75359,476,13,3.0
75363,476,48,4.0
75382,476,313,4.0


In [63]:
#Similarity of users to input user
#subset of users to iterate through. This limit is imposed because we don't want to waste too much time going through every single user
userSubsetGroup = userSubsetGroup[0:100]

In [64]:
#calculate the Pearson Correlation between input user and subset group, and store it in a dictionary, where the key is the user Id and the value is the coefficient
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient

pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    #Get the N (total similar movies watched) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    
    ###For Debugging Purpose
    #if nRatings<5:
    #    print(inputMovies['movieId'].isin(group['movieId'].tolist()))
    #    break
    #else:
    #    continue
    
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    #Now let's calculate the pearson correlation between two users, so called, x and y

    #For package based
    #scipy.stats import pearsonr
    #pearsonr(tempRatingList,tempGroupList)[0]

    #For hard code based
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [65]:
pearsonCorrelationDict.items()

dict_items([(476, -0.5773502691896258), (19, 0.9449111825230572), (288, 0.49999999999998995), (380, 0.9999999999999827), (509, 0.0), (6, 0), (20, -1.0), (21, 1.0), (27, -1.0), (40, 1.0), (43, 0), (64, 1.0), (68, -1.0), (76, 0), (103, 1.0), (132, -1.0), (134, 0), (144, -1.0), (161, 0), (169, 1.0), (177, 1.0), (202, 1.0), (216, 1.0), (226, 1.0), (232, 1.0), (234, 0), (240, 1.0), (249, 1.0), (264, 1.0), (274, 1.0), (304, 0), (323, 0), (328, 0), (372, 1.0), (381, 1.0), (401, 1.0), (414, 1.0), (436, 0), (483, 1.0), (484, 0), (525, 1.0), (534, 0), (559, 1.0), (560, 1.0), (579, 0), (584, 0), (608, 1.0), (1, 0), (5, 0), (7, 0), (15, 0), (17, 0), (18, 0), (31, 0), (32, 0), (33, 0), (38, 0), (44, 0), (45, 0), (46, 0), (48, 0), (50, 0), (54, 0), (57, 0), (58, 0), (60, 0), (63, 0), (66, 0), (71, 0), (73, 0), (78, 0), (82, 0), (86, 0), (89, 0), (90, 0), (91, 0), (93, 0), (96, 0), (98, 0), (107, 0), (111, 0), (112, 0), (117, 0), (119, 0), (121, 0), (124, 0), (130, 0), (135, 0), (137, 0), (140, 0), (

In [66]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()


Unnamed: 0,0
476,-0.57735
19,0.944911
288,0.5
380,1.0
509,0.0


In [67]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()


Unnamed: 0,similarityIndex,userId
0,-0.57735,476
1,0.944911,19
2,0.5,288
3,1.0,380
4,0.0,509


In [68]:
#top 1000 users that are most similar to the input.
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head(15)

Unnamed: 0,similarityIndex,userId
21,1.0,202
19,1.0,169
43,1.0,560
22,1.0,216
23,1.0,226
24,1.0,232
26,1.0,240
27,1.0,249
28,1.0,264
29,1.0,274


Rating of selected users to all movies
We're going to do this by taking the weighted average of the ratings of the movies using the Pearson Correlation as the weight. But to do this, we first need to get the movies watched by the users in our pearsonDF from the ratings dataframe and then store their correlation in a new column called _similarityIndex". This is achieved below by merging of these two tables.

In [69]:
topUsersRating = topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating.tail()

Unnamed: 0,similarityIndex,userId,movieId,rating
20963,0.0,156,8228,5.0
20964,0.0,156,8571,3.5
20965,0.0,156,8614,3.5
20966,0.0,156,26313,4.0
20967,0.0,156,31038,4.0


Now all we need to do is simply multiply the movie rating by its weight (The similarity index), then sum up the new ratings and divide it by the sum of the weights.

We can easily do this by simply multiplying two columns, then grouping up the dataframe by movieId and then dividing two columns:

It shows the idea of all similar users to candidate movies for the input user:

In [70]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,202,1,4.0,4.0
1,1.0,202,2,4.0,4.0
2,1.0,202,6,5.0,5.0
3,1.0,202,10,4.0,4.0
4,1.0,202,11,4.0,4.0


In [71]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,25.444911,100.529645
2,18.444911,67.834734
3,7.444911,26.834734
5,4.5,14.5
6,7.0,31.5


In [72]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.950874,1
2,3.677694,2
3,3.60444,3
5,3.222222,5
6,4.5,6


Recommended movies
Now let's sort it and see the top 20 movies that the algorithm recommended!

In [73]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
62293,5.0,62293
82,5.0,82
685,5.0,685
5577,5.0,5577
65642,5.0,65642


In [75]:
movies.loc[movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
74,82,Antonia's Line (Antonia),1995
212,248,Houseguest,1994
564,685,It's My Party,1996
1086,1411,Hamlet,1996
3240,4380,"Princess and the Warrior, The (Krieger und die...",2000
3905,5485,Tadpole,2002
3958,5577,Igby Goes Down,2002
4716,7034,Show Me Love (Fucking Åmål),1998
6857,62293,"Duchess, The",2008
6954,65642,"Timecrimes (Cronocrímenes, Los)",2007
