The current example handle how to create a collborative filter in code based on math and kaggle 20 million ratings for movies
https://www.kaggle.com/grouplens/movielens-20m-dataset?select=rating.csv

In [16]:
#lets import some important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from datetime import datetime
from collections import Counter

In [3]:
#import rating file and take a view
df = pd.read_csv('./rating.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [12]:
#it is a very big file so we need to process this data in some way that will help to run the function faster with a properly
# structure, start userId index by 0 not 1
df.userId = df.userId - 1
# get movie count by index and set to a dictionary
movie_counts = df['movieId'].value_counts()
movie2idx = dict(movie_counts)
#print(movie2idx[2])
#this will generate a dictionary of index example: { 2:22243 } --> movie 2 appear 22243 times

In [13]:
#row by row in dataframe is added the count of each movie, this could be better solved in a faster way, but by now i will let like this.
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)
df = df.drop(columns=['timestamp'])
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,22243
1,0,29,3.5,8520
2,0,32,3.5,44980
3,0,47,3.5,43249
4,0,50,3.5,47006


In [15]:
print("original dataframe size:", len(df))

original dataframe size: 20000263


In [20]:
#lets reduce the dataframe size
#first generate some count dictionaries for each key on users and movies
user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)
# treshold of n users and m moviews that we will keep
n = 10000
m = 2000
user_ids = [u for u in user_ids_count.most_common(n)]
movie_ids = [m for m in movie_ids_count.most_common(m)]
# print(len(user_ids)) --> this will result in 10000 based on most common
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()
df_small.head()

Unnamed: 0,userId,movieId,rating,movie_idx
960,10,1,4.5,49695
961,10,10,2.5,29005
962,10,19,3.5,20938
963,10,32,5.0,44980
964,10,39,4.5,26254


In [24]:
print("reduced dataframe size:", len(df_small))
print("reduction over", 100 - (len(df_small) * 100 / len(df)), "%" )

reduced dataframe size: 6125993
reduction over 69.37043777874321 %


In [25]:
#some users ids were deleted so we need to rebuild the ids sequence 

new_user_id_map = {}
i = 0
#adding new ids from 0 to users
for old in user_ids:
  new_user_id_map[old] = i
  i += 1

new_movie_id_map = {}
j = 0
#adding new ids from 0 to movies
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1

df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

max user id: 9999
max movie id: 1999


In [26]:
#if desired we can send this reduced dataset to a csv file
df_small.to_csv('./small_rating.csv', index=False)

In [125]:
# we need to define some hyperparameters in order to build the prediction structure

K = 25 # number of neighbors we'd like to consider
limit = 1300 # number of common movies users must have in common in order to consider
N = df_small.userId.max() + 1

Here's the **collaborative filter** math formula <br/> <img src="./images/collFormula.png" width=300 height=200 />

To proceed with the prediction i'm going to use the defined formula of collaborative filtering, omega is the subset of data that will be finded on the neightbors , W is the pearson correlation that will set the weights between users, ri'j is the score of one movie compare to r_i' that is the average score for that user in this way will be summed all the deviations for each user.

 **pearsonCorrelation** math formula <br/> <img src="./images/pearson.png" width=300 height=200 />

In [91]:
#first we will get the neightbors for each user
"""for i in range(N):
    # for get the neightbors it is necessary to know which movies have an user
    IUser = df_small.loc[df_small['userId'] == i] 
    # here i get the unique movies for user i
    IUserMovies = IUser['movieId'].unique()
    neighbors[i] = []
    for j in range(N):
        # No one can be neighbor for itself
        if i != j:
            JUser = df_small.loc[df_small['userId'] == j] 
            JserMovies = JUser['movieId'].unique()
            common_movies = (set(IUserMovies) & set(JserMovies))
            # if users have more than limit (5) movies in common can be a neighbor
            if len(common_movies) > limit:
                if len(neighbors[i]) >= K:
                    break
                # here i will calculate the pearson correlation between both users and its deviations
                neighbors[i].append(j) 
                
"""
#tre previous code generate the neighbors for all dataset of 100000 users this requires lots of time so i had to rethink how to build the predictor
def find_neighbors(i):
    IUser = df_small.loc[df_small['userId'] == i] 
    # here i get the unique movies for user i
    IUserMovies = IUser['movieId'].unique()
    neighbors = []
    for j in range(N):
        # No one can be neighbor for itself
        if i != j:
            JUser = df_small.loc[df_small['userId'] == j] 
            JserMovies = JUser['movieId'].unique()
            common_movies = (set(IUserMovies) & set(JserMovies))
            #print(len(common_movies),j)
            # if users have more than limit defined movies in common can be a neighbor
            if len(common_movies) > limit:
                if len(neighbors) >= K:
                    break
                # here i will calculate the pearson correlation between both users and its deviations
                neighbors.append(j)
    return neighbors

print(find_neighbors(50))             
            

[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 24, 25, 26, 30, 31, 32]


In [119]:

def find_user_average(i):
    IUser = df_small.loc[df_small['userId'] == i] 
    # here i get the unique movies for user i
    return np.mean(IUser["rating"].values)

def find_users_correlation(i,j):
    IUser = df_small.loc[df_small['userId'] == i] 
    JUser = df_small.loc[df_small['userId'] == j] 
    IUserRatingsDev = IUser["rating"].values - np.mean(IUser["rating"].values)
    JUserRatingsDev = IUser["rating"].values - np.mean(JUser["rating"].values)
    numerator = IUserRatingsDev.dot(JUserRatingsDev)
    #dot product it is used because it requires the square
    i_relation = np.sqrt(IUserRatingsDev.dot(JUserRatingsDev))
    j_relation = np.sqrt(JUserRatingsDev.dot(JUserRatingsDev))
    return numerator/(i_relation*j_relation)
        
find_users_correlation(50,2)

0.6730643165633223

In [131]:
def predict(i,j):
    Iuser_average = find_user_average(i)
    neighbors = find_neighbors(i)
    neighbors_correlations = [(lambda x: find_users_correlation(i,x))(x)  for x in neighbors]
    neighbors_average = [(lambda x: find_user_average(x))(x)  for x in neighbors]
    neighbors_j_rating = [(lambda x:  df_small.loc[(df_small['userId'] == x) & (df_small['movieId'] == j)]['rating'].values[0] if len(df_small.loc[(df_small['userId'] == x) & (df_small['movieId'] == j)]['rating'].values) == 1 else 0 )(x)  for x in neighbors]
    numerator = np.array(neighbors_correlations).dot(np.array(neighbors_j_rating)-np.array(neighbors_average))
    prediction = Iuser_average + (numerator / np.sum(neighbors_correlations)) 
    return prediction

print(predict(50,32))
print(df_small.loc[(df_small['userId'] == 50) & (df_small['movieId'] == 32)]['rating'].values[0])

3.372650207341007
3.5
