In [2]:
from pandas.api.types import CategoricalDtype
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD
from funk_svd.dataset import fetch_ml_ratings
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error
from surprise import Reader, Dataset, SVD,accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


#### Loading the data

In [3]:
movies= pd.read_csv("Data/movies.csv")
movies.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
ratings_sample = pd.read_csv("Data/ratings_sample.csv")

In [5]:
ratings_sample

Unnamed: 0,userId,movieId,rating,liked
0,3,356,4.0,1
1,3,4167,3.5,0
2,3,4306,4.0,1
3,3,4979,4.0,1
4,3,5574,4.0,1
...,...,...,...,...
813465,162534,122892,2.5,0
813466,162534,136016,2.0,0
813467,162534,152081,2.5,0
813468,162534,174055,3.5,0


In [6]:
movies

Unnamed: 0,movieId,original_language,original_title,overview,popularity,release_date,runtime,spoken_languages,title,vote_average,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,...,0,0,0,0,0,0,0,0,0,0
1,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,...,0,0,0,0,0,0,0,0,0,0
2,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,...,0,0,0,0,0,0,0,0,0,0
3,949,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,1995-12-15,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Heat,7.7,...,0,0,0,0,0,0,0,1,0,0
4,11860,en,Sabrina,An ugly duckling having undergone a remarkable...,6.677277,1995-12-15,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Sabrina,6.2,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6015,265189,sv,Turist,"While holidaying in the French Alps, a Swedish...",12.165685,2014-08-15,118.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Force Majeure,6.8,...,0,0,0,0,0,0,0,0,0,0
6016,277839,fr,Pattaya,Franky and Krimo dream of leaving the grey gri...,5.613875,2016-02-24,100.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]","Good Guys Go to Heaven, Bad Guys Go to Pattaya",5.3,...,0,0,0,0,0,0,0,0,0,0
6017,248705,fr,Les Visiteurs: La Révolution,"Stuck in the corridors of time, Godefroy de Mo...",7.294920,2016-03-23,110.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",The Visitors: Bastille Day,4.0,...,0,0,0,0,0,0,0,0,0,0
6018,455661,en,In a Heartbeat,A closeted boy runs the risk of being outed by...,20.821780,2017-06-01,4.0,"[{'iso_639_1': 'en', 'name': 'English'}]",In a Heartbeat,8.3,...,0,0,0,0,1,0,0,0,0,0


## Collaborative filtering

In [7]:
ratings = ratings_sample.sort_values(by='userId')

In [8]:
ratings .drop("liked",axis=1,inplace=True)
#dropping the liked columns since we want to use the dataframe in FUNK SVD

In [9]:
ratings

Unnamed: 0,userId,movieId,rating
0,3,356,4.0
27,3,148855,4.0
26,3,134853,4.0
25,3,130634,3.0
24,3,117529,3.0
...,...,...,...
813446,162534,1198,4.0
813445,162534,745,4.0
813468,162534,174055,3.5
813456,162534,33004,3.0


Setting the `userId` to order from (0,len) in order to able to use it in Funk SVD.

In [10]:
# Create a new column 'newUserID' with incremental values starting from 0
ratings['UserID'] = range(len(ratings))



In [11]:
# Update the 'userID' column with the values from the 'newUserID' column
ratings['userId'] = ratings['UserID']

# Remove the 'newUserID' column
ratings = ratings.drop('UserID', axis=1)

In [1]:
#sanity check
ratings

NameError: name 'ratings' is not defined

### Funk SVD

We need to load our data into a dataset object using a package in `Surprise` library

In [22]:
#using Funk SVD
my_dataset = Dataset.load_from_df(ratings, Reader(rating_scale=(0.5, 5)))
my_train_dataset = my_dataset.build_full_trainset()

In [23]:
my_train_dataset

<surprise.trainset.Trainset at 0x7ff2900d0d30>

Now all we do is initialize the algorithm, specify the number of latent variables and iterations we'd like to use, and then let the algorithm run.

A huge downside here is that we cannot use 'Funk SVD' for the users that are new.
For this purpose i'm defining a user Id below and also a movie Id and puuting it in my funk SVD function to see the results.

In [24]:
#example
user_1 = 0
movie = 62

In [25]:

my_algorithm = FunkSVD(n_factors=15, 
                n_epochs=50, 
                lr_all=0.001,    # Learning rate for each epoch
                biased=False,  # This forces the algorithm to store all latent information in the matrices
                verbose=0,
                reg_all =  0.08)
my_algorithm.fit(my_train_dataset)




<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff2e1830e50>

In [26]:
#user matrix
U = my_algorithm.pu


In [27]:
#movie matrix
M = my_algorithm.qi.T


In [28]:
inner_user_id = my_train_dataset.to_inner_uid(user_1) # find the inner representation of user 1
user_profile = U[inner_user_id]
inner_movie_id = my_train_dataset.to_inner_iid(movie) # find the inner representation of item 
movie_profile = M[:, inner_movie_id]




`np.dot` will give us an expected rating.

I'm going to say if the expected rating is above 3.5 it means I'm predicting the user will like the movie. 

In [29]:
expected_rating = np.dot(user_profile, movie_profile)
print(f"------ Result for 10 factors:")
print(f"expected rating for {movies[movies['movieId']==movie]['title']} is {expected_rating}")

------ Result for 10 factors:
expected rating for 332    2001: A Space Odyssey
Name: title, dtype: object is -0.7451322818646982


In [30]:
# The surprise package doesn't allow you to test on the trainset we built
my_train_dataset, my_test_dataset = train_test_split(my_dataset, test_size=0.3)

predictions = my_algorithm.test(my_test_dataset)

In [31]:
if expected_rating > 3.5:
    print("We predict that the user will like this movie! ")

We know the rating user 1 gave movie 2 (it's 3.0), so let's use this to demonstrate how we calculate ratings using these latent factors.

First, we grab the user profile

Our expected rating of this movie by this user is the dot product of these two profiles.

Finding the best parameters:


In [32]:
#Run for fast pass
param_grid = {'n_factors': [30], 'n_epochs': [35], 'lr_all': [0.001],
              'reg_all': [0.08]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(my_dataset)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

#Assigning values
t = gs.best_params
factors = t['rmse']['n_factors']
epochs = t['rmse']['n_epochs']
lr_value = t['rmse']['lr_all']
reg_value = t['rmse']['reg_all']

0.961416302968528
{'n_factors': 30, 'n_epochs': 35, 'lr_all': 0.001, 'reg_all': 0.08}
