In [1]:
import pandas as pd
import numpy as np

from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

In [2]:
credits = pd.read_csv('../dataset/credits.csv')
keywords = pd.read_csv('../dataset/keywords.csv')
links = pd.read_csv('../dataset/links_small.csv')
md = pd.read_csv('../dataset/movies_metadata.csv')
ratings = pd.read_csv('../dataset/ratings_small.csv')

  md = pd.read_csv('../dataset/movies_metadata.csv')


In [3]:
md[['id', 'title']]

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
45461,439050,Subdue
45462,111109,Century of Birthing
45463,67758,Betrayal
45464,227506,Satan Triumphant


In [21]:
wrong_imdb_ids = md[md['imdb_id'] == '0'].index
md.drop(wrong_imdb_ids, axis=0, inplace=True)

In [31]:
md['id'] = md['id'].astype('int')

ratings = ratings.merge(md[['id', 'title']].rename({'id': 'movieId'}, axis=1))
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1371,2.5,1260759135,Rocky III
1,1,1405,1.0,1260759203,Greed
2,1,2105,4.0,1260759139,American Pie
3,1,2193,2.0,1260759198,My Tutor
4,1,2294,2.0,1260759108,Jay and Silent Bob Strike Back
...,...,...,...,...,...
44989,671,4995,4.0,1064891537,Boogie Nights
44990,671,5816,4.0,1065111963,Waiter
44991,671,5902,3.5,1064245507,A Bridge Too Far
44992,671,5991,4.5,1064245387,The Last Laugh


## Use sample of movies:

In [50]:
movies = [
    'Terminator 3: Rise of the Machines',
    'The Million Dollar Hotel',
    'Men in Black II',
    'Bad Boys II',
    'Back to the Future Part II',
]

movie_ids = [165, 318, 608, 296, 8961]

In [52]:
df = ratings

In [58]:
sample_df = df[df['movieId'].isin(movie_ids)]
sample_df

Unnamed: 0,userId,movieId,rating,timestamp,title
13,2,165,3.0,835355441,Back to the Future Part II
29,2,296,4.0,835355395,Terminator 3: Rise of the Machines
67,3,296,4.5,1298862418,Terminator 3: Rise of the Machines
68,3,318,5.0,1298862121,The Million Dollar Hotel
96,4,296,5.0,949895708,Terminator 3: Rise of the Machines
...,...,...,...,...,...
44933,670,318,5.0,938781934,The Million Dollar Hotel
44938,670,608,5.0,938782093,Men in Black II
44948,671,296,4.0,1064890424,Terminator 3: Rise of the Machines
44949,671,318,5.0,1064890397,The Million Dollar Hotel


In [66]:
user_movie_df = sample_df.pivot(index='userId', columns='title', values='rating')
user_movie_df

title,Back to the Future Part II,Bad Boys II,Men in Black II,Terminator 3: Rise of the Machines,The Million Dollar Hotel
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,3.0,,,4.0,
3,,,,4.5,5.0
4,,,,5.0,
7,,,,,5.0
8,,,,4.0,5.0
...,...,...,...,...,...
666,2.0,,,4.0,
667,3.0,,5.0,5.0,
668,,,5.0,5.0,4.0
670,,,5.0,,5.0


In [70]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(
    sample_df[['userId', 'movieId', 'rating']],
    reader
)

In [72]:
trainset, testset = train_test_split(data, test_size=0.3)

In [74]:
svd_model = SVD()

In [76]:
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3117849e0>

In [78]:
predictions = svd_model.test(testset)

In [80]:
accuracy.rmse(predictions)

RMSE: 0.7889


0.7888883857387429

In [112]:
svd_model.predict(uid=602, iid=165)

Prediction(uid=602, iid=165, r_ui=None, est=3.5902793688383223, details={'was_impossible': False})

In [106]:
sample_df[sample_df['userId'] == 602]

Unnamed: 0,userId,movieId,rating,timestamp,title
40857,602,296,5.0,842355957,Terminator 3: Rise of the Machines
40860,602,318,5.0,842356712,The Million Dollar Hotel
40896,602,608,4.0,842356870,Men in Black II


In [122]:
param_grid = {'n_epochs': [5, 10, 20, 40, 50, 100],
              'lr_all': [0.03, 0.01, 0.007, 0.005, 0.003, 0.001, 0.0005, 0.0001]}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)


In [124]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    0.3s finished


In [126]:
gs.best_score['rmse']


0.818163874992312

In [128]:
gs.best_params['rmse']


{'n_epochs': 100, 'lr_all': 0.0005}

In [130]:
svd_model = SVD(**gs.best_params['rmse'])
data = data.build_full_trainset()
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3219541d0>

In [132]:
svd_model.predict(uid=602, iid=662, verbose=True)


user: 602        item: 662        r_ui = None   est = 4.22   {'was_impossible': False}


Prediction(uid=602, iid=662, r_ui=None, est=4.217433157817958, details={'was_impossible': False})