In [10]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SVDpp
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split as surprise_train_test_split

from sklearn import pipeline

# set random state for reproducibility
kwargs = dict(random_state=42)

In [3]:
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv', sep=',')

In [4]:
mappings = pd.read_csv('../../data/preprocessed/movies_id_updated.csv', sep=',')[['id', 'imdbID']].rename(columns={'id': 'movieID'})

In [8]:
ratings

Unnamed: 0.1,Unnamed: 0,user_id,imdbID,rating
0,0,1264,tt0047034,3.5
1,1,213,tt0304141,2.5
2,2,593,tt0369436,3.0
3,3,609,tt1077258,4.0
4,4,1590,tt0052182,4.0
...,...,...,...,...
787536,812812,1032,tt0083530,3.0
787537,812813,99,tt0107798,3.0
787538,812814,333,tt0093857,3.0
787539,812815,49,tt0144168,3.0


In [9]:
# TODO write function to map from imdbID to movieID and back

## Finding optimal Parameters for Collaborative Filtering Approach

In [88]:
# Using GridSearch to find optimal Params
# I ran this cell multiple times trying different intervalls
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.001, .02], 'reg_all' : [.05,.2], 'n_epochs' : [18,25]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Starting GridSearch
0.7613563832864235
{'lr_all': 0.02, 'reg_all': 0.05, 'n_epochs': 18}


In [89]:
svd = gs.best_estimator['rmse']

In [90]:
trainset, testset = surprise_train_test_split(data, test_size=0.2, shuffle=True, **kwargs)

In [91]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11021a520>

In [92]:
preds = svd.test(testset)

In [93]:
# Printing first 50 predicitons for overview
for i in range (0,50):
    print(preds[i])

user: 364        item: 2542       r_ui = 3.00   est = 3.75   {'was_impossible': False}
user: 204        item: 49822      r_ui = 3.50   est = 3.45   {'was_impossible': False}
user: 692        item: 1407       r_ui = 2.50   est = 2.06   {'was_impossible': False}
user: 387        item: 34         r_ui = 4.00   est = 3.32   {'was_impossible': False}
user: 1254       item: 550        r_ui = 3.50   est = 3.32   {'was_impossible': False}
user: 227        item: 7154       r_ui = 3.50   est = 2.85   {'was_impossible': False}
user: 515        item: 527        r_ui = 4.00   est = 4.06   {'was_impossible': False}
user: 433        item: 923        r_ui = 4.00   est = 3.82   {'was_impossible': False}
user: 656        item: 6264       r_ui = 2.00   est = 1.81   {'was_impossible': False}
user: 1465       item: 367        r_ui = 4.50   est = 3.36   {'was_impossible': False}
user: 975        item: 6662       r_ui = 3.50   est = 3.73   {'was_impossible': False}
user: 1057       item: 33493      r_ui = 4.

In [94]:
rmse(preds)

RMSE: 0.7613


0.7612923234718605

In [95]:
# Lets try again
# Using GridSearch to find optimal Params

param_grid = {'lr_all' : [.005, .04], 'reg_all' : [.01,.1], 'n_epochs' : [15,25]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

# Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Starting GridSearch
0.7847288683576945
{'lr_all': 0.005, 'reg_all': 0.01, 'n_epochs': 15}


In [96]:
# Lets try again
# Using GridSearch to find optimal Params

param_grid = {'lr_all' : [.001, .02], 'reg_all' : [.005,.1], 'n_epochs' : [10,20]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

# Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Starting GridSearch
0.7844650185645639
{'lr_all': 0.02, 'reg_all': 0.1, 'n_epochs': 20}


# Best Params currently for cv=5:
rmse = 0.7613563832864235

{'lr_all': 0.02, 'reg_all': 0.05, 'n_epochs': 18}
