In [1]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SVDpp
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split as surprise_train_test_split

from sklearn import pipeline

# set random state for reproducibility
kwargs = dict(random_state=42)

In [2]:
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv', sep=',')

In [3]:
mappings = pd.read_csv('../../data/preprocessed/movies_id_updated.csv', sep=',')[['id', 'imdbID']].rename(columns={'id': 'movieID'})

In [4]:
ratings.drop(columns=['Unnamed: 0'])

Unnamed: 0,user_id,imdbID,rating
0,1264,tt0047034,3.5
1,213,tt0304141,2.5
2,593,tt0369436,3.0
3,609,tt1077258,4.0
4,1590,tt0052182,4.0
...,...,...,...
787536,1032,tt0083530,3.0
787537,99,tt0107798,3.0
787538,333,tt0093857,3.0
787539,49,tt0144168,3.0


In [5]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(ratings[['user_id', 'imdbID', 'rating']], reader)

## Finding optimal Parameters for Collaborative Filtering Approach

In [None]:
# Using GridSearch to find optimal Params
# I ran this cell multiple times trying different intervalls
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.001, .02], 'reg_all' : [.05,.2], 'n_epochs' : [18,25]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
svd = gs.best_estimator['rmse']

In [None]:
trainset, testset = surprise_train_test_split(data, test_size=0.2, shuffle=True, **kwargs)

In [None]:
svd.fit(trainset)

In [None]:
preds = svd.test(testset)

In [None]:
# Printing first 50 predicitons for overview
for i in range (0,50):
    print(preds[i])

In [None]:
rmse(preds)

In [None]:
# Lets try again
# Using GridSearch to find optimal Params

param_grid = {'lr_all' : [.005, .04], 'reg_all' : [.01,.1], 'n_epochs' : [15,25]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

# Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
# Lets try again
# Using GridSearch to find optimal Params

param_grid = {'lr_all' : [.001, .02], 'reg_all' : [.005,.1], 'n_epochs' : [10,20]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

# Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

# Best Params currently for cv=5:
rmse = 0.7613563832864235

{'lr_all': 0.02, 'reg_all': 0.05, 'n_epochs': 18}


# Matrix Factorization.SVD

In [None]:
# SVD based GridSearch to find optimal Params
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.001, 0.005, .02], 'reg_all' : [.02, .05,.2], 'n_epochs' : [10,20,25]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
pd.DataFrame(gs.cv_results).to_csv('SVD_hyperparameters_1.csv')

In [6]:
# SVD based GridSearch to find optimal Params
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.01, .025, .03], 'reg_all' : [.05, .07, .1], 'n_epochs' : [20, 21, 22]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

pd.DataFrame(gs.cv_results).to_csv('SVD_hyperparameters_2.csv')

Starting GridSearch
0.7631045735270001
{'lr_all': 0.01, 'reg_all': 0.05, 'n_epochs': 22}


In [7]:
# SVDpp based GridSearch to find optimal Params
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.07, .01,], 'reg_all' : [.02, .05], 'n_epochs' : [20, 22, 23]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

pd.DataFrame(gs.cv_results).to_csv('SVDpp_hyperparameters_2.csv')

Starting GridSearch
0.7629197090959767
{'lr_all': 0.01, 'reg_all': 0.05, 'n_epochs': 23}


In [9]:
# SVDpp based GridSearch to find optimal Params
# adjusting the intervalls if the optimal parameter was at the intervall border

param_grid = {'lr_all' : [.02, .015,], 'reg_all' : [.06, .055], 'n_epochs' : [24, 23]}
print("Starting GridSearch")
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=5)
gs.fit(data)

#Print best param combination
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

pd.DataFrame(gs.cv_results).to_csv('SVDpp_hyperparameters_3.csv')

Starting GridSearch
0.7626500003659359
{'lr_all': 0.015, 'reg_all': 0.055, 'n_epochs': 24}
