In [1]:
import os
import json
import pandas as pd

In [2]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD, evaluate
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV


# Testing on Final Data

In [8]:
file_path = os.path.expanduser('finaldata.csv')

In [9]:
reader = Reader(line_format='user item rating', sep=',')

In [10]:
data = Dataset.load_from_file(file_path, reader=reader)

In [11]:
data.split(n_folds=5)

## Chosen Model: SVD (results of other models at the bottom)

In [16]:
svd = SVD()

In [17]:
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.0919
MAE:  0.8586
------------
Fold 2
RMSE: 1.0911
MAE:  0.8584
------------
Fold 3
RMSE: 1.0906
MAE:  0.8573
------------
Fold 4
RMSE: 1.0926
MAE:  0.8598
------------
Fold 5
RMSE: 1.0926
MAE:  0.8592
------------
------------
Mean RMSE: 1.0917
Mean MAE : 0.8587
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.0919205372150012,
                             1.0911238090890574,
                             1.0905641164574549,
                             1.0925744889239362,
                             1.0925621922072946],
                            'mae': [0.858605564047045,
                             0.8583596034838482,
                             0.8573090440636588,
                             0.8597598485326621,
                             0.8592491339068206]})

## Gridsearch 1 -- tuning bias and learning rate

In [23]:
param_grid = {'biased': [True, False], 'lr_all': [0.001, 0.005, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [24]:
gs.fit(data)

In [25]:
results = gs.cv_results

In [26]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.to_csv('results.csv')

In [27]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_biased,param_lr_all
0,1.121815,1.123022,1.122295,1.122377,0.000496,2,0.894917,0.89533,0.895943,0.895397,0.000421,3,101.287115,12.756166,55.179468,56.157928,"{'biased': True, 'lr_all': 0.001}",True,0.001
1,1.095546,1.096588,1.09628,1.096138,0.000437,1,0.863294,0.863555,0.863951,0.8636,0.00027,1,122.237027,4.096766,125.242564,10.203921,"{'biased': True, 'lr_all': 0.005}",True,0.005
2,1.145895,1.146561,1.146847,1.146434,0.000399,3,0.887526,0.887307,0.888239,0.88769,0.000398,2,415.258017,388.697796,2648.197713,2215.837836,"{'biased': True, 'lr_all': 0.05}",True,0.05
3,2.665475,2.674566,2.661799,2.66728,0.005366,6,2.331765,2.341412,2.32667,2.333283,0.006113,6,129.907277,11.421524,178.133461,49.572544,"{'biased': False, 'lr_all': 0.001}",False,0.001
4,1.397292,1.40031,1.397937,1.398513,0.001297,5,1.110692,1.112792,1.11097,1.111485,0.000931,5,107.601211,8.414771,120.542719,10.293368,"{'biased': False, 'lr_all': 0.005}",False,0.005
5,1.246154,1.24663,1.246168,1.246317,0.000221,4,0.982923,0.982756,0.983399,0.983026,0.000272,4,105.472106,6.496184,81.059657,12.616613,"{'biased': False, 'lr_all': 0.05}",False,0.05


#### Best LR = 0.005, Bias = True

## Gridsearch 2 -- tuning n_factors

In [47]:
param_grid = {'n_factors': [1, 5, 10], 'biased': [True], 'lr_all': [0.005],
              'n_epochs': [20]}
             
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [48]:
gs2.fit(data)

In [49]:
results = gs2.cv_results

In [50]:
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_biased,param_lr_all,param_n_epochs
0,1.086359,1.085331,1.084574,1.085421,0.000731,1,0.855883,0.854242,0.853917,0.85468,...,1,30.64818,0.463524,11.101873,1.299885,"{'n_factors': 1, 'biased': True, 'lr_all': 0.0...",1,True,0.005,20
1,1.086826,1.085677,1.085088,1.085864,0.000722,2,0.856256,0.854531,0.85441,0.855066,...,2,32.52039,1.041553,10.938498,0.78254,"{'n_factors': 5, 'biased': True, 'lr_all': 0.0...",5,True,0.005,20
2,1.08733,1.086413,1.0856,1.086447,0.000707,3,0.856862,0.855194,0.854764,0.855607,...,3,35.638312,0.684767,9.675809,0.923847,"{'n_factors': 10, 'biased': True, 'lr_all': 0....",10,True,0.005,20


#### Best - 1 factor (also tested higher values)

# Gridsearch 3 -- tuning n_epochs

In [19]:
param_grid = {'n_epochs': [10, 20, 50], 'n_factors': [1], 'biased': [True], 'lr_all': [0.005]}
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [20]:
gs3.fit(data)

In [21]:
results = gs3.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all
0,1.096411,1.095981,1.096711,1.096368,0.000299,2,0.869464,0.869122,0.869546,0.869377,...,2,32.706436,1.962642,11.108491,0.908315,"{'n_epochs': 10, 'n_factors': 50, 'biased': Tr...",10,50,True,0.005
1,1.091622,1.091048,1.091759,1.091476,0.000308,1,0.85999,0.859374,0.860053,0.859806,...,1,70.06996,8.85121,25.61329,14.807396,"{'n_epochs': 20, 'n_factors': 50, 'biased': Tr...",20,50,True,0.005
2,1.140297,1.13845,1.139194,1.139314,0.000759,3,0.885813,0.884742,0.884791,0.885116,...,3,196.621014,28.375468,20.432284,8.81673,"{'n_epochs': 50, 'n_factors': 50, 'biased': Tr...",50,50,True,0.005


#### Best n_epochs = 20

# Gridsearch 4: Tuning reg_all

In [51]:
param_grid = {'n_epochs': [20], 'n_factors': [1], 'biased': [True], 
              'lr_all': [0.005], 'reg_all': [0.08, 0.09, 0.1]}
gs4 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


In [52]:
gs4.fit(data)

In [53]:
results = gs4.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all,param_reg_all
0,1.086356,1.086841,1.08346,1.085552,0.001493,1,0.857579,0.857823,0.855411,0.856938,...,30.341158,0.694978,10.497517,1.248068,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.08
1,1.08641,1.086908,1.08349,1.085603,0.001508,2,0.857953,0.858199,0.85576,0.857304,...,28.688961,0.449683,10.339854,0.66311,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.09
2,1.086446,1.086953,1.083577,1.085659,0.001487,3,0.858297,0.858557,0.856139,0.857664,...,28.932282,0.946691,10.922811,0.134936,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.1


In [54]:
results_df[['rank_test_rmse', 'rank_test_mae']]

Unnamed: 0,rank_test_rmse,rank_test_mae
0,1,1
1,2,2
2,3,3


# Other Models (not used)

In [None]:
svdpp = SVDpp()

In [None]:
evaluate(svdpp, data, measures=['RMSE', 'MAE'])

In [18]:
nmf = NMF()

In [19]:
evaluate(nmf, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm NMF.

------------
Fold 1
RMSE: 1.3794
MAE:  1.0783
------------
Fold 2
RMSE: 1.3801
MAE:  1.0787
------------
Fold 3
RMSE: 1.3803
MAE:  1.0797
------------
Fold 4
RMSE: 1.3790
MAE:  1.0774
------------
Fold 5
RMSE: 1.3789
MAE:  1.0773
------------
Fold 6
RMSE: 1.3810
MAE:  1.0798
------------
Fold 7
RMSE: 1.3789
MAE:  1.0777
------------
Fold 8
RMSE: 1.3796
MAE:  1.0786
------------
Fold 9
RMSE: 1.3799
MAE:  1.0785
------------
Fold 10
RMSE: 1.3790
MAE:  1.0779
------------
------------
Mean RMSE: 1.3796
Mean MAE : 1.0784
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.3794350253411565,
                             1.3800631753244952,
                             1.3803470449633781,
                             1.3790474183963373,
                             1.3788653317907658,
                             1.3809921876160387,
                             1.3788922356505084,
                             1.3796261813374224,
                             1.3799325331811927,
                             1.3790269179343493],
                            'mae': [1.0782924691081697,
                             1.078739278586975,
                             1.07971219559644,
                             1.077435373740628,
                             1.0773078568785115,
                             1.0797835618332265,
                             1.0776545359866114,
                             1.0785939513347529,
                             1.0785215794765843,
                        