# Template 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [11]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import NormalPredictor
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV

In [7]:
# Run this cell before close.
%watermark
%watermark --iversion
%watermark -b -r -g -p surprise

2020-06-18T01:43:23+00:00

CPython 3.7.7
IPython 7.15.0

compiler   : GCC 8.3.0
system     : Linux
release    : 5.4.0-7626-generic
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit
pandas 1.0.4

surprise 0.1
Git hash: 3ef158de7aa83e52fac9bbb8c836721248339cea
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


In [4]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 Y


Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9355  0.9347  0.9377  0.9316  0.9385  0.9356  0.0024  
MAE (testset)     0.7382  0.7352  0.7371  0.7348  0.7417  0.7374  0.0025  
Fit time          4.39    4.47    4.44    4.46    4.39    4.43    0.04    
Test time         0.21    0.21    0.16    0.16    0.20    0.19    0.03    


{'test_rmse': array([0.93552051, 0.93474206, 0.93768054, 0.93158504, 0.9384802 ]),
 'test_mae': array([0.73816693, 0.73522536, 0.73710471, 0.73475422, 0.74174181]),
 'fit_time': (4.386546611785889,
  4.470555543899536,
  4.444119691848755,
  4.462865114212036,
  4.385389804840088),
 'test_time': (0.2136545181274414,
  0.21139836311340332,
  0.15526127815246582,
  0.15820074081420898,
  0.19924402236938477)}

In [8]:
# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([2.26221334, 1.63918369]),
 'test_mae': array([2.17796432, 1.47941602]),
 'fit_time': (0.0001049041748046875, 4.3392181396484375e-05),
 'test_time': (6.651878356933594e-05, 3.123283386230469e-05)}

In [12]:
# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9643920685911608
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [13]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [15]:
results_df.head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.994291,1.000124,0.997607,0.99734,0.002389,7,0.804335,0.808565,0.805497,0.806132,0.001784,7,0.922698,0.002489,0.318507,0.001291,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,1.000346,1.006967,1.003122,1.003478,0.002715,8,0.812799,0.817786,0.813831,0.814806,0.002149,8,0.897257,0.001465,0.297776,0.026065,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.971721,0.977091,0.974069,0.974294,0.002198,3,0.781422,0.784645,0.781249,0.782439,0.001562,2,0.899639,0.002972,0.296867,0.026729,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.980061,0.986013,0.982534,0.982869,0.002441,5,0.791727,0.795796,0.791917,0.793147,0.001875,5,0.901261,0.002445,0.299379,0.029052,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.975413,0.981611,0.978322,0.978449,0.002532,4,0.784665,0.78905,0.785044,0.786253,0.001984,4,1.793217,0.004894,0.292091,0.026019,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4
