In [9]:
import pandas as pd
import surprise
from surprise import SVD, Dataset, accuracy
from surprise.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)



# Load Data

In [14]:
# To build this csv, use the 2_create_features_datasets notebook
df = pd.read_csv("ratings.csv")
df

Unnamed: 0,user_id,content_id,count,engaged_pct
0,000a544834,5c7ee2dd80,1,0.500000
1,000a544834,f8e78525fc,1,0.333333
2,000d2a6006,03bc268791,1,0.333333
3,000d2a6006,0b3e27a1d1,1,0.333333
4,000d2a6006,1d39c1c449,3,0.750000
...,...,...,...,...
120237,fff07ebc97,5df80dbcc0,2,0.500000
120238,fff07ebc97,bed9cfbff5,2,0.500000
120239,fff5c815f7,1c64591529,1,1.000000
120240,fff5c815f7,7e36cdfc2c,2,0.666667


# Find best model

In [16]:
reader = surprise.Reader(rating_scale=(0, 1))
data = surprise.Dataset.load_from_df(
    df[["user_id", "content_id", "engaged_pct"]], reader
)


## Baselines

#%%timeit
benchmark = []
# Iterate over all algorithms

algorithms = [
    surprise.SVDpp(),
    surprise.SVD(),  
    surprise.SlopeOne(),  
    surprise.NormalPredictor(),  
    surprise.KNNBaseline(),  
    surprise.KNNBasic(),  
    surprise.KNNWithMeans(),
    surprise.KNNWithZScore(),
    surprise.BaselineOnly(),
    surprise.CoClustering(),
]

for algorithm in algorithms:
    # Perform cross validation
    results = surprise.model_selection.cross_validate(
        algorithm,
        data,
        measures=["RMSE", "MAE"],
        cv=3,
        verbose=False,  # root mean squared error (RMSE), mean absolute error (MAE)
    )

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)

    tmp = tmp.append(
        pd.Series([str(algorithm).split(" ")[0].split(".")[-1]], index=["Algorithm"])
    )
    benchmark.append(tmp)

benchmark_df = pd.DataFrame(benchmark).set_index("Algorithm").sort_values("test_rmse")
benchmark_df

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,0.175178,0.135861,3.03777,9.65708
KNNBasic,0.183468,0.141026,2.591997,8.755881
SVDpp,0.186802,0.147781,32.252909,1.272077
KNNWithZScore,0.188756,0.143714,2.847728,9.037068
BaselineOnly,0.189615,0.151536,0.312232,0.262376
KNNWithMeans,0.190214,0.146833,2.824875,8.97223
SlopeOne,0.192055,0.146116,0.244545,0.961478
SVD,0.196516,0.155755,5.435284,0.263288
NormalPredictor,0.340322,0.275462,0.119387,0.410649
CoClustering,0.437356,0.381443,2.052386,0.321698


# Evaluate Best Model

In [17]:
trainset, testset = train_test_split(data, test_size=0.25)

print("Using ALS")
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(surprise.SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
best_score = gs.best_score["rmse"]
best_params = gs.best_params["rmse"]
best_estimator = gs.best_estimator["rmse"]

print("best score: ", best_score)
print("best params: ", best_params)

results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

## Get Best Params
# Make PRedications

predictions = best_estimator.fit(trainset).test(testset)
accuracy.rmse(predictions)

final_set = best_estimator.fit(data.build_full_trainset()) # Final Model Object

Using ALS
best score:  0.19373313294389496
best params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
RMSE: 0.1927


In [18]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.212381,0.212324,0.212035,0.212247,0.000152,8,0.170697,0.170746,0.170804,0.170749,4.4e-05,7,1.214761,0.023464,0.442323,0.152905,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,0.212129,0.211898,0.211939,0.211989,0.0001,7,0.170939,0.170602,0.170995,0.170845,0.000173,8,1.185524,0.004451,0.442039,0.138834,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.199562,0.199139,0.198884,0.199195,0.00028,3,0.160395,0.159826,0.160022,0.160081,0.000236,3,1.209756,0.023496,0.327322,0.141471,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.201358,0.200891,0.200598,0.200949,0.000313,4,0.162245,0.161638,0.161876,0.16192,0.00025,4,1.170589,0.008122,0.433644,0.146224,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.202572,0.202114,0.202136,0.202274,0.000211,5,0.162667,0.162174,0.162326,0.162389,0.000206,5,2.314574,0.008042,0.329967,0.146346,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4
5,0.203949,0.203501,0.203558,0.203669,0.000199,6,0.16415,0.163607,0.164071,0.163943,0.000239,6,2.324113,0.006758,0.427966,0.139644,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}",10,0.002,0.6
6,0.19422,0.193618,0.193362,0.193733,0.000359,1,0.156023,0.155264,0.155571,0.155619,0.000312,1,2.316798,0.010823,0.327753,0.139793,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",10,0.005,0.4
7,0.197132,0.196536,0.196386,0.196685,0.000322,2,0.15873,0.158013,0.158401,0.158381,0.000293,2,2.345063,0.013219,0.427282,0.13331,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}",10,0.005,0.6
