In [9]:
import pandas as pd
import surprise
from surprise import SVD, Dataset, accuracy
from surprise.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)



# Load Data

In [7]:
df = pd.read_csv("ratings.csv")
df

Unnamed: 0,user_id,content_id,engaged_pct
0,000a544834,34caa29b86,0.000000
1,000a544834,3d0e786812,0.000000
2,000a544834,5c7ee2dd80,0.500000
3,000a544834,6f7ca40e85,0.000000
4,000a544834,745115bd62,0.000000
...,...,...,...
197557,fff5c815f7,b11fec4c92,0.000000
197558,fff5c815f7,b2f77c9143,0.666667
197559,fff5c815f7,bed9cfbff5,0.000000
197560,fff5c815f7,c29f53db23,0.000000


# Find best model

In [10]:
reader = surprise.Reader(rating_scale=(0, 1))
data = surprise.Dataset.load_from_df(
    df[["user_id", "content_id", "engaged_pct"]], reader
)


## Baselines

#%%timeit
benchmark = []
# Iterate over all algorithms

algorithms = [
    surprise.SVDpp(),  # works
    surprise.SVD(),  # wors
    surprise.SlopeOne(),  # works
    surprise.NormalPredictor(),  # works
    surprise.KNNBaseline(),  # works
    surprise.KNNBasic(),  # works
    surprise.KNNWithMeans(),
    surprise.KNNWithZScore(),
    surprise.BaselineOnly(),
    surprise.CoClustering(),
    # surprise.NMF(),  # division by 0 error
]

for algorithm in algorithms:
    # Perform cross validation
    results = surprise.model_selection.cross_validate(
        algorithm,
        data,
        measures=["RMSE", "MAE"],
        cv=3,
        verbose=False,  # root mean squared error (RMSE), mean absolute error (MAE)
    )

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)

    tmp = tmp.append(
        pd.Series([str(algorithm).split(" ")[0].split(".")[-1]], index=["Algorithm"])
    )
    benchmark.append(tmp)

benchmark_df = pd.DataFrame(benchmark).set_index("Algorithm").sort_values("test_rmse")
benchmark_df

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


AttributeError: 'NoneType' object has no attribute 'best_score'

# Use Best Model

In [None]:
# how to interprest RMSE and MAE

# common sparsity measure

trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options=bsl_options)


print("Using ALS")
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(surprise.SVD, param_grid, measures=["rmse", "mae"], cv=3)

fit_gs = gs.fit(data)

# best RMSE score
best_score = fit_gs.best_score["rmse"]
best_params = fit_gs.best_params["rmse"]
best_estimator = fit_gs.best_estimator["rmse"]

print("best score: ", best_score)
print("best params: ", best_params)

results_df = pd.DataFrame.from_dict(fit_gs.cv_results)
results_df

## Get Best Params
# Make PRedications

predictions = best_estimator.fit(trainset).test(testset)
accuracy.rmse(predictions)

final_set = best_estimator.fit(data.build_full_trainset()) # Final Model Object

In [3]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.295322,0.293939,0.294456,0.294572,0.000571,7,0.251726,0.250547,0.251384,0.251219,0.000495,7,1.99104,0.02635,0.701607,0.282155,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,0.296912,0.295976,0.29645,0.296446,0.000382,8,0.254474,0.25369,0.254371,0.254178,0.000348,8,1.93323,0.005187,0.738325,0.290543,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.285355,0.283811,0.284382,0.284516,0.000637,3,0.242571,0.241179,0.241975,0.241909,0.00057,2,1.924087,0.01553,0.826607,0.308422,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.288362,0.286809,0.287354,0.287508,0.000643,5,0.246697,0.24531,0.246085,0.246031,0.000568,5,1.936892,0.022295,0.732525,0.286255,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.287816,0.286424,0.287096,0.287112,0.000568,4,0.245213,0.243915,0.244871,0.244666,0.000549,4,3.857526,0.051781,0.711694,0.269233,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4
5,0.290979,0.289564,0.290065,0.290203,0.000586,6,0.249297,0.248115,0.248773,0.248728,0.000483,6,3.82775,0.021648,0.611864,0.160378,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}",10,0.002,0.6
6,0.281846,0.280254,0.280657,0.280919,0.000676,1,0.238842,0.237234,0.237857,0.237978,0.000662,1,3.830917,0.027978,0.802404,0.271063,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",10,0.005,0.4
7,0.285293,0.283789,0.284174,0.284419,0.000638,2,0.243472,0.242023,0.242619,0.242705,0.000595,3,3.859669,0.055616,0.743323,0.232005,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}",10,0.005,0.6
