In [1]:
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
import pandas as pd
import numpy as np

rs = 49
np.random.seed(rs)

# Collaborative Filtering Recommender Systems
Load data

In [2]:
from surprise.model_selection.validation import cross_validate
from surprise import SVD
from surprise import KNNWithMeans
from IPython.display import clear_output
from surprise.model_selection import GridSearchCV


df = pd.read_pickle("files/train.pkl")
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)


Tune hyperparameters (number of neighbours and similarity measure) using GridSearchCV
*Note: also run separately with 'user_based': [True, False]*

In [3]:
param_grid_knn = {'k': [2, 3, 4, 6, 8, 16, 32, 50],
                  'sim_options': {'name': ['cosine', 'msd', 'pearson', 'pearson_baseline'],
                                 'user_based': [True]}
                 }

gs_knn = GridSearchCV(KNNWithMeans, param_grid_knn, measures=['rmse'], cv=5)
gs_knn.fit(training)
clear_output(wait=True)
res_knn = pd.DataFrame.from_dict(gs_knn.cv_results)

print(gs_knn.best_score['rmse'])
print(gs_knn.best_params['rmse'])

1.188650290490441
{'k': 4, 'sim_options': {'name': 'pearson', 'user_based': True}}


In [4]:
# res_knn.to_pickle("files/gs_results_knn.pkl")
# res_knn = pd.read_pickle("files/gs_results_knn.pkl")

## SVD 

In [5]:
param_grid_svd = {'n_epochs': [100,250,500,750,1000], 'n_factors': [20, 50, 75, 100], 'lr_all': [0.01, 0.05, 0.1]}
if False: # takes a while to cross-validate whole parameter space (<30min on my setup) 
    gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=5)
    gs_svd.fit(training)
    res_svd = pd.DataFrame.from_dict(gs_svd.cv_results)
    
    print(gs_svd.best_score['rmse']) # best RMSE score
    print(gs_svd.best_params['rmse'])# combination of parameters that gave the best RMSE score


1.1048500735336413
{'n_epochs': 100, 'n_factors': 75, 'lr_all': 0.1}


In [6]:
# res_svd.to_pickle("files/gs_results_svd.pkl")
res_svd = pd.read_pickle("files/gs_results_svd.pkl")
res_svd.columns

Index(['split0_test_rmse', 'split1_test_rmse', 'split2_test_rmse',
       'split3_test_rmse', 'split4_test_rmse', 'mean_test_rmse',
       'std_test_rmse', 'rank_test_rmse', 'mean_fit_time', 'std_fit_time',
       'mean_test_time', 'std_test_time', 'params', 'param_n_epochs',
       'param_n_factors', 'param_lr_all'],
      dtype='object')

## Fit on whole training set and predict all non-rated items

In [8]:
trainset = training.build_full_trainset()
test = trainset.build_anti_testset()
knn = KNNWithMeans(k=gs_knn.best_params["rmse"]["k"],
                   sim_options=gs_knn.best_params["rmse"]["sim_options"]
                  ).fit(trainset)

preds_knn = pd.DataFrame(knn.test(test, verbose=False))
len(preds_knn), preds_knn["est"].mean()

Computing the pearson similarity matrix...
Done computing similarity matrix.


(1449029, 3.820406900664076)

In [57]:
# set hyper params
best_n_epochs, best_n_factors, best_lr_all = 100, 75, 0.1

if 'gs_svd' in vars() or 'gs_svd' in globals():
    best_n_epochs = gs_svd.best_params["rmse"]["n_epochs"]
    best_n_factors = gs_svd.best_params["rmse"]["n_factors"]
    best_lr_all = gs_svd.best_params["rmse"]["lr_all"]
    
svd = SVD(n_factors=best_n_factors,
          n_epochs=best_n_epochs,
          lr_all=best_lr_all).fit(trainset)

preds_svd = pd.DataFrame(svd.test(test, verbose=False))
len(preds_svd), preds_svd["est"].mean()


(1449029, 3.7120347619773235)

In [70]:
df_test = pd.read_pickle("files/test.pkl")
dft = Dataset.load_from_df(df_test[['reviewerID', 'asin', 'overall']], reader)
dft = dft.build_full_trainset()
dft = dft.build_testset()

testset = pd.DataFrame(df_test)
testset = testset.rename(columns={0: "reviewerID", 1: "asin", 2: "overall"})

testset.to_pickle("files/testset.pkl")
preds_knn[["uid","iid","est"]].rename(columns={"est":"score"}).to_pickle("files/preds_knn.pkl")
preds_svd[["uid","iid","est"]].rename(columns={"est":"score"}).to_pickle("files/preds_svd.pkl")

## Gridsearch visualizations

In [None]:
## kNN
# discrete heatmap since we want to avoid the sense of interpolation on sim measures  
import plotly.graph_objects as go

fig = go.Figure(data=
    go.Heatmap(
        z=res_knn["mean_test_rmse"].to_numpy().reshape((len(param_grid_knn["k"]),-1)),
        y=np.arange(len(param_grid_knn["k"])),
        x=param_grid_knn["sim_options"]["name"],
        text=res_knn["mean_test_rmse"].to_numpy().reshape((len(param_grid_knn["k"]),-1)),
        texttemplate="%{text:.4f}",
        textfont={"size":20},
        colorbar=dict(
            title='mean_test_rmse', # title here
            titleside='right',
            titlefont=dict(
                size=18, 
            )),

        ))
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = np.arange(len(param_grid_knn["k"])),
        ticktext = param_grid_knn["k"]
    ))
fig.update_yaxes(title="k neighbours")


fig.show()
fig.write_image("images/gs_heatmap.pdf")


In [48]:
import plotly.express as px

fig = px.scatter_3d(res_svd, x='param_n_epochs', y='param_n_factors', z='param_lr_all',
              color='mean_test_rmse', width=800, height=600)
fig.show()
fig.write_image("images/gs_scatter.pdf")


In [66]:
# import plotly.graph_objects as go

dff = res_svd.groupby(["param_n_epochs", "param_n_factors"]).tail(1).groupby(["param_n_epochs", "param_n_factors"]).head(1)
outliers = pd.concat([dff[dff["param_n_factors"]<30], dff[dff["param_n_epochs"]>800]])

fig = go.Figure(data=
    go.Contour(
        z=dff["mean_test_rmse"].to_numpy().reshape((len(param_grid_svd["n_epochs"]),-1)),
        y=dff["param_n_epochs"].drop_duplicates(keep="first"),
        x=dff["param_n_factors"].drop_duplicates(keep="first"), line_smoothing=1,
        colorbar=dict(
            title='mean_test_rmse', # title here
            titleside='right',
            titlefont=dict(
                size=16, #family='Arial, sans-serif')
            )),
        contours=dict(
            coloring ='fill',
            labelfont = dict( 
            size = 12,
            )
        ),
        colorscale='inferno',
        reversescale=True
    
    ))

fig.add_trace(
     go.Scatter(y=dff["param_n_epochs"], x=dff["param_n_factors"], 
                text=dff['mean_test_rmse'],
                textposition='top left',
                texttemplate="%{text:.3f}",
                textfont=dict(color='black'),
                mode='markers+text',
                marker=dict(color='black', size=6),
                line=dict(color='#52BCA3', width=1, dash='dash'),
                showlegend=False
               ))
fig.add_trace(
     go.Scatter(y=outliers["param_n_epochs"], x=outliers["param_n_factors"], 
                text=outliers['mean_test_rmse'],
                textposition=["top right"] * 4 + ["bottom right"] + ["bottom left"] * 4,
                texttemplate="%{text:.3f}",
                textfont=dict(color='black'),
                mode='markers+text',
                marker=dict(color='black', size=6),
                line=dict(color='#52BCA3', width=1, dash='dash'),
                showlegend=False
               ))
def minmax(x):
    return min(x), max(x)

fig.update_yaxes(range = minmax(dff["param_n_epochs"]), title="n_epochs")
fig.update_xaxes(range = minmax(dff["param_n_factors"]), title="n_factors")
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
)

# fig.update_yaxes(type="log")
# fig.update_xaxes(type="log")

fig.show()
fig.write_image("images/gs_contour.pdf")
