In [245]:
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
import pandas as pd
import numpy as np

rs = 0
np.random.seed(rs)

# Collaborative Filtering Recommender System
In this lab session, we will work with the training set created last week.

## Exercise 1
In this exercise, we are going to predict the rating of a single user-item pair using a neighborhood-based method.
### 1.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Fill unobserved ratings with $0$.

Compute the cosine similarities between the user with 'reviewerID'='A25C2M3QF9G7OQ' and all users that have rated the item with 'asin'='B00EYZY6LQ'.<br>
What are the similarities and what are the ratings given by these users on item 'B00EYZY6LQ'?

In [246]:
#part 1
# load data
df = pd.read_pickle("train.pkl")
df = df[["overall", "reviewerID", "asin"]]

df_with_nans = df.pivot_table(values="overall", index="reviewerID", columns="asin")
df = df_with_nans.fillna(0)


In [247]:
#part 2
#assumes no asin=="sim"
uid = "A25C2M3QF9G7OQ"
iid = "0321700945"
users_rated_item = df_with_nans.T.loc[iid].dropna().index
users_rated_item

Index(['A10Y058K7B96C6', 'A13700AF4X40YG', 'A1E50L7PCVXLN4', 'A1GQRGB8FGSLIZ',
       'A23E9QQHJLNGUI', 'A2PBMCBT1R8TTL', 'A2YKWYC3WQJX5J'],
      dtype='object', name='reviewerID')

In [248]:
cosine_sim = lambda X, Y: (X.T @ Y)/(np.linalg.norm(X)*np.linalg.norm(Y))

user_arr = df.loc[uid]
df2 = df.copy().drop(index=uid)
df2["sim"] = df2.apply(lambda x: cosine_sim(user_arr, x), axis=1)
sims_on_item = df2.loc[df_with_nans.T.loc[iid].dropna().index][[iid, "sim"]]

sims_on_item

asin,0321700945,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A10Y058K7B96C6,5.0,0.0
A13700AF4X40YG,5.0,0.0
A1E50L7PCVXLN4,3.0,0.0
A1GQRGB8FGSLIZ,5.0,0.0
A23E9QQHJLNGUI,5.0,0.0
A2PBMCBT1R8TTL,5.0,0.0
A2YKWYC3WQJX5J,5.0,0.181183


### 1.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' based on the ratings from the $3$ most similar users, using a weighted (by similarity) average. What is the prediction?

In [249]:
sims_on_item.nlargest(3, "sim")

asin,0321700945,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A2YKWYC3WQJX5J,5.0,0.181183
A10Y058K7B96C6,5.0,0.0
A13700AF4X40YG,5.0,0.0


In [250]:
def weighted_avg(data, weights):
    return data.T @ (weights/weights.sum())

k=3
klargest = sims_on_item.nlargest(k, "sim")

print(weighted_avg(klargest[iid], klargest["sim"]))
klargest


5.0


asin,0321700945,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A2YKWYC3WQJX5J,5.0,0.181183
A10Y058K7B96C6,5.0,0.0
A13700AF4X40YG,5.0,0.0


## Exercise 2
In this exercise, we are going to predict the rating of the same user-item pair as in exercise 1, now using a latent factor method.
### 2.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Subtract the row mean (i.e. mean rating per user) from each non-missing element in the matrix.
- Replace missing values with $0$.

Factorize the user-item matrix by performing Singular Value Decomposition (SVD) of rank $5$ using eigendecomposition. What is ther user factors of user 'A25C2M3QF9G7OQ' and the item factors of item 'B00EYZY6LQ'?

In [251]:
means = df_with_nans.mean(1, skipna=True)
df2   = df_with_nans.sub(means, axis=0).fillna(0)

u = df2.index.get_loc(uid)
v = df2.columns.get_loc(iid)
u,v

(557, 0)

In [252]:
Q, sigma, P = svds(df2, k=5)

S = np.diag(sigma)

Q.shape, P.shape
(Q@S)[u], P[:,v]

(array([-0.15656554, -0.45847972, -0.09145076,  0.02374782,  0.02644449]),
 array([-0.00668559,  0.01128843, -0.03907957, -0.00025546,  0.00019045]))

### 2.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' by taking the dot product between the user factors and item factors and adding back the mean rating of this user. What is the prediction?

In [9]:
np.dot((Q@S)[u], P[:,v]) + means.loc[uid]

4.249444040075458

## Exercise 3
### 3.1
Define a user-based neighborhood model that takes into account the mean rating of each user.<br>
Use cosine as similarity measure and try to vary the (maximum) number of neighbors to take into account when predicting ratings. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to use $1$ or $10$ neighbors? You should determine this based on the Root Mean Square Error (RMSE) over 3-fold cross-validation.

In [10]:
from surprise.model_selection.validation import cross_validate
from surprise import SVD
from surprise import KNNWithMeans

df = pd.read_pickle("train.pkl")
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

sim_options = {'name': 'cosine',
               'user_based': True}

In [11]:
from IPython.display import clear_output
from surprise.model_selection import GridSearchCV
from surprise import KNNWithZScore

# param_grid_knn = {'k': [2, 4, 6, 8, 16, 32, 64],
param_grid_knn = {'k': [2, 4, 8, 16, 32, 50],
                  'sim_options': {'name': ['cosine', 'pearson', 'pearson_baseline'],
                                 'user_based': [True]}
                 }

gs_knn = GridSearchCV(KNNWithMeans, param_grid_knn, measures=['rmse'], cv=5)
gs_knn.fit(training)
clear_output(wait=True)
res_knn = pd.DataFrame.from_dict(gs_knn.cv_results)

print(gs_knn.best_score['rmse'])
print(gs_knn.best_params['rmse'])

1.197125865497292
{'k': 4, 'sim_options': {'name': 'pearson', 'user_based': True}}


In [193]:
# res_knn.to_pickle("res_knn.pkl")
# res_knn = pd.read_pickle("res_knn.pkl")

## SVD 

In [194]:
if False: # takes a while to cross-validate whole parameter space (<30min on my setup) 
    param_grid_svd = {'n_epochs': [50, 100,250,500], 'n_factors': [10, 20, 50, 75], 'lr_all': [0.01, 0.1, 0.15, 0.20]}
    gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=5)
    gs_svd.fit(training)
    res_svd = pd.DataFrame.from_dict(gs_svd.cv_results)
    
    print(gs_svd.best_score['rmse']) # best RMSE score
    print(gs_svd.best_params['rmse'])# combination of parameters that gave the best RMSE score


1.105643406574665
{'n_epochs': 100, 'n_factors': 75, 'lr_all': 0.15}


In [30]:
# res_svd.to_pickle("res_svd.pkl")
res_svd = pd.read_pickle("res_svd.pkl")
res_svd.columns

Index(['split0_test_rmse', 'split1_test_rmse', 'split2_test_rmse',
       'split3_test_rmse', 'split4_test_rmse', 'mean_test_rmse',
       'std_test_rmse', 'rank_test_rmse', 'mean_fit_time', 'std_fit_time',
       'mean_test_time', 'std_test_time', 'params', 'param_n_epochs',
       'param_n_factors', 'param_lr_all'],
      dtype='object')

## Fit on whole training set and predict all non-rated items

In [203]:
trainset = training.build_full_trainset()
test = trainset.build_anti_testset()
knn = KNNWithMeans(k=gs_knn.best_params["rmse"]["k"],
                   sim_options=gs_knn.best_params["rmse"]["sim_options"]
                  ).fit(trainset)

preds_knn = pd.DataFrame(knn.test(test, verbose=False))
len(preds_knn), preds_knn["est"].mean()

Computing the pearson similarity matrix...
Done computing similarity matrix.


(1449029, 3.820406900664076)

In [204]:
# set hyper params
best_n_epochs, best_n_factors, best_lr_all = 100, 75, 0.15 

if 'gs_svd' in vars() or 'gs_svd' in globals():
    best_n_epochs = gs_knn.best_params["rmse"]["n_epochs"]
    best_n_factors = gs_knn.best_params["rmse"]["n_factors"]
    best_lr_all = gs_knn.best_params["rmse"]["lr_all"]
    
svd = SVD(n_factors=best_n_factors,
          n_epochs=best_n_epochs,
          lr_all=best_lr_all).fit(trainset)

preds_svd = pd.DataFrame(svd.test(test, verbose=False))
len(preds_svd), preds_svd["est"].mean()


(1449029, 3.7218191497680855)

In [205]:
df_test = pd.read_pickle("test.pkl")
dft = Dataset.load_from_df(df_test[['reviewerID', 'asin', 'overall']], reader)
dft = dft.build_full_trainset()
dft = dft.build_testset()

testset = pd.DataFrame(df_test)
testset = testset.rename(columns={0: "reviewerID", 1: "asin", 2: "overall"})

testset.to_pickle("testset.pkl")
preds_knn.to_pickle("preds_knn.pkl")
preds_svd.to_pickle("preds_svd.pkl")

## Gridsearch visualizations

In [211]:
# KNN
# discrete heatmap since we want to avoid the sense of interpolation on sim measures  
fig = go.Figure(data=
    go.Heatmap(
        z=res_knn["mean_test_rmse"].to_numpy().reshape((len(param_grid_knn["k"]),-1)),
        y=np.arange(len(param_grid_knn["k"])),
        x=param_grid_knn["sim_options"]["name"],
        text=res_knn["mean_test_rmse"].to_numpy().reshape((len(param_grid_knn["k"]),-1)),
        texttemplate="%{text:.4f}",
        textfont={"size":20},
        colorbar=dict(
            title='mean_test_rmse', # title here
            titleside='right',
            titlefont=dict(
                size=18, 
            )),

        ))
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = np.arange(len(param_grid_knn["k"])),
        ticktext = param_grid_knn["k"]
    ))

fig.show()
fig.write_image("images/gs_heatmap.pdf")


In [210]:
import plotly.express as px
import plotly.graph_objects as go

df = px.data.iris()
fig = px.scatter_3d(res_svd, x='param_n_epochs', y='param_n_factors', z='param_lr_all',
              color='mean_test_rmse', width=800, height=600)
fig.show()
fig.write_image("images/gs_scatter.pdf")


In [212]:
param_grid_svd = {'n_epochs': [50, 100,250,500], 'n_factors': [10, 20, 50, 75], 'lr_all': [0.01, 0.1, 0.15, 0.20]}
dff = res_svd.groupby(["param_n_epochs", "param_n_factors"]).tail(2).groupby(["param_n_epochs", "param_n_factors"]).head(1)
outliers = pd.concat([dff[dff["param_n_factors"]<20], dff[dff["param_n_epochs"]>400]])

fig = go.Figure(data=
    go.Contour(
        z=dff["mean_test_rmse"].to_numpy().reshape((len(param_grid_svd["n_epochs"]),-1)),
        y=dff["param_n_epochs"].drop_duplicates(keep="first"),
        x=dff["param_n_factors"].drop_duplicates(keep="first"), line_smoothing=1,
        colorbar=dict(
            title='mean_test_rmse', # title here
            titleside='right',
            titlefont=dict(
                size=16, #family='Arial, sans-serif')
            )),
        contours=dict(
            coloring ='fill',
            labelfont = dict( 
            size = 12,
            )
        ),
        colorscale='inferno',
        reversescale=True
    
    ))

fig.add_trace(
     go.Scatter(y=dff["param_n_epochs"], x=dff["param_n_factors"], 
                text=dff['mean_test_rmse'],
                textposition='top left',
                texttemplate="%{text:.3f}",
                textfont=dict(color='black'),
                mode='markers+text',
                marker=dict(color='black', size=6),
                line=dict(color='#52BCA3', width=1, dash='dash'),
                showlegend=False
               ))
fig.add_trace(
     go.Scatter(y=outliers["param_n_epochs"], x=outliers["param_n_factors"], 
                text=outliers['mean_test_rmse'],
                textposition='bottom left',
                texttemplate="%{text:.3f}",
                textfont=dict(color='black'),
                mode='markers+text',
                marker=dict(color='black', size=6),
                line=dict(color='#52BCA3', width=1, dash='dash'),
                showlegend=False
               ))
def minmax(x):
    return min(x), max(x)

fig.update_yaxes(range = minmax(dff["param_n_epochs"]), title="n_epochs")
fig.update_xaxes(range = minmax(dff["param_n_factors"]), title="n_factors")
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
)

# fig.update_yaxes(type="log")
# fig.update_xaxes(type="log")

fig.show()
fig.write_image("images/gs_contour.pdf")
