In [1]:
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
import pandas as pd
import numpy as np

rs = 0
np.random.seed(rs)

# Collaborative Filtering Recommender System
In this lab session, we will work with the training set created last week.

## Exercise 1
In this exercise, we are going to predict the rating of a single user-item pair using a neighborhood-based method.
### 1.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Fill unobserved ratings with $0$.

Compute the cosine similarities between the user with 'reviewerID'='A25C2M3QF9G7OQ' and all users that have rated the item with 'asin'='B00EYZY6LQ'.<br>
What are the similarities and what are the ratings given by these users on item 'B00EYZY6LQ'?

In [2]:
#part 1
# load data
df = pd.read_pickle("train.pkl")
df = df[["overall", "reviewerID", "asin"]]

df_with_nans = df.pivot_table(values="overall", index="reviewerID", columns="asin")
df = df_with_nans.fillna(0)


In [3]:
#part 2
#assumes no asin=="sim"
uid = "A25C2M3QF9G7OQ"
iid = "B00EYZY6LQ"
users_rated_item = df_with_nans.T.loc[iid].dropna().index
users_rated_item

Index(['A1F7YU6O5RU432', 'A1R1BFJCMWX0Y3', 'A1UQBFCERIP7VJ', 'A22CW0ZHY3NJH8',
       'A2LW5AL0KQ9P1M', 'A2PD27UKAD3Q00', 'A2WW57XX2UVLM6', 'A2ZY49IDE6TY5I',
       'A39WWMBA0299ZF', 'A3M6TSEV71537G', 'A3S3R88HA0HZG3', 'A914TQVHI872U',
       'AOEUN9718KVRD'],
      dtype='object', name='reviewerID')

In [4]:
cosine_sim = lambda X, Y: (X.T @ Y)/(np.linalg.norm(X)*np.linalg.norm(Y))

user_arr = df.loc[uid]
df2 = df.copy().drop(index=uid)
df2["sim"] = df2.apply(lambda x: cosine_sim(user_arr, x), axis=1)
sims_on_item = df2.loc[df_with_nans.T.loc[iid].dropna().index][[iid, "sim"]]

sims_on_item

asin,B00EYZY6LQ,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A1F7YU6O5RU432,5.0,0.079243
A1R1BFJCMWX0Y3,3.0,0.245145
A1UQBFCERIP7VJ,5.0,0.058634
A22CW0ZHY3NJH8,3.0,0.207883
A2LW5AL0KQ9P1M,4.0,0.27581
A2PD27UKAD3Q00,5.0,0.0
A2WW57XX2UVLM6,4.0,0.0
A2ZY49IDE6TY5I,4.0,0.682835
A39WWMBA0299ZF,5.0,0.0
A3M6TSEV71537G,5.0,0.0


### 1.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' based on the ratings from the $3$ most similar users, using a weighted (by similarity) average. What is the prediction?

In [5]:
sims_on_item.nlargest(3, "sim")

asin,B00EYZY6LQ,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A2ZY49IDE6TY5I,4.0,0.682835
A2LW5AL0KQ9P1M,4.0,0.27581
A1R1BFJCMWX0Y3,3.0,0.245145


In [6]:
def weighted_avg(data, weights):
    return data.T @ (weights/weights.sum())

k=3
klargest = sims_on_item.nlargest(k, "sim")

print(weighted_avg(klargest[iid], klargest["sim"]))
klargest


3.796355495412109


asin,B00EYZY6LQ,sim
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A2ZY49IDE6TY5I,4.0,0.682835
A2LW5AL0KQ9P1M,4.0,0.27581
A1R1BFJCMWX0Y3,3.0,0.245145


## Exercise 2
In this exercise, we are going to predict the rating of the same user-item pair as in exercise 1, now using a latent factor method.
### 2.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Subtract the row mean (i.e. mean rating per user) from each non-missing element in the matrix.
- Replace missing values with $0$.

Factorize the user-item matrix by performing Singular Value Decomposition (SVD) of rank $5$ using eigendecomposition. What is ther user factors of user 'A25C2M3QF9G7OQ' and the item factors of item 'B00EYZY6LQ'?

In [7]:
means = df_with_nans.mean(1, skipna=True)
df2   = df_with_nans.sub(means, axis=0).fillna(0)

u = df2.index.get_loc(uid)
v = df2.columns.get_loc(iid)
u,v

(285, 55)

In [8]:
Q, sigma, P = svds(df2, k=5)

S = np.diag(sigma)

Q.shape, P.shape
(Q@S)[u], P[:,v]

(array([ 0.56741882, -0.11720887,  0.41310006,  0.66515646, -0.2784688 ]),
 array([-0.05629871, -0.03083716, -0.02149093,  0.04057112, -0.15355926]))

### 2.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' by taking the dot product between the user factors and item factors and adding back the mean rating of this user. What is the prediction?

In [9]:
np.dot((Q@S)[u], P[:,v]) + means.loc[uid]

4.432539143272558

## Exercise 3
### 3.1
Define a user-based neighborhood model that takes into account the mean rating of each user.<br>
Use cosine as similarity measure and try to vary the (maximum) number of neighbors to take into account when predicting ratings. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to use $1$ or $10$ neighbors? You should determine this based on the Root Mean Square Error (RMSE) over 3-fold cross-validation.

In [10]:
from surprise.model_selection.validation import cross_validate
from surprise import SVD
from surprise import KNNWithMeans

df = pd.read_pickle("train.pkl")
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

sim_options = {'name': 'cosine',
               'user_based': True}

In [11]:
# Run 5-fold cross-validation and print results.
cross_validate(KNNWithMeans(k=1, sim_options=sim_options), training, measures=['RMSE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4791  0.4883  0.4143  0.4606  0.0329  
Fit time          0.18    0.18    0.19    0.18    0.00    
Test time         0.06    0.06    0.06    0.06    0.00    


{'test_rmse': array([0.47907267, 0.48833888, 0.41433844]),
 'fit_time': (0.18406248092651367, 0.18215107917785645, 0.18586325645446777),
 'test_time': (0.0573573112487793, 0.05948829650878906, 0.05942893028259277)}

In [12]:
cross_validate(KNNWithMeans(k=10, sim_options=sim_options), training, measures=['RMSE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4255  0.3983  0.4582  0.4273  0.0245  
Fit time          0.18    0.18    0.18    0.18    0.00    
Test time         0.07    0.07    0.07    0.07    0.00    


{'test_rmse': array([0.42550478, 0.39834346, 0.45819149]),
 'fit_time': (0.18379926681518555, 0.18081903457641602, 0.1838397979736328),
 'test_time': (0.0724344253540039, 0.06842947006225586, 0.06854462623596191)}

In [13]:
# param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# gs.fit(data)
# print(gs.best_score['rmse']) # best RMSE score
# print(gs.best_params['rmse'])# combination of parameters that gave the best RMSE score


### 3.2
Fit the neigborhood-based model defined in exercise 3.1 on the full training set with cosine as similarity measure and either $1$ or $10$ neighbors based on what you found to be better in exercise 3.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results. <br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [14]:
trainset = training.build_full_trainset()
test = trainset.build_anti_testset()

knn = KNNWithMeans(k=10, sim_options=sim_options, random_state=rs).fit(trainset)

preds = pd.DataFrame(knn.test(test, verbose=False))
len(preds), preds["est"].mean()

Computing the cosine similarity matrix...
Done computing similarity matrix.


(72404, 4.673826534498951)

## Exercise 4
### 4.1
Define an SVD model with user and item biases that uses Stochastic Gradient Descend (SGD) to estimate the low-rank matrix based on only observed ratings. <br>
Set the number of latent factors to $30$ and try to iterate the SGD procedure for different number of epochs. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to run for $100$ or $500$ epochs? You should determine this based on the RMSE over 3-fold cross-validation.

In [15]:
cross_validate(SVD(n_factors=30, n_epochs=100), training, measures=['RMSE'], cv=3, verbose=True)


Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4072  0.3750  0.3810  0.3877  0.0140  
Fit time          0.16    0.15    0.15    0.16    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.40717683, 0.37499726, 0.38104081]),
 'fit_time': (0.15732884407043457, 0.15442371368408203, 0.1543729305267334),
 'test_time': (0.0029897689819335938,
  0.00398707389831543,
  0.003987550735473633)}

In [16]:
cross_validate(SVD(n_factors=30, n_epochs=500), training, measures=['RMSE'], cv=3, verbose=True)


Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.3696  0.3589  0.3454  0.3580  0.0099  
Fit time          0.77    0.77    0.78    0.77    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.36963083, 0.35892489, 0.34541165]),
 'fit_time': (0.771923303604126, 0.7666869163513184, 0.7769966125488281),
 'test_time': (0.0040912628173828125,
  0.00410008430480957,
  0.0029892921447753906)}

### 4.2
Fit the latent factor model defined in exercise 4.1 on the full training set with $30$ latent factors and run for either $100$ or $500$ epochs based on what you found to be better in exercise 4.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results.<br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [17]:
svd = SVD(n_factors=30, n_epochs=500, random_state=0).fit(trainset)
preds2 = pd.DataFrame(svd.test(test, verbose=False))
len(preds2), preds2["est"].mean()

(72404, 4.512605750450481)

# Write data

In [18]:
df_test = pd.read_pickle("test.pkl")
dft = Dataset.load_from_df(df_test[['reviewerID', 'asin', 'overall']], reader)
dft = dft.build_full_trainset()
dft = dft.build_testset()

In [19]:
# preds1 = pd.DataFrame(knn.test(dft))
# preds2 = pd.DataFrame(svd.test(dft))


In [20]:
testset = pd.DataFrame(df_test)
testset = testset.rename(columns={0: "reviewerID", 1: "asin", 2: "overall"})

testset.to_pickle("testset.pkl")
preds.to_pickle("preds1.pkl")
preds2.to_pickle("preds2.pkl")