In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
from Session_1 import training_data
import pandas as pd
import numpy as np

# Collaborative Filtering Recommender System
In this lab session, we will work with the training set created last week.

## Exercise 1
In this exercise, we are going to predict the rating of a single user-item pair using a neighborhood-based method.
### 1.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Fill unobserved ratings with $0$.

Compute the cosine similarities between the user with 'reviewerID'='A25C2M3QF9G7OQ' and all users that have rated the item with 'asin'='B00EYZY6LQ'.<br>
What are the similarities and what are the ratings given by these users on item 'B00EYZY6LQ'?

In [16]:
user_item = training_data.pivot('reviewerID', 'asin', 'overall')
user_item = user_item.fillna(0)
user_item.head()

asin,B0000530HU,B00006L9LC,B00021DJ32,B0002JHI1I,B0006O10P4,B0009RF9DW,B000FI4S1E,B000FOI48G,B000FTYALG,B000GLRREU,...,B00N2WQ2IW,B00NT0AR7E,B00RZYW4RG,B00W259T7G,B016V8YWBC,B019809F9Y,B019FWRG3C,B019V2KYZS,B01BNEYGQU,B01E7UKR38
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A105A034ZG9EHO,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10JB7YPWZGRF4,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10M2MLE2R0L6K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A10P0NAKKRYKTZ,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10ZJZNO4DAVB,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
similar_user = pd.DataFrame(columns = ["cosine similarity", "overall"], index = user_item[user_item.B00EYZY6LQ > 0.0].index)
similar_user['overall'] = user_item[user_item.B00EYZY6LQ > 0.0].B00EYZY6LQ
# compute cosine_similarity
cosine_similarities = cosine_similarity([user_item.loc['A25C2M3QF9G7OQ']], user_item[user_item.B00EYZY6LQ > 0.0])
similar_user['cosine similarity'] = cosine_similarities[0]
similar_user

Unnamed: 0_level_0,cosine similarity,overall
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A1F7YU6O5RU432,0.079243,5.0
A1R1BFJCMWX0Y3,0.245145,3.0
A1UQBFCERIP7VJ,0.058634,5.0
A22CW0ZHY3NJH8,0.207883,3.0
A2LW5AL0KQ9P1M,0.27581,4.0
A2PD27UKAD3Q00,0.0,5.0
A2WW57XX2UVLM6,0.0,4.0
A2ZY49IDE6TY5I,0.682835,4.0
A39WWMBA0299ZF,0.0,5.0
A3M6TSEV71537G,0.0,5.0


### 1.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' based on the ratings from the $3$ most similar users, using a weighted (by similarity) average. What is the prediction?

In [18]:
top3_users = similar_user.sort_values(by=['cosine similarity'], ascending=False).values[0:3]
points = (top3_users[:,0] * top3_users[:,1]) / np.sum(top3_users[:,0])
print("Predicted rating:",points.sum())

Predicted rating: 3.7963554954121093


## Exercise 2
In this exercise, we are going to predict the rating of the same user-item pair as in exercise 1, now using a latent factor method.
### 2.1
- Represent the ratings from the training set in a user-item matrix where the rows represent users and the columns represent items.
- Subtract the row mean (i.e. mean rating per user) from each non-missing element in the matrix.
- Replace missing values with $0$.

Factorize the user-item matrix by performing Singular Value Decomposition (SVD) of rank $5$ using eigendecomposition. What is ther user factors of user 'A25C2M3QF9G7OQ' and the item factors of item 'B00EYZY6LQ'?

In [19]:
user_item_1 = training_data.pivot('reviewerID', 'asin', 'overall')
user_item_1_mean = user_item_1.mean(axis = 1)
user_item_sub = user_item_1.sub(user_item_1_mean, axis = 0)
user_item_sub = user_item_sub.fillna(0)

In [20]:
Q, sigma, P = svds(user_item_sub, k=5)
U = Q * sigma
user_factors = pd.DataFrame(data = U, index = user_item_sub.index)
print(user_factors.loc['A25C2M3QF9G7OQ'])
item_factors = pd.DataFrame(data = P, columns = user_item_sub.columns)
print(item_factors.B00EYZY6LQ)

0   -0.553446
1    0.421214
2    0.063396
3    0.656496
4    0.251410
Name: A25C2M3QF9G7OQ, dtype: float64
0    0.054085
1    0.009215
2   -0.040723
3    0.042454
4    0.152673
Name: B00EYZY6LQ, dtype: float64


### 2.2
Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' by taking the dot product between the user factors and item factors and adding back the mean rating of this user. What is the prediction?

In [21]:
mean_rating_ = user_item_1.loc['A25C2M3QF9G7OQ'].mean()
np.dot(user_factors.loc['A25C2M3QF9G7OQ'].values, item_factors.B00EYZY6LQ.values) + mean_rating_

4.437621084849714

<br>
<br>
For the rest of the exercises, you can use the python library Scikit-Surprise. Please find the documentation here: https://surprise.readthedocs.io/en/stable/getting_started.html. <br>
You can convert the training set to the format required in Scikit-Surprise as follows:

In [22]:
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(training_data[['reviewerID', 'asin', 'overall']], reader)

## Exercise 3
### 3.1
Define a user-based neighborhood model that takes into account the mean rating of each user.<br>
Use cosine as similarity measure and try to vary the (maximum) number of neighbors to take into account when predicting ratings. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to use $1$ or $10$ neighbors? You should determine this based on the Root Mean Square Error (RMSE) over 3-fold cross-validation.

In [28]:
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans
from surprise import accuracy

sim_options = {'name': 'cosine', 'user_based': True}

k_set = [1, 10]

for k in k_set:
  algo = KNNWithMeans(k = k, sim_options = sim_options)
  print(k)
  cross_validate(algo, training, measures=['RMSE'], cv=3, verbose=True)

# it is better to let k = 10

1
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4086  0.4485  0.5168  0.4580  0.0447  
Fit time          0.17    0.16    0.17    0.17    0.00    
Test time         0.09    0.08    0.09    0.09    0.00    
10
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4551  0.4056  0.3410  0.4006  0.0467  
Fit time          0.17    0.16    0.17    0.17    0.00    
Test time 

### 3.2
Fit the neigborhood-based model defined in exercise 3.1 on the full training set with cosine as similarity measure and either $1$ or $10$ neighbors based on what you found to be better in exercise 3.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results. <br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [24]:
trainset = training.build_full_trainset()
model_algo = KNNWithMeans(k = 10, sim_options = sim_options)
model_algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = model_algo.test(testset)
len(predictions)
count = 0
for _, _, _, est, _ in predictions:
  count += est

print(len(predictions))
# the average of all the predictions
count / len(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
54746


4.628144189949609

## Exercise 4
### 4.1
Define an SVD model with user and item biases that uses Stochastic Gradient Descend (SGD) to estimate the low-rank matrix based on only observed ratings. <br>
Set the number of latent factors to $30$ and try to iterate the SGD procedure for different number of epochs. Keep Scikit-Surprise's default setting for all other parameters. <br>
Is it better to run for $100$ or $500$ epochs? You should determine this based on the RMSE over 3-fold cross-validation.

In [25]:
from surprise import SVD
epoch_set = [100, 500]
for epoch in epoch_set:
  print(epoch)
  svd_model = SVD(n_factors = 30, n_epochs = epoch)
  cross_validate(svd_model, training, measures=['RMSE'], cv=3, verbose=True)

# better to run for 500 epochs

100
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4121  0.3691  0.5213  0.4341  0.0640  
Fit time          0.22    0.21    0.22    0.21    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    
500
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.3393  0.3769  0.3601  0.3588  0.0154  
Fit time          1.04    1.03    1.04    1.03    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    


### 4.2
Fit the latent factor model defined in exercise 4.1 on the full training set with $30$ latent factors and run for either $100$ or $500$ epochs based on what you found to be better in exercise 4.1. Keep Scikit-Surprise's default setting for all other parameters, but set the random state to $0$ for comparable results.<br>
Use the model to predict the unobserved ratings for the users in the training set. How many predictions are there and what is the average of all the predictions?

In [26]:
trainset = training.build_full_trainset()
svd_model = SVD(n_factors = 30, n_epochs = 500, random_state = 0)
svd_model.fit(trainset)
testset = trainset.build_anti_testset()
predictions = svd_model.test(testset)
count = 0
for _, _, _, est, _ in predictions:
  count += est

print(len(predictions))
# the average of all the predictions
count / len(predictions)

54746


4.403720461682837