In [2]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

# Week 6

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Software_5.json.gz

--2022-03-06 17:01:58--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Software_5.json.gz
Resolving deepyeti.ucsd.edu... 169.228.63.50
Connecting to deepyeti.ucsd.edu|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5339013 (5.1M) [application/octet-stream]
Saving to: 'Software_5.json.gz'


2022-03-06 17:02:03 (1.29 MB/s) - 'Software_5.json.gz' saved [5339013/5339013]



In [4]:
### load the meta data

data = []
with gzip.open('Software_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# total length of list, this number equals total number of products
print(len(data))

12805


In [5]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))

12805


Clean the dataset from missing ratings and duplicates (cases where the same user has rated the same item multiple times) if any.

In [6]:
df = df.sort_values(by=['reviewerID', 'asin', 'unixReviewTime'])
cleaned_dataset = df.dropna(subset=['overall']).drop_duplicates(subset=['reviewerID', 'asin'], keep = 'last').reset_index(drop=True)
print(len(cleaned_dataset))

11884


create a test set by extracting the latest positively rated item (rating ≥ 4) by each user. Remove users that do not appear in the training set.


In [7]:
cleaned_dataset = cleaned_dataset.sort_values(by=['reviewerID', 'unixReviewTime']).reset_index(drop=True)
# extracting the latest (in time) positively rated item (rating  ≥4 ) by each user. 
test_data_pre = cleaned_dataset[cleaned_dataset.overall >= 4.0].drop_duplicates(subset=['reviewerID'], keep='last')
# generate training data
training_data = cleaned_dataset.drop(test_data_pre.index)
print(len(training_data))

# Remove users that do not appear in the training set.
user_in_training = test_data_pre['reviewerID'].isin(training_data['reviewerID'])
test_data = test_data_pre[user_in_training]
print(len(test_data))

10171
1711


distribution of ratings per user




In [8]:
number_of_rating_user = training_data.groupby(['reviewerID']).count()
print("summary statistics:")
number_of_rating_user.overall.describe()

summary statistics:


count    1824.000000
mean        5.576206
std         3.488828
min         1.000000
25%         4.000000
50%         5.000000
75%         6.000000
max        51.000000
Name: overall, dtype: float64

distribution of ratings per item

In [9]:
number_of_rating_item = training_data.groupby(['asin']).count()
print("summary statistics:")
number_of_rating_item.overall.describe()

summary statistics:


count    800.000000
mean      12.713750
std       16.234801
min        1.000000
25%        5.000000
50%        9.500000
75%       17.000000
max      226.000000
Name: overall, dtype: float64

 the top 5 most popular items

# Week 7

In [10]:
from surprise import Reader
from surprise import Dataset
import numpy as np

In [11]:
reader = Reader(rating_scale=(1, 5))
training = Dataset.load_from_df(training_data[['reviewerID', 'asin', 'overall']], reader)

#### Define a user-based neighborhood model that takes into account the mean rating of each user.

Use 3-fold cross-validation on the training set to tune the hyperparameters of the chosen model (similarity measure and number of neighbors for the neighborhood-based model)

In [14]:
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans

similarities = ['cosine', 'msd', 'pearson', 'pearson_baseline']
k_set = np.arange(1, 11) # values of k

hyperparameters = dict()
for name in similarities:
    sim_options = {'name': name, 'user_based': True}
    for k in k_set:
        rmse_mean_list = []
        for i in range(10):        
            algo = KNNWithMeans(k = k, sim_options = sim_options)
            mean_rmse = cross_validate(algo, training, measures=['RMSE'], cv=3, verbose=False)['test_rmse']
            rmse_mean_list.append(mean_rmse.mean())
        hyperparameters[(name, k)] = np.mean(rmse_mean_list)
print(hyperparameters)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

Report the optimal hyperparameters together with the corresponding validation Root Mean Square Errors averaged over the 3 folds.

In [20]:
mini_key = ()
mini_value = 10
for key, value in hyperparameters.items():
    if value < mini_value:
        mini_value = value
        mini_key = key
print("Optimal hyperparameters: ",mini_key, "\nRoot Mean Square Errors averaged over the 3 folds: ", mini_value)

Optimal hyperparameters:  ('pearson', 6) 
Root Mean Square Errors averaged over the 3 folds:  1.210219074729067


Run the models with the optimal hyperparameters to the whole training set.

In [22]:
trainset_nb = training.build_full_trainset()
sim_options = {'name': 'pearson', 'user_based': True}
model_nb = KNNWithMeans(k=6, sim_options=sim_options)
model_nb.fit(trainset_nb)
testset_nb = trainset_nb.build_anti_testset()
predictions_nb = model_nb.test(testset_nb)

Computing the pearson similarity matrix...
Done computing similarity matrix.


Use the final models to rank the non-rated items for each user.

Prediction(uid='A100UD67AHFODS', iid='B000UJUJ7U', r_ui=3.7454527578409205, est=5, details={'actual_k': 0, 'was_impossible': False})

#### Define an SVD model with user and item biases that uses Stochastic Gradient Descend (SGD) to estimate the low-rank matrix based on only observed ratings.

Use 3-fold cross-validation on the training set to tune the hyperparameters of the chosen models (number of latent factors and number of epochs for the latent factor model)

In [43]:
from surprise import SVD

n_factors_set = np.arange(10, 51, 10) 
n_epochs_set = np.arange(100, 501, 100)

hyperparameters = dict()
for n_factors in n_factors_set:
    for n_epochs in n_epochs_set:
        rmse_mean_list = []
        for i in range(10):        
            svd_model = SVD(n_factors = n_factors, n_epochs = n_epochs, random_state = 0)
            mean_rmse = cross_validate(svd_model, training, measures=['RMSE'], cv=3, verbose=False)['test_rmse']
            rmse_mean_list.append(mean_rmse.mean()*3)
        hyperparameters[(n_factors, n_epochs)] = np.mean(rmse_mean_list) / 3
        print((n_factors, n_epochs))
print(hyperparameters)

(10, 100)
(10, 200)
(10, 300)
(10, 400)
(10, 500)
(20, 100)
(20, 200)
(20, 300)
(20, 400)
(20, 500)
(30, 100)
(30, 200)
(30, 300)
(30, 400)
(30, 500)
(40, 100)
(40, 200)
(40, 300)
(40, 400)
(40, 500)
(50, 100)
(50, 200)
(50, 300)
(50, 400)
(50, 500)
{(10, 100): 1.1356290527978936, (10, 200): 1.1662829298821833, (10, 300): 1.1771694878663868, (10, 400): 1.1748894856124679, (10, 500): 1.1800625614420757, (20, 100): 1.1414064205578776, (20, 200): 1.1545240898420002, (20, 300): 1.1526675968975948, (20, 400): 1.1566282821721114, (20, 500): 1.1554278298712102, (30, 100): 1.1427582225142912, (30, 200): 1.1458137623236344, (30, 300): 1.1453386981012603, (30, 400): 1.1465532661916242, (30, 500): 1.1441815930658115, (40, 100): 1.1411985365848045, (40, 200): 1.146582894937393, (40, 300): 1.145040583542997, (40, 400): 1.1442387455466012, (40, 500): 1.1428295662844237, (50, 100): 1.1442704363682985, (50, 200): 1.141316017041397, (50, 300): 1.14407330724676, (50, 400): 1.1430691019763397, (50, 500):

Report the optimal hyperparameters together with the corresponding validation Root Mean Square Errors averaged over the 3 folds.

In [44]:
mini_key = ()
mini_value = 10
for key, value in hyperparameters.items():
    if value < mini_value:
        mini_value = value
        mini_key = key
print("Optimal hyperparameters: ",mini_key, "\nRoot Mean Square Errors averaged over the 3 folds: ", mini_value)

Optimal hyperparameters:  (10, 100) 
Root Mean Square Errors averaged over the 3 folds:  1.1356290527978936


Run the models with the optimal hyperparameters to the whole training set.

In [45]:
trainset = training.build_full_trainset()
model_lf = SVD(n_factors = 30, n_epochs = 500, random_state = 0)
model_lf.fit(trainset)
testset_lf = trainset.build_anti_testset()
predictions_lf = model_lf.test(testset_lf)

# Week 8

Measure the error of the system’s predicted ratings for Software products

In [46]:
from surprise import accuracy

pred_nb_list = predictions_nb
pred_lf_list = predictions_lf
# Detect users from training set that are not in test
nb_users = set([pred.uid for pred in pred_nb_list])
lf_users = set([pred.uid for pred in pred_lf_list])
nb_users_in_pred_but_not_in_test = list(nb_users.difference(set(test_data['reviewerID'])))
lf_users_in_pred_but_not_in_test = list(lf_users.difference(set(test_data['reviewerID'])))
assert nb_users_in_pred_but_not_in_test == lf_users_in_pred_but_not_in_test
print(f"There are {len(lf_users_in_pred_but_not_in_test)} users in the training set that are not in the test set.")

# Remove these users' predictions for evaluation
pred_nb_list_removed = [pred for pred in pred_nb_list if pred.uid not in nb_users_in_pred_but_not_in_test]
pred_lf_list_removed = [pred for pred in pred_lf_list if pred.uid not in lf_users_in_pred_but_not_in_test]
assert len(pred_nb_list_removed) == len(pred_lf_list_removed)
print(f"Evaluating the systems with {len(pred_nb_list_removed)} predictions for users in the test split.")

print("RMSE for Neighborhood based Collaborative Filtering: {:.3f}".format(accuracy.rmse(pred_nb_list_removed)))
print("RMSE for Latent Factor based Collaborative Filtering: {:.3f}".format(accuracy.rmse(pred_lf_list_removed)))

There are 113 users in the training set that are not in the test set.
Evaluating the systems with 1359246 predictions for users in the test split.
RMSE: 0.9793
RMSE for Neighborhood based Collaborative Filtering: 0.979
RMSE: 0.7673
RMSE for Latent Factor based Collaborative Filtering: 0.767


Generate the top-k (with k = 5) recommendation for each test user

In [47]:
from collections import defaultdict

def transfer_to_user_item_rating(pred_list):
    # First map the predictions to each user.
    user_item_rating = defaultdict(list)
    for uid, iid, _, est, _ in pred_list:
        user_item_rating[uid].append((iid, est))
    # {uid: (iid, est)}
    return user_item_rating

def top_k_recommendations(n, user_item_rating):
    # Then sort the predictions for each user and retrieve the k highest ones.
    top_n = defaultdict(list)
    for uid, user_ratings in user_item_rating.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        # {uid: (iid, est)}
        top_n[uid] = user_ratings[:n]

    return top_n

In [48]:
user_item_test = test_data.pivot('reviewerID', 'asin', 'overall')
user_item_test = user_item_test.fillna(0)

# compute P@k for one user
def precision_at_k(k, user_ratings):
    n_rel = sum((relevant) for (_, _, relevant) in user_ratings[:k])
    return n_rel / k

# compute RR@k for one user
def RR_at_k(k, user_ratings):
    for i in range(k):
        _, _, relevant = user_ratings[i]
        if relevant == 1:
            return 1.0 / (i+1)
    return 0.0

def total_relevant(user_ratings):
    return sum((relevant) for (_, _, relevant) in user_ratings)

def transfer_to_user_item_rating_relevant(pred_list):
    # map the predictions to each user.
    user_item_rating = defaultdict(list)
    for pred in pred_list:
        true_rating = user_item_test.loc[pred.uid, pred.iid] if pred.iid in list(user_item_test.columns) else 0
        relevant = 1 if true_rating >= 4.0 else 0
        user_item_rating[pred.uid].append((pred.iid, pred.est, relevant))
    # {uid: [(iid, est, relevant)]}
    return user_item_rating

def compute_metrics(k, user_item_rating):
    precisions = dict() # precision
    ap = dict() # average precision
    rr = dict() # reciprocal rank
    for uid, user_ratings in user_item_rating.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        precisions[uid] = precision_at_k(k, user_ratings)
        ap[uid] = (sum((precision_at_k(i, user_ratings)*user_ratings[i-1][2]) for i in range(1, k+1)) / total_relevant(user_ratings)) if total_relevant(user_ratings) != 0 else 0
        rr[uid] = RR_at_k(k, user_ratings)

    return sum(prec for prec in precisions.values()) / len(precisions), sum(prec for prec in ap.values()) / len(ap), sum(prec for prec in rr.values()) / len(rr)

In [49]:
user_item_rating_nb_relevant = transfer_to_user_item_rating_relevant(pred_nb_list_removed)
user_item_rating_lf_relevant = transfer_to_user_item_rating_relevant(pred_lf_list_removed)

k_set = [5]
print("Metrics for Neighborhood based CF:")
for k in k_set:
    p_at_k_nb, map_at_k_nb, mrr_at_k_nb = compute_metrics(k, user_item_rating_nb_relevant)
    print(f"Averaged P@{k}:", round(p_at_k_nb, 3))
    print(f"MAP@{k}:", round(map_at_k_nb, 3))
    print(f"MRR@{k}:", round(mrr_at_k_nb, 3))

print("")

print("Metrics for Latent Factor based CF:")
for k in k_set:
    p_at_k_lf, map_at_k_lf, mrr_at_k_lf = compute_metrics(k, user_item_rating_lf_relevant)
    print(f"Averaged P@{k}:", round(p_at_k_lf, 3))
    print(f"MAP@{k}:", round(map_at_k_lf, 3))
    print(f"MRR@{k}:", round(mrr_at_k_lf, 3))

Metrics for Neighborhood based CF:
Averaged P@5: 0.004
MAP@5: 0.009
MRR@5: 0.009

Metrics for Latent Factor based CF:
Averaged P@5: 0.019
MAP@5: 0.028
MRR@5: 0.028


Compute the system’s hit rate averaged over the total number of users in the test set.

In [51]:
# compute HR@k for one user
def HR_at_k(k, user_ratings):
    for i in range(k):
        _, _, relevant = user_ratings[i]
        if relevant == 1:
            return 1.0
    return 0.0

def compute_hit_rate(k, user_item_rating):
    hr = dict() # hit rate
    for uid, user_ratings in user_item_rating.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        hr[uid] = HR_at_k(k, user_ratings)

    return sum(prec for prec in hr.values()) / len(hr) 

k_set = [5]
print("Hit Rate for Neighborhood based CF:")
for k in k_set:
    mhr_at_k_nb = compute_hit_rate(k, user_item_rating_nb_relevant)
    print(f"Hit Rate (top-{k}):", round(mhr_at_k_nb, 3))

print("")

print("Hit Rate for Latent Factor based CF:")
for k in k_set:
    mhr_at_k_lf = compute_hit_rate(k, user_item_rating_lf_relevant)
    print(f"Hit Rate (top-{k}):", round(mhr_at_k_lf, 3))

Hit Rate for Neighborhood based CF:
Hit Rate (top-5): 0.019

Hit Rate for Latent Factor based CF:
Hit Rate (top-5): 0.096


Ordered by the value of the column “unixReviewTime”, take the first and last users from the test set as reference and retrieve the 10 nearest neighbours of each reference user. Print their rate history and analyse their predictions.