In [2]:
import pandas as pd
import numpy as np
import surprise as sp

rec_path = r'/Users/erik/Downloads/archive(1)/recommendations.csv'
games_path = r'/Users/erik/Downloads/archive(1)/games.csv'
users_path = r'/Users/erik/Downloads/archive(1)/users.csv'
games_metadata_path = r'/Users/erik/Downloads/archive(1)/games_metadata.json'

recommendations = pd.read_csv(rec_path)
games = pd.read_csv(games_path)
users = pd.read_csv(users_path)
games_metadata = pd.read_json(games_metadata_path, lines=True)

In [3]:
import random
import numpy as np

my_seed = 1
random.seed(my_seed)
np.random.seed(my_seed)

### Define Helper functions

In [46]:
from collections import defaultdict

from surprise import Dataset, SVD
from surprise.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


def precision_recall_at_k(predictions, k=10, threshold=0.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
       user_est_true[uid].append((est, true_r))
   #     if uid == 922219:
   #         print((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

def get_recommendation_for_user(uid, top_n, testset):
    recommended = pd.Series([(x,y) for (x,y) in top_n[uid]])
    #print(recommended)

    if testset != None:
        in_testset = pd.Series([(i, r) for (u, i, r) in testset if u == uid])

    print ("\nUser: " + str(uid))
    print("Rated:")
    counter = 1
    if testset != None:
        for i in in_testset:
            print(str(counter) + "- "+ str(i) + '- ' + games[games['app_id'] == i[0]]['title'].iloc[0])
            counter+=1

    print("\n Recommended Items:")
    counter = 1
    for r in recommended:
        print(str(counter) + "- " + str(r) + '- ' +  games[games['app_id'] == r[0]]['title'].iloc[0])
        counter+=1

from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def precision_recall_f1(predictions, threshold=3.5):

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    # collect metrics per user
    precisions = dict()
    recalls = dict()
    f1s = dict()
    accuracies = dict()
    
    for uid, user_ratings in user_est_true.items():
        
        # get relevance labels for average precision calculation
        y_true = [1 if (true_r >= threshold) else 0 for (_, true_r) in user_ratings]
        y_pred = [1 if (est_r >= threshold) else 0 for (est_r, _) in user_ratings]
        
        precisions[uid] = precision_score(y_true, y_pred, zero_division=0)
        recalls[uid] = recall_score(y_true, y_pred, zero_division=0)
        f1s[uid] = f1_score(y_true, y_pred, zero_division=0)

        accuracies[uid] = accuracy_score(y_true, y_pred)

    # average scores over all users 
    avg_precision = sum(list(precisions.values())) / len(list(precisions.values()))
    avg_recall = sum(list(recalls.values())) / len(list(recalls.values()))
    avg_f1 = sum(list(f1s.values())) / len(list(f1s.values()))
    
    avg_accuracy = sum(list(accuracies.values())) / len(list(accuracies.values()))
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy

def get_ratings_from_uid(uid, data):
    rs = [(i, r) for (u,i,r,d) in data.raw_ratings if u == uid]
    for i in rs:
        print((games[games['app_id'] == i[0]]['title'].iloc[0]) + " rated: " + str(i[1]))

### Get User-App Ratings

Here we filter on users which rated atleast 20 games which should give us enough data to train a model properly.

In [5]:
user_app_ratings = pd.merge(users.loc[users['reviews'] >= 20], recommendations, how="inner", on=["user_id"])
user_app_ratings['is_recommended'] = user_app_ratings['is_recommended'].map({False: 0, True: 1})
user_app_ratings

Unnamed: 0,user_id,products,reviews,app_id,helpful,funny,date,is_recommended,hours,review_id
0,1965432,702,32,264710,0,2,2018-05-30,1,33.9,2063767
1,1965432,702,32,239030,0,0,2013-12-12,1,10.3,4517660
2,1965432,702,32,250900,0,0,2017-02-17,1,61.6,5762464
3,1965432,702,32,607080,2,0,2021-12-25,1,19.0,6152399
4,1965432,702,32,335300,2,0,2017-01-01,1,164.0,6410467
...,...,...,...,...,...,...,...,...,...,...
429428,922219,72,25,1361000,28,19,2022-06-26,1,1.1,9366376
429429,922219,72,25,289650,0,0,2022-06-24,1,16.1,9639562
429430,922219,72,25,424840,0,0,2021-12-28,0,2.7,9697813
429431,922219,72,25,633230,0,0,2021-12-28,1,10.0,11229597


### Create dataset
Create dataset object for surprise library and split data into 75/25 train/test split

In [6]:
from surprise.model_selection import train_test_split

reader = sp.Reader(rating_scale=(0, 1))
data = sp.Dataset.load_from_df(user_app_ratings[["user_id", "app_id", "is_recommended"]], reader)

trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

In [7]:
[(u, i, r) for (u, i, r) in testset if u == 922219]

[(922219, 1449850, 0.0),
 (922219, 311210, 1.0),
 (922219, 383150, 1.0),
 (922219, 1361000, 1.0),
 (922219, 617830, 1.0),
 (922219, 1240440, 1.0),
 (922219, 1172470, 0.0),
 (922219, 349040, 1.0),
 (922219, 1211630, 1.0),
 (922219, 438100, 1.0)]

### Train First Model

Here we train a KNN Basic model with default settings to get an impression of the performance

In [8]:
from surprise import KNNWithMeans

# User-based with Pearson correlation similarity and simple prediction
sim_options = {'name': 'pearson', 'user_based': True, 'min_support': 1}

# Build algorithm
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2a45aabb0>

Predict a rating for a specific user. As we can see the model predicts the rating as 0.56. Now depending on the threshhold we set we can interpret this as true or false.

In [9]:
uid = 922219
iid = 1449850

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=0, verbose=True)
pred = algo.predict(uid, 607080, r_ui=0, verbose=True)

user: 922219     item: 1449850    r_ui = 0.00   est = 0.67   {'actual_k': 10, 'was_impossible': False}
user: 922219     item: 607080     r_ui = 0.00   est = 1.00   {'actual_k': 4, 'was_impossible': False}


### Evaluating performance

Next we try to evaluate the performance of this model. First we use some standard measures to get a first impression of the overall performance.

In [10]:
from surprise.accuracy import mse, rmse, mae
# Compute MSE, RMSE and MAE on the test set predictions

predictions = algo.test(testset)

mse(predictions)
rmse(predictions)
mae(predictions)

MSE: 0.1380
RMSE: 0.3715
MAE:  0.2460


0.24600086226755605

In [24]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.8042665199786134
Avg. Recall: 0.8796142173655607
Avg. F1: 0.8255403585658555
Avg. Accuracy: 0.8094259608122694


### Precision and Recall

Since we are dealing with a recommender system purely evaluating on MSE/RMSE/MAE will not help for further comparison. Therefore we calculate the precision and recall for each user @k.

In [60]:
predictions = algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=0.6)

# Precision and recall can then be averaged over all users
print('MAP@5: '+ str(sum(prec for prec in precisions.values()) / len(precisions)))
#print(sum(rec for rec in recalls.values()) / len(recalls))

MAP@5: 0.8103360988447172


# SVD & SVD++

Next we will use a more advanced algorithm namely SVD & SVD++. These rely on matrix factorization to predict ratings.

In [12]:
from surprise import SVD

algo_svd = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo_svd.fit(trainset)
predictions_svd = algo_svd.test(testset)

# Then compute MSE, RMSE and MAE
mse(predictions_svd)
rmse(predictions_svd)
mae(predictions_svd)


MSE: 0.1286
RMSE: 0.3586
MAE:  0.2615


0.26151725408216797

In [25]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions_svd, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.8031573934617847
Avg. Recall: 0.9181383471017706
Avg. F1: 0.8427892641553852
Avg. Accuracy: 0.8210451596413622


In [13]:
precisions_svd, recalls_svd = precision_recall_at_k(predictions_svd, k=5, threshold=0.5)

# Precision and recall can then be averaged over all users
print('MAP@10: '+ str(sum(prec for prec in precisions_svd.values()) / len(precisions_svd)))
#print(sum(rec for rec in recalls_svd.values()) / len(recalls_svd))

MAP@10: 0.8271053570631903


In [14]:
uid = 922219
iid = 1449850

# get a prediction for specific users and items.
pred = algo_svd.predict(uid, iid, r_ui=0, verbose=True)
pred = algo_svd.predict(uid, 607080, r_ui=0, verbose=True)

user: 922219     item: 1449850    r_ui = 0.00   est = 0.64   {'was_impossible': False}
user: 922219     item: 607080     r_ui = 0.00   est = 0.99   {'was_impossible': False}


# SVD++

In [15]:
from surprise import SVDpp

algo_svd_pp = SVDpp()

algo_svd_pp.fit(trainset)
predictions_svdpp = algo_svd_pp.test(testset)

# Then compute MSE, RMSE and MAE
mse(predictions_svdpp)
rmse(predictions_svdpp)
mae(predictions_svdpp)

MSE: 0.1262
RMSE: 0.3553
MAE:  0.2564


0.2564247476445066

In [26]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions_svdpp, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.7999136201390769
Avg. Recall: 0.9102390376002774
Avg. F1: 0.8375235276487459
Avg. Accuracy: 0.8234863163769046


In [16]:
pred = algo_svd_pp.predict(uid, iid, r_ui=0, verbose=True)

user: 922219     item: 1449850    r_ui = 0.00   est = 0.74   {'was_impossible': False}


In [17]:
precisions_svdpp, recalls_svdpp = precision_recall_at_k(predictions_svdpp, k=5, threshold=0.5)

# Precision and recall can then be averaged over all users
print('MAP@10: '+ str(sum(prec for prec in precisions_svdpp.values()) / len(precisions_svdpp)))
print(sum(rec for rec in recalls_svdpp.values()) / len(recalls_svdpp))

MAP@10: 0.8264608144876969
0.7286204712855491


### Get top n-recommendations

We can now retrieve the top-n games for this user. If we set a threshhold of 0.5 for the recommendation we can see that the last two games VRChat and Halo Infinite would fall out of the recommendation although they are rated positive. In the testset we have 8 relevant items, 8 recommended items (with threshhold 0.5) but we only have 6 items in the top 10 which are relevant and recommended. This yields a precision of 0.75. A perfect score would be a 1.0

In [18]:
get_recommendation_for_user(uid, get_top_n(predictions_svdpp, n=10), testset)


User: 922219
Rated:
1- (1449850, 0.0)- Yu-Gi-Oh! Master Duel
2- (311210, 1.0)- Call of Duty®: Black Ops III
3- (383150, 1.0)- Dead Island Definitive Edition
4- (1361000, 1.0)- In Silence
5- (617830, 1.0)- SUPERHOT VR
6- (1240440, 1.0)- Halo Infinite
7- (1172470, 0.0)- Apex Legends™
8- (349040, 1.0)- NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
9- (1211630, 1.0)- The Jackbox Party Pack 7
10- (438100, 1.0)- VRChat

 Recommended Items:
1- (1211630, 0.9410860463403207)- The Jackbox Party Pack 7
2- (349040, 0.8203087719915527)- NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
3- (1449850, 0.7369421507548444)- Yu-Gi-Oh! Master Duel
4- (383150, 0.7024076979261503)- Dead Island Definitive Edition
5- (1172470, 0.6706176848762508)- Apex Legends™
6- (617830, 0.6494469049482899)- SUPERHOT VR
7- (311210, 0.609374165844687)- Call of Duty®: Black Ops III
8- (1361000, 0.5965696204227892)- In Silence
9- (1240440, 0.4698828500087759)- Halo Infinite
10- (438100, 0.4639576629518599)- VRChat


## Hyperparameter tuning

In [19]:

data_h = data
raw_ratings = data_h.raw_ratings

random.shuffle(raw_ratings)

threshold = int(0.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data_h.raw_ratings = A_raw_ratings  # data is now the set A
trainset_h, testset_h = train_test_split(data_h, test_size=0.3, random_state=1)


In [133]:
import optuna
from optuna.samplers import TPESampler

def objective(trial):
    # Define the search space
    lr_all = trial.suggest_float('learning_rate', 0.002, 0.01)
    no_epochs = trial.suggest_int('no_epochs', 5, 100)
    n_factors = trial.suggest_int('n_factors', 10, 100)
    reg_all = trial.suggest_float("reg_all", 0.02, 0.6)
    t = trial.suggest_float("threshold", 0.5, 0.9)

    algo_svd_pp = SVDpp(n_epochs = no_epochs, lr_all=lr_all, reg_all=reg_all, n_factors=n_factors)
    algo_svd_pp.fit(trainset_h)
    predictions = algo_svd_pp.test(testset_h)
    
    precisions, recalls= precision_recall_at_k(predictions, k=5, threshold=t)
    
    if len(precisions) > 0:
        score = (sum(prec for prec in precisions.values()) / len(precisions))
    else:
        score = 0
        
    #score = rmse(predictions)
    return score

study = optuna.create_study(study_name="svd_optimization",
                            direction="maximize",
                            sampler=TPESampler())

study.optimize(objective, n_trials=20)
print(study.best_params)
print(study.best_value)

[32m[I 2023-05-19 18:04:48,420][0m A new study created in memory with name: svd_optimization[0m
[32m[I 2023-05-19 18:04:48,699][0m Trial 0 finished with value: 0.6547897009435472 and parameters: {'learning_rate': 0.009107724261696223, 'no_epochs': 14, 'n_factors': 82, 'reg_all': 0.07876759709401744, 'threshold': 0.7281045655652435}. Best is trial 0 with value: 0.6547897009435472.[0m
[32m[I 2023-05-19 18:04:49,162][0m Trial 1 finished with value: 0.7686790340636493 and parameters: {'learning_rate': 0.006085965832053526, 'no_epochs': 40, 'n_factors': 56, 'reg_all': 0.43597241656445695, 'threshold': 0.5569587173709498}. Best is trial 1 with value: 0.7686790340636493.[0m
[32m[I 2023-05-19 18:04:49,455][0m Trial 2 finished with value: 0.7749746788208322 and parameters: {'learning_rate': 0.0024933966787298303, 'no_epochs': 14, 'n_factors': 91, 'reg_all': 0.17297749210379243, 'threshold': 0.5980770489576818}. Best is trial 2 with value: 0.7749746788208322.[0m
[32m[I 2023-05-19 18

{'learning_rate': 0.0020017081984642403, 'no_epochs': 7, 'n_factors': 10, 'reg_all': 0.26948125732400396, 'threshold': 0.5157022268893682}
0.7889706274321657


In [20]:
# retrain on the whole set A
trainset_h = data_h.build_full_trainset()

algo_svd_ppt = SVDpp(n_epochs = 7, lr_all=0.0020017081984642403, reg_all=0.26948125732400396, n_factors=10)
algo_svd_ppt.fit(trainset_h)

# Compute biased accuracy on A
predictions_h = algo_svd_ppt.test(trainset_h.build_testset())
print("Biased P@k on A,", end="   ")
#accuracy.rmse(predictions)
precisions, recalls= precision_recall_at_k(predictions_h, k=5, threshold=0.5)
print(str(sum(prec for prec in precisions.values()) / len(precisions)))

# Compute unbiased accuracy on B
testset_h = data_h.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_h = algo_svd_ppt.test(testset_h)
print("Unbiased P@k on B,", end=" ")
precisions, recalls= precision_recall_at_k(predictions_h, k=5, threshold=0.5)
print(str(sum(prec for prec in precisions.values()) / len(precisions)))

Biased P@k on A,   0.9053490238031303
Unbiased P@k on B, 0.8145437028074586


In [64]:
algo_svd_ppt2 = SVDpp(n_epochs = 7, lr_all=0.0020017081984642403, reg_all=0.26948125732400396, n_factors=10)
algo_svd_ppt2.fit(trainset)
predictions_svdpp2 = algo_svd_ppt2.test(testset)

precisions_svdpp2, recalls_svdpp2 = precision_recall_at_k(predictions_svdpp, k=5, threshold=0.6)

# Precision and recall can then be averaged over all users
print('MAP@5: '+ str(sum(prec for prec in precisions_svdpp2.values()) / len(precisions_svdpp2)))

MAP@5: 0.8142446139435174


In [63]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(predictions_svdpp2, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.7939981175480904
Avg. Recall: 0.981415543387114
Avg. F1: 0.8592403240757416
Avg. Accuracy: 0.7967525062582209


## Recommending New Games

With the trained model we can also recommend new games for a user which the user has not played/rated yet. Now, since we don't have a rating for that item/user pair we can not evaluate statistically whether the recommendation was good or not.

In [28]:
testset_new = trainset.build_anti_testset()
predictions_new = algo_svd_ppt2.test(testset_new)

In [65]:
get_recommendation_for_user(uid, get_top_n(predictions_new, n=5), None)


User: 922219
Rated:

 Recommended Items:
1- (413150, 0.9418938476654373)- Stardew Valley
2- (548430, 0.9414624414550479)- Deep Rock Galactic
3- (400, 0.9410470460729086)- Portal
4- (105600, 0.9391318776211085)- Terraria
5- (1217060, 0.9378025479237493)- Gunfire Reborn
