In [None]:
!pip install recommenders surprise cornac==2.3.0 ranx

In [None]:
!add-apt-repository ppa:ubuntu-toolchain-r/test
!apt-get update
!apt-get install --only-upgrade libstdc++6

## Preparation

In [33]:
!cp '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/utils.py' .

In [7]:
import pandas as pd
import sys
import surprise
import cornac
import recommenders

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.models.surprise.surprise_utils import (
    predict
)
from recommenders.utils.notebook_utils import store_metadata
from recommenders.models.cornac.cornac_utils import predict_ranking

from ranx import Qrels, Run, evaluate

from surprise import Dataset, NormalPredictor, Reader
from utils import compute_ranking_predictions

In [8]:
DATA_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/ml-1m/data_split/'

In [9]:
train_data = pd.read_csv(f'{DATA_PATH}/data_train_full.tsv', sep='\t')
test_data = pd.read_csv(f'{DATA_PATH}/data_test.tsv', sep='\t')

In [10]:
len(train_data), len(test_data)

(665395, 166351)

In [11]:
# Model parameters
NUM_EPOCHS = 20
SEED=100

## Check Library Version

In [39]:
import pandas as pd
import numpy as np
import surprise
import cornac
import recommenders
import ranx
import seaborn
import matplotlib
import torch

print(f"{pd.__name__}: {pd.__version__}")
print(f"{np.__name__}: {np.__version__}")
print(f"{surprise.__name__}: {surprise.__version__}")
print(f"{cornac.__name__}: {cornac.__version__}")
print(f"{recommenders.__name__}: {recommenders.__version__}")
print(f"{seaborn.__name__}: {seaborn.__version__}")
print(f"{matplotlib.__name__}: {matplotlib.__version__}")
print(f"{torch.__name__}: {torch.__version__}")

pandas: 2.2.2
numpy: 1.26.4
surprise: 1.1.4
cornac: 2.3.0
recommenders: 1.2.1
seaborn: 0.13.2
matplotlib: 3.10.0
torch: 2.6.0+cu124


## SVD

In [None]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
train_set = Dataset.load_from_df(train_data[["user_id", "movie_id", "rating"]], reader).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7dedbf4982d0>

In [None]:
svd = surprise.SVD(random_state=SEED, n_epochs=NUM_EPOCHS, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Took 4.225704836000034 seconds for training.


In [None]:
with Timer() as test_time:
    all_predictions_svd = compute_ranking_predictions(
        svd, train_data, usercol="user_id", itemcol="movie_id", predcol="prediction", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")

  0%|          | 0/5755 [00:00<?, ?it/s]

Took 141.70557476300002 seconds for prediction.


## BPR and Mpop

In [None]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_data[['user_id', 'movie_id', 'rating']].itertuples(index=False)),
  test_data=list(test_data[['user_id', 'movie_id', 'rating']].itertuples(index=False)),
  exclude_unknowns=True,
  verbose=True,
  seed=SEED,
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 5755
Number of items = 3571
Number of ratings = 665395
Max rating = 5.0
Min rating = 3.0
Global mean = 4.0
---
Test data:
Number of users = 5755
Number of items = 3571
Number of ratings = 166276
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5755
Total items = 3571


In [None]:
bpr = cornac.models.BPR(
    max_iter=NUM_EPOCHS,
    verbose=True,
    seed=SEED
)
most_pop = cornac.models.MostPop()

In [None]:
with Timer() as t:
    bpr.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!
Took 2.5233 seconds for training.


In [None]:
with Timer() as t:
    most_pop.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

Took 0.0077 seconds for training.


In [None]:
with Timer() as t:
    all_predictions_bpr = predict_ranking(bpr, train_data, usercol='user_id', itemcol='movie_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 17.2196 seconds for prediction.


In [None]:
with Timer() as t:
  all_predictions_mpop = predict_ranking(most_pop, train_data, usercol='user_id', itemcol='movie_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 18.4257 seconds for prediction.


## Evaluate

In [None]:
ranking_metrics_svd = {
    'Precision@1' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="movie_id", col_rating='rating', k=20)
}

In [None]:
ranking_metrics_bpr = {
    'Precision@1' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="movie_id", col_rating='rating', k=20)
}

In [None]:
ranking_metrics_mpop = {
    'Precision@1' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="movie_id", col_rating='rating', k=20)
}

In [None]:
pd.DataFrame({"SVD":ranking_metrics_svd, "BPR":ranking_metrics_bpr, "Mpop": ranking_metrics_mpop})

Unnamed: 0,SVD,BPR,Mpop
Precision@1,0.038401,0.12007,0.118158
Precision@10,0.037272,0.096525,0.095621
Precision@20,0.033745,0.087507,0.087359
Recall@1,0.001366,0.005203,0.005139
Recall@10,0.014402,0.039981,0.03969
Recall@20,0.026677,0.074399,0.074696
NDCG@1,0.038401,0.12007,0.118158
NDCG@10,0.039282,0.10422,0.103495
NDCG@20,0.03976,0.106022,0.10592


## Evaluation using Ranx

In [None]:
max_item = all_predictions_mpop['prediction'].max()

In [None]:
all_predictions_mpop['prediction'] = all_predictions_mpop['prediction']/max_item

In [None]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['movie_id'] = test_data['movie_id'].astype(str)

all_predictions_svd['user_id'] = all_predictions_svd['user_id'].astype(str)
all_predictions_svd['movie_id'] = all_predictions_svd['movie_id'].astype(str)

all_predictions_bpr['user_id'] = all_predictions_bpr['user_id'].astype(str)
all_predictions_bpr['movie_id'] = all_predictions_bpr['movie_id'].astype(str)

all_predictions_mpop['user_id'] = all_predictions_mpop['user_id'].astype(str)
all_predictions_mpop['movie_id'] = all_predictions_mpop['movie_id'].astype(str)

In [None]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="rating",
)

run_svd = Run.from_df(
    df=all_predictions_svd,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

run_bpr = Run.from_df(
    df=all_predictions_bpr,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

run_mpop = Run.from_df(
    df=all_predictions_mpop,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

In [None]:
rank_all_df = pd.DataFrame({'SVD': evaluate(qrels, run_svd, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'BPR': evaluate(qrels, run_bpr, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'Mpop': evaluate(qrels, run_mpop, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


In [None]:
rank_all_df

Unnamed: 0,SVD,BPR,Mpop
mrr,0.099873,0.224575,0.224046
ndcg@10,0.036265,0.094955,0.094328
recall@10,0.014163,0.039981,0.039711
precision@10,0.036768,0.096525,0.095656
hit_rate@10,0.229887,0.445178,0.442224


## Parameters from Hyperparameter Opt

### Singular Value Decomposition

In [12]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
train_set = Dataset.load_from_df(train_data[["user_id", "movie_id", "rating"]], reader).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7b848d41f490>

In [13]:
svd = surprise.SVD(random_state=SEED,
                         n_epochs=20,
                         verbose=False,
                         n_factors=20,
                         lr_all=0.0073,
                         reg_all=0.0016
                         )

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")

Took 1.7735709260000476 seconds for training.


In [14]:
with Timer() as test_time:
    all_predictions_svd = compute_ranking_predictions(
        svd, train_data, usercol="user_id", itemcol="movie_id", predcol="prediction", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")

  0%|          | 0/5755 [00:00<?, ?it/s]

Took 147.997951166 seconds for prediction.


### BPR and Mpop

In [15]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_data[['user_id', 'movie_id', 'rating']].itertuples(index=False)),
  test_data=list(test_data[['user_id', 'movie_id', 'rating']].itertuples(index=False)),
  exclude_unknowns=True,
  verbose=True,
  seed=SEED,
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 5755
Number of items = 3571
Number of ratings = 665395
Max rating = 5.0
Min rating = 3.0
Global mean = 4.0
---
Test data:
Number of users = 5755
Number of items = 3571
Number of ratings = 166276
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5755
Total items = 3571


In [16]:
bpr = cornac.models.BPR(
    k=200,
    max_iter=70,
    learning_rate=0.0869,
    lambda_reg=0.0208,
    verbose=True,
    seed=SEED
  )
most_pop = cornac.models.MostPop()

In [17]:
with Timer() as t:
    bpr.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/70 [00:00<?, ?it/s]

Optimization finished!
Took 23.4470 seconds for training.


In [18]:
with Timer() as t:
    most_pop.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

Took 0.0103 seconds for training.


In [19]:
with Timer() as t:
    all_predictions_bpr = predict_ranking(bpr, train_data, usercol='user_id', itemcol='movie_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 18.5722 seconds for prediction.


In [20]:
with Timer() as t:
    all_predictions_mpop = predict_ranking(most_pop, train_data, usercol='user_id', itemcol='movie_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 19.5534 seconds for prediction.


In [22]:
import pickle

In [26]:
PREDICTION_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/ml-1m/predictions'
pickle.dump(all_predictions_svd, open(f'{PREDICTION_PATH}/predictions_svd_opt.pkl', 'wb'))
pickle.dump(all_predictions_mpop, open(f'{PREDICTION_PATH}/predictions_pop_opt.pkl', 'wb'))
pickle.dump(all_predictions_bpr, open(f'{PREDICTION_PATH}/predictions_bpr_opt.pkl', 'wb'))

### Evaluation

In [27]:
max_item = all_predictions_mpop['prediction'].max()

In [28]:
all_predictions_mpop['prediction'] = all_predictions_mpop['prediction']/max_item

In [29]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['movie_id'] = test_data['movie_id'].astype(str)

all_predictions_svd['user_id'] = all_predictions_svd['user_id'].astype(str)
all_predictions_svd['movie_id'] = all_predictions_svd['movie_id'].astype(str)

all_predictions_bpr['user_id'] = all_predictions_bpr['user_id'].astype(str)
all_predictions_bpr['movie_id'] = all_predictions_bpr['movie_id'].astype(str)

all_predictions_mpop['user_id'] = all_predictions_mpop['user_id'].astype(str)
all_predictions_mpop['movie_id'] = all_predictions_mpop['movie_id'].astype(str)

In [30]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="rating",
)

run_svd = Run.from_df(
    df=all_predictions_svd,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

run_bpr = Run.from_df(
    df=all_predictions_bpr,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

run_mpop = Run.from_df(
    df=all_predictions_mpop,
    q_id_col="user_id",
    doc_id_col="movie_id",
    score_col="prediction",
)

In [31]:
rank_all_df = pd.DataFrame({'SVD': evaluate(qrels, run_svd, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'BPR': evaluate(qrels, run_bpr, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'Mpop': evaluate(qrels, run_mpop, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


In [32]:
rank_all_df

Unnamed: 0,SVD,BPR,Mpop
mrr,0.15543,0.265174,0.224046
ndcg@10,0.057955,0.114512,0.094328
recall@10,0.021085,0.061949,0.039711
precision@10,0.055013,0.113275,0.095656
hit_rate@10,0.307385,0.560209,0.442224
