In [None]:
!pip install recommenders surprise cornac==2.3.0 ranx

In [None]:
!add-apt-repository ppa:ubuntu-toolchain-r/test
!apt-get update
!apt-get install --only-upgrade libstdc++6

In [None]:
!cp '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/utils.py' .

In [None]:
import pickle
import pandas as pd
import sys
import surprise
import cornac
import recommenders

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.models.surprise.surprise_utils import (
    predict
)
from recommenders.utils.notebook_utils import store_metadata
from recommenders.models.cornac.cornac_utils import predict_ranking

from ranx import Qrels, Run, evaluate

from surprise import Dataset, NormalPredictor, Reader
from utils import compute_ranking_predictions

In [None]:
# Model parameters
NUM_EPOCHS = 20
SEED=100

In [None]:
DATA_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/goodbooks-10k/data_sample_split/'

In [None]:
train_data = pd.read_csv(f'{DATA_PATH}/data_train_full.tsv', sep='\t')
test_data = pd.read_csv(f'{DATA_PATH}/data_test.tsv', sep='\t')

In [None]:
len(train_data), len(test_data)

(462630, 115607)

In [None]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
train_set = Dataset.load_from_df(train_data[["user_id", "book_id", "rating"]], reader).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7ed381f737d0>

In [None]:
svd = surprise.SVD(random_state=SEED, n_epochs=NUM_EPOCHS, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Took 7.287154802000032 seconds for training.


In [None]:
with Timer() as test_time:
    all_predictions_svd = compute_ranking_predictions(
        svd, train_data, usercol="user_id", itemcol="book_id", predcol="prediction", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")

  0%|          | 0/10000 [00:00<?, ?it/s]

Took 133.37008921300003 seconds for prediction.


## BPR and Mpop

In [None]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_data[['user_id', 'book_id', 'rating']].itertuples(index=False)),
  test_data=list(test_data[['user_id', 'book_id', 'rating']].itertuples(index=False)),
  exclude_unknowns=False,
  verbose=True,
  seed=SEED,
)

rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 10000
Number of items = 1000
Number of ratings = 462630
Max rating = 5.0
Min rating = 3.0
Global mean = 4.1
---
Test data:
Number of users = 10000
Number of items = 1000
Number of ratings = 115607
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 10000
Total items = 1000


In [None]:
bpr = cornac.models.BPR(
    max_iter=NUM_EPOCHS,
    verbose=True,
    seed=SEED
)
most_pop = cornac.models.MostPop()

In [None]:
with Timer() as t:
    bpr.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!
Took 3.1566 seconds for training.


In [None]:
with Timer() as t:
    most_pop.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

Took 0.0111 seconds for training.


In [None]:
with Timer() as t:
    all_predictions_bpr = predict_ranking(bpr, train_data, usercol='user_id', itemcol='book_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 15.7647 seconds for prediction.


In [None]:
with Timer() as t:
  all_predictions_mpop = predict_ranking(most_pop, train_data, usercol='user_id', itemcol='book_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 16.9618 seconds for prediction.


## Evaluate

In [None]:
ranking_metrics_svd = {
    'Precision@1' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_svd, col_user="user_id", col_item="book_id", col_rating='rating', k=20)
}

In [None]:
ranking_metrics_bpr = {
    'Precision@1' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_bpr, col_user="user_id", col_item="book_id", col_rating='rating', k=20)
}

In [None]:
ranking_metrics_mpop = {
    'Precision@1' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, all_predictions_mpop, col_user="user_id", col_item="book_id", col_rating='rating', k=20)
}

In [None]:
pd.DataFrame({"SVD":ranking_metrics_svd, "BPR":ranking_metrics_bpr, "Mpop": ranking_metrics_mpop})

Unnamed: 0,SVD,BPR,Mpop
Precision@1,0.0278,0.0708,0.0708
Precision@10,0.02307,0.05735,0.05766
Precision@20,0.021275,0.046305,0.04607
Recall@1,0.00251,0.006005,0.00601
Recall@10,0.020682,0.049613,0.049923
Recall@20,0.037998,0.079001,0.078684
NDCG@1,0.0278,0.0708,0.0708
NDCG@10,0.025133,0.063096,0.063561
NDCG@20,0.031819,0.071189,0.071213


## Evaluation using Ranx

In [None]:
max_item = all_predictions_mpop['prediction'].max()

In [None]:
all_predictions_mpop['prediction'] = all_predictions_mpop['prediction']/max_item

In [None]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['book_id'] = test_data['book_id'].astype(str)

all_predictions_svd['user_id'] = all_predictions_svd['user_id'].astype(str)
all_predictions_svd['book_id'] = all_predictions_svd['book_id'].astype(str)

all_predictions_bpr['user_id'] = all_predictions_bpr['user_id'].astype(str)
all_predictions_bpr['book_id'] = all_predictions_bpr['book_id'].astype(str)

all_predictions_mpop['user_id'] = all_predictions_mpop['user_id'].astype(str)
all_predictions_mpop['book_id'] = all_predictions_mpop['book_id'].astype(str)

In [None]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="rating",
)

run_svd = Run.from_df(
    df=all_predictions_svd,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

run_bpr = Run.from_df(
    df=all_predictions_bpr,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

run_mpop = Run.from_df(
    df=all_predictions_mpop,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

In [None]:
rank_all_df = pd.DataFrame({'SVD': evaluate(qrels, run_svd, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'BPR': evaluate(qrels, run_bpr, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'Mpop': evaluate(qrels, run_mpop, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


In [None]:
rank_all_df

Unnamed: 0,SVD,BPR,Mpop
mrr,0.067098,0.154034,0.153882
ndcg@10,0.021006,0.060375,0.060886
recall@10,0.017756,0.049613,0.049924
precision@10,0.01998,0.05735,0.05767
hit_rate@10,0.1481,0.2959,0.2922


## Parameters from Hyperparameter Opt

### Singular Value Decomposition

In [None]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
train_set = Dataset.load_from_df(train_data[["user_id", "book_id", "rating"]], reader).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x797c37051250>

In [None]:
svd = surprise.SVD(random_state=SEED,
                         n_epochs=20,
                         verbose=False,
                         n_factors=60,
                         lr_all = 0.0066,
                         reg_all = 0.0020
                         )

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")

Took 4.228312179 seconds for training.


In [None]:
with Timer() as test_time:
    all_predictions_svd = compute_ranking_predictions(
        svd, train_data, usercol="user_id", itemcol="book_id", predcol="prediction", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")

  0%|          | 0/10000 [00:00<?, ?it/s]

Took 142.72172613600003 seconds for prediction.


### BPR and Mpop

In [None]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_data[['user_id', 'book_id', 'rating']].itertuples(index=False)),
  test_data=list(test_data[['user_id', 'book_id', 'rating']].itertuples(index=False)),
  exclude_unknowns=True,
  verbose=True,
  seed=SEED,
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 10000
Number of items = 1000
Number of ratings = 462630
Max rating = 5.0
Min rating = 3.0
Global mean = 4.1
---
Test data:
Number of users = 10000
Number of items = 1000
Number of ratings = 115607
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 10000
Total items = 1000


In [None]:
bpr = cornac.models.BPR(
    k=180,
    max_iter=150,
    learning_rate=0.0672,
    lambda_reg=0.0013,
    verbose=True,
    seed=SEED
  )
most_pop = cornac.models.MostPop()

In [None]:
with Timer() as t:
    bpr.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/150 [00:00<?, ?it/s]

Optimization finished!
Took 53.4238 seconds for training.


In [None]:
with Timer() as t:
    most_pop.fit(eval_method.train_set)
print("Took {} seconds for training.".format(t))

Took 0.0106 seconds for training.


In [None]:
with Timer() as t:
    all_predictions_bpr = predict_ranking(bpr, train_data, usercol='user_id', itemcol='book_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 17.6850 seconds for prediction.


In [None]:
with Timer() as t:
    all_predictions_mpop = predict_ranking(most_pop, train_data, usercol='user_id', itemcol='book_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 16.1732 seconds for prediction.


In [None]:
PREDICTION_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/goodbooks-10k/predictions'
pickle.dump(all_predictions_svd, open(f'{PREDICTION_PATH}/predictions_svd_opt.pkl', 'wb'))
pickle.dump(all_predictions_mpop, open(f'{PREDICTION_PATH}/predictions_pop_opt.pkl', 'wb'))
pickle.dump(all_predictions_bpr, open(f'{PREDICTION_PATH}/predictions_bpr_opt.pkl', 'wb'))

### Evaluation

In [None]:
max_item = all_predictions_mpop['prediction'].max()

In [None]:
all_predictions_mpop['prediction'] = all_predictions_mpop['prediction']/max_item

In [None]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['book_id'] = test_data['book_id'].astype(str)

all_predictions_svd['user_id'] = all_predictions_svd['user_id'].astype(str)
all_predictions_svd['book_id'] = all_predictions_svd['book_id'].astype(str)

all_predictions_bpr['user_id'] = all_predictions_bpr['user_id'].astype(str)
all_predictions_bpr['book_id'] = all_predictions_bpr['book_id'].astype(str)

all_predictions_mpop['user_id'] = all_predictions_mpop['user_id'].astype(str)
all_predictions_mpop['book_id'] = all_predictions_mpop['book_id'].astype(str)

In [None]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="rating",
)

run_svd = Run.from_df(
    df=all_predictions_svd,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

run_bpr = Run.from_df(
    df=all_predictions_bpr,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

run_mpop = Run.from_df(
    df=all_predictions_mpop,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

In [None]:
rank_all_df = pd.DataFrame({'SVD': evaluate(qrels, run_svd, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'BPR': evaluate(qrels, run_bpr, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"]),
                            'Mpop': evaluate(qrels, run_mpop, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

In [26]:
rank_all_df

Unnamed: 0,SVD,BPR,Mpop
mrr,0.097372,0.255247,0.153882
ndcg@10,0.029146,0.104644,0.060886
recall@10,0.022674,0.094484,0.049924
precision@10,0.02635,0.09423,0.05767
hit_rate@10,0.2059,0.5653,0.2922
