In [1]:
!pip install recommenders cornac==2.3.0 ranx

Collecting recommenders
  Downloading recommenders-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting cornac==2.3.0
  Downloading cornac-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (37 kB)
Collecting ranx
  Downloading ranx-0.3.20-py3-none-any.whl.metadata (17 kB)
Collecting numpy<2.0.0 (from cornac==2.3.0)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<=1.13.1 (from cornac==2.3.0)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting powerlaw (from cornac==2.3.0)
  Downloading powerlaw-1.5-py3-none-any.whl.metadata (9.3 kB)
Collecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_enc

In [2]:
!add-apt-repository ppa:ubuntu-toolchain-r/test
!apt-get update
!apt-get install --only-upgrade libstdc++6

PPA publishes dbgsym, you may need to include 'main/debug' component
Repository: 'deb https://ppa.launchpadcontent.net/ubuntu-toolchain-r/test/ubuntu/ jammy main'
Description:
Toolchain test builds; see https://wiki.ubuntu.com/ToolChain

More info: https://launchpad.net/~ubuntu-toolchain-r/+archive/ubuntu/test
Adding repository.
Press [ENTER] to continue or Ctrl-c to cancel.
Adding deb entry to /etc/apt/sources.list.d/ubuntu-toolchain-r-ubuntu-test-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/ubuntu-toolchain-r-ubuntu-test-jammy.list
Adding key to /etc/apt/trusted.gpg.d/ubuntu-toolchain-r-ubuntu-test.gpg with fingerprint C8EC952E2A0E1FBDC5090F6A2C277A0A352154E5
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,659 kB]
Hit:4 http://archi

In [1]:
import pandas as pd
import sys
import cornac
import tensorflow as tf
import recommenders
import pickle

from recommenders.utils.timer import Timer
from recommenders.evaluation.python_evaluation import (
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
)

from ranx import Qrels, Run, evaluate

print(f"System version: {sys.version}")
print(f"Tensorflow version: {tf.version}")
print(f"Cornac version: {cornac.__version__}")
print(f"Recommenders version: {recommenders.__version__}")

System version: 3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]
Tensorflow version: <module 'tensorflow._api.v2.version' from '/usr/local/lib/python3.11/dist-packages/tensorflow/_api/v2/version/__init__.py'>
Cornac version: 2.3.0
Recommenders version: 1.2.1


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## GRU

In [3]:
DATA_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/goodbooks-10k/data_sample_split/'

In [4]:
train_data = pd.read_csv(f'{DATA_PATH}/data_train_full.tsv', sep='\t')
test_data = pd.read_csv(f'{DATA_PATH}/data_test.tsv', sep='\t')

In [5]:
# top k items to recommend
TOP_K = 20
NUM_EPOCHS = 20
SEED=100

In [6]:
train_data['time'] = pd.to_datetime(train_data["time"], utc=True).astype(int) // 10**9
test_data['time'] = pd.to_datetime(test_data["time"], utc=True).astype(int) // 10**9

In [7]:
from cornac.eval_methods import NextItemEvaluation

next_item_eval = NextItemEvaluation.from_splits(
    train_data=list(train_data[['user_id', 'book_id', 'time']].itertuples(index=False)),
    test_data=list(test_data[['user_id', 'book_id', 'time']].itertuples(index=False)),
    exclude_unknowns=True,
    verbose=True,
    fmt="SIT",
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 1
Number of items = 1000
Number of sessions = 10000
---
Test data:
Number of users = 1
Number of items = 1000
Number of sessions = 10000
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 1
Total items = 1000
Total sessions = 20000


## Default Parameter

In [None]:
 gru = cornac.models.GRU4Rec(
        n_epochs=NUM_EPOCHS,
        device="cuda",
        verbose=True,
        batch_size=512,
        seed=SEED,
    )

In [None]:
with Timer() as t:
    gru.fit(next_item_eval.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/20 [00:00<?, ?it/s]

Took 940.8154 seconds for training.


In [None]:
gru.transform(next_item_eval.test_set)

In [12]:
from collections import OrderedDict, defaultdict
import numpy as np
from tqdm.notebook import tqdm
import itertools

def ranking_eval(
    model,
    train_set,
    test_set,
    exclude_unknowns=True,
    mode="last",
    verbose=False,
):

    rankings = []
    scores = []
    user_sessions = defaultdict(list)
    session_ids = []
    for [sid], [mapped_ids], [session_items] in tqdm(
        test_set.si_iter(batch_size=1, shuffle=False),
        total=len(test_set.sessions)):

        if len(session_items) < 2:  # exclude all session with size smaller than 2
            continue
        user_idx = test_set.uir_tuple[0][mapped_ids[0]]
        session_ids.append(sid)

        start_pos = 1 if mode == "next" else len(session_items) - 1
        for test_pos in range(start_pos, len(session_items), 1):
            test_pos_items = session_items[test_pos]

            # binary mask for ground-truth positive items
            u_gt_pos_mask = np.zeros(test_set.num_items, dtype="int")
            u_gt_pos_mask[test_pos_items] = 1

            # binary mask for ground-truth negative items, removing all positive items
            u_gt_neg_mask = np.ones(test_set.num_items, dtype="int")
            u_gt_neg_mask[test_pos_items] = 0

            # filter items being considered for evaluation
            if exclude_unknowns:
                u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items]
                u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items]

            u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0]
            u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0]
            item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0]


            item_rank, item_scores = model.rank(
                user_idx,
                item_indices,
                history_items=session_items[:test_pos],
                history_mapped_ids=mapped_ids[:test_pos],
                sessions=test_set.sessions,
                session_indices=test_set.session_indices,
                extra_data=test_set.extra_data,
            )
            item_scores = item_scores[item_rank]
            item_rank = [key for value in item_rank for key, val in train_set.iid_map.items() if val == value]

            rankings.append(item_rank)
            scores.append(item_scores)

    return rankings, scores

In [None]:
gru_ranking, gru_scores = ranking_eval(
    gru,
    next_item_eval.train_set,
    next_item_eval.test_set,
    exclude_unknowns=True,
    mode="last",
    verbose=False,
)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
len(gru_ranking), len(gru_scores)

(10000, 10000)

In [None]:
users = []
item = list(gru.train_set.iid_map.keys())
for uid, user_idx in gru.train_set.sid_map.items():
    user = [uid] * len(item)
    users.extend(user)

In [None]:
flattened_ranking = list(itertools.chain(*gru_ranking))
flattened_score= list(itertools.chain(*gru_scores))

In [None]:
len(users), len(flattened_ranking), len(flattened_score)

(10000000, 10000000, 10000000)

In [None]:
df_gru_predictions = pd.DataFrame({'user_id':users, 'book_id':flattened_ranking, 'prediction': flattened_score})

In [None]:
ranking_metrics_gru = {
    'Precision@1' : precision_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Precision@10' : precision_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Precision@20' : precision_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'Recall@1' : recall_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'Recall@10' : recall_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'Recall@20' : recall_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=20),
    'NDCG@1' : ndcg_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=1),
    'NDCG@10' : ndcg_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=10),
    'NDCG@20' : ndcg_at_k(test_data, df_gru_predictions, col_user="user_id", col_item="book_id", col_rating='rating', k=20)
}

In [None]:
pd.DataFrame(ranking_metrics_gru, index=['metrics']).T

Unnamed: 0,metrics
Precision@1,0.2068
Precision@10,0.1325
Precision@20,0.107835
Recall@1,0.020919
Recall@10,0.12769
Recall@20,0.204923
NDCG@1,0.2068
NDCG@10,0.157806
NDCG@20,0.184158


In [None]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['book_id'] = test_data['book_id'].astype(str)

df_gru_predictions['user_id'] = df_gru_predictions['user_id'].astype(str)
df_gru_predictions['book_id'] = df_gru_predictions['book_id'].astype(str)
df_gru_predictions['prediction'] = df_gru_predictions['prediction'].astype(float)

In [None]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="rating",
)

run = Run.from_df(
    df=df_gru_predictions,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

In [None]:
ranking_metrics_gru = pd.DataFrame({'GRU': evaluate(qrels, run, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


In [None]:
ranking_metrics_gru

Unnamed: 0,GRU
mrr,0.348124
ndcg@10,0.149074
recall@10,0.12769
precision@10,0.1325
hit_rate@10,0.6692


## Hyperparameter Model

In [8]:
gru = cornac.models.GRU4Rec(
      layers=[150],
      loss='cross-entropy',
      learning_rate=0.0092,
      dropout_p_embed=0.0257,
      dropout_p_hidden=0.4707,
      batch_size=128,
      n_epochs=30,
      device="cuda",
      verbose=True,
      seed=SEED,
  )

In [9]:
with Timer() as t:
    gru.fit(next_item_eval.train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/30 [00:00<?, ?it/s]

Took 399.6861 seconds for training.


In [10]:
gru.transform(next_item_eval.test_set)

In [13]:
gru_ranking, gru_scores = ranking_eval(
    gru,
    next_item_eval.train_set,
    next_item_eval.test_set,
    exclude_unknowns=True,
    mode="last",
    verbose=False,
)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
users = []
item = list(gru.train_set.iid_map.keys())
for uid, user_idx in gru.train_set.sid_map.items():
    user = [uid] * len(item)
    users.extend(user)

In [15]:
import itertools

flattened_ranking = list(itertools.chain(*gru_ranking))
flattened_score= list(itertools.chain(*gru_scores))

In [16]:
len(users), len(flattened_ranking), len(flattened_score)

(10000000, 10000000, 10000000)

In [17]:
df_gru_predictions = pd.DataFrame({'user_id':users, 'book_id':flattened_ranking, 'prediction': flattened_score})

In [18]:
PREDICTION_PATH = '/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/goodbooks-10k/predictions'
pickle.dump(df_gru_predictions, open(f'{PREDICTION_PATH}/predictions_gru_opt.pkl', 'wb'))

In [21]:
filtered_df_gru = df_gru_predictions.merge(train_data, on=['user_id', 'book_id'], how='left', indicator=True)
filtered_df_gru = filtered_df_gru[filtered_df_gru['_merge'] == 'left_only']
filtered_df_gru = filtered_df_gru.drop(columns=['_merge'])

In [22]:
test_data['user_id'] = test_data['user_id'].astype(str)
test_data['book_id'] = test_data['book_id'].astype(str)

filtered_df_gru['user_id'] = filtered_df_gru['user_id'].astype(str)
filtered_df_gru['book_id'] = filtered_df_gru['book_id'].astype(str)
filtered_df_gru['prediction'] = filtered_df_gru['prediction'].astype(float)

In [23]:
qrels = Qrels.from_df(
    df=test_data,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="rating",
)

run = Run.from_df(
    df=filtered_df_gru,
    q_id_col="user_id",
    doc_id_col="book_id",
    score_col="prediction",
)

In [24]:
ranking_metrics_gru = pd.DataFrame({'GRU': evaluate(qrels, run, ["mrr", "ndcg@10", "recall@10", "precision@10", "hit_rate@10"])})

  scores[i] = _reciprocal_rank(qrels[i], run[i], k, rel_lvl)


In [25]:
ranking_metrics_gru

Unnamed: 0,GRU
mrr,0.549224
ndcg@10,0.253776
recall@10,0.208981
precision@10,0.21226
hit_rate@10,0.9218


In [None]:
gru.save("/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/animelist/model")

GRU4Rec model is saved to /content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/animelist/model/GRU4Rec/2025-04-18_15-28-53-352427.pkl


'/content/drive/MyDrive/Final Project/Codes/animelist-goodbooks-recommendation/animelist/model/GRU4Rec/2025-04-18_15-28-53-352427.pkl'