# Global Setting and Imports

In [None]:
# In order to make things work on google drive
import os
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/LightGCN')

Mounted at /content/gdrive


In [None]:
!pip install scrapbook
!pip install retrying
!pip install pandera



In [None]:
import sys
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
import random
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
Pandas version: 1.5.3
Tensorflow version: 2.14.0


In [None]:
# top k items to recommend
TOP_K = 10

# # Select MovieLens data size: 100k, 1m, 10m, or 20m
# MOVIELENS_DATA_SIZE = '100k'
# DATA_SIZE = 1584082

# Model parameters
EPOCHS = 30
BATCH_SIZE = 512

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "recommenders/models/deeprec/config/lightgcn.yaml"
# user_file = "tests/resources/deeprec/lightgcn/user_embeddings.csv"
# item_file = "tests/resources/deeprec/lightgcn/item_embeddings.csv"

# Load and Split Data

In [None]:
path = "recommenders/datasets/amazon-book/"
train_file = path + '/train.txt'
data = []


with open(train_file) as f:
    for l in f.readlines()[:2000]: # take first 2000 users
        if len(l) > 0:
            l = l.strip('\n').split(' ')
            items = [int(i) for i in l[1:]]
            uid = int(l[0])
            for item_id in items:
                timestamp = random.randint(100000000, 999999999)
                data.append([uid, item_id, 5.0, timestamp])

df = pd.DataFrame(data, columns=['userID', 'itemID', 'rating', 'timestamp'])
train, test = python_stratified_split(df, ratio=0.75)
# print(train.head)
# print(test.head)
data = ImplicitCF(train=train, test=test, seed=SEED)

# Prepare Hyper-parameters

In [None]:
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

# Baseline Model

In [None]:
model_amazon_base = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_amazon_base.fit()

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)3.5s: train loss = 0.33156 = (mf)0.33109 + (embed)0.00047
Epoch 2 (train)2.7s: train loss = 0.12545 = (mf)0.12435 + (embed)0.00110
Epoch 3 (train)2.7s: train loss = 0.08429 = (mf)0.08281 + (embed)0.00148
Epoch 4 (train)2.9s: train loss = 0.06548 = (mf)0.06371 + (embed)0.00177
Epoch 5 (train)3.4s + (eval)1.7s: train loss = 0.05274 = (mf)0.05071 + (embed)0.00202, recall = 0.02336, ndcg = 0.04029, precision = 0.03510, map = 0.00854
Epoch 6 (train)3.1s: train loss = 0.04535 = (mf)0.04310 + (embed)0.00225
Epoch 7 (train)2.8s: train loss = 0.03842 = (mf)0.03597 + (embed)0.00245
Epoch 8 (train)2.8s: train loss = 0.03291 = (mf)0.03027 + (embed)0.00264
Epoch 9 (train)3.2s: train loss = 0.02973 = (mf)0.02692 + (embed)0.00281
Epoch 10 (train)3.5s + (eval)1.4s: train loss = 0.02652 = (mf)0.02356 + (embed)0.00296, recall = 0.02638, ndcg = 0.04851, precision = 0.04095, map = 0.01087
Epoch

In [None]:
topk_scores = model_amazon_base.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.012472
NDCG:	0.051557
Precision@K:	0.043350
Recall@K:	0.029194


# Hard Negative Sampling (Batch)

## Round 1

In [None]:
model_amazon_hard = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [None]:
with Timer() as train_time:
    model_amazon_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)26.9s: train loss = 0.34689 = (mf)0.34643 + (embed)0.00046
Epoch 2 (train)25.7s: train loss = 0.13928 = (mf)0.13817 + (embed)0.00111
Epoch 3 (train)26.1s: train loss = 0.09501 = (mf)0.09350 + (embed)0.00151
Epoch 4 (train)26.2s: train loss = 0.07566 = (mf)0.07383 + (embed)0.00183
Epoch 5 (train)26.0s + (eval)1.8s: train loss = 0.06240 = (mf)0.06030 + (embed)0.00210, recall = 0.02529, ndcg = 0.04254, precision = 0.03645, map = 0.00958
Epoch 6 (train)25.6s: train loss = 0.05241 = (mf)0.05006 + (embed)0.00235
Epoch 7 (train)26.2s: train loss = 0.04657 = (mf)0.04399 + (embed)0.00258
Epoch 8 (train)26.2s: train loss = 0.04000 = (mf)0.03721 + (embed)0.00279
Epoch 9 (train)26.0s: train loss = 0.03447 = (mf)0.03149 + (embed)0.00299
Epoch 10 (train)25.8s + (eval)1.7s: train loss = 0.03302 = (mf)0.02983 + (embed)0.00318, recall = 0.02719, ndcg = 0.04880, precision = 0.04120, map = 0.01137
Epoch 11 (train)26.2s: train loss = 0.02857 = (mf)0.02521 + (embed)0.00336
Epoch 12 (train)26

In [None]:
topk_scores = model_amazon_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,0,215,11.506405
1,0,537,11.020104
2,0,670,10.804235
3,0,436,10.581541
4,0,90,9.831392


In [None]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.012742
NDCG:	0.052831
Precision@K:	0.044050
Recall@K:	0.029993


## Round 2

In [None]:
model_amazon_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_amazon_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)26.8s: train loss = 0.34205 = (mf)0.34157 + (embed)0.00048
Epoch 2 (train)25.9s: train loss = 0.13996 = (mf)0.13884 + (embed)0.00111
Epoch 3 (train)26.3s: train loss = 0.09873 = (mf)0.09723 + (embed)0.00150
Epoch 4 (train)26.3s: train loss = 0.07610 = (mf)0.07428 + (embed)0.00182
Epoch 5 (train)26.3s + (eval)1.5s: train loss = 0.06353 = (mf)0.06144 + (embed)0.00209, recall = 0.02344, ndcg = 0.04146, precision = 0.03560, map = 0.00906
Epoch 6 (train)25.9s: train loss = 0.05528 = (mf)0.05295 + (embed)0.00233
Epoch 7 (train)26.1s: train loss = 0.04518 = (mf)0.04262 + (embed)0.00256
Epoch 8 (train)26.4s: train loss = 0.04015 = (mf)0.03738 + (embed)0.00278
Epoch 9 (train)26.4s: train loss = 0.03504 = (mf)0.03207 + (embed)0.00297
Epoch 10 (train)25.8s + (eval)1.9s: train loss = 0.03186 = (mf)0.02871 + (embed)0.00315, recall = 0.02663, ndcg = 0.04674, precision = 0.03940, map = 0.0

In [None]:
topk_scores = model_amazon_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.012041
NDCG:	0.050422
Precision@K:	0.042250
Recall@K:	0.029542


## Round3

In [None]:
model_amazon_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_amazon_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)27.0s: train loss = 0.34690 = (mf)0.34643 + (embed)0.00047
Epoch 2 (train)25.8s: train loss = 0.14027 = (mf)0.13916 + (embed)0.00111
Epoch 3 (train)26.3s: train loss = 0.09668 = (mf)0.09517 + (embed)0.00151
Epoch 4 (train)26.3s: train loss = 0.07654 = (mf)0.07471 + (embed)0.00184
Epoch 5 (train)26.3s + (eval)1.5s: train loss = 0.06355 = (mf)0.06144 + (embed)0.00211, recall = 0.02582, ndcg = 0.04392, precision = 0.03780, map = 0.00989
Epoch 6 (train)26.0s: train loss = 0.05320 = (mf)0.05084 + (embed)0.00236
Epoch 7 (train)26.2s: train loss = 0.04653 = (mf)0.04394 + (embed)0.00259
Epoch 8 (train)26.3s: train loss = 0.04018 = (mf)0.03737 + (embed)0.00281
Epoch 9 (train)26.5s: train loss = 0.03434 = (mf)0.03134 + (embed)0.00301
Epoch 10 (train)25.9s + (eval)1.9s: train loss = 0.03201 = (mf)0.02880 + (embed)0.00320, recall = 0.02787, ndcg = 0.05015, precision = 0.04220, map = 0.0

In [None]:
topk_scores = model_amazon_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.012743
NDCG:	0.052844
Precision@K:	0.044100
Recall@K:	0.029352


# Backup

In [None]:
model_movie_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_movie_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)3.5s: train loss = 0.41073 = (mf)0.41033 + (embed)0.00040
Epoch 2 (train)1.9s: train loss = 0.27035 = (mf)0.26952 + (embed)0.00083
Epoch 3 (train)1.8s: train loss = 0.25703 = (mf)0.25596 + (embed)0.00107
Epoch 4 (train)1.8s: train loss = 0.23813 = (mf)0.23687 + (embed)0.00126
Epoch 5 (train)1.8s + (eval)0.2s: train loss = 0.22023 = (mf)0.21875 + (embed)0.00148, recall = 0.16635, ndcg = 0.36824, precision = 0.32428, map = 0.09793
Epoch 6 (train)1.8s: train loss = 0.20807 = (mf)0.20636 + (embed)0.00171
Epoch 7 (train)2.3s: train loss = 0.19484 = (mf)0.19289 + (embed)0.00195
Epoch 8 (train)2.6s: train loss = 0.19078 = (mf)0.18862 + (embed)0.00216
Epoch 9 (train)2.3s: train loss = 0.18683 = (mf)0.18451 + (embed)0.00232
Epoch 10 (train)1.9s + (eval)0.2s: train loss = 0.18281 = (mf)0.18032 + (embed)0.00249, recall = 0.18394, ndcg = 0.39113, precision = 0.34008, map = 0.11155
Epoch

In [None]:
topk_scores = model_movie_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.135777
NDCG:	0.447739
Precision@K:	0.392895
Recall@K:	0.214650


In [None]:
model_movie_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_movie_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)2.6s: train loss = 0.40600 = (mf)0.40559 + (embed)0.00041
Epoch 2 (train)1.8s: train loss = 0.27268 = (mf)0.27183 + (embed)0.00085
Epoch 3 (train)1.8s: train loss = 0.25265 = (mf)0.25156 + (embed)0.00109
Epoch 4 (train)1.8s: train loss = 0.23157 = (mf)0.23028 + (embed)0.00129
Epoch 5 (train)1.8s + (eval)0.2s: train loss = 0.21923 = (mf)0.21772 + (embed)0.00151, recall = 0.17226, ndcg = 0.38236, precision = 0.33415, map = 0.10513
Epoch 6 (train)2.3s: train loss = 0.20382 = (mf)0.20208 + (embed)0.00174
Epoch 7 (train)2.6s: train loss = 0.19614 = (mf)0.19419 + (embed)0.00195
Epoch 8 (train)2.2s: train loss = 0.19269 = (mf)0.19056 + (embed)0.00212
Epoch 9 (train)1.8s: train loss = 0.18461 = (mf)0.18232 + (embed)0.00228
Epoch 10 (train)1.8s + (eval)0.2s: train loss = 0.18133 = (mf)0.17889 + (embed)0.00243, recall = 0.18188, ndcg = 0.39019, precision = 0.34115, map = 0.11074
Epoch

In [None]:
topk_scores = model_movie_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.138731
NDCG:	0.452987
Precision@K:	0.394910
Recall@K:	0.216197


In [None]:
model_movie_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_movie_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)3.2s: train loss = 0.41439 = (mf)0.41400 + (embed)0.00039
Epoch 2 (train)1.7s: train loss = 0.27780 = (mf)0.27701 + (embed)0.00079
Epoch 3 (train)1.7s: train loss = 0.25488 = (mf)0.25383 + (embed)0.00104
Epoch 4 (train)1.7s: train loss = 0.24570 = (mf)0.24447 + (embed)0.00123
Epoch 5 (train)1.7s + (eval)0.2s: train loss = 0.22777 = (mf)0.22634 + (embed)0.00143, recall = 0.16571, ndcg = 0.36184, precision = 0.31930, map = 0.09713
Epoch 6 (train)1.7s: train loss = 0.20892 = (mf)0.20723 + (embed)0.00169
Epoch 7 (train)2.3s: train loss = 0.19873 = (mf)0.19681 + (embed)0.00193
Epoch 8 (train)2.7s: train loss = 0.19270 = (mf)0.19055 + (embed)0.00215
Epoch 9 (train)2.3s: train loss = 0.18996 = (mf)0.18763 + (embed)0.00233
Epoch 10 (train)1.8s + (eval)0.2s: train loss = 0.18035 = (mf)0.17786 + (embed)0.00249, recall = 0.18722, ndcg = 0.39592, precision = 0.34889, map = 0.11192
Epoch

In [None]:
topk_scores = model_movie_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.137631
NDCG:	0.449586
Precision@K:	0.393213
Recall@K:	0.213085


In [None]:
model_movie_hard = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model_movie_hard.fit(neg_mode="hard", neg_size=10)

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)2.3s: train loss = 0.42283 = (mf)0.42246 + (embed)0.00037
Epoch 2 (train)1.7s: train loss = 0.28597 = (mf)0.28522 + (embed)0.00075
Epoch 3 (train)1.7s: train loss = 0.25731 = (mf)0.25628 + (embed)0.00103
Epoch 4 (train)1.7s: train loss = 0.24234 = (mf)0.24111 + (embed)0.00123
Epoch 5 (train)1.7s + (eval)0.3s: train loss = 0.22740 = (mf)0.22597 + (embed)0.00143, recall = 0.16689, ndcg = 0.36625, precision = 0.32131, map = 0.09719
Epoch 6 (train)2.3s: train loss = 0.21128 = (mf)0.20960 + (embed)0.00168
Epoch 7 (train)2.4s: train loss = 0.19689 = (mf)0.19495 + (embed)0.00194
Epoch 8 (train)2.0s: train loss = 0.18827 = (mf)0.18611 + (embed)0.00216
Epoch 9 (train)1.7s: train loss = 0.19020 = (mf)0.18787 + (embed)0.00233
Epoch 10 (train)1.7s + (eval)0.2s: train loss = 0.18092 = (mf)0.17842 + (embed)0.00250, recall = 0.18183, ndcg = 0.39208, precision = 0.34390, map = 0.11046
Epoch

In [None]:
topk_scores = model_movie_hard.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.137427
NDCG:	0.452683
Precision@K:	0.392577
Recall@K:	0.212811
