In [None]:
!pip install pytorch-lightning
!pip install torch-geometric


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, LGConv
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset
import random, math
from torch.utils.data import random_split
from pytorch_lightning import Trainer

from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
class RecommenderDataModule(pl.LightningDataModule):
    def __init__(self, interaction_file, batch_size=1204, val_size=0.3):
        super().__init__()
        self.interaction_file = interaction_file
        self.batch_size = batch_size
        self.val_size = val_size

    def prepare_data(self):
        # Load interactions
        df = pd.read_csv(self.interaction_file, sep ="\t")

        # Unique users and items
        unique_users = df['user_id:token'].unique()
        unique_items = df['item_id:token'].unique()

        self.num_users = len(unique_users)
        self.num_items = len(unique_items)

        # Mapping
        self.user_to_idx = {u: idx for idx, u in enumerate(unique_users)}
        self.item_to_idx = {i: idx for idx, i in enumerate(unique_items)}

        # Build positive interactions
        interactions = []
        for _, row in df.iterrows():
            u = self.user_to_idx[row['user_id:token']]
            i = self.item_to_idx[row['item_id:token']]
            interactions.append((u, i + self.num_users))

        # Split interactions into train_dataset and val_dataset
        dataset = interactions
        train_size = int(len(dataset) * (1 - self.val_size))
        val_size = len(dataset) - train_size
        self.train_dataset, self.val_dataset = random_split(dataset, [train_size, val_size])

        # Build edge_index from train_dataset
        train_user_ids = [u for u, _ in self.train_dataset]
        train_item_ids = [i for _, i in self.train_dataset]

        self.edge_index = torch.tensor([train_user_ids + train_item_ids, train_item_ids + train_user_ids], dtype=torch.long)

        train_user_pos_items = defaultdict(set)
        for u, i in self.train_dataset:
            train_user_pos_items[u].add(i)

        val_user_pos_items = defaultdict(set)
        for u, i in self.val_dataset:
            val_user_pos_items[u].add(i)

        self.train_user_pos_items = train_user_pos_items
        self.val_user_pos_items = val_user_pos_items

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

In [None]:
class BaseRecommender(pl.LightningModule):
    def __init__(self, num_users, num_items, embedding_dim=64, learning_rate=0.005, dropout=0.2):
        super().__init__()

        model_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_device = model_device

        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.dropout = dropout

        self.edge_index = None
        self.save_hyperparameters()

    def setup(self, stage=None):
        self.edge_index = self.trainer.datamodule.edge_index.to(self.model_device)
        self.train_user_pos_items = self.trainer.datamodule.train_user_pos_items
        self.val_user_pos_items = self.trainer.datamodule.val_user_pos_items

    @staticmethod
    def hit_at_k(pred_items, true_items, k):
        hits = 0
        for pred, true in zip(pred_items, true_items):
            if len(set(pred[:k]) & set(true)) > 0:
                hits += 1
        return hits / len(true_items)

    @staticmethod
    def ndcg_at_k(pred_items, true_items, k):
        ndcg = 0.0
        for pred, true in zip(pred_items, true_items):
            gains = []
            for idx, item in enumerate(pred[:k]):
                gains.append(1 if item in true else 0)
            ideal_gains = [1] * min(len(true), k)
            dcg = sum(g / math.log2(i+2) for i, g in enumerate(gains))
            idcg = sum(g / math.log2(i+2) for i, g in enumerate(ideal_gains))
            ndcg += dcg / idcg if idcg > 0 else 0
        return ndcg / len(true_items)

    @staticmethod
    def recall_at_k(pred_items, true_items, k):
        recall = 0.0
        for pred, true in zip(pred_items, true_items):
            recall += len(set(pred[:k]) & set(true)) / len(true)
        return recall / len(true_items)

    @staticmethod
    def precision_at_k(pred_items, true_items, k):
        precision = 0.0
        for pred, true in zip(pred_items, true_items):
            precision += len(set(pred[:k]) & set(true)) / k
        return precision / len(true_items)

    def compute_loss(self, batch, full_user_embs, full_item_embs):
        user_ids, item_ids = batch
        pos_item_ids = item_ids - self.hparams.num_users

        # Get embeddings
        user_emb = full_user_embs[user_ids]
        pos_emb = full_item_embs[pos_item_ids]

        # Compute positive scores
        pos_scores = torch.exp(-torch.abs(user_emb - pos_emb).sum(dim=1))

        ####################### Hard negative Sampling #######################
        distances = torch.cdist(user_emb, full_item_embs, p=1)
        scores = torch.exp(-distances)

        ######## Mask all pos_item_ids of the user in train_dataset ########
        ### Basically, the  model should only see the information in the train_dataset.
        ### Therefore, only mask the pos_item_ids of the user in train_dataset
        ### All cell (user, item) in val_dataset should be treated as blank hence don't mask the val_dataset

        for i, u in enumerate(user_ids.tolist()):
            pos_item_ids = [item - self.num_users for item in self.train_user_pos_items[u]]
            scores[i, pos_item_ids] = float('-inf')
        ######## Mask all pos_item_ids of the user in train_dataset ########

        k = 10 # Select top-K most negatives for each user
        neg_item_ids = torch.topk(scores, k=k, dim=1).indices

        # Get embeddings for these negatives
        neg_emb = full_item_embs[neg_item_ids]

        neg_scores = torch.exp(-torch.abs(user_emb.unsqueeze(1) - neg_emb).sum(dim=2))
        neg_scores = neg_scores.mean(dim=1)
        ####################### Hard negative Sampling #######################


        ####################### Compute Loss #######################
        scores = torch.cat([pos_scores, neg_scores], dim=0)
        labels = torch.cat([torch.ones_like(pos_scores), torch.zeros_like(neg_scores)], dim=0)

        loss = F.binary_cross_entropy(scores, labels)
        ####################### Compute Loss #######################
        return loss

    def training_step(self, batch, batch_idx):
        full_user_embs, full_item_embs = self()
        loss = self.compute_loss(batch, full_user_embs, full_item_embs)

        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        user_ids, item_ids = batch

        full_user_embs, full_item_embs = self()
        user_emb = full_user_embs[user_ids]

        distances = torch.cdist(user_emb, full_item_embs, p=1)
        scores = torch.exp(-distances)  # it is the score between the ith user in batch_size and ALL items

        ########## Mask those user-item pair that already in training set so that it won't suggest again
        mask = torch.zeros_like(scores, dtype=torch.bool)
        for i, u in enumerate(user_ids.tolist()):
            trained_items = [item - self.num_users for item in self.train_user_pos_items[u]]
            mask[i, trained_items] = True

        scores = scores.masked_fill(mask, float('-inf'))    #### Make them to -inf so that TopK won't pick again
        ########## Mask those user-item pair that already in training set so that it won't suggest again


        ################ Calculate metrics
        k_values = [5, 10, 15, 20]  # Example: you can add more values as needed
        # k_values = [10]

        for k in k_values:
            # Get top-k items for this k
            topk_items = torch.topk(scores, k=k, dim=1).indices.tolist() # (1024, K=5)

            true_items = []  # each user may have multiple positive items
            for u in user_ids.tolist():
                adjusted_val_items = [item - self.num_users for item in self.val_user_pos_items[u]]
                true_items.append(adjusted_val_items)

            # Compute metrics for this k
            hit = self.hit_at_k(topk_items, true_items, k)
            ndcg = self.ndcg_at_k(topk_items, true_items, k)
            recall = self.recall_at_k(topk_items, true_items, k)
            precision = self.precision_at_k(topk_items, true_items, k)

            # Log metrics dynamically
            self.log(f"val_hit@{k:02d}", hit, prog_bar=True)
            self.log(f"val_recall@{k:02d}", recall, prog_bar=True)
            self.log(f"val_precision@{k:02d}", precision, prog_bar=True)
            self.log(f"val_ndcg@{k:02d}", ndcg, prog_bar=True)

        # # Compute bpr_loss
        # loss = self.compute_loss(batch, full_user_embs, full_item_embs)

        # self.log('val_loss', loss, prog_bar=True, logger=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=1e-5)
        return optimizer

In [None]:
class GCNRecommender(BaseRecommender):
    def __init__(self, num_users, num_items, embedding_dim=64,hidden_dim=64, learning_rate=0.005, dropout=0.2):

        super().__init__(num_users, num_items, embedding_dim, learning_rate, dropout)

        self.user_embedding = nn.Embedding(num_users, embedding_dim).to(self.model_device)
        self.item_embedding = nn.Embedding(num_items, embedding_dim).to(self.model_device)

        self.conv1 = GCNConv(embedding_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self):
        # Initial embeddings for all nodes
        full_user_embs = self.user_embedding.weight.to(self.model_device)
        full_item_embs = self.item_embedding.weight.to(self.model_device)
        x = torch.cat([full_user_embs, full_item_embs], dim=0)

        # GCN propagation over full graph
        x = F.relu(self.conv1(x, self.edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, self.edge_index)

        # Split back into user/item embeddings
        user_final = x[:self.num_users]
        item_final = x[self.num_users:]
        return user_final, item_final


In [None]:
class GATRecommender(BaseRecommender):
    def __init__(self, num_users, num_items, embedding_dim=64, hidden_dim=64,
                 heads=8, learning_rate=0.005, dropout=0.2):
        super().__init__(num_users, num_items, embedding_dim, learning_rate, dropout)

        self.user_embedding = nn.Embedding(num_users, embedding_dim).to(self.model_device)
        self.item_embedding = nn.Embedding(num_items, embedding_dim).to(self.model_device)
        self.heads = heads

        self.conv1 = GATConv(embedding_dim, hidden_dim, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, dropout=dropout)

    def forward(self):
        # Initial embeddings for all nodes
        full_user_embs = self.user_embedding.weight.to(self.model_device)
        full_item_embs = self.item_embedding.weight.to(self.model_device)
        x = torch.cat([full_user_embs, full_item_embs], dim=0)

        # GCN propagation over full graph
        x = F.relu(self.conv1(x, self.edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, self.edge_index)

        # Split back into user/item embeddings
        user_final = x[:self.num_users]
        item_final = x[self.num_users:]
        return user_final, item_final

In [None]:
class LightGCNRecommender(BaseRecommender):
    def __init__(self, num_users, num_items, embedding_dim=64, num_layers=8,
                 learning_rate=0.005, dropout=0.2):
        super().__init__(num_users, num_items, embedding_dim, learning_rate, dropout)
        self.num_layers = num_layers

        self.user_embedding = nn.Embedding(num_users, embedding_dim).to(self.model_device)
        self.item_embedding = nn.Embedding(num_items, embedding_dim).to(self.model_device)

        self.convs = nn.ModuleList([LGConv() for _ in range(num_layers)])

    def forward(self):
        full_user_embs = self.user_embedding.weight.to(self.model_device)
        full_item_embs = self.item_embedding.weight.to(self.model_device)
        x = torch.cat([full_user_embs, full_item_embs], dim=0)

        embeddings = [x]
        for conv in self.convs:
            x = conv(x, self.edge_index)
            embeddings.append(x)

        # Combine all layers with equal weight
        final_embeddings = torch.stack(embeddings, dim=1).mean(dim=1)

        user_final = final_embeddings[:self.num_users]
        item_final = final_embeddings[self.num_users:]

        return user_final, item_final

In [None]:
# if __name__ == "__main__":
#     data_module = RecommenderDataModule('Amazon-KG-5core-Books.inter')
#     data_module.prepare_data()

#     models = {
#         # 'GAT': GATRecommender(
#         #     num_users=data_module.num_users,
#         #     num_items=data_module.num_items,
#         #     embedding_dim=128,
#         #     hidden_dim=128,
#         #     heads=8,
#         #     learning_rate=0.005
#         # ),
#         'GCN': GCNRecommender(
#             num_users=data_module.num_users,
#             num_items=data_module.num_items,
#             embedding_dim=128,
#             hidden_dim=128,
#             learning_rate=0.005
#         ),
#         'LightGCN': LightGCNRecommender(
#             num_users=data_module.num_users,
#             num_items=data_module.num_items,
#             embedding_dim=128,
#             num_layers=8,
#             learning_rate=0.005
#         )
#     }

#     for model_name, model in models.items():
#         print(f"\n{'='*60}")
#         print(f"Training {model_name}")
#         print(f"{'='*60}")

#         # Early stopping callback
#         early_stop_callback = EarlyStopping(
#             monitor="val_hit@10",     # metric to monitor
#             patience=20,            # number of epochs with no improvement after which training will be stopped
#             mode="max",             # 'min' because we want to minimize val_loss
#             verbose=True
#         )

#         checkpoint_hit = ModelCheckpoint(
#                 monitor="val_hit@10",
#                 dirpath="./checkpoints",
#                 filename=f"{model_name}-'book'-{{epoch:02d}}-{{val_hit@10:.3f}}",
#                 save_top_k=1,
#                 mode="max",
#         )

#         checkpoint_recall = ModelCheckpoint(
#                 monitor="val_recall@10",
#                 dirpath="./checkpoints",
#                 filename=f"{model_name}-'book'-{{epoch:02d}}-{{val_recall@10:.3f}}",
#                 save_top_k=1,
#                 mode="max",
#         )

#         checkpoint_ndcg = ModelCheckpoint(
#                 monitor="val_ndcg@10",
#                 dirpath="./checkpoints",
#                 filename=f"{model_name}-'book'-{{epoch:02d}}-{{val_ndcg@10:.3f}}",
#                 save_top_k=1,
#                 mode="max",
#         )

#         checkpoint_precision = ModelCheckpoint(
#                 monitor="val_precision@10",
#                 dirpath="./checkpoints",
#                 filename=f"{model_name}-'book'-{{epoch:02d}}-{{val_precision@10:.3f}}",
#                 save_top_k=1,
#                 mode="max",
#         )

#         trainer = Trainer(
#                 num_sanity_val_steps=0,
#                 max_epochs=500,
#                 accelerator="auto",
#                 callbacks=[checkpoint_hit, checkpoint_recall, checkpoint_ndcg, checkpoint_precision, early_stop_callback],
#             )

#         trainer.fit(model, data_module)

In [None]:
# # ########### Validation
data_module = RecommenderDataModule('Amazon-KG-5core-Books.inter')
data_module.prepare_data()

checkpoint_path = "/content/checkpoints/GAT-'book'-epoch=37-val_ndcg@10=0.066.ckpt"
model = GATRecommender.load_from_checkpoint(checkpoint_path)

trainer = Trainer(accelerator="gpu", devices=1)
trainer.validate(model, datamodule=data_module)