In [None]:
# # 1. Gỡ bỏ phiên bản quá mới hiện tại
# !pip uninstall torch torchvision torchaudio torch-scatter torch-sparse torch-geometric torch-geometric-temporal -y

# # 2. Cài đặt PyTorch 2.5.1 (Bản ổn định) + CUDA 12.4
# !pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124

# # 3. Cài đặt các thư viện vệ tinh (Scatter/Sparse) dành RIÊNG cho bản 2.5.1
# !pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu124.html

# # 4. Cài thư viện chính
# !pip install pytorch_lightning torch-geometric torch-geometric-temporal

# # # 5. Runtime > Restart session
# # # 6 Ignore this !pip section

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch_geometric_temporal.nn.recurrent import TGCN
from torch.utils.data import DataLoader, TensorDataset
from collections import defaultdict

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(self, interaction_file, batch_size=1024, train_size=0.7, val_size=0.15, test_size=0.15):
        super().__init__()
        self.interaction_file = interaction_file
        self.batch_size = batch_size
        self.train_size = train_size
        self.val_size = val_size
        self.test_size = test_size

    def prepare_data(self):
        # --- 1. Load & Preprocess ---
        df = pd.read_csv(self.interaction_file)
        
        # Chuyển timestamp
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['month'] = df['timestamp'].dt.to_period('M')

        # Mapping ID sang Index (0 -> N-1)
        unique_users = df['user_id'].unique()
        unique_items = df['item_id'].unique()
        
        self.num_users = len(unique_users)
        self.num_items = len(unique_items)

        self.user_to_idx = {u: idx for idx, u in enumerate(unique_users)}
        self.item_to_idx = {i: idx for idx, i in enumerate(unique_items)}

        # Áp dụng mapping vào DataFrame (Nhanh hơn iterrows rất nhiều)
        df['user_idx'] = df['user_id'].map(self.user_to_idx)
        df['item_idx'] = df['item_id'].map(self.item_to_idx)

        # --- 2. Temporal Split ---
        months = sorted(df['month'].unique())
        n_months = len(months)
        
        train_end = int(n_months * self.train_size)
        val_end = train_end + int(n_months * self.val_size)
        
        train_months = months[:train_end]
        val_months = months[train_end:val_end]
        test_months = months[val_end:]

        # Tách DataFrame
        self.train_df = df[df['month'].isin(train_months)]
        self.val_df = df[df['month'].isin(val_months)]
        self.test_df = df[df['month'].isin(test_months)]

        print(f"Split sizes -> Train: {len(self.train_df)}, Val: {len(self.val_df)}, Test: {len(self.test_df)}")

        # --- 3. Build Graph (Edge Index) cho Train Set ---
        # Chỉ dùng dữ liệu Train để xây dựng đồ thị nền tảng
        # Item nodes sẽ có ID từ num_users đến num_users + num_items - 1
        src = torch.tensor(self.train_df['user_idx'].values, dtype=torch.long)
        dst = torch.tensor(self.train_df['item_idx'].values, dtype=torch.long) + self.num_users
        
        # Tạo edge_index vô hướng (2 chiều: user->item và item->user)
        self.edge_index = torch.stack([torch.cat([src, dst]), torch.cat([dst, src])], dim=0)

        # --- 4. Prepare User History (Cho việc sampling/evaluation nếu cần) ---
        # Dùng set để tra cứu nhanh O(1)
        self.train_user_pos_items = self._build_user_history(self.train_df)
        self.val_user_pos_items = self._build_user_history(self.val_df)
        self.test_user_pos_items = self._build_user_history(self.test_df)

    def _build_user_history(self, df_subset):
        """Hàm phụ trợ để gom nhóm item theo user"""
        user_pos_items = defaultdict(set)
        # Zip nhanh hơn iterrows
        for u, i in zip(df_subset['user_idx'], df_subset['item_idx']):
            user_pos_items[u].add(i)
        return user_pos_items

    def _create_dataloader(self, df_subset, shuffle):
        # Chuyển đổi thành TensorDataset để DataLoader hiểu
        users = torch.tensor(df_subset['user_idx'].values, dtype=torch.long)
        items = torch.tensor(df_subset['item_idx'].values, dtype=torch.long)
        dataset = TensorDataset(users, items)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=shuffle, num_workers=2)

    def train_dataloader(self):
        # Shuffle=True cho Train set
        return self._create_dataloader(self.train_df, shuffle=True)

    def val_dataloader(self):
        return self._create_dataloader(self.val_df, shuffle=False)

    def test_dataloader(self):
        return self._create_dataloader(self.test_df, shuffle=False)

In [None]:
class TGCNRecommender(pl.LightningModule):
    def __init__(self, num_cells, num_users, num_items, batch_size, embedding_dim, lr):
        super().__init__()
        self.save_hyperparameters()

        model_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_device = model_device

        self.num_cells = num_cells
        
        self.num_users = num_users
        self.num_items = num_items
        self.num_nodes = self.num_users + self.num_items

        self.embedding_dim = embedding_dim

        # Learnable Node Embeddings
        self.node_emb = nn.Embedding(self.num_nodes, embedding_dim)
        nn.init.xavier_uniform_(self.node_emb.weight)

        # T-GCN Layer
        self.tgcns = nn.ModuleList([TGCN(in_channels=embedding_dim, 
                                        out_channels=embedding_dim) for _ in range(num_cells)])

        self.lr = lr
        self.h0 = None

    def setup(self, stage=None):
        self.edge_index = self.trainer.datamodule.edge_index.to(self.model_device)

        self.train_user_pos_items = self.trainer.datamodule.train_user_pos_items
        self.val_user_pos_items = self.trainer.datamodule.val_user_pos_items
        self.test_user_pos_items = self.trainer.datamodule.test_user_pos_items

    def on_train_epoch_start(self):
        self.h0 = None

    def on_test_epoch_start(self):
        self.h0 = None

    def forward(self):
        # 1. Get current node embeddings
        x = self.node_emb.weight

        # 2. Update Embeddings with T-GCN
        h_out = self.h0
        for tgcn in self.tgcns:
            h_out = tgcn(x, self.edge_index, h_out) #h_out shape: [num_nodes, embedding_dim]

        user_embs = h_out[:self.num_users]
        item_embs = h_out[self.num_users:]

        return user_embs, item_embs

    def compute_loss(self, batch, user_embs, item_embs):
        user_ids, item_ids = batch
        pos_item_ids = item_ids - self.hparams.num_users

        # Get embeddings
        user_emb = full_user_embs[user_ids]
        pos_emb = full_item_embs[pos_item_ids]

        # Compute positive scores
        pos_scores = torch.exp(-torch.abs(user_emb - pos_emb).sum(dim=1))

        ####################### Hard negative Sampling #######################
        distances = torch.cdist(user_emb, full_item_embs, p=1)
        scores = torch.exp(-distances)

        ######## Mask all pos_item_ids of the user in train_dataset ########
        ### Basically, the  model should only see the information in the train_dataset.
        ### Therefore, only mask the pos_item_ids of the user in train_dataset
        ### All cell (user, item) in val_dataset should be treated as blank hence don't mask the val_dataset

        for i, u in enumerate(user_ids.tolist()):
            pos_item_ids = [item - self.num_users for item in self.train_user_pos_items[u]]
            scores[i, pos_item_ids] = float('-inf')
        ######## Mask all pos_item_ids of the user in train_dataset ########

        k = 10 # Select top-K most negatives for each user
        neg_item_ids = torch.topk(scores, k=k, dim=1).indices

        # Get embeddings for these negatives
        neg_emb = full_item_embs[neg_item_ids]

        neg_scores = torch.exp(-torch.abs(user_emb.unsqueeze(1) - neg_emb).sum(dim=2))
        neg_scores = neg_scores.mean(dim=1)
        ####################### Hard negative Sampling #######################


        ####################### Compute Loss #######################
        scores = torch.cat([pos_scores, neg_scores], dim=0)
        labels = torch.cat([torch.ones_like(pos_scores), torch.zeros_like(neg_scores)], dim=0)

        loss = F.binary_cross_entropy(scores, labels)
        ####################### Compute Loss #######################
        return loss

    def training_step(self, batch, batch_idx):
        user_embs, item_embs = self()
        loss = self.compute_loss(batch, user_embs, item_embs)

        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
if __name__ == "__main__":
    data_module = DataModule('data/book_interaction.csv')
    data_module.prepare_data()

    model = TGCNRecommender(
        num_cells = 3,
        num_users=data_module.num_users,
        num_items=data_module.num_items,
        batch_size = 1024,
        embedding_dim= 64,
        lr = 0.001,
    )

    trainer = pl.Trainer(
        max_epochs=10,
        accelerator="auto",
        devices=1,
        enable_progress_bar=True,
        log_every_n_steps=1
    )

    trainer.fit(model, data_module)
