In [4]:
import os
import random, math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [5]:
name = 'book'
n_clusters = 4

## 1. Learn Embedding

### 1.1 Dataset

In [6]:
class TCKGDataset(Dataset):
    def __init__(self, triplets, num_entities):
        self.triplets = triplets
        self.num_entities = num_entities
    def __len__(self):
        return len(self.triplets)
    def __getitem__(self, idx):
        # Trả về bộ ba (head, relation, tail)
        return self.triplets[idx]

### 1.2 TransE Model

In [None]:
class TransE(pl.LightningModule):
    def __init__(self, num_entities, num_relations, embedding_dim=64, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        
        # Khởi tạo Embeddings
        self.entity_emb = nn.Embedding(num_entities, embedding_dim)
        self.relation_emb = nn.Embedding(num_relations, embedding_dim)
        
        # Xavier initialization giúp hội tụ tốt hơn
        nn.init.xavier_uniform_(self.entity_emb.weight)
        nn.init.xavier_uniform_(self.relation_emb.weight)
        
    def forward(self, h, r, t):
        h_e = self.entity_emb(h)
        r_e = self.relation_emb(r)
        t_e = self.entity_emb(t)
        
        # Công thức (6): Khoảng cách bình phương L2
        # g_r(h, t) = ||h import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.recurrent import TGCN, EvolveGCNH, A3TGCN
from torch_geometric.utils import dropout_edge
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader, TensorDataset, Sampler
from collections import defaultdict+ r - t||^2
        score = torch.sum((h_e + r_e - t_e)**2, dim=1)
        return score

    def training_step(self, batch, batch_idx):
        h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]
        
        # Tính score cho bộ ba đúng (Positive) -> Cần giảm thiểu khoảng cách này
        pos_scores = self(h, r, t)
        
        # Negative Sampling: Thay thế tail t bằng t' ngẫu nhiên
        # t' không nhất thiết phải là không đúng thực tế (simplified), nhưng xác suất cao là không đúng.
        rand_t = torch.randint(0, self.hparams.num_entities, t.shape, device=self.device)
        
        # Tính score cho bộ ba sai (Negative) -> Cần tối đa hóa khoảng cách này
        neg_scores = self(h, r, rand_t)
        
        # Công thức (7) Loss: -ln(sigmoid(g_neg - g_pos))
        # Chúng ta muốn g_neg > g_pos (khoảng cách sai lớn hơn đúng)
        # => (g_neg - g_pos) càng lớn càng tốt
        loss = -F.logsigmoid(neg_scores - pos_scores).mean()
        
        # Log loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

### 1.3 Load dataset

In [9]:
file_path = f'./data/{name}/{name}_TCKG.csv' 
print(f"Loading data from {file_path}...")

TCKG_df = pd.read_csv(file_path)

# Tạo Mapping ID an toàn (dựa trên token để tránh trùng lặp giữa UserID và EntityID)
# Gom tất cả head và tail tokens để tạo không gian Entity
all_entities = pd.concat([TCKG_df['head_id:token'], TCKG_df['tail_id:token']]).unique()
entity_to_idx = {token: i for i, token in enumerate(all_entities)}

all_relations = TCKG_df['relation_id:token'].unique()
relation_to_idx = {token: i for i, token in enumerate(all_relations)}

print(f"Total Entities: {len(all_entities)}")
print(f"Total Relations: {len(all_relations)}")

# Chuyển đổi dữ liệu sang index
triplets_np = np.stack([
    TCKG_df['head_id:token'].map(entity_to_idx).values,
    TCKG_df['relation_id:token'].map(relation_to_idx).values,
    TCKG_df['tail_id:token'].map(entity_to_idx).values
], axis=1)

# Chuyển sang Tensor
triplets_tensor = torch.tensor(triplets_np, dtype=torch.long)
print(f'triplets_tensor.shape: {triplets_tensor.shape}')

# Tạo DataLoader
dataset = TCKGDataset(triplets_tensor, num_entities=len(all_entities))
train_loader = DataLoader(dataset, batch_size=1024, shuffle=True, num_workers=0)


Loading data from ./data/book/book_TCKG.csv...
Total Entities: 46186
Total Relations: 24
triplets_tensor.shape: torch.Size([187274, 3])


### 1.3 Init and train model

In [None]:
model = TransE(
    num_entities=len(all_entities), 
    num_relations=len(all_relations), 
    embedding_dim=64, # Có thể chỉnh d-dimension tại đây
    lr=0.001
)
# Trainer
trainer = pl.Trainer(
    max_epochs=20, 
    accelerator="auto", # Tự động dùng GPU nếu có
    enable_progress_bar=True
)
# Bắt đầu huấn luyện
trainer.fit(model, train_loader)
# Sau khi train, bạn có thể lấy embedding bằng:
# entity_embeddings = model.entity_emb.weight.detach().cpu().numpy()