In [16]:
import os, random, math, pickle
import pandas as pd
import numpy as np
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.utils.data import random_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [17]:
name = 'book'
n_clusters = 4

## 1. Learn Embedding

### 1.1 Dataset

In [18]:
class TCKGDataset(Dataset):
    def __init__(self, triplets, num_entities):
        self.triplets = triplets
        self.num_entities = num_entities
    def __len__(self):
        return len(self.triplets)
    def __getitem__(self, idx):
        # Trảmovie về bộ ba (head, relation, tail)
        return self.triplets[idx]

### 1.2 TransE Model

In [19]:
class TransE(pl.LightningModule):
    def __init__(self, num_entities, num_relations, embedding_dim=64, lr=1e-3, weight_decay=1e-4, dropout_rate=0.2):
        super().__init__()
        self.save_hyperparameters()
        
        # Khởi tạo Embeddings
        self.entity_emb = nn.Embedding(num_entities, embedding_dim)
        self.relation_emb = nn.Embedding(num_relations, embedding_dim)
        
        # Xavier initialization giúp hội tụ tốt hơn
        nn.init.xavier_uniform_(self.entity_emb.weight)
        nn.init.xavier_uniform_(self.relation_emb.weight)

        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, h, r, t):
        h_e = self.entity_emb(h)
        r_e = self.relation_emb(r)
        t_e = self.entity_emb(t)

        # 2. Embedding Normalization (Rất quan trọng cho TransE)
        # Ép độ dài các vector về 1 (Unit Norm constraint)
        h_e = F.normalize(h_e, p=2, dim=1)
        r_e = F.normalize(r_e, p=2, dim=1)
        t_e = F.normalize(t_e, p=2, dim=1)
        
        # 3. Áp dụng Dropout
        h_e = self.dropout(h_e)
        r_e = self.dropout(r_e)
        t_e = self.dropout(t_e)
        
        # Công thức (6): Khoảng cách bình phương L2
        # g_r(h, t) = ||h + r - t||^2
        score = torch.sum((h_e + r_e - t_e)**2, dim=1)
        return score

    def training_step(self, batch, batch_idx):
        h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]
        
        # Tính score cho bộ ba đúng (Positive) -> Cần giảm thiểu khoảng cách này
        pos_scores = self(h, r, t)
        
        # Negative Sampling: Thay thế tail t bằng t' ngẫu nhiên
        # t' không nhất thiết phải là không đúng thực tế (simplified), nhưng xác suất cao là không đúng.
        rand_t = torch.randint(0, self.hparams.num_entities, t.shape, device=self.device)
        
        # Tính score cho bộ ba sai (Negative) -> Cần tối đa hóa khoảng cách này
        neg_scores = self(h, r, rand_t)
        
        # Công thức (7) Loss: -ln(sigmoid(g_neg - g_pos))
        # Chúng ta muốn g_neg > g_pos (khoảng cách sai lớn hơn đúng)
        # => (g_neg - g_pos) càng lớn càng tốt
        loss = -F.logsigmoid(neg_scores - pos_scores).mean()
        
        # Log loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]
        
        # 1. Tính loss trên valid set
        pos_scores = self(h, r, t)
        
        # Negative sampling (đơn giản hoá để tính loss theo dõi)
        rand_t = torch.randint(0, self.hparams.num_entities, t.shape, device=self.device)
        neg_scores = self(h, r, rand_t)
        
        val_loss = -F.logsigmoid(neg_scores - pos_scores).mean()
        self.log('val_loss', val_loss, prog_bar=True)
        return val_loss

    def configure_optimizers(self):
        # 4. Thêm weight_decay (L2 regularization) vào Adam
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)

### 1.3 Load dataset

In [20]:
file_path = f'./data/{name}_TCKG.csv' 
print(f"Loading data from {file_path}...")

TCKG_df = pd.read_csv(file_path)

# Tạo Mapping ID an toàn (dựa trên token để tránh trùng lặp giữa UserID và EntityID)
# Gom tất cả head và tail tokens để tạo không gian Entity
all_entities = pd.concat([TCKG_df['head_id:token'], TCKG_df['tail_id:token']]).unique()
entity_to_idx = {token: i for i, token in enumerate(all_entities)}

all_relations = TCKG_df['relation_id:token'].unique()
relation_to_idx = {token: i for i, token in enumerate(all_relations)}

print(f"Total Entities: {len(all_entities)}")
print(f"Total Relations: {len(all_relations)}")

# Chuyển đổi dữ liệu sang index
triplets_np = np.stack([
    TCKG_df['head_id:token'].map(entity_to_idx).values,
    TCKG_df['relation_id:token'].map(relation_to_idx).values,
    TCKG_df['tail_id:token'].map(entity_to_idx).values
], axis=1)

# Chuyển sang Tensor
triplets_tensor = torch.tensor(triplets_np, dtype=torch.long)
print(f'triplets_tensor.shape: {triplets_tensor.shape}')

# Tạo DataLoader
full_dataset = TCKGDataset(triplets_tensor, num_entities=len(all_entities))

# Chia 90% Train - 10% Val
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_set, val_set = random_split(full_dataset, [train_size, val_size])

# Tạo 2 Loaders
train_loader = DataLoader(train_set, batch_size=1024, shuffle=True, num_workers=4)
val_loader = DataLoader(val_set, batch_size=1024, shuffle=False, num_workers=4)


Loading data from ./data/book_TCKG.csv...
Total Entities: 36884
Total Relations: 24
triplets_tensor.shape: torch.Size([97181, 3])


### 1.3 Init and train model

In [21]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',       # Theo dõi val_loss
    dirpath=f'./checkpoints/', # Thư mục lưu
    filename=f'{name}-transE-{timestamp}-{{epoch:02d}}-{{val_loss:.4f}}', 
    save_top_k=1,             # Chỉ giữ lại 1 model tốt nhất
    mode='min',               # Lưu khi val_loss nhỏ nhất
)
import pickle
import numpy as np
# 5. Early Stopping Callback
early_stop_callback = EarlyStopping(
    monitor='val_loss', # Theo dõi val_loss
    min_delta=0.001,    # Cải thiện tối thiểu cần thiết
    patience=10,         # Chờ 5 epochs nếu không cải thiện thì dừng
    verbose=True,
    mode='min'
)

model = TransE(
    num_entities=len(all_entities), 
    num_relations=len(all_relations), 
    embedding_dim=128, # Có thể chỉnh d-dimension tại đây
    lr=0.001,
    weight_decay=1e-3,  # Tăng lên nếu vẫn overfit (ví dụ: 1e-3)
    dropout_rate=0.3    # Tăng lên nếu vẫn overfit (tối đa 0.5)
)

# Trainer
trainer = pl.Trainer(
    max_epochs=50, 
    accelerator="auto", # Tự động dùng GPU nếu có
    callbacks=[checkpoint_callback, early_stop_callback],
    enable_progress_bar=True
)
# Bắt đầu huấn luyện
trainer.fit(model, train_loader, val_loader)
# Sau khi train, bạn có thể lấy embedding bằng:
# entity_embeddings = model.entity_emb.weight.detach().cpu().numpy()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name         | Type      | Params | Mode  | FLOPs
-----------------------------------------------------------
0 | entity_emb   | Embedding | 4.7 M  | train | 0    
1 | relation_emb | Embedding | 3.1 K  | train | 0    
2 | dropout      | Dropout   | 0      | train | 0    
-----------------------------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.897    Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode
0         Total Flops


Epoch 0: 100%|██████████| 86/86 [00:07<00:00, 11.76it/s, v_num=7, train_loss=0.504, val_loss=0.472]

Metric val_loss improved. New best score: 0.472


Epoch 1: 100%|██████████| 86/86 [00:07<00:00, 11.84it/s, v_num=7, train_loss=0.347, val_loss=0.433]

Metric val_loss improved by 0.039 >= min_delta = 0.001. New best score: 0.433


Epoch 2: 100%|██████████| 86/86 [00:07<00:00, 11.67it/s, v_num=7, train_loss=0.303, val_loss=0.411]

Metric val_loss improved by 0.022 >= min_delta = 0.001. New best score: 0.411


Epoch 3: 100%|██████████| 86/86 [00:07<00:00, 11.25it/s, v_num=7, train_loss=0.268, val_loss=0.404]

Metric val_loss improved by 0.007 >= min_delta = 0.001. New best score: 0.404


Epoch 4: 100%|██████████| 86/86 [00:07<00:00, 11.66it/s, v_num=7, train_loss=0.253, val_loss=0.397]

Metric val_loss improved by 0.007 >= min_delta = 0.001. New best score: 0.397


Epoch 5: 100%|██████████| 86/86 [00:07<00:00, 11.40it/s, v_num=7, train_loss=0.251, val_loss=0.393]

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.393


Epoch 7: 100%|██████████| 86/86 [00:07<00:00, 11.40it/s, v_num=7, train_loss=0.248, val_loss=0.390]

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 0.390


Epoch 8: 100%|██████████| 86/86 [00:07<00:00, 11.55it/s, v_num=7, train_loss=0.244, val_loss=0.386]

Metric val_loss improved by 0.005 >= min_delta = 0.001. New best score: 0.386


Epoch 10: 100%|██████████| 86/86 [00:07<00:00, 11.55it/s, v_num=7, train_loss=0.237, val_loss=0.384]

Metric val_loss improved by 0.001 >= min_delta = 0.001. New best score: 0.384


Epoch 16: 100%|██████████| 86/86 [00:07<00:00, 11.41it/s, v_num=7, train_loss=0.253, val_loss=0.380]

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.380


Epoch 26: 100%|██████████| 86/86 [00:07<00:00, 11.75it/s, v_num=7, train_loss=0.256, val_loss=0.384]

Monitored metric val_loss did not improve in the last 10 records. Best score: 0.380. Signaling Trainer to stop.


Epoch 26: 100%|██████████| 86/86 [00:07<00:00, 11.74it/s, v_num=7, train_loss=0.256, val_loss=0.384]


### 1.5 Save trained 

In [22]:
# 1. Extract Embeddings from Model (move to CPU and convert to numpy)
entity_embeddings = model.entity_emb.weight.detach().cpu().numpy()
relation_embeddings = model.relation_emb.weight.detach().cpu().numpy()

# 2. Package everything into a dictionary
saved_data = {
    'entity_embeddings': entity_embeddings,      # (Num_Entities, dim)
    'relation_embeddings': relation_embeddings,  # (Num_Relations, dim)
    'entity_to_idx': entity_to_idx,              # Dict: logs"item_123" -> 0
    'relation_to_idx': relation_to_idx           # Dict: "interacted_0" -> 0
}
# 3. Save to a single file
with open(f'./pickle/{name}_transE_embeddings_{timestamp}.pkl', 'wb') as f:
    pickle.dump(saved_data, f)
print("Embeddings and mappings saved successfully!")

Embeddings and mappings saved successfully!
