In [102]:
import os, random, math, pickle
import pandas as pd
import numpy as np
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.utils.data import random_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [103]:
name = 'movie'
n_clusters = 4

## 1. Learn Embedding

### 1.1 Dataset

In [104]:
class TCKGDataset(Dataset):
    def __init__(self, triplets):
        self.triplets = triplets
    def __len__(self):
        return len(self.triplets)
    def __getitem__(self, idx):
        # Trảmovie về bộ ba (head, relation, tail)
        return self.triplets[idx]

### 1.2 TransE Model

In [105]:
class TransE(pl.LightningModule):
    def __init__(self, num_entities, num_relations, embedding_dim=64, lr=1e-3, weight_decay=1e-4, dropout_rate=0.2):
        super().__init__()
        self.save_hyperparameters()
        
        # Khởi tạo Embeddings
        self.entity_emb = nn.Embedding(num_entities + 1, embedding_dim, padding_idx=0)     # +1 because starting at 1 instead of 0
        self.relation_emb = nn.Embedding(num_relations + 1, embedding_dim, padding_idx=0)
        
        # # Xavier initialization giúp hội tụ tốt hơn
        # nn.init.xavier_uniform_(self.entity_emb.weight)
        # nn.init.xavier_uniform_(self.relation_emb.weight)

        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, h, r, t):
        h_e = self.entity_emb(h)
        r_e = self.relation_emb(r)
        t_e = self.entity_emb(t)

        # 2. Embedding Normalization (Rất quan trọng cho TransE)
        # Ép độ dài các vector về 1 (Unit Norm constraint)
        h_e = F.normalize(h_e, p=2, dim=1)
        r_e = F.normalize(r_e, p=2, dim=1)
        t_e = F.normalize(t_e, p=2, dim=1)
        
        # 3. Áp dụng Dropout
        h_e = self.dropout(h_e)
        r_e = self.dropout(r_e)
        t_e = self.dropout(t_e)
        
        # Công thức (6): Khoảng cách bình phương L2
        # g_r(h, t) = ||h + r - t||^2
        score = torch.sum((h_e + r_e - t_e)**2, dim=1)
        return score

    def training_step(self, batch, batch_idx):
        h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]
        
        # Tính score cho bộ ba đúng (Positive) -> Cần giảm thiểu khoảng cách này
        pos_scores = self(h, r, t)
        
        # Negative Sampling: Thay thế tail t bằng t' ngẫu nhiên
        # t' không nhất thiết phải là không đúng thực tế (simplified), nhưng xác suất cao là không đúng.
        rand_t = torch.randint(1, self.hparams.num_entities + 1, t.shape, device=self.device)
        
        # Tính score cho bộ ba sai (Negative) -> Cần tối đa hóa khoảng cách này
        neg_scores = self(h, r, rand_t)
        
        # Công thức (7) Loss: -ln(sigmoid(g_neg - g_pos))
        # Chúng ta muốn g_neg > g_pos (khoảng cách sai lớn hơn đúng)
        # => (g_neg - g_pos) càng lớn càng tốt
        loss = -F.logsigmoid(neg_scores - pos_scores).mean()
        
        # Log loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]
        
        # 1. Tính loss trên valid set
        pos_scores = self(h, r, t)
        
        # Negative sampling (đơn giản hoá để tính loss theo dõi)
        rand_t = torch.randint(1, self.hparams.num_entities + 1, t.shape, device=self.device)

        neg_scores = self(h, r, rand_t)
        
        val_loss = -F.logsigmoid(neg_scores - pos_scores).mean()
        self.log('val_loss', val_loss, prog_bar=True)
        return val_loss

    def configure_optimizers(self):
        # 4. Thêm weight_decay (L2 regularization) vào Adam
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)

### 1.3 Load dataset

In [106]:
file_path = f'./data/{name}_TCKG.csv' 
print(f"Loading data from {file_path}...")

TCKG_df = pd.read_csv(file_path)

# Chuyển đổi dữ liệu sang index
triplets_np = np.stack([
    TCKG_df['head_id'],
    TCKG_df['relation_id'],
    TCKG_df['tail_id']
], axis=1)


# Chuyển sang Tensor
triplets_tensor = torch.tensor(triplets_np, dtype=torch.long)
print(f'triplets_tensor.shape: {triplets_tensor.shape}')

# Tạo DataLoader
full_dataset = TCKGDataset(triplets_tensor)

# Chia 90% Train - 10% Val
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_set, val_set = random_split(full_dataset, [train_size, val_size])

# Tạo 2 Loaders
train_loader = DataLoader(train_set, batch_size=1024, shuffle=True, num_workers=4)
val_loader = DataLoader(val_set, batch_size=1024, shuffle=False, num_workers=4)


Loading data from ./data/movie_TCKG.csv...
triplets_tensor.shape: torch.Size([164383, 3])


### 1.3 Init and train model

In [107]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

num_entites = pd.concat([TCKG_df['head_id'], TCKG_df['tail_id']]).max()
num_relations = TCKG_df['relation_id'].max()

print(f"Total Entities: {num_entites}")
print(f"Total Relations: {num_relations}")

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',       # Theo dõi val_loss
    dirpath=f'./checkpoints/', # Thư mục lưu
    filename=f'{name}-transE-{timestamp}-{{epoch:02d}}-{{val_loss:.4f}}', 
    save_top_k=1,             # Chỉ giữ lại 1 model tốt nhất
    mode='min',               # Lưu khi val_loss nhỏ nhất
)

# 5. Early Stopping Callback
early_stop_callback = EarlyStopping(
    monitor='val_loss', # Theo dõi val_loss
    min_delta=0.001,    # Cải thiện tối thiểu cần thiết
    patience=10,         # Chờ 5 epochs nếu không cải thiện thì dừng
    verbose=True,
    mode='min'
)

model = TransE(
    num_entities=num_entites, 
    num_relations=num_relations, 
    embedding_dim=64, # Có thể chỉnh d-dimension tại đây
    lr=0.001,
    weight_decay=1e-3,  # Tăng lên nếu vẫn overfit (ví dụ: 1e-3)
    dropout_rate=0.3    # Tăng lên nếu vẫn overfit (tối đa 0.5)
)

# Trainer
trainer = pl.Trainer(
    max_epochs=50, 
    accelerator="auto", # Tự động dùng GPU nếu có
    callbacks=[checkpoint_callback, early_stop_callback],
    enable_progress_bar=True
)
# Bắt đầu huấn luyện
trainer.fit(model, train_loader, val_loader)
# Sau khi train, bạn có thể lấy embedding bằng:
# entity_embeddings = model.entity_emb.weight.detach().cpu().numpy()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name         | Type      | Params | Mode  | FLOPs
-----------------------------------------------------------
0 | entity_emb   | Embedding | 3.7 M  | train | 0    
1 | relation_emb | Embedding | 1.6 K  | train | 0    
2 | dropout      | Dropout   | 0      | train | 0    
-----------------------------------------------------------
3.7 M     Trainable params
0         Non-trainable params
3.7 M     Total params
14.990    Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode
0         Total Flops


Total Entities: 58529
Total Relations: 24
Epoch 0:   0%|          | 0/145 [00:00<?, ?it/s]                           

1982.33s - thread._ident is None in _get_related_thread!


Epoch 0: 100%|██████████| 145/145 [00:15<00:00,  9.33it/s, v_num=0, train_loss=0.809, val_loss=0.726]

Metric val_loss improved. New best score: 0.726


Epoch 1: 100%|██████████| 145/145 [00:13<00:00, 10.55it/s, v_num=0, train_loss=0.840, val_loss=0.724]

Metric val_loss improved by 0.003 >= min_delta = 0.001. New best score: 0.724


Epoch 2:   0%|          | 0/145 [00:00<?, ?it/s, v_num=0, train_loss=0.840, val_loss=0.724]          

2012.78s - thread._ident is None in _get_related_thread!


Epoch 2: 100%|██████████| 145/145 [00:14<00:00, 10.05it/s, v_num=0, train_loss=0.779, val_loss=0.719]

Metric val_loss improved by 0.005 >= min_delta = 0.001. New best score: 0.719


Epoch 3: 100%|██████████| 145/145 [00:13<00:00, 10.87it/s, v_num=0, train_loss=0.776, val_loss=0.718]

Metric val_loss improved by 0.001 >= min_delta = 0.001. New best score: 0.718


Epoch 4: 100%|██████████| 145/145 [00:11<00:00, 12.12it/s, v_num=0, train_loss=0.780, val_loss=0.715]

Metric val_loss improved by 0.003 >= min_delta = 0.001. New best score: 0.715


Epoch 5: 100%|██████████| 145/145 [00:12<00:00, 11.88it/s, v_num=0, train_loss=0.812, val_loss=0.715]

2064.56s - thread._ident is None in _get_related_thread!


Epoch 5: 100%|██████████| 145/145 [00:13<00:00, 10.95it/s, v_num=0, train_loss=0.812, val_loss=0.709]

Metric val_loss improved by 0.006 >= min_delta = 0.001. New best score: 0.709


Epoch 6:  90%|████████▉ | 130/145 [00:09<00:01, 13.03it/s, v_num=0, train_loss=0.790, val_loss=0.709]

FileNotFoundError: [Errno 2] No such file or directory: '/home/hp/Study/07. Luan Van/03. TPRec/03. Learn Embedding/lightning_logs/version_0/metrics.csv'

### 1.5 Save trained 

In [None]:
# 1. Extract Embeddings from Model (move to CPU and convert to numpy)
entity_embeddings = model.entity_emb.weight.detach().cpu().numpy()
relation_embeddings = model.relation_emb.weight.detach().cpu().numpy()

# 2. Package everything into a dictionary
saved_data = {
    'entity_embeddings': entity_embeddings,      # (Num_Entities, dim)
    'relation_embeddings': relation_embeddings,  # (Num_Relations, dim)
}
# 3. Save to a single file
with open(f'./pickle/{name}_transE_embeddings_{timestamp}.pkl', 'wb') as f:
    pickle.dump(saved_data, f)
print("Embeddings and mappings saved successfully!")