# Neural Machine Translation (English → Vietnamese) with Transformer (from scratch)

Notebook cho bài tập lớn môn NLP 2025: xây dựng mô hình Seq2Seq dùng kiến trúc Transformer từ các thành phần cơ bản, huấn luyện trên tập **IWSLT2015 En–Vi**, đánh giá BLEU và hỗ trợ dịch câu.

Cấu trúc notebook:
1. Cài đặt & Thiết lập
2. Dữ liệu & Tokenizer (BPE)
3. Mô hình Transformer (Encoder, Decoder, Multi-Head Attention)
4. Huấn luyện (Label Smoothing + Noam Scheduler)
5. Suy diễn & Đánh giá BLEU

In [None]:
!pip install -q datasets tokenizers sacrebleu torchmetrics

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import math
import random
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Sampler

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

import sacrebleu

print("PyTorch:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Hyperparameters
d_model = 512
n_heads = 8
num_encoder_layers = 4   # có thể tăng lên 6 nếu GPU đủ mạnh
num_decoder_layers = 4
d_ff = 2048
dropout = 0.1

src_vocab_size = 20000
tgt_vocab_size = 20000

max_src_len = 80
max_tgt_len = 80

batch_size = 64
num_epochs = 10
label_smoothing = 0.1
warmup_steps = 4000
learning_rate_factor = 1.0
max_grad_norm = 1.0

PyTorch: 2.9.0+cu126
Device: cuda


## 1. Dữ liệu: IWSLT2015 En–Vi với `datasets`

In [None]:
# Tải IWSLT2015 English-Vietnamese
# Tải IWSLT2015 English-Vietnamese (bản đã convert sang Parquet)
from datasets import load_dataset

raw_datasets = load_dataset("thainq107/iwslt2015-en-vi")
raw_datasets



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [None]:
# Chuẩn hoá thành cặp {src: en, tgt: vi}
def extract_en_vi(example):
    return {
        "src": example["en"],
        "tgt": example["vi"],
    }

datasets_envi = raw_datasets.map(
    extract_en_vi,
    remove_columns=raw_datasets["train"].column_names,
)

print(datasets_envi)
print("Ví dụ train[0]:", datasets_envi["train"][0])



datasets_envi = raw_datasets.map(
    extract_en_vi,
    remove_columns=raw_datasets["train"].column_names,
)

print(datasets_envi)
print("Ví dụ train[0]:", datasets_envi["train"][0])

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1268
    })
})
Ví dụ train[0]: {'src': 'Rachel Pike : The science behind a climate headline', 'tgt': 'Khoa học đằng sau một tiêu đề về khí hậu'}
DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1268
    })
})
Ví dụ train[0]: {'src': 'Rachel Pike : The science behind a climate headline', 'tgt': 'Khoa học đằng sau một tiêu đề về khí hậu'}


## 2. Tokenizer BPE (English & Vietnamese) với `tokenizers`

In [None]:
special_tokens = ["[PAD]", "[UNK]", "[BOS]", "[EOS]"]

tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_en.pre_tokenizer = Whitespace()
trainer_en = BpeTrainer(vocab_size=src_vocab_size, special_tokens=special_tokens)

tokenizer_vi = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_vi.pre_tokenizer = Whitespace()
trainer_vi = BpeTrainer(vocab_size=tgt_vocab_size, special_tokens=special_tokens)

def en_corpus():
    for split in ["train", "validation", "test"]:
        for ex in datasets_envi[split]:
            yield ex["src"]

def vi_corpus():
    for split in ["train", "validation", "test"]:
        for ex in datasets_envi[split]:
            yield ex["tgt"]

print("Training English tokenizer...")
tokenizer_en.train_from_iterator(en_corpus(), trainer_en)

print("Training Vietnamese tokenizer...")
tokenizer_vi.train_from_iterator(vi_corpus(), trainer_vi)

# Lưu ra file để tái sử dụng
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")
print("Đã lưu tokenizer.")

Training English tokenizer...
Training Vietnamese tokenizer...
Đã lưu tokenizer.


In [None]:
# Load lại từ file (thói quen tốt khi dùng Colab)
tokenizer_en = Tokenizer.from_file("tokenizer_en.json")
tokenizer_vi = Tokenizer.from_file("tokenizer_vi.json")

PAD_TOKEN = "[PAD]"
UNK_TOKEN = "[UNK]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"

src_pad_idx = tokenizer_en.token_to_id(PAD_TOKEN)
tgt_pad_idx = tokenizer_vi.token_to_id(PAD_TOKEN)
src_bos_idx = tokenizer_en.token_to_id(BOS_TOKEN)
src_eos_idx = tokenizer_en.token_to_id(EOS_TOKEN)
tgt_bos_idx = tokenizer_vi.token_to_id(BOS_TOKEN)
tgt_eos_idx = tokenizer_vi.token_to_id(EOS_TOKEN)

src_vocab_size = tokenizer_en.get_vocab_size()
tgt_vocab_size = tokenizer_vi.get_vocab_size()

print("Src vocab:", src_vocab_size, "Tgt vocab:", tgt_vocab_size)
print("src_pad_idx =", src_pad_idx, "tgt_pad_idx =", tgt_pad_idx)

Src vocab: 20000 Tgt vocab: 20000
src_pad_idx = 0 tgt_pad_idx = 0


## 3. Hàm encode, padding và `DataLoader` (kèm Bucket Sampler)

In [None]:
def encode_text(tokenizer, text, bos_idx, eos_idx, max_len):
    ids = tokenizer.encode(text).ids
    ids = ids[: max_len - 2]               # chừa chỗ cho BOS/EOS
    ids = [bos_idx] + ids + [eos_idx]
    return ids

def pad_sequences(sequences, pad_idx):
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    tensor = torch.full((batch_size, max_len), pad_idx, dtype=torch.long)
    for i, seq in enumerate(sequences):
        tensor[i, : len(seq)] = torch.tensor(seq, dtype=torch.long)
    return tensor

In [None]:
def collate_fn(batch):
    src_texts = [item["src"] for item in batch]
    tgt_texts = [item["tgt"] for item in batch]

    src_seqs = [
        encode_text(tokenizer_en, t, src_bos_idx, src_eos_idx, max_src_len)
        for t in src_texts
    ]
    tgt_seqs = [
        encode_text(tokenizer_vi, t, tgt_bos_idx, tgt_eos_idx, max_tgt_len)
        for t in tgt_texts
    ]

    src_batch = pad_sequences(src_seqs, src_pad_idx)
    tgt_batch = pad_sequences(tgt_seqs, tgt_pad_idx)

    # Decoder input (shift right) & target
    tgt_input = tgt_batch[:, :-1].contiguous()
    tgt_output = tgt_batch[:, 1:].contiguous()

    return {
        "src": src_batch,
        "tgt_input": tgt_input,
        "tgt_output": tgt_output,
    }

In [None]:
# Bucket-style batch sampler để gom các câu có độ dài gần nhau
class BucketBatchSampler(Sampler):
    def __init__(self, lengths, batch_size, shuffle=True):
        self.lengths = lengths
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __iter__(self):
        indices = list(range(len(self.lengths)))
        indices.sort(key=lambda i: self.lengths[i])  # sort theo length

        batches = [
            indices[i : i + self.batch_size]
            for i in range(0, len(indices), self.batch_size)
        ]

        if self.shuffle:
            random.shuffle(batches)

        for batch in batches:
            yield batch

    def __len__(self):
        return (len(self.lengths) + self.batch_size - 1) // self.batch_size

In [None]:
def compute_lengths(dataset_split):
    lengths = []
    for ex in dataset_split:
        src_len = len(ex["src"].split())
        tgt_len = len(ex["tgt"].split())
        lengths.append(max(src_len, tgt_len))
    return lengths

train_lengths = compute_lengths(datasets_envi["train"])

train_batch_sampler = BucketBatchSampler(train_lengths, batch_size=batch_size, shuffle=True)

train_loader = DataLoader(
    datasets_envi["train"],
    batch_sampler=train_batch_sampler,
    collate_fn=collate_fn,
)

valid_loader = DataLoader(
    datasets_envi["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
)

test_loader = DataLoader(
    datasets_envi["test"],
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn,
)

len(train_loader), len(valid_loader), len(test_loader)

(2084, 20, 1268)

## 4. Tạo Mask (padding & causal)

In [None]:
def create_src_mask(src, pad_idx):
    # src: [B, S]
    # True tại vị trí hợp lệ (không phải PAD)
    src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)  # [B,1,1,S]
    return src_mask

def create_tgt_mask(tgt, pad_idx):
    # tgt: [B, T]
    batch_size, tgt_len = tgt.shape

    tgt_pad_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)  # [B,1,1,T]

    subsequent_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
    subsequent_mask = subsequent_mask.unsqueeze(0).unsqueeze(1)  # [1,1,T,T]

    tgt_mask = tgt_pad_mask & subsequent_mask  # [B,1,T,T]
    return tgt_mask

## 5. Kiến trúc Transformer From Scratch

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.out_proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        # query/key/value: [B, L, D]
        batch_size = query.size(0)

        def transform(x, linear):
            x = linear(x)  # [B, L, D]
            x = x.view(batch_size, -1, self.num_heads, self.d_k)
            x = x.transpose(1, 2)  # [B, H, L, d_k]
            return x

        Q = transform(query, self.q_linear)
        K = transform(key, self.k_linear)
        V = transform(value, self.v_linear)

        # scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)  # [B,H,L_q,L_k]

        if mask is not None:
            # mask: True ở vị trí hợp lệ, False ở vị trí bị che
            scores = scores.masked_fill(~mask, float("-inf"))

        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        context = torch.matmul(attn, V)  # [B,H,L_q,d_k]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)  # [B,L_q,D]
        output = self.out_proj(context)
        return output

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: [B, L, D]
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len]
        return x

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout_ff = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        # Self-attention
        attn_out = self.self_attn(src, src, src, src_mask)
        src = self.norm1(src + self.dropout1(attn_out))

        # Feed-forward
        ff = self.linear2(self.dropout_ff(F.relu(self.linear1(src))))
        src = self.norm2(src + self.dropout2(ff))

        return src

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)

        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.dropout_ff = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask, src_mask):
        # Masked self-attention
        _tgt = self.self_attn(tgt, tgt, tgt, tgt_mask)
        tgt = self.norm1(tgt + self.dropout1(_tgt))

        # Cross-attention với output encoder
        _tgt2 = self.cross_attn(tgt, memory, memory, src_mask)
        tgt = self.norm2(tgt + self.dropout2(_tgt2))

        # Feed-forward
        ff = self.linear2(self.dropout_ff(F.relu(self.linear1(tgt))))
        tgt = self.norm3(tgt + self.dropout3(ff))

        return tgt

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        x = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoder(x)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x  # [B,S,D]

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.layers = nn.ModuleList(
            [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask, src_mask):
        x = self.embedding(tgt) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoder(x)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, memory, tgt_mask, src_mask)

        return x  # [B,T,D]

In [None]:
class TransformerNMT(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        d_model,
        num_encoder_layers,
        num_decoder_layers,
        num_heads,
        d_ff,
        dropout,
    ):
        super().__init__()
        self.encoder = TransformerEncoder(src_vocab_size, d_model, num_encoder_layers, num_heads, d_ff, dropout)
        self.decoder = TransformerDecoder(tgt_vocab_size, d_model, num_decoder_layers, num_heads, d_ff, dropout)
        self.output_layer = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt_input, src_mask, tgt_mask):
        memory = self.encoder(src, src_mask)                # [B,S,D]
        dec_out = self.decoder(tgt_input, memory, tgt_mask, src_mask)  # [B,T,D]
        logits = self.output_layer(dec_out)                 # [B,T,V]
        return logits

model = TransformerNMT(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=d_model,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    num_heads=n_heads,
    d_ff=d_ff,
    dropout=dropout,
).to(device)

print("Số tham số (triệu):", sum(p.numel() for p in model.parameters()) / 1e6)

Số tham số (triệu): 60.165664


## 6. Loss với Label Smoothing & Noam Scheduler

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, label_smoothing, vocab_size, ignore_index=0):
        super().__init__()
        assert 0.0 <= label_smoothing <= 1.0
        self.smoothing = label_smoothing
        self.confidence = 1.0 - label_smoothing
        self.vocab_size = vocab_size
        self.ignore_index = ignore_index

    def forward(self, pred, target):
        """
        pred: [N, V] logits
        target: [N]
        """
        pred = pred.log_softmax(dim=-1)

        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.vocab_size - 2))
            ignore = target == self.ignore_index
            target = target.clone()
            target[ignore] = 0
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
            true_dist[ignore] = 0

        loss = torch.sum(-true_dist * pred, dim=-1)
        non_pad_mask = ~ignore
        loss = loss[non_pad_mask].mean()
        return loss

In [None]:
class NoamOpt:
    """
    Wrapper cho optimizer Adam với lịch học Noam.
    """
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def step(self):
        self._step += 1
        lr = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = lr
        self._rate = lr
        self.optimizer.step()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def rate(self, step=None):
        if step is None:
            step = self._step
        return self.factor * (
            self.model_size ** (-0.5)
            * min(step ** (-0.5), step * (self.warmup ** (-1.5)))
        )

In [None]:
base_optimizer = torch.optim.Adam(
    model.parameters(),
    betas=(0.9, 0.98),
    eps=1e-9,
)

optimizer = NoamOpt(d_model, learning_rate_factor, warmup_steps, base_optimizer)

criterion = LabelSmoothingLoss(
    label_smoothing=label_smoothing,
    vocab_size=tgt_vocab_size,
    ignore_index=tgt_pad_idx,
)

print("Khởi tạo optimizer & loss xong.")

Khởi tạo optimizer & loss xong.


## 7. Huấn luyện & Đánh giá (Loss, Perplexity)

In [None]:
def train_epoch(model, data_loader, optimizer, criterion, src_pad_idx, tgt_pad_idx):
    model.train()
    total_loss = 0.0
    total_tokens = 0

    for batch in data_loader:
        src = batch["src"].to(device)
        tgt_input = batch["tgt_input"].to(device)
        tgt_output = batch["tgt_output"].to(device)

        src_mask = create_src_mask(src, src_pad_idx)
        tgt_mask = create_tgt_mask(tgt_input, tgt_pad_idx)

        optimizer.zero_grad()
        logits = model(src, tgt_input, src_mask, tgt_mask)  # [B,T,V]

        loss = criterion(
            logits.view(-1, logits.size(-1)),
            tgt_output.view(-1),
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

        num_tokens = (tgt_output != tgt_pad_idx).sum().item()
        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens

    return total_loss / total_tokens

In [None]:
@torch.no_grad()
def evaluate_epoch(model, data_loader, criterion, src_pad_idx, tgt_pad_idx):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for batch in data_loader:
        src = batch["src"].to(device)
        tgt_input = batch["tgt_input"].to(device)
        tgt_output = batch["tgt_output"].to(device)

        src_mask = create_src_mask(src, src_pad_idx)
        tgt_mask = create_tgt_mask(tgt_input, tgt_pad_idx)

        logits = model(src, tgt_input, src_mask, tgt_mask)

        loss = criterion(
            logits.view(-1, logits.size(-1)),
            tgt_output.view(-1),
        )

        num_tokens = (tgt_output != tgt_pad_idx).sum().item()
        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens

    return total_loss / total_tokens

In [None]:
best_valid_loss = float("inf")

for epoch in range(1, num_epochs + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, src_pad_idx, tgt_pad_idx)
    valid_loss = evaluate_epoch(model, valid_loader, criterion, src_pad_idx, tgt_pad_idx)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "model_best.pth")

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {train_loss:.4f} | Train PPL: {math.exp(train_loss):.2f} | "
        f"Valid Loss: {valid_loss:.4f} | Valid PPL: {math.exp(valid_loss):.2f} | "
        f"LR: {optimizer._rate:.6f}"
    )

# Lưu thêm bản cuối cùng
torch.save(model.state_dict(), "model.pth")
print("Đã lưu model vào model.pth")

Epoch 01 | Train Loss: 5.7875 | Train PPL: 326.21 | Valid Loss: 4.8770 | Valid PPL: 131.24 | LR: 0.000364
Epoch 02 | Train Loss: 4.6274 | Train PPL: 102.25 | Valid Loss: 4.4452 | Valid PPL: 85.21 | LR: 0.000685
Epoch 03 | Train Loss: 4.3155 | Train PPL: 74.85 | Valid Loss: 4.2270 | Valid PPL: 68.51 | LR: 0.000559
Epoch 04 | Train Loss: 4.0818 | Train PPL: 59.25 | Valid Loss: 4.1038 | Valid PPL: 60.57 | LR: 0.000484
Epoch 05 | Train Loss: 3.9289 | Train PPL: 50.85 | Valid Loss: 3.9967 | Valid PPL: 54.42 | LR: 0.000433
Epoch 06 | Train Loss: 3.8152 | Train PPL: 45.39 | Valid Loss: 3.9426 | Valid PPL: 51.55 | LR: 0.000395
Epoch 07 | Train Loss: 3.7236 | Train PPL: 41.41 | Valid Loss: 3.9031 | Valid PPL: 49.56 | LR: 0.000366
Epoch 08 | Train Loss: 3.6478 | Train PPL: 38.39 | Valid Loss: 3.8832 | Valid PPL: 48.58 | LR: 0.000342
Epoch 09 | Train Loss: 3.5811 | Train PPL: 35.91 | Valid Loss: 3.8774 | Valid PPL: 48.30 | LR: 0.000323
Epoch 10 | Train Loss: 3.5243 | Train PPL: 33.93 | Valid Loss

## 8. Suy diễn: Hàm dịch câu `translate_sentence` (Beam Search)

In [None]:
@torch.no_grad()
def translate_sentence(
    model,
    tokenizer_src,
    tokenizer_tgt,
    src_sentence,
    max_len=60,
    beam_size=5,
):
    model.eval()

    # Encode source
    src_ids = encode_text(tokenizer_src, src_sentence, src_bos_idx, src_eos_idx, max_src_len)
    src_tensor = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)  # [1,S]
    src_mask = create_src_mask(src_tensor, src_pad_idx)

    memory = model.encoder(src_tensor, src_mask)  # [1,S,D]

    # Mỗi beam: (list token ids, log_prob)
    beams = [([tgt_bos_idx], 0.0)]
    completed = []

    for _ in range(max_len):
        new_beams = []
        for tokens, log_prob in beams:
            if tokens[-1] == tgt_eos_idx:
                completed.append((tokens, log_prob))
                continue

            tgt_tensor = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)
            tgt_mask = create_tgt_mask(tgt_tensor, tgt_pad_idx)

            dec_out = model.decoder(tgt_tensor, memory, tgt_mask, src_mask)
            logits = model.output_layer(dec_out)  # [1,T,V]
            log_probs = F.log_softmax(logits[0, -1], dim=-1)

            topk_log_probs, topk_indices = log_probs.topk(beam_size)

            for k in range(beam_size):
                next_token = topk_indices[k].item()
                next_log_prob = log_prob + topk_log_probs[k].item()
                new_tokens = tokens + [next_token]
                new_beams.append((new_tokens, next_log_prob))

        if not new_beams:
            break

        new_beams.sort(key=lambda x: x[1], reverse=True)
        beams = new_beams[:beam_size]

        if all(tokens[-1] == tgt_eos_idx for tokens, _ in beams):
            completed.extend(beams)
            break

    if not completed:
        completed = beams

    completed.sort(key=lambda x: x[1], reverse=True)
    best_tokens, best_score = completed[0]

    # Bỏ BOS/EOS/PAD
    best_tokens = [
        t for t in best_tokens
        if t not in {tgt_bos_idx, tgt_eos_idx, tgt_pad_idx}
    ]

    translation = tokenizer_tgt.decode(best_tokens)
    return translation

In [None]:
src_example = "How are you?"
# tgt_example = example["tgt"]

print("SRC:", src_example)
# print("REF:", tgt_example)

print("\nPRED:")
print(translate_sentence(model, tokenizer_en, tokenizer_vi, src_example))
#print(translate_sentence(model, tokenizer_vi, tokenizer_en, src_example))

SRC: How are you?

PRED:
Bạn như thế nào ?


## 9. Đánh giá BLEU trên tập Test

In [None]:
@torch.no_grad()
def compute_bleu(model, dataset_split, tokenizer_src, tokenizer_tgt, max_sentences=None):
    model.eval()
    hyps = []
    refs = []

    n = len(dataset_split) if max_sentences is None else min(max_sentences, len(dataset_split))

    for i in range(n):
        ex = dataset_split[i]
        src = ex["src"]
        ref = ex["tgt"]

        hyp = translate_sentence(
            model,
            tokenizer_src,
            tokenizer_tgt,
            src_sentence=src,
            max_len=max_tgt_len,
            beam_size=5,
        )

        hyps.append(hyp)
        refs.append(ref)

        if (i + 1) % 50 == 0:
            print(f"Đã dịch {i+1}/{n} câu")

    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    return bleu

In [None]:
# Load best model nếu có
try:
    model.load_state_dict(torch.load("model_best.pth", map_location=device))
    print("Đã load model_best.pth")
except FileNotFoundError:
    print("Không tìm thấy model_best.pth, dùng model hiện tại.")

model.to(device)

bleu_result = compute_bleu(
    model,
    datasets_envi["test"],
    tokenizer_en,
    tokenizer_vi,
    max_sentences=500,
)
print("BLEU:", bleu_result.score)

Đã load model_best.pth
Đã dịch 50/500 câu
Đã dịch 100/500 câu
Đã dịch 150/500 câu
Đã dịch 200/500 câu
Đã dịch 250/500 câu
Đã dịch 300/500 câu
Đã dịch 350/500 câu
Đã dịch 400/500 câu
Đã dịch 450/500 câu




Đã dịch 500/500 câu
BLEU: 13.784108626674229


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

dest_dir = "/content/drive/MyDrive/BTL_NLP_MT/transformer"

if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)
    print(f"Đã tạo thư mục đích: {dest_dir}")

files_to_save = {
    "tokenizer_en.json": "tokenizer_en.json",
    "tokenizer_vi.json": "tokenizer_vi.json",
    "model_best.pth": "transformer_best.pth",
    "model.pth": "transformer_last.pth"
}

print("Đang sao chép file sang Drive...")

for src_name, dest_name in files_to_save.items():
    src_path = f"/content/{src_name}"
    dest_path = os.path.join(dest_dir, dest_name)

    if os.path.exists(src_path):
        shutil.copy(src_path, dest_path)
        print(f"Đã lưu: {src_name} -> {dest_path}")
    else:
        print(f"Không tìm thấy file gốc: {src_path} (Bỏ qua)")

print("\nHoàn tất! Kiểm tra lại Drive của bạn.")

Đã tạo thư mục đích: /content/drive/MyDrive/BTL_NLP_MT/transformer
Đang sao chép file sang Drive...
Đã lưu: tokenizer_en.json -> /content/drive/MyDrive/BTL_NLP_MT/transformer/tokenizer_en.json
Đã lưu: tokenizer_vi.json -> /content/drive/MyDrive/BTL_NLP_MT/transformer/tokenizer_vi.json
Đã lưu: model_best.pth -> /content/drive/MyDrive/BTL_NLP_MT/transformer/transformer_best.pth
Đã lưu: model.pth -> /content/drive/MyDrive/BTL_NLP_MT/transformer/transformer_last.pth

Hoàn tất! Kiểm tra lại Drive của bạn.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

def create_src_mask(src, pad_idx):
    return (src != pad_idx).unsqueeze(1).unsqueeze(2)

def create_tgt_mask(tgt, pad_idx):
    batch_size, tgt_len = tgt.shape
    tgt_pad_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
    subsequent_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
    return tgt_pad_mask & subsequent_mask.unsqueeze(0).unsqueeze(1)

def encode_text(tokenizer, text, bos_idx, eos_idx, max_len=80):
    ids = tokenizer.encode(text).ids
    ids = ids[: max_len - 2]
    ids = [bos_idx] + ids + [eos_idx]
    return ids

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.q_linear(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.k_linear(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.v_linear(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))

        attn = self.dropout(F.softmax(scores, dim=-1))
        context = torch.matmul(attn, value)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.out_proj(context)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        src = self.norm1(src + self.dropout(self.self_attn(src, src, src, src_mask)))
        ff = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = self.norm2(src + self.dropout(ff))
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask, src_mask):
        tgt = self.norm1(tgt + self.dropout(self.self_attn(tgt, tgt, tgt, tgt_mask)))
        tgt = self.norm2(tgt + self.dropout(self.cross_attn(tgt, memory, memory, src_mask)))
        ff = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
        tgt = self.norm3(tgt + self.dropout(ff))
        return tgt
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.d_model = d_model  # <--- DÒNG QUAN TRỌNG VỪA THÊM
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        x = self.dropout(self.pos_encoder(self.embedding(src) * math.sqrt(self.d_model)))
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.d_model = d_model  # <--- DÒNG QUAN TRỌNG VỪA THÊM
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask, src_mask):
        x = self.dropout(self.pos_encoder(self.embedding(tgt) * math.sqrt(self.d_model)))
        for layer in self.layers:
            x = layer(x, memory, tgt_mask, src_mask)
        return x

class TransformerNMT(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_encoder_layers, num_decoder_layers, num_heads, d_ff, dropout):
        super().__init__()
        self.encoder = TransformerEncoder(src_vocab_size, d_model, num_encoder_layers, num_heads, d_ff, dropout)
        self.decoder = TransformerDecoder(tgt_vocab_size, d_model, num_decoder_layers, num_heads, d_ff, dropout)
        self.output_layer = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt_input, src_mask, tgt_mask):
        memory = self.encoder(src, src_mask)
        dec_out = self.decoder(tgt_input, memory, tgt_mask, src_mask)
        return self.output_layer(dec_out)

@torch.no_grad()
def translate_sentence(model, tokenizer_src, tokenizer_tgt, src_sentence, max_len=80, device='cuda'):
    model.eval()

    src_bos = tokenizer_src.token_to_id("[BOS]")
    src_eos = tokenizer_src.token_to_id("[EOS]")
    src_pad = tokenizer_src.token_to_id("[PAD]")

    tgt_bos = tokenizer_tgt.token_to_id("[BOS]")
    tgt_eos = tokenizer_tgt.token_to_id("[EOS]")
    tgt_pad = tokenizer_tgt.token_to_id("[PAD]")

    src_ids = encode_text(tokenizer_src, src_sentence, src_bos, src_eos, max_len)
    src_tensor = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)
    src_mask = create_src_mask(src_tensor, src_pad)

    memory = model.encoder(src_tensor, src_mask)

    tgt_indices = [tgt_bos]
    for _ in range(max_len):
        tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long, device=device).unsqueeze(0)
        tgt_mask = create_tgt_mask(tgt_tensor, tgt_pad)

        output = model.decoder(tgt_tensor, memory, tgt_mask, src_mask)
        output = model.output_layer(output)

        next_token = output[0, -1].argmax().item()
        tgt_indices.append(next_token)

        if next_token == tgt_eos:
            break

    return tokenizer_tgt.decode([t for t in tgt_indices if t not in [tgt_bos, tgt_eos, tgt_pad]])

print("Đã sửa xong lỗi 'd_model'. Hãy chạy cell này trước khi load model!")

Đã sửa xong lỗi 'd_model'. Hãy chạy cell này trước khi load model!


In [None]:
import os
import torch
from tokenizers import Tokenizer

save_dir = "/content/drive/MyDrive/BTL_NLP_MT/transformer"
model_path = os.path.join(save_dir, "transformer_best.pth")

tokenizer_en = Tokenizer.from_file(os.path.join(save_dir, "tokenizer_en.json"))
tokenizer_vi = Tokenizer.from_file(os.path.join(save_dir, "tokenizer_vi.json"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerNMT(
    src_vocab_size=tokenizer_en.get_vocab_size(),
    tgt_vocab_size=tokenizer_vi.get_vocab_size(),
    d_model=512,
    num_encoder_layers=4,
    num_decoder_layers=4,
    num_heads=8,
    d_ff=2048,
    dropout=0.1,
).to(device)

if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    print("Đã load model thành công!")
else:
    print("Không tìm thấy file model trên Drive.")

model.eval()
cau_can_dich = "The science behind a climate headline"
ket_qua = translate_sentence(
    model,
    tokenizer_en,  # Tokenizer nguồn (Tiếng Anh)
    tokenizer_vi,  # Tokenizer đích (Tiếng Việt)
    cau_can_dich
)

print(f"Câu gốc: {cau_can_dich}")
print(f"Dịch máy: {ket_qua}")

cau_can_dich2 = "I hate this subject"
ket_qua2 = translate_sentence(
    model,
    tokenizer_en,  # Tokenizer nguồn (Tiếng Anh)
    tokenizer_vi,  # Tokenizer đích (Tiếng Việt)
    cau_can_dich2
)

print(f"Câu gốc: {cau_can_dich2}")
print(f"Dịch máy: {ket_qua2}")

Đã load model thành công!
Câu gốc: The science behind a climate headline
Dịch máy: Khoa học đằng sau một tiêu đề về khí hậu
Câu gốc: I hate this subject
Dịch máy: Tôi ghét chủ đề này
