In [1]:
# Cell 0: Install Dependencies
!pip install rouge evaluate transformers sacrebleu

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:

In [2]:
import os
import torch
from transformers import (
    PreTrainedTokenizerFast, GPT2Config, GPT2LMHeadModel,
    get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
from tqdm.auto import tqdm
import evaluate
import pandas as pd

# Thiết bị
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper‐params
INPUT_DIR      = '/kaggle/input/final-nlp/NLP_final/data/raw'
TOKENIZER_PATH = '/kaggle/input/final-nlp/NLP_final/tokenizer_bpe/model/tokenizer.json'
CHECKPOINT_DIR = '/kaggle/working/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

BATCH_SIZE           = 4
ACCUMULATION_STEPS   = 2      # để effective batch size = 8
MAX_LENGTH           = 128
NUM_EPOCHS           = 5
LEARNING_RATE        = 5e-5
WEIGHT_DECAY         = 0.01
WARMUP_STEPS         = 500
GRAD_CLIP_NORM       = 1.0

# Load tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=TOKENIZER_PATH,
    bos_token='<s>', eos_token='</s>',
    unk_token='<unk>', pad_token='<pad>'
)

# Sample sentences để inference nhanh
sample_en = [
    "I want to go to sleep",
    "Good morning! Did you sleep well?",
    "Yesterday the stock market plunged almost five percent.",
    "She wonders whether artificial intelligence will ever surpass human creativity.",
    "Please, turn off the lights before you leave the laboratory.",
    "Although it rained heavily, the concert continued until midnight.",
    "The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.",
    "Have you ever tried Vietnamese egg coffee?",
    "If we fail to act now, future generations will pay the price.",
    "NASA's James Webb telescope recently captured breathtaking images of distant galaxies.",
    "In my opinion, learning a new language is like opening an extra window on the world."
]

2025-05-26 23:51:58.780032: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748303519.241030      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748303519.363324      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class TranslationDataset(Dataset):
    def __init__(self, path_csv, tokenizer, max_length=MAX_LENGTH):
        self.df = pd.read_csv(path_csv)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df.iloc[idx]['en']
        tgt = self.df.iloc[idx]['vi']
        prompt = f"Translate English to Vietnamese: {src} {self.tokenizer.eos_token}"

        enc = self.tokenizer(
            prompt, return_tensors='pt',
            truncation=True, max_length=self.max_length//2
        )
        dec = self.tokenizer(
            tgt + self.tokenizer.eos_token, return_tensors='pt',
            truncation=True, max_length=self.max_length//2
        )

        input_ids      = torch.cat([enc.input_ids[0],      dec.input_ids[0]], dim=0)
        attention_mask = torch.cat([enc.attention_mask[0], dec.attention_mask[0]], dim=0)
        labels         = input_ids.clone()
        labels[:enc.input_ids.size(1)] = -100

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

def collate_fn(batch):
    input_ids      = pad_sequence([b['input_ids']      for b in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([b['attention_mask'] for b in batch], batch_first=True, padding_value=0)
    labels         = pad_sequence([b['labels']         for b in batch], batch_first=True, padding_value=-100)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_ds = TranslationDataset(os.path.join(INPUT_DIR,'train.csv'), tokenizer)
val_ds   = TranslationDataset(os.path.join(INPUT_DIR,'val.csv'),   tokenizer)
test_ds  = TranslationDataset(os.path.join(INPUT_DIR,'test.csv'),  tokenizer)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [4]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=MAX_LENGTH,
    n_ctx=MAX_LENGTH,
    n_embd=768,
    n_layer=8,
    n_head=12,
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1
)
model = GPT2LMHeadModel(config).to(device)

In [5]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
total_steps = len(train_loader) // ACCUMULATION_STEPS * NUM_EPOCHS
scheduler   = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)
best_val_loss = float('inf')
stale = 0

for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    train_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch}")):
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss / ACCUMULATION_STEPS
        loss.backward()
        train_loss += loss.item()

        if (step+1) % ACCUMULATION_STEPS == 0:
            clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_train = train_loss * ACCUMULATION_STEPS / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            out = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            val_loss += out.loss.item()
    avg_val = val_loss / len(val_loader)

    print(f"[Epoch {epoch}] Train Loss: {avg_train:.4f} — Val Loss: {avg_val:.4f}")

    # Checkpoint & Early Stop
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR,'best_model.pt'))
        stale = 0
    else:
        stale += 1
        if stale >= 2:
            print("Early stopping triggered.")
            break

Epoch 1:   0%|          | 0/26422 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


[Epoch 1] Train Loss: 4.2738 — Val Loss: 3.4259


Epoch 2:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 2] Train Loss: 3.0764 — Val Loss: 2.7779


Epoch 3:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 3] Train Loss: 2.5261 — Val Loss: 2.5182


Epoch 4:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 4] Train Loss: 2.1919 — Val Loss: 2.3372


Epoch 5:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 5] Train Loss: 1.9656 — Val Loss: 2.2850


In [19]:
# Cell 7: Zip Model & Checkpoints
!zip -r /kaggle/working/model_checkpoint.zip /kaggle/working/checkpoints

  adding: kaggle/working/checkpoints/ (stored 0%)
  adding: kaggle/working/checkpoints/best_model.pt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)


In [20]:
# Cell 7: Zip Model & Checkpoints (Python)
import shutil

shutil.make_archive('/kaggle/working/model_checkpoint', 'zip', '/kaggle/working/checkpoints')
print("Đã tạo /kaggle/working/model_checkpoint.zip")

Đã tạo /kaggle/working/model_checkpoint.zip


In [5]:
# Load best
model.load_state_dict(torch.load(os.path.join("/kaggle/input/model-cpkt",'best_model.pt')))
model.eval()
bleu = evaluate.load('sacrebleu')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [6]:
def translate(text, num_beams=1, max_new_tokens=64):
    # 1) Tạo prompt
    prompt = f"Translate English to Vietnamese: {text} {tokenizer.eos_token}"
    # 2) Tokenize & truncate
    encoding = tokenizer(
        prompt,
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LENGTH//2
    )
    input_ids = encoding.input_ids.to(device)

    # 3) Generate without attention_mask
    out = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        early_stopping=(num_beams>1),
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        use_cache=True
    )
    # 4) Lấy phần mới sinh
    return tokenizer.decode(out[0, input_ids.size(-1):], skip_special_tokens=True)


In [8]:
from tqdm.auto import tqdm

# Giả sử val_ds.df là DataFrame validation với cột 'en' và 'vi'
refs = [[v] for v in val_ds.df['vi']]

# Greedy translation với thanh tiến độ
preds_greedy = []
for src in tqdm(val_ds.df['en'], desc="Greedy Translation"):
    preds_greedy.append(translate(src, num_beams=1))

# Beam-5 translation với thanh tiến độ
# preds_beam5 = []
# for src in tqdm(val_ds.df['en'], desc="Beam-5 Translation"):
#     preds_beam5.append(translate(src, num_beams=5))

# Tính BLEU
print("Greedy BLEU:", bleu.compute(predictions=preds_greedy, references=refs)["score"])
# print("Beam-5 BLEU:", bleu.compute(predictions=preds_beam5,  references=refs)["score"])

Greedy Translation:   0%|          | 0/13210 [00:00<?, ?it/s]

Greedy BLEU: 23.212151785809066


In [None]:
refs = [[v] for v in val_ds.df['vi']]
preds_greedy = [translate(s, num_beams=1) for s in val_ds.df['en']]
#preds_beam5  = [translate(s, num_beams=5) for s in val_ds.df['en']]

print("Greedy BLEU:", bleu.compute(predictions=preds_greedy, references=refs)['score'])
# print("Beam-5 BLEU:", bleu.compute(predictions=preds_beam5,  references=refs)['score'])

In [None]:
preds_beam5 = []
for src in tqdm(val_ds.df['en'], desc="Beam-5 Translation"):
    preds_beam5.append(translate(src, num_beams=5))
print("Beam-5 BLEU:", bleu.compute(predictions=preds_beam5,  references=refs)["score"])

Beam-5 Translation:   0%|          | 0/13210 [00:00<?, ?it/s]

Beam-5 BLEU: 24.621177714770397


In [None]:
rouge_metric = evaluate.load('rouge')
flat_refs = [r[0] for r in refs]

rouge_scores = rouge_metric.compute(
    predictions=preds_greedy,
    references=flat_refs,
    use_stemmer=True         
)

print("Greedy ROUGE-1   :", round(rouge_scores['rouge1'] * 100, 2))
print("Greedy ROUGE-2   :", round(rouge_scores['rouge2'] * 100, 2))
print("Greedy ROUGE-L   :", round(rouge_scores['rougeL'] * 100, 2))

Greedy ROUGE-1   : 67.72
Greedy ROUGE-2   : 43.2
Greedy ROUGE-L   : 57.99


In [None]:
rouge_beam5 = rouge_metric.compute(
    predictions=preds_beam5,
    references=flat_refs,
    use_stemmer=True
)
print("Beam-5 ROUGE-1  :", round(rouge_beam5['rouge1'] * 100, 2))
print("Beam-5 ROUGE-2  :", round(rouge_beam5['rouge2'] * 100, 2))
print("Beam-5 ROUGE-L  :", round(rouge_beam5['rougeL'] * 100, 2))

Beam-5 ROUGE-1  : 68.46
Beam-5 ROUGE-2  : 44.43
Beam-5 ROUGE-L  : 59.03


In [11]:
print("\n--- Sample Translations ---\n")
for sent in sample_en:
    out_g = translate(sent, num_beams=1)
    out_b = translate(sent, num_beams=5)
    print(f">> {sent}")
    print(f"  - Greedy: {out_g}")
    print(f"  - Beam-5: {out_b}\n")



--- Sample Translations ---

>> I want to go to sleep
  - Greedy: Tôi muốn ngủ ngủ .
  - Beam-5: Tôi muốn đi ngủ .

>> Good morning! Did you sleep well?
  - Greedy: Xin chào buổi sáng bạn ngủ tốt bạn ngủ .
  - Beam-5: Xin chào các bạn cũng có thể ngủ ngon .

>> Yesterday the stock market plunged almost five percent.
  - Greedy: Những thị trường chứng khoán của thị trường tiểu bang gần 5 %
  - Beam-5: Ngày hôm qua thị trường chứng khoán có 5 %

>> She wonders whether artificial intelligence will ever surpass human creativity.
  - Greedy: Cô ấy sẽ cho rằng trí thông minh nhân tạo nhân tạo sẽ vượt qua sự sáng tạo con người .
  - Beam-5: Cô ấy cho rằng liệu trí thông minh nhân tạo nhân tạo có thể vượt qua sự sáng tạo con người .

>> Please, turn off the lights before you leave the laboratory.
  - Greedy: Xin hãy tắt đèn trước khi bạn rời phòng thí nghiệm
  - Beam-5: Hãy tắt đèn trước khi bạn rời phòng thí nghiệm

>> Although it rained heavily, the concert continued until midnight.
  - Gre

In [12]:
# Cell X: Save model & tokenizer for demo/report
SAVE_DIR = '/kaggle/working/gpt_fs_saved_model'
os.makedirs(SAVE_DIR, exist_ok=True)

# 1. Lưu weights & config
model.save_pretrained(SAVE_DIR)

# 2. Lưu tokenizer (vocab & merges)
tokenizer.save_pretrained(SAVE_DIR)

print(f"✅ Model và tokenizer đã được lưu tại: {SAVE_DIR}")

✅ Model và tokenizer đã được lưu tại: /kaggle/working/gpt_fs_saved_model


In [13]:
# Cell X+1: Zip the saved model
!zip -r /kaggle/working/gpt_fs_saved_model.zip /kaggle/working/gpt_fs_saved_model
print("✅ Đã tạo file: /kaggle/working/gpt_fs_saved_model.zip")

  adding: kaggle/working/gpt_fs_saved_model/ (stored 0%)
  adding: kaggle/working/gpt_fs_saved_model/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)
  adding: kaggle/working/gpt_fs_saved_model/special_tokens_map.json (deflated 45%)
  adding: kaggle/working/gpt_fs_saved_model/generation_config.json (deflated 24%)
  adding: kaggle/working/gpt_fs_saved_model/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/gpt_fs_saved_model/config.json (deflated 51%)
  adding: kaggle/working/gpt_fs_saved_model/tokenizer.json (deflated 82%)
✅ Đã tạo file: /kaggle/working/gpt_fs_saved_model.zip


# Train tiếp từ 6 - 10 epoch

In [23]:
START_EPOCH = 6
END_EPOCH   = 10
PATIENCE    = 3
CHECKPOINT_DIR = '/kaggle/working/checkpoints'

In [20]:
model.load_state_dict(torch.load(os.path.join("/kaggle/input/model-cpkt",'best_model.pt')))

<All keys matched successfully>

In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = (len(train_loader)//ACCUMULATION_STEPS) * (END_EPOCH - START_EPOCH + 1)
scheduler   = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

In [22]:
bleu_metric  = evaluate.load('sacrebleu')
rouge_metric = evaluate.load('rouge')

best_val_bleu = 0.0
stale = 0

In [26]:
import os, math, torch
from tqdm.auto import tqdm
import evaluate

In [30]:
best_ckpt = os.path.join(CHECKPOINT_DIR, 'best_model.pt')
torch.save(model.state_dict(), best_ckpt)

In [27]:
best_ckpt = os.path.join(CHECKPOINT_DIR, 'best_model.pt')
torch.save(model.state_dict(), best_ckpt)

In [None]:
START_EPOCH = 7
END_EPOCH   = 10

for epoch in range(START_EPOCH, END_EPOCH+1):
    model.train(); total_loss = 0.; optimizer.zero_grad()
    for step, batch in enumerate(tqdm(train_loader, desc=f"Train E{epoch}")):
        out = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = out.loss / ACCUMULATION_STEPS
        loss.backward()
        total_loss += loss.item()
        if (step+1) % ACCUMULATION_STEPS == 0:
            clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
            optimizer.step(); scheduler.step(); optimizer.zero_grad()
    avg_train = total_loss * ACCUMULATION_STEPS / len(train_loader)

    # (bạn có thể thêm validation/metrics ở đây nếu muốn)
    print(f"[Epoch {epoch}] Train Loss = {avg_train:.4f}")

    # Lưu checkpoint mỗi epoch
    ckpt_path = os.path.join(CHECKPOINT_DIR, f'epoch{epoch:02d}.pt')
    torch.save(model.state_dict(), ckpt_path)
    print(f"  → Saved checkpoint {ckpt_path}")

Train E7:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 7] Train Loss = 1.7582
  → Saved checkpoint /kaggle/working/checkpoints/epoch07.pt


Train E8:   0%|          | 0/26422 [00:00<?, ?it/s]

[Epoch 8] Train Loss = 1.5061
  → Saved checkpoint /kaggle/working/checkpoints/epoch08.pt


Train E9:   0%|          | 0/26422 [00:00<?, ?it/s]

In [34]:
# Chỉ cần model đang còn trong RAM:
last_ckpt_path = os.path.join(CHECKPOINT_DIR, "epoch09_draft.pt")
torch.save(model.state_dict(), last_ckpt_path)
print("✅ Saved model at:", last_ckpt_path)


✅ Saved model at: /kaggle/working/checkpoints/epoch09_draft.pt


In [39]:
def translate_cpu(text, num_beams=5, max_new_tokens=64):
    prompt = f"Translate English to Vietnamese: {text} {tokenizer.eos_token}"
    enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=MAX_LENGTH//2)
    input_ids = enc.input_ids
    out = model_cpu.generate(
        input_ids,
        num_beams=num_beams,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(out[0, input_ids.size(-1):], skip_special_tokens=True)

# Dịch thử
for s in sample_en:
    print(f">> {s}\n-> {translate_cpu(s)}\n")


>> I want to go to sleep
-> Tôi muốn đi ngủ .

>> Good morning! Did you sleep well?
-> Chào buổi sáng bạn ngủ ngon .

>> Yesterday the stock market plunged almost five percent.
-> Ngày hôm qua thị trường chứng khoán đã giảm gần 5 %

>> She wonders whether artificial intelligence will ever surpass human creativity.
-> Bà cho rằng trí thông minh nhân tạo sẽ vượt qua sự sáng tạo của con người .

>> Please, turn off the lights before you leave the laboratory.
-> Hãy tắt đèn trước khi bạn rời phòng thí nghiệm

>> Although it rained heavily, the concert continued until midnight.
-> Mặc dù trong suốt buổi hoà nhạc vẫn còn nhiều hơn nữa cho đến nửa đêm .

>> The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.
-> Báo cáo này đã được xuất bản năm 2023 ước tính rằng khí thải khí thải từ khí thải toàn cầu đã đạt tới 37.4,4,000 tấn .

>> Have you ever tried Vietnamese egg coffee?
-> Bạn đã bao giờ thử lấy cà phê trứng

>> If we fail to act now, future gen