In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets sacrebleu rouge-score evaluate sentencepiece

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━

In [None]:
# Giải nén trực tiếp vào thư mục BASE_DIR trên Drive
!unzip -q /content/drive/MyDrive/NLP_final.zip -d /content/drive/MyDrive/


In [None]:
import os, json, random, numpy as np, torch
from pathlib import Path
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer, TrainerCallback,get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from torch import nn
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd
from torch.utils.data import DataLoader
import evaluate
from tqdm.auto import tqdm

BASE_PATH   = Path('/content/drive/MyDrive/NLP_final')
RAW_DIR   = BASE_PATH / 'data' / 'raw'
TOKENIZER_DIR = BASE_PATH / 'tokenizer_bpe' / 'model'
CHECKPOINT_DIR = os.path.join(BASE_PATH, 'ckpt_gpt2_pretrained')
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on", DEVICE)

Running on cuda


In [None]:
train_df = pd.read_csv(os.path.join(RAW_DIR, "train.csv"))
val_df = pd.read_csv(os.path.join(RAW_DIR, "val.csv"))
test_df = pd.read_csv(os.path.join(RAW_DIR, "test.csv"))
print(len(train_df), len(val_df), len(test_df))

105685 13210 13211


In [None]:
model_name = "NlpHUST/gpt2-vietnamese"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", truncation_side="left")

special_tokens = {
    'bos_token': '<s>',
    'eos_token': '</s>',
    'pad_token': '<pad>',
    'unk_token': '<unk>',
    'sep_token': '<sep>',
}
added = tokenizer.add_special_tokens(special_tokens)
model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
if added > 0:
    model.resize_token_embeddings(len(tokenizer))
print("Vocab size:", len(tokenizer))

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=96):
        self.src = df['en'].tolist()
        self.tgt = df['vi'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        if isinstance(idx, (list, np.ndarray)):
          idx = idx[0]
        en, vi = self.src[idx], self.tgt[idx]
        # prompt: "EN_SENTENCE <sep_token> VI_SENTENCE"
        text = f"{en}{self.tokenizer.sep_token}{vi}"
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        input_ids     = enc.input_ids[0]
        attention_mask= enc.attention_mask[0]

        # labels: mask phần EN+<sep> bằng -100
        labels = input_ids.clone()
        sep_id = self.tokenizer.sep_token_id
        sep_pos = (input_ids == sep_id).nonzero(as_tuple=True)
        if len(sep_pos[0])>0:
            cut = sep_pos[0][0].item() + 1
            labels[:cut] = -100
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

In [None]:
BATCH_SIZE = 8

train_ds = TranslationDataset(train_df, tokenizer)
val_ds   = TranslationDataset(val_df,   tokenizer)
test_ds  = TranslationDataset(test_df,  tokenizer)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

In [None]:
def train_model(model, train_loader, val_loader,
                epochs=5, lr=5e-5, eps_stop=1, accum_steps=2):
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader)//accum_steps * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                    num_warmup_steps=int(0.1*total_steps),
                    num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    best_val = float('inf')
    no_imp   = 0

    for ep in range(1, epochs+1):
        # train
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        for i, batch in enumerate(tqdm(train_loader, desc=f"Train {ep}/{epochs}")):
            inputs = batch["input_ids"].to(DEVICE)
            masks  = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss / accum_steps
            loss.backward()
            train_loss += loss.item()*accum_steps

            if (i+1)%accum_steps==0:
                optimizer.step(); scheduler.step(); optimizer.zero_grad()

        avg_train = train_loss/len(train_loader)
        # eval
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch["input_ids"].to(DEVICE)
                masks  = batch["attention_mask"].to(DEVICE)
                labels = batch["labels"].to(DEVICE)
                out = model(input_ids=inputs, attention_mask=masks, labels=labels)
                val_loss += out.loss.item()
        avg_val = val_loss/len(val_loader)

        print(f"[Epoch {ep}] train_loss={avg_train:.4f}  val_loss={avg_val:.4f}")

        # early-stop & save best
        ckpt = os.path.join(CHECKPOINT_DIR, f"best_model.pt")
        if avg_val < best_val:
            best_val = avg_val; no_imp=0
            torch.save(model.state_dict(), ckpt)
        else:
            no_imp+=1
            if no_imp>eps_stop:
                print("Early stopping."); break

    # load best
    model.load_state_dict(torch.load(ckpt))
    return model

In [None]:
bleu  = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def greedy_eval(loader, model, tokenizer, max_new_tokens=96):
    model.eval()
    preds, refs = [], []

    for batch in tqdm(loader, desc="Eval"):
        input_ids     = batch["input_ids"].to(DEVICE)
        attention_mask= batch["attention_mask"].to(DEVICE)

        # sinh từng câu trong batch
        outputs = model.generate(
            input_ids          = input_ids,
            attention_mask     = attention_mask,
            max_new_tokens     = max_new_tokens,
            num_beams          = 1,                # greedy = beam size 1
            no_repeat_ngram_size = 2,              # không lặp 2-gram
            repetition_penalty   = 1.2,            # phạt lặp token
            length_penalty       = 1.0,            # độ ưu tiên độ dài
            pad_token_id         = tokenizer.pad_token_id,
            eos_token_id         = tokenizer.eos_token_id,
        )

        # decode preds
        for seq in outputs:
            preds.append(tokenizer.decode(seq, skip_special_tokens=True).strip())

        # chuẩn bị refs
        labels = batch["labels"].tolist()
        for lab in labels:
            lab = [tok if tok != -100 else tokenizer.pad_token_id for tok in lab]
            refs.append(tokenizer.decode(lab, skip_special_tokens=True).strip())

    print("BLEU   :", round(bleu.compute(predictions=preds, references=[[r] for r in refs])['score'],2))
    scores = rouge.compute(predictions=preds, references=refs)
    print("ROUGE-1:", round(scores['rouge1']*100,2))
    print("ROUGE-2:", round(scores['rouge2']*100,2))
    print("ROUGE-L:", round(scores['rougeL']*100,2))
    return preds, refs

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def translate_batch(texts, model, tokenizer,
                    device=DEVICE,
                    num_beams=5,
                    max_new_tokens=40):
    model.eval()
    prompts = [t + tokenizer.sep_token for t in texts]
    enc = tokenizer(prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_new_tokens//2
                   ).to(device)

    out = model.generate(
        input_ids          = enc.input_ids,
        attention_mask     = enc.attention_mask,
        max_new_tokens     = max_new_tokens,
        num_beams          = num_beams,
        no_repeat_ngram_size = 2,
        repetition_penalty   = 1.2,
        pad_token_id         = tokenizer.pad_token_id,
        eos_token_id         = tokenizer.eos_token_id,
    )

    results = []
    for inp, gen_seq in zip(enc.input_ids, out):
        L_in = inp.size(0)
        gen_ids = gen_seq[L_in:]
        results.append(
            tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        )
    return results


In [None]:
model = train_model(model, train_loader, val_loader, epochs=3)

Train 1/3:   0%|          | 0/13211 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


[Epoch 1] train_loss=2.3541  val_loss=2.0822


Train 2/3:   0%|          | 0/13211 [00:00<?, ?it/s]

[Epoch 2] train_loss=1.9379  val_loss=1.9760


Train 3/3:   0%|          | 0/13211 [00:00<?, ?it/s]

[Epoch 3] train_loss=1.8101  val_loss=1.9553


In [None]:
print("\n== Eval on VAL ==")
val_preds, val_refs = greedy_eval(val_loader, model, tokenizer)


== Eval on VAL ==


Eval:   0%|          | 0/1652 [00:00<?, ?it/s]



BLEU   : 14.72
ROUGE-1: 35.92
ROUGE-2: 35.03
ROUGE-L: 35.92


In [None]:
print("\n== Eval on TEST ==")
test_preds, test_refs = greedy_eval(test_loader, model, tokenizer)

In [None]:
# Load mô hình đã fine-tune (checkpoint từ train_model)
checkpoint = os.path.join(CHECKPOINT_DIR, "best_model.pt")
model.load_state_dict(torch.load(checkpoint, map_location=DEVICE))
model.to(DEVICE)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [None]:
# 3) Inference mẫu
samples = [
    "I want to go to sleep",
    "Good morning! Did you sleep well?",
    "Have you tried Vietnamese egg coffee?",
]
outs_greedy = translate_batch(samples, model, tokenizer, num_beams=1)
outs_beam5  = translate_batch(samples, model, tokenizer, num_beams=5)
for en, g, b in zip(samples, outs_greedy, outs_beam5):
    print("EN:", en)
    print("Greedy:", g)
    print("Beam5 :", b)
    print("---")

EN: I want to go to sleep
Greedy: Tôi muốn đi ngủ . Tôi muốn được thức dậy . Và tôi muốn có một giấc ngủ ngon . " . Đó là điều mà tôi đã làm . . và đó cũng chính là mục đích của tôi
Beam5 : Tôi muốn đi ngủ . Tôi muốn thức dậy vào buổi sáng . Và tôi muốn ngủ trưa . Vậy nên , tôi sẽ thức giấc vào lúc sáng sớm . Đó là lúc mà cơ thể tôi khoẻ mạnh
---
EN: Good morning! Did you sleep well?
Greedy: Xin chào buổi sáng ! Bạn có ngủ ngon không ? . . giấc ngủ tốt không ạ ? ? Không ạ . ? Vâng , thưa quý vị . Vâng ạ , vâng ạ ! Vâng, thưa quí
Beam5 : Xin chào buổi sáng ! Bạn có ngủ ngon không ? Hãy dậy sớm nào ! Hãy thức dậy nào . Hãy tỉnh táo nào , hãy tỉnh dậy nhé . Tỉnh táo nhé ! Tỉnh đi ! tỉnh lại
---
EN: Have you tried Vietnamese egg coffee?
Greedy: Bạn thử tưởng tượng rằng bạn đã thử uống cà phê Việt Nam không ? Bạn có thể thử không ạ ? . . ? ? ! ? Không ạ , ? ... ? Vậy thì thử xem nào :
Beam5 : Các bạn thử uống cà phê Việt Nam đi nào ? Các bạn đã thử chưa ? Hãy thử xem nhé ! . " Cà phê " là m

In [None]:
model.save_pretrained(CHECKPOINT_DIR)
tokenizer.save_pretrained(CHECKPOINT_DIR)
print("Saved to", CHECKPOINT_DIR)

In [None]:
def generate_response(model, tokenizer, prompts, max_new_tokens=40):
    model.eval()
    enc = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
    outputs = model.generate(
        input_ids=enc.input_ids,
        attention_mask=enc.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return responses

def compute_reward(responses, references):
    scores = []
    for r, ref in zip(responses, references):
        bleu_score = bleu.compute(predictions=[r], references=[[ref]])["score"]
        scores.append(bleu_score / 100.0)  # scale to 0–1
    return torch.tensor(scores, dtype=torch.float32).to(DEVICE)

In [None]:
def ppo_update(model, old_logprobs, rewards, values, entropies, eps_clip=0.2, vf_coef=0.5, ent_coef=0.01):
    advantages = rewards - values.detach()
    returns = rewards

    ratio = torch.exp(old_logprobs - old_logprobs.detach())
    surr1 = ratio * advantages
    surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantages
    policy_loss = -torch.min(surr1, surr2).mean()

    value_loss = nn.MSELoss()(values, returns)
    entropy_loss = -entropies.mean()

    total_loss = policy_loss + vf_coef * value_loss + ent_coef * entropy_loss
    return total_loss

In [None]:
from tqdm.auto import tqdm

def ppo_finetune(model, tokenizer, train_df, epochs=2, batch_size=4, max_len=96):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    ppo_data = list(zip(train_df["en"], train_df["vi"]))

    for epoch in range(epochs):
        random.shuffle(ppo_data)
        print(f"\n--- PPO Epoch {epoch+1}/{epochs} ---")

        progress_bar = tqdm(range(0, len(ppo_data), batch_size), desc=f"Training PPO", leave=False)

        for i in progress_bar:
            batch = ppo_data[i:i+batch_size]
            if len(batch) < batch_size:
                continue

            prompts, refs = zip(*batch)
            full_prompts = [p + tokenizer.sep_token for p in prompts]

            # Encode
            enc = tokenizer(full_prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(DEVICE)
            input_ids = enc.input_ids
            attn_mask = enc.attention_mask

            # Generate & logprob
            model.eval()
            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
            logits = outputs.logits
            next_token_logits = logits[:, -1, :]
            probs = torch.softmax(next_token_logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            actions = dist.sample()
            log_probs = dist.log_prob(actions)
            values = torch.zeros_like(log_probs)  # assume no value head

            # Decode output
            gen_ids = torch.cat([input_ids, actions.unsqueeze(1)], dim=-1)
            generated = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            rewards = compute_reward(generated, refs)

            model.train()
            loss = ppo_update(model, log_probs, rewards, values, dist.entropy())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch+1}/{epochs} finished.")

In [None]:
# # Load lại mô hình từ checkpoint
# model_path = os.path.join(CHECKPOINT_DIR, "best_model.pt")
# model.load_state_dict(torch.load(model_path))
# model.to(DEVICE)
# print("Loaded pretrained model from checkpoint.")

In [None]:
model_path = os.path.join(CHECKPOINT_DIR, "best_model.pt")
state_dict = torch.load(model_path, map_location="cpu")

# Load vào base GPT2 của bạn
model.base.load_state_dict(state_dict)
model.to(DEVICE)

print("Loaded pretrained GPT2 weights into GPTWithValueHead.")

Loaded pretrained GPT2 weights into GPTWithValueHead.


In [None]:
ppo_subset = train_df.sample(n=10000, random_state=42).reset_index(drop=True)
ppo_finetune(model, tokenizer, ppo_subset, epochs=1)


--- PPO Epoch 1/1 ---


Training PPO:   0%|          | 0/2500 [00:00<?, ?it/s]

Epoch 1/1 finished.


In [None]:
print("\n== Đánh giá lại trên tập VALIDATION sau PPO ==")
val_preds_after, val_refs_after = greedy_eval(val_loader, model, tokenizer)


== Đánh giá lại trên tập VALIDATION sau PPO ==


Eval:   0%|          | 0/1652 [00:00<?, ?it/s]



BLEU   : 16.41
ROUGE-1: 30.59
ROUGE-2: 29.8
ROUGE-L: 30.6


In [None]:
samples = [
    "I want to go to sleep",
    "Good morning! Did you sleep well?",
    "Have you tried Vietnamese egg coffee?"
]

outs_greedy = translate_batch(samples, model, tokenizer, num_beams=1)
outs_beam5  = translate_batch(samples, model, tokenizer, num_beams=5)

for en, g, b in zip(samples, outs_greedy, outs_beam5):
    print("EN     :", en)
    print("Greedy :", g)
    print("Beam-5 :", b)
    print("---")

EN     : I want to go to sleep
Greedy : I would like to take a little talk with the people who are going out of this world . I think that , when you were involved on our own personality , we couldn 't
Beam-5 : Rằng tôi muốn đi ngủ sớm hơn một chút để có thể ngủ ngon hơn . Tôi sẽ thức dậy vào lúc 5 giờ sáng để ngủ tiếp . Và tôi sẽ dậy lúc 7 giờ tối để đi
---
EN     : Good morning! Did you sleep well?
Greedy : Goodnight ! Hãy ngủ ngon nào ! Đừng thức nữa nhé ! Ngủ ngon lắm ! . Tốt nhất là đừng thức quá khuya nhé . . Đừng ngủ quá nhiều nhé , . Sẽ rất nguy hiểm đấy
Beam-5 : good morning ! Hãy ngủ ngon nhé ! Đừng thức quá khuya . Đừng ngủ quá nhiều . Hãy nghỉ ngơi đầy đủ . Ngủ đủ giấc . Ăn đủ chất dinh dưỡng . Uống đủ nước . Nghỉ ngơi
---
EN     : Have you tried Vietnamese egg coffee?
Greedy : H.T.S. : " You 're not yet to know , but that is a good for the people who are looking at this . " . The story of the day and night ,
Beam-5 : HỒ hởi với một ly cà phê Việt Nam ? Hãy thử tưởng tượng b