In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import zipfile
# zip_path = '/content/drive/MyDrive/NLP_final/NLP_final.zip'
# data_dir = '/content/drive/MyDrive/NLP_final/'
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(data_dir)
# print("Đã giải nén:", data_dir)

Đã giải nén: /content/drive/MyDrive/NLP_final/


In [2]:
!pip install transformers datasets sacrebleu  rouge-score evaluate --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [3]:
import os
import math, time, glob, shutil, datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from torch.optim import Adam
from transformers import PreTrainedTokenizerFast, get_linear_schedule_with_warmup
import evaluate

# ----- PATH CONFIG -----
BASE_PATH = '/content/drive/MyDrive/NLP_final/NLP_final'
RAW_DIR   = os.path.join(BASE_PATH, 'data', 'raw')
BPE_MODEL = os.path.join(BASE_PATH, 'tokenizer_bpe', 'model', 'tokenizer.json')
CHECKPOINT_DIR = os.path.join(BASE_PATH, 'ckpt_fs')
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cuda


In [4]:
train_df = pd.read_csv(os.path.join(RAW_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(RAW_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(RAW_DIR, "test.csv"))
print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))
display(train_df.head())

Train: 105685 Val: 13210 Test: 13211


Unnamed: 0,en,vi,len_en_words,len_vi_words
0,Cyrus sets up a model of how you run a great m...,Cyrus xây dựng một mô hình chỉ cho ta cách điề...,18,27
1,Solar 's wonderful on rooftops .,Thái dương năng đầy uy lực nếu đặt trên mái cao,6,11
2,"If we worry too much about some things , we en...","Nếu chúng ta lo lắng quá nhiều về một số thứ ,...",18,24
3,And they all kind of laughed and they patted h...,Và tất cả họ đều cười và vỗ nhẹ lên lưng anh ta .,14,14
4,It was above the GDP for three years in a row ...,Nó hơn cả GDP 3 năm liên tiếp ngay trước sự sụ...,16,14


In [5]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=BPE_MODEL,
                                    bos_token='<s>', eos_token='</s>',
                                    pad_token='<pad>', unk_token='<unk>')
MAX_LEN = 128

In [6]:
def encode_pair(src, tgt):
    src_enc = tokenizer(src, max_length=MAX_LEN, truncation=True,
                        padding='max_length')
    tgt_ids = tokenizer.encode(tgt, add_special_tokens=False)
    tgt_ids = [tokenizer.bos_token_id] + tgt_ids + [tokenizer.eos_token_id]
    tgt_ids = tgt_ids[:MAX_LEN] + [tokenizer.pad_token_id]*(MAX_LEN-len(tgt_ids))

    # shift-right
    tgt_in  = tgt_ids[:-1]
    labels  = tgt_ids[1:]
    labels  = [tok if tok!=tokenizer.pad_token_id else -100 for tok in labels]

    return {"input_ids": src_enc["input_ids"],
            "attention_mask": src_enc["attention_mask"],
            "tgt_in": tgt_in,
            "labels": labels}

class TranslationDataset(Dataset):
    def __init__(self, df):
        self.samples = [encode_pair(r.en, r.vi) for _,r in df.iterrows()]
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        s = self.samples[idx]
        return {k: torch.tensor(v) for k,v in s.items()}

train_set = TranslationDataset(train_df)
val_set   = TranslationDataset(val_df)
test_set  = TranslationDataset(test_df)

train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_set,   batch_size=8)
test_loader  = DataLoader(test_set,  batch_size=8)

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(max_len,dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(pos*div);  pe[:,1::2] = torch.cos(pos*div)
        self.register_buffer("pe", pe.unsqueeze(0))          # (1,L,d)

    def forward(self,x):  return x + self.pe[:,:x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,h):
        super().__init__()
        assert d_model%h==0
        self.h, self.d_k = h, d_model//h
        self.q = nn.Linear(d_model,d_model)
        self.k = nn.Linear(d_model,d_model)
        self.v = nn.Linear(d_model,d_model)
        self.o = nn.Linear(d_model,d_model)
        self.scale = math.sqrt(self.d_k)

    def forward(self,q,k,v,mask=None):
        B,Tq,_ = q.size(); _,Tk,_ = k.size()
        def split(x):          # (B,T,d)->(B,h,T,d_k)
            return x.view(B,-1,self.h,self.d_k).transpose(1,2)
        q,k,v = map(split,(self.q(q),self.k(k),self.v(v)))
        scores = (q @ k.transpose(-2,-1)) / self.scale      # (B,h,Tq,Tk)
        if mask is not None:
            if mask.dim()==2: mask = mask[:,None,None,:]
            else:             mask = mask[:,None,:,:]
            scores = scores.masked_fill(mask==0, -1e9)
        attn = torch.softmax(scores,-1)
        out  = (attn @ v).transpose(1,2).contiguous().view(B,Tq,-1)
        return self.o(out)

class FeedForward(nn.Module):
    def __init__(self,d_model,d_ff,drop=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model,d_ff), nn.ReLU(),
            nn.Linear(d_ff,d_model), nn.Dropout(drop))

    def forward(self,x): return self.net(x)

class EncoderLayer(nn.Module):
    def __init__(self,d,n_h,f,drop):
        super().__init__()
        self.sa = MultiHeadAttention(d,n_h)
        self.ff = FeedForward(d,f,drop)
        self.n1 = nn.LayerNorm(d); self.n2 = nn.LayerNorm(d)
        self.drop = nn.Dropout(drop)
    def forward(self,x,m):
        x = self.n1(x + self.drop(self.sa(x,x,x,m)))
        return self.n2(x + self.ff(x))

class DecoderLayer(nn.Module):
    def __init__(self,d,n_h,f,drop):
        super().__init__()
        self.sa = MultiHeadAttention(d,n_h)
        self.ca = MultiHeadAttention(d,n_h)
        self.ff = FeedForward(d,f,drop)
        self.n1 = nn.LayerNorm(d); self.n2 = nn.LayerNorm(d); self.n3 = nn.LayerNorm(d)
        self.drop = nn.Dropout(drop)
    def forward(self,x,enc,smask,tmask):
        x = self.n1(x + self.drop(self.sa(x,x,x,tmask)))
        x = self.n2(x + self.drop(self.ca(x,enc,enc,smask)))
        return self.n3(x + self.ff(x))

class Encoder(nn.Module):
    def __init__(self,vocab,d=512,N=4,h=8,f=2048,drop=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab,d)
        self.pos = PositionalEncoding(d)
        self.layers = nn.ModuleList([EncoderLayer(d,h,f,drop) for _ in range(N)])
        self.norm = nn.LayerNorm(d)
    def forward(self,src,mask):
        x = self.pos(self.emb(src))
        for l in self.layers: x = l(x,mask)
        return self.norm(x)

class Decoder(nn.Module):
    def __init__(self,vocab,d=512,N=4,h=8,f=2048,drop=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab,d)
        self.pos = PositionalEncoding(d)
        self.layers = nn.ModuleList([DecoderLayer(d,h,f,drop) for _ in range(N)])
        self.norm = nn.LayerNorm(d)
    def forward(self,tgt,enc,smask,tmask):
        x = self.pos(self.emb(tgt))
        for l in self.layers: x = l(x,enc,smask,tmask)
        return self.norm(x)

class TransformerScratch(nn.Module):
    def __init__(self,vocab,d=512,N=4,h=8,f=2048,drop=0.1):
        super().__init__()
        self.enc = Encoder(vocab,d,N,h,f,drop)
        self.dec = Decoder(vocab,d,N,h,f,drop)
        self.out = nn.Linear(d,vocab)

    def forward(self,src,tgt,smask,tmask):
        enc = self.enc(src,smask)
        return self.out(self.dec(tgt,enc,smask,tmask))

VOCAB = tokenizer.vocab_size
model  = TransformerScratch(VOCAB).to(DEVICE)
print("Model built")

Model built


In [None]:
# chính
num_epochs, accum, lr = 5, 4, 5e-4
VOCAB = tokenizer.vocab_size
optim  = optim.Adam(model.parameters(), lr=lr)
total  = (len(train_loader)//accum)*num_epochs
sched  = get_linear_schedule_with_warmup(optim, int(.1*total), total)
criterion = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)

best_val, patience, no_imp = 1e9, 1, 0

for ep in range(1, num_epochs+1):
    # -------- TRAIN LOOP --------
    model.train(); tr_loss = 0
    pbar = tqdm(train_loader, desc=f"Train {ep}/{num_epochs}")
    for i,b in enumerate(pbar,1):
        src, tgt_in, lab, sm = [b[k].to(DEVICE) for k in
            ("input_ids","tgt_in","labels","attention_mask")]
        B,S = tgt_in.shape
        pad_t = (tgt_in!=tokenizer.pad_token_id).unsqueeze(1)
        causal = torch.tril(torch.ones(S,S,device=DEVICE)).bool()
        tgt_m  = pad_t & causal

        logits = model(src, tgt_in, sm, tgt_m)
        loss  = criterion(logits.view(-1, VOCAB), lab.view(-1))/accum
        loss.backward(); tr_loss += loss.item()*accum
        if i%accum==0:
            optim.step(); sched.step(); optim.zero_grad()
        pbar.set_postfix(train_loss=f"{tr_loss/i:.4f}")

    # -------- VALIDATION LOOP --------
    model.eval(); val_loss = 0
    with torch.no_grad():
        for b in val_loader:
            src, tgt_in, lab, sm = [b[k].to(DEVICE) for k in
                ("input_ids","tgt_in","labels","attention_mask")]
            B,S = tgt_in.shape
            pad_t = (tgt_in!=tokenizer.pad_token_id).unsqueeze(1)
            causal = torch.tril(torch.ones(S,S,device=DEVICE)).bool()
            tgt_m  = pad_t & causal
            logits = model(src, tgt_in, sm, tgt_m)
            loss  = criterion(logits.view(-1, VOCAB), lab.view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f"Epoch {ep} | train_loss={tr_loss/len(train_loader):.4f} | val_loss={val_loss:.4f}")

    # -------- Save checkpoint tốt nhất --------
    torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, f"epoch{ep:02d}.pt"))
    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, "best_model.pt"))
        no_imp = 0
    else:
        no_imp += 1
        if no_imp > patience:
            print("Early-stop"); break

Train 1/5:   0%|          | 0/13211 [00:00<?, ?it/s]

Epoch 1 | train_loss=5.1728 | val_loss=4.2207


Train 2/5:   0%|          | 0/13211 [00:00<?, ?it/s]

Epoch 2 | train_loss=3.8561 | val_loss=3.7121


Train 3/5:   0%|          | 0/13211 [00:00<?, ?it/s]

Epoch 3 | train_loss=3.3707 | val_loss=3.4894


Train 4/5:   0%|          | 0/13211 [00:00<?, ?it/s]

Epoch 4 | train_loss=3.0051 | val_loss=3.3957


Train 5/5:   0%|          | 0/13211 [00:00<?, ?it/s]

Epoch 5 | train_loss=2.6783 | val_loss=3.3837


In [None]:
bleu  = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def greedy_decode(model,src,sm,max_len=128):
    model.eval()
    enc = model.enc(src,sm)
    ys  = torch.tensor([[tokenizer.bos_token_id]],device=src.device)
    for _ in range(max_len):
        S=ys.size(1)
        pad = (ys!=tokenizer.pad_token_id).unsqueeze(1)
        causal=torch.tril(torch.ones(S,S,device=src.device)).bool()
        tm = pad & causal
        logit = model.dec(ys,enc,sm,tm)
        nxt = model.out(logit)[:,-1,:].argmax(-1).item()
        ys  = torch.cat([ys, torch.tensor([[nxt]],device=src.device)],1)
        if nxt==tokenizer.eos_token_id: break
    return tokenizer.decode(ys[0,1:],skip_special_tokens=True).strip()

def eval_autoreg(loader, max_len=128):
    model.eval(); preds, refs = [], []
    for b in tqdm(loader,desc="Eval"):
        src,sm,lab = b["input_ids"].to(DEVICE), b["attention_mask"].to(DEVICE).bool(), b["labels"]
        for i in range(src.size(0)):
            preds.append(greedy_decode(model,src[i:i+1],sm[i:i+1],max_len))
            ref_ids=[tok if tok!=-100 else tokenizer.pad_token_id for tok in lab[i].tolist()]
            refs.append(tokenizer.decode(ref_ids,skip_special_tokens=True))
    bleu_score = bleu.compute(predictions=preds,references=[[r] for r in refs])['score']
    rouge_scores = rouge.compute(predictions=preds, references=refs)
    print("BLEU:", round(bleu_score,2))
    print("ROUGE-1:", round(rouge_scores['rouge1']*100,2))
    print("ROUGE-2:", round(rouge_scores['rouge2']*100,2))
    print("ROUGE-L:", round(rouge_scores['rougeL']*100,2))
    return preds, refs

# Đánh giá trên validation hoặc test
# Load best_model trước:
model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "best_model.pt")))
preds, refs = eval_autoreg(val_loader,128)    # hoặc test_loader,128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Eval:   0%|          | 0/1652 [00:00<?, ?it/s]



BLEU: 24.96
ROUGE-1: 68.41
ROUGE-2: 44.14
ROUGE-L: 59.0


In [11]:
def translate_batch(texts, model, tokenizer, device=DEVICE, max_len=128):
    model.eval()
    batch = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    src_ids = batch['input_ids']
    src_mask = batch['attention_mask'].bool()
    translations = []
    for i in range(src_ids.size(0)):
        with torch.no_grad():
            enc_out = model.enc(src_ids[i:i+1], src_mask[i:i+1])
        ys = torch.tensor([[tokenizer.bos_token_id]], device=device)
        for _ in range(max_len):
            S = ys.size(1)
            tmask = torch.tril(torch.ones((S, S), dtype=torch.bool, device=device)).unsqueeze(0)
            with torch.no_grad():
                dec_out = model.dec(ys, enc_out, src_mask[i:i+1], tmask)
                logits = model.out(dec_out)
            next_token = logits[:, -1, :].argmax(-1).item()
            ys = torch.cat([ys, torch.tensor([[next_token]], device=device)], dim=1)
            if next_token == tokenizer.eos_token_id:
                break
        out_ids = ys[0, 1:].tolist()
        translations.append(tokenizer.decode(out_ids, skip_special_tokens=True).strip())
    return translations

sample_en = [
    "I want to go to sleep",
    "I ate apple yesterday",
    "Good morning! Did you sleep well?",
    "Yesterday the stock market plunged almost five percent.",
    "She wonders whether artificial intelligence will ever surpass human creativity.",
    "Please, turn off the lights before you leave the laboratory.",
    "Although it rained heavily, the concert continued until midnight.",
    "The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.",
    "Have you ever tried Vietnamese egg coffee?",
    "If we fail to act now, future generations will pay the price.",
    "NASA's James Webb telescope recently captured breathtaking images of distant galaxies.",
    "In my opinion, learning a new language is like opening an extra window on the world."
]

vi_translations = translate_batch(sample_en, model, tokenizer, device=DEVICE, max_len=128)
print("\n--- Sample Translations ---")
for en, vi in zip(sample_en, vi_translations):
    print(f"EN: {en}\nVI: {vi}\n")


--- Sample Translations ---
EN: I want to go to sleep
VI: Tôi muốn đi ngủ để ngủ để ngủ .

EN: I ate apple yesterday
VI: Tôi ăn táo hôm qua ngày hôm qua ăn táo vào ngày hôm qua tôi ăn táo .

EN: Good morning! Did you sleep well?
VI: Chào buổi sáng , bạn ngủ ngon lành bạn nghỉ hưu tốt cho bạn nghỉ hưu giấc ngủ ngon lành bạn có thể làm bạn vui vẻ

EN: Yesterday the stock market plunged almost five percent.
VI: Có thị trường chứng khoán giảm gần 5 %

EN: She wonders whether artificial intelligence will ever surpass human creativity.
VI: Cô ấy tự hào cho dù trí tuệ nhân tạo có thể vượt qua sự sáng tạo của con người .

EN: Please, turn off the lights before you leave the laboratory.
VI: Xin hãy tắt đèn chiếu sáng trước khi bạn rời phòng thí nghiệm

EN: Although it rained heavily, the concert continued until midnight.
VI: Mặc dù nó đã làm hoà nhạc cho đến tận nửa đêm .

EN: The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.
VI: Báo cáo của báo cá

In [14]:
from torch.utils.data import DataLoader, Subset
import random

n_samples = 100
indices = random.sample(range(len(val_set)), n_samples)
sample_set = Subset(val_set, indices)
sample_loader = DataLoader(sample_set, batch_size=1, shuffle=False)

In [16]:
def beam_decode(model, src, sm, tokenizer, beam_width=5, max_len=128, device=DEVICE):
    model.eval()
    with torch.no_grad():
        enc = model.enc(src, sm)
        # Mỗi phần tử beam: (sequence tensor, score)
        beams = [(torch.tensor([[tokenizer.bos_token_id]], device=device), 0.0)]
        completed = []

        for _ in range(max_len):
            new_beams = []
            for seq, score in beams:
                S = seq.size(1)
                pad = (seq != tokenizer.pad_token_id).unsqueeze(1)
                causal = torch.tril(torch.ones(S, S, device=device)).bool()
                tm = pad & causal
                dec_out = model.dec(seq, enc, sm, tm)
                logits = model.out(dec_out)
                log_probs = torch.log_softmax(logits[:, -1, :], dim=-1)
                topk_log_probs, topk_ids = log_probs.topk(beam_width)

                for k in range(beam_width):
                    next_token = topk_ids[0, k].item()
                    next_score = score + topk_log_probs[0, k].item()
                    new_seq = torch.cat([seq, torch.tensor([[next_token]], device=device)], dim=1)
                    if next_token == tokenizer.eos_token_id:
                        completed.append((new_seq, next_score))
                    else:
                        new_beams.append((new_seq, next_score))
            # Giữ lại beam_width sequence có score tốt nhất
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            if len(beams) == 0:
                break

        # Nếu có completed sequence, lấy score cao nhất
        if completed:
            best_seq = sorted(completed, key=lambda x: x[1], reverse=True)[0][0]
        else:
            best_seq = beams[0][0]

        return tokenizer.decode(best_seq[0, 1:], skip_special_tokens=True).strip()

In [17]:
def eval_beam_autoreg(loader, beam_width=5, max_len=128):
    model.eval(); preds, refs = [], []
    for b in tqdm(loader, desc=f"Eval Beam{beam_width}"):
        src, sm, lab = b["input_ids"].to(DEVICE), b["attention_mask"].to(DEVICE).bool(), b["labels"]
        pred = beam_decode(model, src, sm, tokenizer, beam_width=beam_width, max_len=max_len, device=DEVICE)
        preds.append(pred)
        ref_ids = [tok if tok != -100 else tokenizer.pad_token_id for tok in lab[0].tolist()]
        refs.append(tokenizer.decode(ref_ids, skip_special_tokens=True))
    bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])['score']
    rouge_scores = rouge.compute(predictions=preds, references=refs)
    print(f"BLEU (beam{beam_width}):", round(bleu_score, 2))
    print("ROUGE-1:", round(rouge_scores['rouge1'] * 100, 2))
    print("ROUGE-2:", round(rouge_scores['rouge2'] * 100, 2))
    print("ROUGE-L:", round(rouge_scores['rougeL'] * 100, 2))
    return preds, refs

# Chạy đánh giá
preds_beam, refs_beam = eval_beam_autoreg(sample_loader, beam_width=5, max_len=128)

Eval Beam5:   0%|          | 0/100 [00:00<?, ?it/s]

BLEU (beam5): 24.21
ROUGE-1: 67.66
ROUGE-2: 43.63
ROUGE-L: 58.37


In [None]:
en_samples = [val_df.iloc[idx]['en'] for idx in indices] 

for i, (en, hyp, ref) in enumerate(zip(en_samples, preds_beam, refs_beam)):
    print(f"[{i+1}]")
    print("EN :", en)
    print("Pred:", hyp)
    print("Ref :", ref)
    print("-" * 40)

[1]
EN : We are still asking the question .
Pred: Chúng ta vẫn đang hỏi câu hỏi .
Ref : Còn chúng ta vẫn đang đặt ra những câu hỏi .
----------------------------------------
[2]
EN : In fact , one of the most important was a brain region that becomes active when you feel the rush of cocaine .
Pred: Trên thực tế , một trong những phần quan trọng nhất là một khu vực não hoạt động khi bạn cảm thấy chất lỏng đi .
Ref : Thực ra , một trong những điều quan trọng nhất là một khu vực não bộ khu vực ấy trở nên hoạt động tích cực mà bạn cảm thấy có sự xuất hiện đột ngột của cocaine
----------------------------------------
[3]
EN : He was a brilliant scientist at the frontiers of mathematics , even as a teenager .
Pred: Ông ấy là một nhà khoa học thông minh về toán học , thậm chí là một thiếu niên .
Ref : Ông là một nhà khoa học vĩ đại đạt tới đỉnh cao của toàn học ngay từ thời niên thiếu
----------------------------------------
[4]
EN : Stress hormones , glucocorticoids , released by the brain ,

In [None]:
def translate_batch(texts, model, tokenizer, device=DEVICE, max_len=128):
    """
    Dịch một list các câu tiếng Anh sang tiếng Việt.
    """
    model.eval()
    # Tokenize hàng loạt
    batch = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    src_ids = batch['input_ids']
    src_mask = batch['attention_mask'].bool()
    translations = []
    for i in range(src_ids.size(0)):
        with torch.no_grad():
            enc_out = model.enc(src_ids[i:i+1], src_mask[i:i+1])
        ys = torch.tensor([[tokenizer.bos_token_id]], device=device)
        for _ in range(max_len):
            S = ys.size(1)
            tmask = torch.tril(torch.ones((S, S), dtype=torch.bool, device=device)).unsqueeze(0)
            with torch.no_grad():
                dec_out = model.dec(ys, enc_out, src_mask[i:i+1], tmask)
                logits = model.out(dec_out)
            next_token = logits[:, -1, :].argmax(-1).item()
            ys = torch.cat([ys, torch.tensor([[next_token]], device=device)], dim=1)
            if next_token == tokenizer.eos_token_id:
                break
        out_ids = ys[0, 1:].tolist()
        translations.append(tokenizer.decode(out_ids, skip_special_tokens=True).strip())
    return translations

# ----------- TEST MULTI-SENTENCE -----------
sample_en = [
    "I want to go to sleep",
    "Good morning! Did you sleep well?",
    "Yesterday the stock market plunged almost five percent.",
    "She wonders whether artificial intelligence will ever surpass human creativity.",
    "Please, turn off the lights before you leave the laboratory.",
    "Although it rained heavily, the concert continued until midnight.",
    "The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.",
    "Have you ever tried Vietnamese egg coffee?",
    "If we fail to act now, future generations will pay the price.",
    "NASA's James Webb telescope recently captured breathtaking images of distant galaxies.",
    "In my opinion, learning a new language is like opening an extra window on the world."
]

vi_translations = translate_batch(sample_en, model, tokenizer,
                                      "cuda", max_len=128)

print("\n--- Sample Translations ---")
for en, vi in zip(sample_en, vi_translations):
    print(f"EN: {en}")
    print(f"VI: {vi}\n")



--- Sample Translations ---
EN: I want to go to sleep
VI: Tôi muốn đi ngủ để ngủ .

EN: Good morning! Did you sleep well?
VI: Sáng sáng sớm , bạn có thể ngủ ngon lành mạnh .

EN: Yesterday the stock market plunged almost five percent.
VI: Có khoảng cách về thị trường chứng khoán hầu như hầu hết 5 %

EN: She wonders whether artificial intelligence will ever surpass human creativity.
VI: Cô ấy đã làm thay đổi trí thông minh nhân tạo có thể bao giờ sáng tạo của con người bằng sáng tạo

EN: Please, turn off the lights before you leave the laboratory.
VI: Hãy tắt những ánh sáng trước khi bạn rời phòng thí nghiệm .

EN: Although it rained heavily, the concert continued until midnight.
VI: Mặc dù nó đã được đánh giá cao hơn cả ngày hôm sau , cho đến khi người ta tiếp tục đến ngày hôm qua .

EN: The report, which was published in 2023, estimates that global CO₂ emissions hit 37.4 gigatons.
VI: Báo cáo của báo cáo đã được xuất bản trong 20.000 ước lượng khí thải toàn cầu mà các nhà cung cấp dị

In [None]:
# # --------- 1) Mount Google Drive --------- # sẽ yêu cầu bạn cho phép

# # --------- 2) Khai báo đường dẫn ----------
# SRC_CKPT      = '/content/ckpt'
# SRC_BACKUP    = '/content/backup_ckpt'

# DST_CKPT      = '/content/drive/MyDrive/NLP_final/NLP_final/ckpt'
# DST_BACKUP    = '/content/drive/MyDrive/NLP_final/NLP_final/backup_ckpt'

# import os, shutil

# # --------- 3) Tạo thư mục đích (nếu chưa có) ----------
# os.makedirs(DST_CKPT,   exist_ok=True)
# os.makedirs(DST_BACKUP, exist_ok=True)

# # --------- 4) Copy toàn bộ nội dung ----------
# def copy_tree(src, dst):
#     for root, dirs, files in os.walk(src):
#         rel   = os.path.relpath(root, src)
#         tgt_r = os.path.join(dst, rel) if rel != '.' else dst
#         os.makedirs(tgt_r, exist_ok=True)
#         for f in files:
#             shutil.copy2(os.path.join(root, f), os.path.join(tgt_r, f))

# copy_tree(SRC_CKPT,   DST_CKPT)
# copy_tree(SRC_BACKUP, DST_BACKUP)

# print("✅  Đã sao chép xong!  Check Drive của bạn nhé.")


✅  Đã sao chép xong!  Check Drive của bạn nhé.
