In [None]:
pip install -U datasets transformers sacrebleu sentencepiece evaluate

In [None]:
import os
from tqdm import tqdm
import datasets
from datasets import load_dataset
import evaluate
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [None]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/IT4772E - NLP/marianmt/

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
drive_train = "/content/drive/MyDrive/IT4772E - NLP/marianmt/train.json"
drive_dev = "/content/drive/MyDrive/IT4772E - NLP/marianmt/dev.json"
drive_test = "/content/drive/MyDrive/IT4772E - NLP/marianmt/test.json"
local_train = "/content/train.json"
local_dev = "/content/dev.json"
local_test = "/content/test.json"

if os.path.exists(drive_train):
    os.system(f'cp "{drive_train}" "{local_train}"')
if os.path.exists(drive_dev):
    os.system(f'cp "{drive_dev}" "{local_dev}"')
if os.path.exists(drive_test):
    os.system(f'cp "{drive_test}" "{local_test}"')

In [None]:
# Load dataset
train_dataset = load_dataset("json", data_files=local_train, split="train")
dev_dataset = load_dataset("json", data_files=local_dev, split="train")
test_dataset = load_dataset("json", data_files=local_test, split="train")

In [None]:
# Hàm kiểm tra độ dài token
def filter_by_length(example):
    en_tokens = len(tokenizer.encode(example["translation"]["en"], add_special_tokens=True))
    vi_tokens = len(tokenizer.encode(example["translation"]["vi"], add_special_tokens=True))
    return en_tokens <= 64 and vi_tokens <= 64

# Lọc dataset
filtered_train = train_dataset.filter(filter_by_length, num_proc=4)
filtered_dev = dev_dataset.filter(filter_by_length, num_proc=4)
filtered_test = test_dataset.filter(filter_by_length, num_proc=4)

In [None]:
import random

# Chia train và valid từ train_dev
train_indices = random.sample(range(len(filtered_train)), min(200000, len(filtered_train)))
valid_indices = random.sample(range(len(filtered_dev)), min(10000, len(filtered_dev)))
test_indices = random.sample(range(len(filtered_test)), min(5000, len(filtered_test)))
train_dataset = filtered_train.select(train_indices)
valid_dataset = filtered_dev.select(valid_indices)
test_dataset = filtered_test.select(test_indices)

print(f"Số mẫu train: {len(train_dataset)}")
print(f"Số mẫu valid: {len(valid_dataset)}")
print(f"Số mẫu test: {len(test_dataset)}")

In [None]:
def preprocess(examples):
    sources = [ex["en"] for ex in examples["translation"]]
    targets = [ex["vi"] for ex in examples["translation"]]
    inputs = tokenizer(sources, truncation=True, padding="longest", max_length=64, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, padding="longest", max_length=64, return_tensors="pt")
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
tokenized_train = train_dataset.map(preprocess, batched=True, batch_size=1000)
tokenized_valid = valid_dataset.map(preprocess, batched=True, batch_size=1000)
tokenized_test = test_dataset.map(preprocess, batched=True, batch_size=1000)

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    max_length=64,
    pad_to_multiple_of=8,
    label_pad_token_id=-100,
)

In [None]:
bleu = evaluate.load("sacrebleu")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[l if l != -100 else tokenizer.pad_token_id for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
import os
os.makedirs("./logs/tensorboard", exist_ok=True)

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./marian_finetuned_vi",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs/tensorboard",  # Thư mục lưu log TensorBoard
    logging_steps=50,
    fp16=True,
    optim="adamw_torch",
    warmup_steps=500,
    max_grad_norm=1.0,
    dataloader_num_workers=4,
    report_to="tensorboard",  # Kích hoạt TensorBoard
    disable_tqdm=False,  # Giữ progress bar
)

In [None]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train
trainer.train()

In [None]:
# Lưu mô hình
trainer.save_model("./marian_finetuned_vi_final")

In [None]:
def postprocess(preds, labels):
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return preds, labels

results = trainer.predict(tokenized_test)
preds, labels = postprocess(results.predictions, results.label_ids)

bleu_score = bleu.compute(predictions=preds, references=[[l] for l in labels])
print(f"BLEU: {bleu_score['score']:.2f}")