In [None]:
pip install datasets transformers sacrebleu sentencepiece evaluate

In [None]:
pip install -U datasets

In [None]:
import os
import re
import json
import unicodedata
from tqdm import tqdm
import datasets
import evaluate
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [None]:
from google.colab import drive
drive.mount('/content/drive')
cd /content/drive/MyDrive/IT4772E - NLP/marianmt/

In [None]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
drive_file_path = "/content/drive/MyDrive/IT4772E - NLP/marianmt/phomt_cleaned.json"
local_file_path = "/content/phomt_cleaned.json"
!cp "{drive_file_path}" "{local_file_path}"

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files=local_file_path, split="train")
dataset = dataset.train_test_split(test_size=0.02)  # 98% train, 2% eval

print(dataset["train"][0])
# {'translation': {'en': 'hello world', 'vi': 'xin chào thế giới'}}

In [None]:
def preprocess(examples):
    sources = [ex["en"] for ex in examples["translation"]]
    targets = [ex["vi"] for ex in examples["translation"]]
    inputs = tokenizer(sources, truncation=True, padding="longest", max_length=64, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, padding="longest", max_length=64, return_tensors="pt")
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
tokenized_dataset = dataset.map(preprocess, batched=True, batch_size=1000)

In [None]:
# Lấy 500,000 mẫu từ tập train
train_dataset = tokenized_dataset["train"].select(range(500000))
test_dataset = tokenized_dataset["test"].select(range(1000))

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    max_length=128,
    pad_to_multiple_of=8,
    label_pad_token_id=-100,
)

In [None]:
bleu = evaluate.load("sacrebleu")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[l if l != -100 else tokenizer.pad_token_id for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./marian_finetuned_vi",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    optim="adamw_torch",
    warmup_steps=500,
    max_grad_norm=1.0,
    dataloader_num_workers=4,
    report_to="none",
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train
trainer.train()

In [None]:
# Lưu mô hình
trainer.save_model("./marian_finetuned_vi_final")

In [None]:
def postprocess(preds, labels):
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return preds, labels

results = trainer.predict(tokenized_dataset["test"])
preds, labels = postprocess(results.predictions, results.label_ids)

bleu_score = bleu.compute(predictions=preds, references=[[l] for l in labels])
print(f"BLEU: {bleu_score['score']:.2f}")