# **Fine-tuning mBART50 for En-Vi Machine Translation**

In [1]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

## **Dataset**

In [2]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [4]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa h·ªçc ƒë·∫±ng sau m·ªôt ti√™u ƒë·ªÅ v·ªÅ kh√≠ h·∫≠u'}

## **Tokenizer**

In [5]:
from transformers import AutoTokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
len(tokenizer)

250054

## **Encoding**

In [7]:
import torch

MAX_LEN = 75

def preprocess_function(examples):
    ### Your code here
    input_ids = tokenizer(
        examples["en"], padding="max_length", truncation=True, max_length=MAX_LEN
    )['input_ids']

    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN
    )['input_ids']

    labels = [
        [-100 if item == tokenizer.pad_token_id else item for item in label]
        for label in labels
    ]

    return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels)
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode([250054])

''

In [9]:
preprocessed_ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa h·ªçc ƒë·∫±ng sau m·ªôt ti√™u ƒë·ªÅ v·ªÅ kh√≠ h·∫≠u',
 'input_ids': [250004,
  127055,
  66937,
  13,
  152,
  581,
  41664,
  50155,
  10,
  153552,
  10336,
  2256,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [250004,
  67766,
  2546,
  218877,
  858,
  889,
  10037,
  6248,
  1893,
  17964,
  42254,
  2,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -10

## **Model**

In [10]:
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [11]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

## **Evaluate**

In [12]:
import numpy as np
import evaluate
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    ### Your code here
    preds, label = eval_preds # N_samples x Sequence Length
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    labels = np.where(label != -100, label, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    preds, labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

## **Trainer**

In [13]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="en-vi-machine-translation",
#     name="mbart50" #
# )

In [14]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./en-vi-mbart50",
    logging_dir="logs",
    logging_steps=1000,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    save_total_limit=1,
    num_train_epochs=3,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision
    gradient_accumulation_steps=2,  # Simulate larger batch size
    # report_to="wandb"
)

# training_args = Seq2SeqTrainingArguments(
#     ### Your code here
#     output_dir="./en-vi-mbart50",
#     logging_dir="logs",
#     logging_steps=1000,
#     predict_with_generate=True,
#     eval_strategy="steps",
#     eval_steps=1000,
#     save_strategy="steps",
#     save_steps=1000,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     save_total_limit=1,
#     num_train_epochs=3,
#     load_best_model_at_end=True,
#     # report_to="wandb"
# )


data_collator = DataCollatorForSeq2Seq(
    ### Your code here
    tokenizer,
    model=model
)

trainer = Seq2SeqTrainer(
    ### Your code here
    model,
    training_args,
    train_dataset=preprocessed_ds['train'],
    eval_dataset=preprocessed_ds['validation'],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)




Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu
1000,1.2838,1.400238,32.341632
2000,1.2578,1.37722,32.765471
3000,1.2533,1.351768,32.582521
4000,1.2416,1.327914,33.299364
5000,1.2214,1.322077,33.566309
6000,1.2198,1.30523,33.096443
7000,1.2131,1.289076,34.087475
8000,1.1963,1.281414,33.997656
9000,1.0251,1.309712,33.571413
10000,0.9495,1.305548,33.625714


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=24996, training_loss=0.9669089241778684, metrics={'train_runtime': 15687.5692, 'train_samples_per_second': 25.495, 'train_steps_per_second': 1.593, 'total_flos': 6.34764795420672e+16, 'train_loss': 0.9669089241778684, 'epoch': 2.9996999699969997})

In [None]:
# trainer.push_to_hub(token="...")

## **Inference**

In [17]:
model_name = "thainq107/en-vi-mbart50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

### **Greedy Search**

In [18]:
src_text = "I go to school"
encoded_text = tokenizer(src_text, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_text
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['T√¥i ƒëi h·ªçc']

### **Beam search**

In [19]:
src_text = "In the next step, we consider the next possible tokens for each of the three branches we created in the previous step."
encoded_text = tokenizer(src_text, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_text,
    num_beams=5,
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['B∆∞·ªõc ti·∫øp theo , ch√∫ng t√¥i xem x√©t c√°c token ti·ªÅm nƒÉng ti·∫øp theo cho m·ªói trong ba nh√°nh m√† ch√∫ng t√¥i t·∫°o ra ·ªü b∆∞·ªõc tr∆∞·ªõc .']

### **Pipeline**

In [20]:
from transformers import pipeline

translator = pipeline(model="thainq107/en-vi-mbart50")

Device set to use cuda:0


In [21]:
translated_text = translator("I go to school", num_beams=1, do_sample=False)
translated_text



[{'generated_text': 'T√¥i ƒëi h·ªçc'}]

In [22]:
translated_text = translator("I go to school", num_beams=2)
translated_text

[{'generated_text': 'T√¥i ƒëi h·ªçc'}]

In [26]:
# greedy search
pred_sentences = translator(ds['test']['en'], batch_size=32, num_beams=1, do_sample=False)



In [27]:
# beam search
pred_sentences = translator(ds['test']['en'], batch_size=32, num_beams=5, early_stopping=True)

In [28]:
pred_sentences = [pred_sentence['generated_text'] for pred_sentence in pred_sentences]

In [29]:
pred_sentences[0]

'Khi t√¥i c√≤n nh·ªè , t√¥i nghƒ© ƒë·∫•t n∆∞·ªõc m√¨nh t·ªët nh·∫•t tr√™n th·∫ø gi·ªõi , v√† t√¥i l·ªõn l√™n v√† h√°t m·ªôt b√†i h√°t t√™n l√† &quot; Kh√¥ng c√≥ g√¨ ƒë√°ng ghen t·ªã . &quot;'

In [30]:
ds['test']['vi'][0]

'Khi t√¥i c√≤n nh·ªè , T√¥i nghƒ© r·∫±ng B·∫ØcTri·ªÅu Ti√™n l√† ƒë·∫•t n∆∞·ªõc t·ªët nh·∫•t tr√™n th·∫ø gi·ªõi v√† t√¥i th∆∞·ªùng h√°t b√†i &quot; Ch√∫ng ta ch·∫≥ng c√≥ g√¨ ph·∫£i ghen t·ªã . &quot;'

In [31]:
import sacrebleu

# greedy search
bleu_score = sacrebleu.corpus_bleu(pred_sentences, [ds['test']['vi']], force=True)
bleu_score

BLEU = 34.17 66.5/42.2/28.0/18.9 (BP = 0.980 ratio = 0.980 hyp_len = 33060 ref_len = 33738)

In [32]:
import sacrebleu

# beam search
bleu_score = sacrebleu.corpus_bleu(pred_sentences, [ds['test']['vi']], force=True)
bleu_score

BLEU = 34.17 66.5/42.2/28.0/18.9 (BP = 0.980 ratio = 0.980 hyp_len = 33060 ref_len = 33738)