<a href="https://colab.research.google.com/github/tienhuynh96/Low_Resource_NMT/blob/main/%5BDemo%5D_NMT_mBART50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Low-resource Machine Translation using mBART50**

##**1. Import libraries**

In [None]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
from transformers import (
    MBart50TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

##**2. Dataset**

##**Incase can not load dataset from dataset**

In [None]:
# Incase can not load dataset from dataset
# Try
# Wget data from github: https://github.com/stefan-it/nmt-en-vi
# Train file
!wget "https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz"
# Dev file
!wget "https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz"
# Test file
!wget "https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz"

--2024-06-08 14:01:06--  https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/stefan-it/nmt-en-vi/master/data/train-en-vi.tgz [following]
--2024-06-08 14:01:06--  https://raw.githubusercontent.com/stefan-it/nmt-en-vi/master/data/train-en-vi.tgz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9903559 (9.4M) [application/octet-stream]
Saving to: ‘train-en-vi.tgz’


2024-06-08 14:01:07 (162 MB/s) - ‘train-en-vi.tgz’ saved [9903559/9903559]

--2024-06-08 14:01:07--  https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-e

In [None]:
# extract the tgz file
!tar -xzf train-en-vi.tgz
!tar -xzf dev-2012-en-vi.tgz
!tar -xzf test-2013-en-vi.tgz

In [None]:
# Add path for train, test, validation file
# Train file
train_source_file = "/content/train.en"
train_target_file = "/content/train.vi"
# Validation file
val_source_file = "/content/tst2012.en"
val_target_file = "/content/tst2012.vi"
# Test file
test_source_file = "/content/tst2013.en"
test_target_file = "/content/tst2013.vi"

In [None]:
# Build function: Get examples from sourse, target file and append into data
def generate_examples(source_file, target_file):
    # Open and read source file and target file of "train, test, validation" data
    with open(source_file, encoding="utf-8") as f:
        source_sentences = f.read().split("\n")
    with open(target_file, encoding="utf-8") as f:
        target_sentences = f.read().split("\n")

    # Add examples of "train, test, validation" data
    data = []
    source, target = "en", "vi"
    for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
        result = {source: l1, target: l2}
        data.append(result)
    return data

In [None]:
# Build data
train_data = generate_examples(source_file=train_source_file, target_file=train_target_file)
test_data = generate_examples(test_source_file, test_target_file)
val_data = generate_examples(val_source_file, val_target_file)

In [None]:
import pandas as pd
# from datasets import DatasetDict to create data dict
from datasets import DatasetDict
# Import dataset to use code:datasets.Dataset
# If from datasets import Dataset: error wil be raised, because it's dublicated with Dataset above
import datasets

# train data
tem_dic = {}
tem_dic['translation'] = train_data
train_data = pd.DataFrame(tem_dic)
train_data = datasets.Dataset.from_pandas(train_data)

# # test data
tem_dic = {}
tem_dic['translation'] = test_data
test_data = pd.DataFrame(tem_dic)
test_data = datasets.Dataset.from_pandas(test_data)

# # validation
tem_dic = {}
tem_dic['translation'] = val_data
val_data = pd.DataFrame(tem_dic)
val_data = datasets.Dataset.from_pandas(val_data)



In [None]:
train_data


Dataset({
    features: ['translation'],
    num_rows: 133318
})

In [None]:
# Creaet dataset_dict: have shape like load from dataset library
dataset_dict = DatasetDict({
    'train': train_data,
    'validation': val_data,
    'test': test_data
})

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1554
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [None]:
# dataset_dict['train']['translation'][0]
dataset_dict['train']['translation'][0]


{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## **2.2. Create Poem Dataset**

In [None]:
# Create NMT dataset
class NMTDataset(Dataset):
    def __init__(self, cfg, data, data_type="train"):
        super().__init__()
        # Config
        self.cfg = cfg

        # Separate to source text and target texts
        self.src_texts, self.tgt_texts = self.read_data(data, data_type)

        # Convert source texts to ids
        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        # Convert target texts to ids
        self.labels = self.texts_to_sequences(self.tgt_texts)

    # read data function to separate train data to source texts and target texts
    def read_data(self, data, data_type):
        # Get data "train"
        data = data[data_type]
        # Get list source texts from data['train']
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        # Get list target texts from data['train']
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]
        # Return list of src_texts, tgt_texts
        return src_texts, tgt_texts

    # texts_to_sequences to convert tokens to ids
    def texts_to_sequences(self, texts):
        # Output of tokenizer is input_ids and attention_mask
        data_inputs = self.cfg.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.cfg.max_len,
            return_tensors='pt'
        )
        # Return input_ids
        return data_inputs.input_ids

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

##**Config**

In [None]:
# Base configuration class meant to be inherited by other configuration classes
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):     # Takes any number of keyword arguments and sets them as attributes of the instance using a loop
        for k, v in kwargs.items():   # Iterates over the key-value pairs in the provided arguments.
            setattr(self, k, v)       # Sets an attribute k with value v on the instance

# NMTConfig class: defines a specific configuration for a Neural Machine Translation (NMT) task, inheriting from BaseConfig.
class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'vi'
    max_len = 75
    add_special_tokens = True

    # Model
    model_name = "facebook/mbart-large-50-many-to-many-mmt"

    # Training
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 5e-5
    train_batch_size = 16
    eval_batch_size = 16
    num_train_epochs = 2
    save_total_limit = 1    # Limit on the total number of checkpoints to save
    ckpt_dir = f'./mbart50-{src_lang}-{tgt_lang}' # Directory path for saving model checkpoints
    eval_steps = 1000       # Number of steps between evaluations

    # Inference
    beam_size = 5           # Beam size for beam search during inference

# initializing cfg with the default values specified in the class definition.
cfg = NMTConfig()

##**Tokenizer, Model, Metric**

In [None]:
# tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name, src_lang="en_XX",tgt_lang = "vi_VN")
# Creating a new attribute tokenizer in the cfg object
cfg.tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name)
# Create model
model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_name)

In [None]:
# Use sacrebleu for metric
metric = evaluate.load("sacrebleu")

# Create function to process labels and prediction (use strip)
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Create compute metric function
def compute_metrics(eval_preds):
    # Get prediction and labels from evaluation prediciton results (a tuple containing predictions and labels)
    preds, labels = eval_preds    # preds: N x S, labels: N x S
    # If preds is a tuple, it takes the first element. This is common in some models that return additional outputs such as attention scores.
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace the token -100 in preds with the tokenizer's pad token ID (the -100 token is used to mark ignored positions)
    preds= np.where(preds != -100, preds, cfg.tokenizer.pad_token_id)
    # Decode the token ids to text and skips special tokens and cleans up tokenization spaces.
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Replace the token -100 in preds with the tokenizer's pad token ID (the -100 token is used to mark ignored positions)
    labels= np.where(labels != -100, labels, cfg.tokenizer.pad_token_id)
    # Decode labels
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Calls the postprocess_text function to strip whitespace and format the labels appropriately.
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU Score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # Calculate Average Prediction Length (ignore padding)
    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    # Round Metric Values
    result = {k: round(v, 4) for k, v in result.items()}

    return result

##**Training**

In [None]:

# Create the datasets
train_dataset = NMTDataset(cfg, dataset_dict, data_type="train")
valid_dataset = NMTDataset(cfg, dataset_dict, data_type="validation")
test_dataset = NMTDataset(cfg, dataset_dict, data_type="test")


In [None]:
next(iter(train_dataset))

{'input_ids': tensor([250004, 127055,  66937,     13,    152,    581,  41664,  50155,     10,
         153552,  10336,   2256,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1]),
 'labels': tensor([250004,  67766,   2546, 218877,    858,    889,  10037,   6248,   1893,
          17964,  42254,      2,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,   

In [None]:
# Set training argument
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=cfg.eval_steps,
    eval_steps=cfg.eval_steps,
    output_dir=cfg.ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    save_total_limit=cfg.save_total_limit,
    num_train_epochs=cfg.num_train_epochs,
    load_best_model_at_end=True,
)

# initialize a data collator to handle the preparation of batches of data for sequence-to-sequence models
data_collator = DataCollatorForSeq2Seq(
    cfg.tokenizer,
    model=model
)

# Set trainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=cfg.tokenizer,
    compute_metrics=compute_metrics
)



In [None]:
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss,Bleu,Gen Len
1000,0.5091,0.548572,28.4876,28.0541
2000,0.5011,0.538401,28.5603,28.0129
3000,0.4922,0.528043,29.1597,27.5965


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [None]:
# Check prediction
prediction = trainer.predict(test_dataset)

In [None]:
# Check prediction ans score
prediction

PredictionOutput(predictions=array([[     2, 250004,  16584, ...,      1,      1,      1],
       [     2, 250004,  23598, ...,      1,      1,      1],
       [     2, 250004,  71717, ...,      1,      1,      1],
       ...,
       [     2, 250004,  14343, ...,      1,      1,      1],
       [     2, 250004, 131785, ...,      1,      1,      1],
       [     2, 250004,      2, ...,      1,      1,      1]]), label_ids=array([[250004,  16584,   2259, ...,      1,      1,      1],
       [250004,  14343,   1408, ...,      1,      1,      1],
       [250004,  71717,   4373, ...,      1,      1,      1],
       ...,
       [250004,  14343,   1274, ...,      1,      1,      1],
       [250004, 131785,  43209, ...,      1,      1,      1],
       [250004,      2,      1, ...,      1,      1,      1]]), metrics={'test_loss': 0.5306801795959473, 'test_bleu': 34.8714, 'test_gen_len': 32.792, 'test_runtime': 134.5483, 'test_samples_per_second': 9.432, 'test_steps_per_second': 0.595})

##**Inference**

In [None]:
# Inference
def inference(
    text,
    tokenizer,
    model,
    device="cpu",
    max_length=75,
    beam_size=5
    ):
    # Tokenize input => input_ids and attention_mask
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
        )
    # Move input_ids to device
    input_ids = inputs.input_ids.to(device)
    # Move attention_mask to device
    attention_mask = inputs.attention_mask.to(device)
    # Move model to device
    model.to(device)

    # Generate outputs
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        early_stopping=True,
        num_beams=beam_size,
        length_penalty=2.0
    )

    # Decode outputs
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

In [None]:
sentence = 'i go to school'
inference(sentence, cfg.tokenizer, model)

['tôi đi học.']

##**Checkpoint**
https://drive.google.com/drive/folders/1ii_lPm2-1CfIhQM8RVzLgTHMxXDKgnk4?usp=sharing