<a href="https://colab.research.google.com/github/xandreiAThome/machine-translation-nlp1k/blob/main/nmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Machine Translation

## Preprocess

Load the aligned verses from the tsv, clean the string from any non alphabetic characters. Remove any verses that have no verse for either of the two language, and use the class from the datasets library to structure the data and be ready for training.

In [None]:
import regex as re

def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def process(example):
    src = example["src"].strip()
    tgt = example["tgt"].strip()

    # skip invalid pairs
    if src.lower() == "<no verse>" or tgt.lower() == "<no verse>":
        return {"src": None, "tgt": None}

    return {
        "src": clean_string(src),
        "tgt": clean_string(tgt),
    }

In [None]:
src_lang = "Bikolano"
target_lang = "Tagalog"

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files="data/dataset/Bikolano_Tagalog_Parallel.tsv",
    delimiter="\t",
)

dataset = dataset["train"].select_columns([src_lang, target_lang])
dataset = dataset.rename_columns({src_lang: "src", target_lang: "tgt"})

# Get initial dataset length
initial_dataset_length = len(dataset)

dataset = dataset.map(process)

# remove rows with None (invalid)
dataset = dataset.filter(lambda x: x["src"] is not None and x["tgt"] is not None)

# Calculate skipped verses
skipped = initial_dataset_length - len(dataset)
print(f"skipped verses: {skipped}")

Lets look at the first 5 aligned verses

In [None]:
display(dataset[:5])

## Setting up Trainer
We will use facebook's No Language Left Behind Model as the base model to fine tune using our dataset. It is performant even on low resource languages thats why our group decided to use it.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    model_inputs = tokenizer(batch["src"], truncation=True, max_length=128)
    labels = tokenizer(batch["tgt"], truncation=True, max_length=128).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

Let us split the training data to also have a dataset for evaluation after training.

In [None]:
split = tokenized_dataset.train_test_split(test_size=0.1)
train_data = split["train"]
eval_data = split["test"]

In [None]:
from transformers import Seq2SeqTrainingArguments,

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb-bcl-tgl",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=8,
    eval_strategy="epoch", # Changed from evaluate_during_training
    save_strategy="epoch",
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=2,  # effective batch size = 8
    weight_decay=0.01,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()
trainer.save_model("./nllb-bcl-tgl")
tokenizer.save_pretrained("./nllb-bcl-tgl")
