<a href="https://colab.research.google.com/github/xandreiAThome/machine-translation-nlp1k/blob/main/nmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Machine Translation

## Preprocess

Load the aligned verses from the tsv, clean the string from any non alphabetic characters. Remove any verses that have no verse for either of the two language, and use the class from the datasets library to structure the data and be ready for training.

In [None]:
import regex as re

def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def process(example):
    src = example["src"].strip()
    tgt = example["tgt"].strip()

    # skip invalid pairs
    if src.lower() == "<no verse>" or tgt.lower() == "<no verse>":
        return {"src": None, "tgt": None}

    return {
        "src": clean_string(src),
        "tgt": clean_string(tgt),
    }

In [None]:
# LANGUAGE CONFIGURATION (also the name of the columns in the dataset)
SRC_LANG = "Bikolano"
TGT_LANG = "Tagalog"

In [None]:
!ls /kaggle/input

In [None]:
from datasets import load_dataset

# DATASET CONFIGURATION 
DATASET_PATH = "/kaggle/input/bikolano-tagalog-parallel/Bikolano_Tagalog_Parallel.tsv"
DATASET_DELIMITER = "\t"
DATASET_SPLIT = "train"

dataset = load_dataset(
    "csv",
    data_files=DATASET_PATH,
    delimiter=DATASET_DELIMITER,
)

dataset = dataset[DATASET_SPLIT].select_columns([SRC_LANG, TGT_LANG])
dataset = dataset.rename_columns({SRC_LANG: "src", TGT_LANG: "tgt"})
initial_dataset_length = len(dataset)

dataset = dataset.map(process)

dataset = dataset.filter(lambda x: x["src"] is not None and x["tgt"] is not None)

skipped = initial_dataset_length - len(dataset)
print(f"skipped verses: {skipped}")

Lets look at the first 5 aligned verses

In [None]:
display(dataset[:5])

## Setting up Trainer
We will use facebook's No Language Left Behind Model as the base model to fine tune using our dataset. It is performant even on low resource languages thats why our group decided to use it.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# MODEL CONFIGURATION 
BASE_MODEL_NAME = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)

In [None]:
# TOKENIZATION CONFIGURATION 
MAX_LENGTH = 128

def tokenize(batch):
    model_inputs = tokenizer(batch["src"], truncation=True, max_length=MAX_LENGTH)
    labels = tokenizer(batch["tgt"], truncation=True, max_length=MAX_LENGTH).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

Let us split the training data to also have a dataset for evaluation after training.

In [None]:
split = tokenized_dataset.train_test_split(test_size=0.1)
train_data = split["train"]
eval_data = split["test"]
# TRAINING CONFIGURATION 
RUN_NAME = "nllb-bcl-tgl"
OUTPUT_PATH = f"/kaggle/tmp/{RUN_NAME}"

In [None]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

# TRAINING HYPERPARAMETERS 
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
NUM_EPOCHS = 6
LOGGING_STEPS = 50
GRADIENT_ACCUMULATION_STEPS = 2  # effective batch size = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIMIT = 2
USE_FP16 = True

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_PATH,
    run_name=RUN_NAME,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=LOGGING_STEPS,
    fp16=USE_FP16,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    weight_decay=WEIGHT_DECAY,
    predict_with_generate=True,
    save_total_limit=SAVE_TOTAL_LIMIT,
    report_to=[],
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
import torch
print("CUDA available?", torch.cuda.is_available())
print("Device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
print("starting training")
trainer.train()
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)


In [None]:
!ls /kaggle/tmp

In [None]:
!zip -r /kaggle/working/nllb-bcl-tgl.zip /kaggle/tmp/nllb-bcl-tgl

In [None]:
 %cd /kaggle/working

In [None]:
from IPython.display import FileLink
FileLink(r'nllb-bcl-tgl.zip')

## Evaluate Model on Bikolano to Tagalog Translation

Load the trained checkpoint and evaluate its translation quality on the dataset.

In [None]:
# Load the trained model and tokenizer from checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# CHECKPOINT CONFIGURATION 
CHECKPOINT_PATH = "data/models/nllb-bcl-tgl"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Using device: {device}")

In [None]:
from datasets import load_dataset

# ===== TRANSLATION FUNCTION =====
def translate(text, model_tokenizer, translation_model, src_lang=SRC_LANG, tgt_lang=TGT_LANG):
    # Tokenize input text
    inputs = model_tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True).to(device)
    
    # Generate translation
    with torch.no_grad():
        outputs = translation_model.generate(
            **inputs,
            max_length=GENERATION_MAX_LENGTH,
            num_beams=NUM_BEAMS,
            early_stopping=True
        )
    
    translation = model_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# ===== EVALUATION DATASET CONFIGURATION =====
EVAL_DATASET_PATH = "data/dataset/Bikolano_Tagalog_Parallel.tsv"

# Load the original Bikolano-Tagalog dataset
dataset = load_dataset(
    "csv",
    data_files=EVAL_DATASET_PATH,
    delimiter="\t",
)

dataset = dataset["train"].select_columns(["Bikolano", "Tagalog"])
dataset = dataset.rename_columns({"Bikolano": "src", "Tagalog": "tgt"})

# Apply the same cleaning function as before
dataset = dataset.map(process)
dataset = dataset.filter(lambda x: x["src"] is not None and x["tgt"] is not None)

print(f"Total dataset size: {len(dataset)}")

In [None]:
# Evaluate on test set
import sacrebleu
from tqdm import tqdm
import numpy as np

# ===== EVALUATION CONFIGURATION =====
EVAL_SIZE = 100
GENERATION_MAX_LENGTH = 128
NUM_BEAMS = 5

# Get a sample from the dataset for evaluation
eval_size = min(EVAL_SIZE, len(dataset))
eval_dataset = dataset.select(range(eval_size))

predictions = []
references = []

print("Generating translations for evaluation...")
for i, example in enumerate(tqdm(eval_dataset, total=eval_size)):
    src_text = example["src"]
    ref_text = example["tgt"]
    
    pred_text = translate(src_text, tokenizer, model, SRC_LANG, TGT_LANG)
    
    predictions.append(pred_text)
    references.append(ref_text)

def calculate_bleu(predictions, references):
    """Calculate corpus BLEU score"""
    # sacrebleu expects predictions as list of strings and references as list of list of strings
    refs = [[ref] for ref in references]
    return sacrebleu.corpus_bleu(predictions, refs)

bleu_score = calculate_bleu(predictions, references)
print(f"\nBLEU Score: {bleu_score.score:.4f}")

## Compare with Base NLLB Model

Let us evaluate the base NLLB model on the same test set to compare performance.

In [None]:
# Load base NLLB model for comparison
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("Loading base NLLB model...")
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)

base_model = base_model.to(device)
base_model.eval()

print(f"Base model loaded: {BASE_MODEL_NAME}")

def translate_base_model(text, target_lang_code="tgl_Latn", model_tokenizer=None, multilingual_model=None):
    """ 
    Common language codes:
        - tgl_Latn: Tagalog
        - eng_Latn: English
        - spa_Latn: Spanish
        - fra_Latn: French
        - deu_Latn: German
        - cmn_Hans: Mandarin Chinese
        - jpn_Jpan: Japanese
    """
    if model_tokenizer is None:
        model_tokenizer = base_tokenizer
    if multilingual_model is None:
        multilingual_model = base_model
    
    inputs = model_tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True).to(device)
    
    # Force the target language
    forced_bos_token_id = model_tokenizer.convert_tokens_to_ids(target_lang_code)
    
    with torch.no_grad():
        outputs = multilingual_model.generate(
            **inputs,
            max_length=GENERATION_MAX_LENGTH,
            num_beams=NUM_BEAMS,
            early_stopping=True,
            forced_bos_token_id=forced_bos_token_id
        )
    
    translation = model_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Generate predictions with base model
print("\nGenerating translations with base model...")
base_predictions = []

for example in tqdm(eval_dataset, total=eval_size, desc="Base model"):
    pred_text = translate_base_model(example["src"])
    base_predictions.append(pred_text)

# Calculate BLEU score for base model
base_bleu_score = calculate_bleu(base_predictions, references)

print("\n" + "=" * 80)
print("BLEU SCORE COMPARISON")
print("=" * 80)
print(f"Base NLLB Model:        {base_bleu_score.score:.4f}")
print(f"Fine-tuned Model:       {bleu_score.score:.4f}")
print(f"Improvement:            {bleu_score.score - base_bleu_score.score:+.4f}")
print("=" * 80)

In [None]:

import pandas as pd

comparison_df = pd.DataFrame({
    "Source (Bikolano)": [eval_dataset[i]['src'] for i in range(len(eval_dataset))],
    "Reference (Tagalog)": references,
    "Base Model Output": base_predictions,
    "Fine-tuned Model Output": predictions
})

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("FULL TRANSLATION COMPARISON")
display(comparison_df.head(10))

comparison_df.to_csv("translation_comparison.csv", index=False)
print("\nComparison saved to: translation_comparison.csv")