# Neural Machine Translation Evaluation

Evaluate the fine-tuned NLLB model and compare with the base model

## Load Trained Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import regex as re

def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def process(example):
    src = example["src"].strip()
    tgt = example["tgt"].strip()

    # skip invalid pairs
    if src.lower() == "<no verse>" or tgt.lower() == "<no verse>":
        return {"src": None, "tgt": None}

    return {
        "src": clean_string(src),
        "tgt": clean_string(tgt),
    }

# LANGUAGE CONFIGURATION
SRC_LANG = "Pangasinan"
TGT_LANG = "Bikolano"

# CHECKPOINT CONFIGURATION 
CHECKPOINT_PATH = "/kaggle/tmp/nllb-pag-bcl"
BASE_MODEL_NAME = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Using device: {device}")
print(f"Model loaded from: {CHECKPOINT_PATH}")

## Prepare Evaluation Dataset

In [None]:
from datasets import load_dataset

# ===== EVALUATION DATASET CONFIGURATION =====
EVAL_DATASET_PATH = "/kaggle/input/bikolano-pangasinan-parallel/Bikolano_Pangasinan_Parallel.tsv"

# Load the original Bikolano-Tagalog dataset
dataset = load_dataset(
    "csv",
    data_files=EVAL_DATASET_PATH,
    delimiter="\t",
)

dataset = dataset["train"].select_columns(["Pangasinan", "Bikolano"])
dataset = dataset.rename_columns({"Pangasinan": "src", "Bikolano": "tgt"})

# Apply the same cleaning function as before
dataset = dataset.map(process)
dataset = dataset.filter(lambda x: x["src"] is not None and x["tgt"] is not None)

print(f"Total dataset size: {len(dataset)}")

## Translation Function

In [None]:
# ===== TRANSLATION FUNCTION =====
MAX_LENGTH = 128
GENERATION_MAX_LENGTH = 128
NUM_BEAMS = 5

def translate(text, model_tokenizer, translation_model, src_lang=SRC_LANG, tgt_lang=TGT_LANG):
    # Tokenize input text
    inputs = model_tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True).to(device)
    
    # Generate translation
    with torch.no_grad():
        outputs = translation_model.generate(
            **inputs,
            max_length=GENERATION_MAX_LENGTH,
            num_beams=NUM_BEAMS,
            early_stopping=True
        )
    
    translation = model_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

print("Translation function ready")

## Evaluate Fine-tuned Model

In [None]:
%pip install sacrebleu

In [None]:
import sacrebleu
from tqdm import tqdm
import numpy as np

# ===== EVALUATION CONFIGURATION =====
EVAL_SIZE = 100

# Get a sample from the dataset for evaluation
eval_size = min(EVAL_SIZE, len(dataset))
eval_dataset = dataset.select(range(eval_size))

predictions = []
references = []

print(f"Generating translations for {eval_size} samples...")
for i, example in enumerate(tqdm(eval_dataset, total=eval_size)):
    src_text = example["src"]
    ref_text = example["tgt"]
    
    pred_text = translate(src_text, tokenizer, model, SRC_LANG, TGT_LANG)
    
    predictions.append(pred_text)
    references.append(ref_text)

def calculate_bleu(predictions, references):
    """Calculate corpus BLEU score"""
    # sacrebleu expects predictions as list of strings and references as list of list of strings
    refs = [[ref] for ref in references]
    return sacrebleu.corpus_bleu(predictions, refs)

bleu_score = calculate_bleu(predictions, references)
print(f"\nFine-tuned Model BLEU Score: {bleu_score.score:.4f}")

## Compare with Base NLLB Model

In [None]:
# Load base NLLB model for comparison
print("Loading base NLLB model...")
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)

base_model = base_model.to(device)
base_model.eval()

print(f"Base model loaded: {BASE_MODEL_NAME}")

def translate_base_model(text, target_lang_code="tgl_Latn", model_tokenizer=None, multilingual_model=None):
    """ 
    Common language codes:
        - tgl_Latn: Tagalog
        - eng_Latn: English
        - spa_Latn: Spanish
        - fra_Latn: French
        - deu_Latn: German
        - cmn_Hans: Mandarin Chinese
        - jpn_Jpan: Japanese
    """
    if model_tokenizer is None:
        model_tokenizer = base_tokenizer
    if multilingual_model is None:
        multilingual_model = base_model
    
    inputs = model_tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True).to(device)
    
    # Force the target language
    forced_bos_token_id = model_tokenizer.convert_tokens_to_ids(target_lang_code)
    
    with torch.no_grad():
        outputs = multilingual_model.generate(
            **inputs,
            max_length=GENERATION_MAX_LENGTH,
            num_beams=NUM_BEAMS,
            early_stopping=True,
            forced_bos_token_id=forced_bos_token_id
        )
    
    translation = model_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

In [None]:
# Generate predictions with base model
print("\nGenerating translations with base model...")
base_predictions = []

for example in tqdm(eval_dataset, total=eval_size, desc="Base model"):
    pred_text = translate_base_model(example["src"])
    base_predictions.append(pred_text)

# Calculate BLEU score for base model
base_bleu_score = calculate_bleu(base_predictions, references)

print("\n" + "="*80)
print("BLEU SCORE COMPARISON")
print("="*80)
print(f"Base NLLB Model:        {base_bleu_score.score:.4f}")
print(f"Fine-tuned Model:       {bleu_score.score:.4f}")
print(f"Improvement:            {bleu_score.score - base_bleu_score.score:+.4f}")
print("="*80)

## Translation Comparison Table

In [None]:
import pandas as pd

comparison_df = pd.DataFrame({
    "Source (Pangasinan)": [eval_dataset[i]['src'] for i in range(len(eval_dataset))],
    "Reference (Bikolano)": references,
    "Base Model Output": base_predictions,
    "Fine-tuned Model Output": predictions
})

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("FULL TRANSLATION COMPARISON")
display(comparison_df.head(10))

comparison_df.to_csv("translation_comparison.csv", index=False)
print("\nComparison saved to: translation_comparison.csv")