# Base NLLB Model Comparison

Evaluate the base NLLB-200-distilled-600M model (without fine-tuning) on Yami → Tagalog translation.
Results are saved to CSV for comparison with fine-tuned model.

## Setup and Load Test Data

In [None]:
from pathlib import Path
import pandas as pd
import nltk
import sacrebleu
import regex as re
import torch
import gc
import statistics
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def translate_nllb(text, model, tokenizer, device, max_length=128, tgt_lang="tgl_Latn"):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
    
    with torch.no_grad():
        translated_tokens = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang)
        )
    
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

print("Translation functions ready")

## Load Yami→Tagalog Validation Test Data

In [None]:
TEST_DATA_PATH = Path("data/validation/yami-tgl.tsv")
print(f"Loading test data from: {TEST_DATA_PATH}")
test_df = pd.read_csv(TEST_DATA_PATH, sep='\t')

test_data = []
for idx, row in test_df.iterrows():
    src = row['Yami']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(test_data)} valid test pairs")

## Evaluate Base NLLB on Yami→Tagalog Validation Set (Direct)

In [None]:
print("Loading base NLLB model (no fine-tuning)...")
base_model_name = "facebook/nllb-200-distilled-600M"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
base_model.eval()

base_translations = []
print(f"Translating {len(test_data)} test pairs with base NLLB model...")
for i, item in enumerate(test_data):
    if (i + 1) % max(1, len(test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(test_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        base_model,
        base_tokenizer,
        device,
        tgt_lang="tgl_Latn"
    )
    base_translations.append(translation)

print("Base model translation complete")

del base_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

references = [item['ref_cleaned'] for item in test_data]
bleu_base = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_base = bleu_base.corpus_score(base_translations, [references])
base_sentence_bleu_scores = [sacrebleu.BLEU(smooth_method='exp').sentence_score(hyp, [ref]).score for hyp, ref in zip(base_translations, references)]

print("\n" + "="*70)
print("BASE NLLB MODEL RESULTS (No Fine-tuning) - VALIDATION SET")
print("="*70)
print(f"\nModel: facebook/nllb-200-distilled-600M")
print(f"Target Language: Tagalog (tgl_Latn)")
print(f"Test Pairs: {len(test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_base.score:.4f}")
print(f"Mean Sentence BLEU: {statistics.mean(base_sentence_bleu_scores):.4f}")
print(f"Median Sentence BLEU: {statistics.median(base_sentence_bleu_scores):.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_base.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_base.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_base.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_base.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_base.bp:.4f}")
print("="*70)

## Load Yami→Tagalog Bible Dataset

In [None]:
bible_dataset_path = Path("data/dataset/Tagalog_Yami_Parallel.tsv")
print(f"Loading bible dataset from: {bible_dataset_path}")
bible_df = pd.read_csv(bible_dataset_path, sep='\t')

bible_with_yami = bible_df[(bible_df['Yami'] != '<no verse>') & (bible_df['Tagalog'] != '<no verse>')].copy()
sample_size = min(200, len(bible_with_yami))
bible_sample = bible_with_yami.sample(n=sample_size, random_state=42)

bible_test_data = []
for idx, row in bible_sample.iterrows():
    src = row['Yami']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        bible_test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(bible_test_data)} valid bible verse pairs")

## Evaluate Base NLLB on Yami→Tagalog Bible Dataset (Direct)

In [None]:
print("Loading base NLLB model for Bible dataset evaluation...")
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
base_model.eval()

base_bible_translations = []
print(f"Translating {len(bible_test_data)} Bible verses with base NLLB model...")
for i, item in enumerate(bible_test_data):
    if (i + 1) % max(1, len(bible_test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(bible_test_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        base_model,
        base_tokenizer,
        device,
        tgt_lang="tgl_Latn"
    )
    base_bible_translations.append(translation)

print("Base model Bible translation complete")

del base_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

bible_references = [item['ref_cleaned'] for item in bible_test_data]
bleu_base_bible = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_base_bible = bleu_base_bible.corpus_score(base_bible_translations, [bible_references])
base_bible_sentence_bleu_scores = [sacrebleu.BLEU(smooth_method='exp').sentence_score(hyp, [ref]).score for hyp, ref in zip(base_bible_translations, bible_references)]

print("\n" + "="*70)
print("BASE NLLB MODEL RESULTS (No Fine-tuning) - BIBLE DATASET")
print("="*70)
print(f"\nModel: facebook/nllb-200-distilled-600M")
print(f"Target Language: Tagalog (tgl_Latn)")
print(f"Test Pairs: {len(bible_test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_base_bible.score:.4f}")
print(f"Mean Sentence BLEU: {statistics.mean(base_bible_sentence_bleu_scores):.4f}")
print(f"Median Sentence BLEU: {statistics.median(base_bible_sentence_bleu_scores):.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_base_bible.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_base_bible.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_base_bible.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_base_bible.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_base_bible.bp:.4f}")
print("="*70)

## Load Pangasinan→Tagalog Bible Dataset

In [None]:
pangasinan_dataset_path = Path("data/dataset/Tagalog_Pangasinan_Parallel.tsv")
print(f"Loading Pangasinan-Tagalog dataset from: {pangasinan_dataset_path}")
pangasinan_df = pd.read_csv(pangasinan_dataset_path, sep='\t')

pangasinan_with_data = pangasinan_df[(pangasinan_df['Pangasinan'] != '<no verse>') & (pangasinan_df['Tagalog'] != '<no verse>')].copy()
sample_size_pang = min(200, len(pangasinan_with_data))
pangasinan_sample = pangasinan_with_data.sample(n=sample_size_pang, random_state=42)

pangasinan_test_data = []
for idx, row in pangasinan_sample.iterrows():
    src = row['Pangasinan']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        pangasinan_test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(pangasinan_test_data)} valid Pangasinan-Tagalog pairs")


## Evaluate Base NLLB on Pangasinan→Tagalog Bible Dataset (Direct Translation)

In [None]:
print("Loading base NLLB model for Pangasinan-Tagalog evaluation...")
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
base_model.eval()

pangasinan_translations = []
print(f"Translating {len(pangasinan_test_data)} Pangasinan sentences with base NLLB model...")
for i, item in enumerate(pangasinan_test_data):
    if (i + 1) % max(1, len(pangasinan_test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(pangasinan_test_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        base_model,
        base_tokenizer,
        device,
        tgt_lang="tgl_Latn"
    )
    pangasinan_translations.append(translation)

print("Base model Pangasinan-Tagalog translation complete")

del base_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

pangasinan_references = [item['ref_cleaned'] for item in pangasinan_test_data]
bleu_pangasinan = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_pangasinan = bleu_pangasinan.corpus_score(pangasinan_translations, [pangasinan_references])
pangasinan_sentence_bleu_scores = [sacrebleu.BLEU(smooth_method='exp').sentence_score(hyp, [ref]).score for hyp, ref in zip(pangasinan_translations, pangasinan_references)]

print("\n" + "="*70)
print("BASE NLLB MODEL RESULTS - PANGASINAN→TAGALOG BIBLE (Direct Translation)")
print("="*70)
print(f"\nModel: facebook/nllb-200-distilled-600M")
print(f"Source Language: Pangasinan")
print(f"Target Language: Tagalog (tgl_Latn)")
print(f"Test Pairs: {len(pangasinan_test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_pangasinan.score:.4f}")
print(f"Mean Sentence BLEU: {statistics.mean(pangasinan_sentence_bleu_scores):.4f}")
print(f"Median Sentence BLEU: {statistics.median(pangasinan_sentence_bleu_scores):.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_pangasinan.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_pangasinan.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_pangasinan.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_pangasinan.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_pangasinan.bp:.4f}")
print("="*70)


## Load Pangasinan Validation Set

In [None]:
PNG_VALIDATION_PATH = Path("data/validation/png-tgl.tsv")
print(f"Loading Pangasinan validation data from: {PNG_VALIDATION_PATH}")
png_val_df = pd.read_csv(PNG_VALIDATION_PATH, sep='\t')

png_val_data = []
for idx, row in png_val_df.iterrows():
    src = row['Pangasinan']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        png_val_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(png_val_data)} valid Pangasinan validation pairs")


## Evaluate Base NLLB on Pangasinan→Tagalog Validation Set (Direct)

In [None]:
print("Loading base NLLB model for Pangasinan validation evaluation...")
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
base_model.eval()

png_val_translations = []
print(f"Translating {len(png_val_data)} Pangasinan validation sentences with base NLLB model...")
for i, item in enumerate(png_val_data):
    if (i + 1) % max(1, len(png_val_data) // 5) == 0:
        print(f"  Progress: {i+1}/{len(png_val_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        base_model,
        base_tokenizer,
        device,
        tgt_lang="tgl_Latn"
    )
    png_val_translations.append(translation)

print("Base model Pangasinan validation translation complete")

del base_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

png_val_references = [item['ref_cleaned'] for item in png_val_data]
bleu_png_val = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_png_val = bleu_png_val.corpus_score(png_val_translations, [png_val_references])
png_val_sentence_bleu_scores = [sacrebleu.BLEU(smooth_method='exp').sentence_score(hyp, [ref]).score for hyp, ref in zip(png_val_translations, png_val_references)]

print("\n" + "="*70)
print("BASE NLLB MODEL RESULTS - PANGASINAN→TAGALOG VALIDATION (Direct)")
print("="*70)
print(f"\nModel: facebook/nllb-200-distilled-600M")
print(f"Source Language: Pangasinan")
print(f"Target Language: Tagalog (tgl_Latn)")
print(f"Test Pairs: {len(png_val_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_png_val.score:.4f}")
print(f"Mean Sentence BLEU: {statistics.mean(png_val_sentence_bleu_scores):.4f}")
print(f"Median Sentence BLEU: {statistics.median(png_val_sentence_bleu_scores):.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_png_val.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_png_val.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_png_val.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_png_val.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_png_val.bp:.4f}")
print("="*70)


## Save Comparison Results

In [None]:
comparison_data = {
    'Model': [
        'Base NLLB (No Fine-tuning)',
        'Base NLLB (No Fine-tuning)',
        'Base NLLB (No Fine-tuning)',
        'Base NLLB (No Fine-tuning)'
    ],
    'Source Language': [
        'Yami',
        'Yami',
        'Pangasinan',
        'Pangasinan'
    ],
    'Target Language': [
        'Tagalog (tgl_Latn)',
        'Tagalog (tgl_Latn)',
        'Tagalog (tgl_Latn)',
        'Tagalog (tgl_Latn)'
    ],
    'Dataset': [
        'Yami Validation Set',
        'Yami Bible Dataset',
        'Pangasinan Bible Dataset',
        'Pangasinan Validation Set'
    ],
    'Corpus BLEU': [
        corpus_bleu_base.score,
        corpus_bleu_base_bible.score,
        corpus_bleu_pangasinan.score,
        corpus_bleu_png_val.score
    ],
    'Mean BLEU': [
        statistics.mean(base_sentence_bleu_scores),
        statistics.mean(base_bible_sentence_bleu_scores),
        statistics.mean(pangasinan_sentence_bleu_scores),
        statistics.mean(png_val_sentence_bleu_scores)
    ],
    'BLEU-1': [
        corpus_bleu_base.precisions[0],
        corpus_bleu_base_bible.precisions[0],
        corpus_bleu_pangasinan.precisions[0],
        corpus_bleu_png_val.precisions[0]
    ],
    'BLEU-2': [
        corpus_bleu_base.precisions[1],
        corpus_bleu_base_bible.precisions[1],
        corpus_bleu_pangasinan.precisions[1],
        corpus_bleu_png_val.precisions[1]
    ],
    'BLEU-3': [
        corpus_bleu_base.precisions[2],
        corpus_bleu_base_bible.precisions[2],
        corpus_bleu_pangasinan.precisions[2],
        corpus_bleu_png_val.precisions[2]
    ],
    'BLEU-4': [
        corpus_bleu_base.precisions[3],
        corpus_bleu_base_bible.precisions[3],
        corpus_bleu_pangasinan.precisions[3],
        corpus_bleu_png_val.precisions[3]
    ],
    'Brevity Penalty': [
        corpus_bleu_base.bp,
        corpus_bleu_base_bible.bp,
        corpus_bleu_pangasinan.bp,
        corpus_bleu_png_val.bp
    ]
}

comparison_df = pd.DataFrame(comparison_data)
comparison_path = Path('results/base_nllb_evaluation.csv')
comparison_path.parent.mkdir(parents=True, exist_ok=True)
comparison_df.to_csv(comparison_path, index=False)

print("\n" + "="*70)
print("BASE NLLB RESULTS SUMMARY")
print("="*70)
print("\n" + comparison_df.to_string(index=False))
print(f"\nResults saved to: {comparison_path}")
print("="*70)
