# Direct Yami to Tagalog Translation Evaluation

Evaluate the fine-tuned NLLB model for direct Yami → Tagalog translation using BLEU scores and analysis.

In [4]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Setup and Load Test Data

In [5]:
from pathlib import Path
import pandas as pd
import nltk
import sacrebleu

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

TEST_DATA_PATH = Path("data/validation/yami-tgl.tsv")

print(f"Loading test data from: {TEST_DATA_PATH}")

test_df = pd.read_csv(TEST_DATA_PATH, sep='\t')

print(f"Loaded {len(test_df)} test pairs")
print(f"\nColumns: {test_df.columns.tolist()}")
print(f"\nFirst 3 rows:")
print(test_df.head(3))

Loading test data from: data/validation/yami-tgl.tsv
Loaded 20 test pairs

Columns: ['Yami', 'Tagalog']

First 3 rows:
                                                Yami  \
0  Ori ya kanakan ira do apheshepen, ya mamizing ...   
1  Siciaraw ya mipangay sira do vahay, ta ya masá...   
2  Sira kaka ira ya mivanoa do kahasan, ta somivo...   

                                             Tagalog  
0  Naroon ang mga bata sa tabing-dagat, at nakiki...  
1  Ngayong araw ay pupunta sila sa bahay, dahil m...  
2  Ang magkapatid ay pumunta sa kagubatan, dahil ...  


## Load Trained Model

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gc

model_yami_tgl_path = Path("data/models/nllb-yami-tgl")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_yami_tgl_path)
    print(f"Loaded Yami → Tagalog tokenizer from {model_yami_tgl_path}")
except Exception as e:
    print(f" Error loading tokenizer: {e}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")
print("Note: Model will be loaded when needed and unloaded after translation")

Loaded Yami → Tagalog tokenizer from data/models/nllb-yami-tgl

Using device: cuda
Note: Model will be loaded when needed and unloaded after translation


## Translation Functions

In [None]:
import regex as re

def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def translate_nllb(text, model, tokenizer, device, max_length=128, tgt_lang="tgl_Latn"):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
    
    with torch.no_grad():
        translated_tokens = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang)
        )
    
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

print("Translation functions ready (NLLB-based with explicit target language)")

Translation functions ready (NLLB-based with explicit target language)


## Perform Translation

In [13]:
test_data = []
for idx, row in test_df.iterrows():
    src = row['Yami']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(test_data)} valid test pairs\n")

if len(test_data) > 0:
    sample = test_data[0]
    print("Sample data (before and after cleaning):")
    print(f"  Source (original):  {sample['src']}")
    print(f"  Source (cleaned):   {sample['src_cleaned']}")
    print(f"  Reference (orig):   {sample['ref']}")
    print(f"  Reference (clean):  {sample['ref_cleaned']}")
    print()

print("Loading Yami → Tagalog model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_yami_tgl_path).to(device)
model.eval()

translations = []
print("Translating Yami text to Tagalog...")
for i, item in enumerate(test_data):
    if (i + 1) % max(1, len(test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(test_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        model,
        tokenizer,
        device
    )
    translations.append(translation)

print("Translation complete")

del model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

references = [item['ref_cleaned'] for item in test_data]

print(f"Translated all {len(test_data)} test pairs")

Prepared 20 valid test pairs

Sample data (before and after cleaning):
  Source (original):  Ori ya kanakan ira do apheshepen, ya mamizing sira do ciriciring ni ama ira.
  Source (cleaned):   ori ya kanakan ira do apheshepen ya mamizing sira do ciriciring ni ama ira
  Reference (orig):   Naroon ang mga bata sa tabing-dagat, at nakikinig sila sa mga kuwento ng kanilang ama.
  Reference (clean):  naroon ang mga bata sa tabingdagat at nakikinig sila sa mga kuwento ng kanilang ama

Loading Yami → Tagalog model...


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 17.88 MiB is free. Process 2116 has 46.04 MiB memory in use. Process 122722 has 2.38 GiB memory in use. Including non-PyTorch memory, this process has 1.19 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 15.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Calculate BLEU Scores with SacreBLEU

In [None]:
bleu = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_result = bleu.corpus_score(translations, [references])

print("="*70)
print("BLEU SCORE EVALUATION (using SacreBLEU with smooth method)")
print("="*70)
print(f"\nDirect Translation: Yami → Tagalog")
print(f"Test Pairs: {len(test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_result.score:.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_result.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_result.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_result.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_result.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_result.bp:.4f}")
print(f"  Ratio: {corpus_bleu_result.ratio:.4f}")
print("="*70)

BLEU SCORE EVALUATION (using SacreBLEU with smooth method)

Direct Translation: Yami → Tagalog
Test Pairs: 20

Corpus BLEU Score: 2.3511

BLEU Breakdown:
  BLEU-1: 25.8555
  BLEU-2: 5.3498
  BLEU-3: 0.8969
  BLEU-4: 0.2463
  Brevity Penalty: 1.0000
  Ratio: 1.0958


## Sentence-level BLEU Scores

In [None]:
sentence_bleu_scores = []
bleu_sent = sacrebleu.BLEU(smooth_method='exp')

for hyp, ref in zip(translations, references):
    score = bleu_sent.sentence_score(hyp, [ref])
    sentence_bleu_scores.append(score.score)

import statistics

print(f"\nSentence-level BLEU Statistics:")
print(f"  Mean: {statistics.mean(sentence_bleu_scores):.4f}")
print(f"  Median: {statistics.median(sentence_bleu_scores):.4f}")
print(f"  Min: {min(sentence_bleu_scores):.4f}")
print(f"  Max: {max(sentence_bleu_scores):.4f}")
print(f"  Stdev: {statistics.stdev(sentence_bleu_scores) if len(sentence_bleu_scores) > 1 else 0:.4f}")

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 


Sentence-level BLEU Statistics:
  Mean: 5.8927
  Median: 4.7134
  Min: 2.4427
  Max: 14.4737
  Stdev: 3.1114


## Translation Examples

In [None]:
comparison_df = pd.DataFrame({
    'Source (Yami)': [item['src'] for item in test_data],
    'Source (Cleaned)': [item['src_cleaned'] for item in test_data],
    'Translation (Tagalog)': translations,
    'Reference (Tagalog)': [item['ref'] for item in test_data],
    'Reference (Cleaned)': [item['ref_cleaned'] for item in test_data],
    'BLEU': sentence_bleu_scores
})

comparison_df_sorted = comparison_df.sort_values('BLEU')

print("\nWORST PERFORMING TRANSLATIONS (lowest BLEU):\n")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)
print(comparison_df_sorted.head(5).to_string())

print("\n" + "="*70)
print("\nBEST PERFORMING TRANSLATIONS (highest BLEU):\n")
print(comparison_df_sorted.tail(5).to_string())


WORST PERFORMING TRANSLATIONS (lowest BLEU):

                                                                Source (Yami)                                                         Source (Cleaned)                                                                                                  Translation (Tagalog)                                                                            Reference (Tagalog)                                                                          Reference (Cleaned)      BLEU
6             Miangay sira do aHarang, ta rareng a mialalam ya sira do vahay.            miangay sira do aharang ta rareng a mialalam ya sira do vahay  at ang bawat isa sa kanila ay tumakas sa kanyang sariling bahay at ang bawat isa ay tumakas sa kanyang sariling bahay  Pumunta sila sa pampang, at naglaro sila malapit sa lugar kung saan nakadaong ang mga bangka.  pumunta sila sa pampang at naglaro sila malapit sa lugar kung saan nakadaong ang mga bangka  2.442663
2      Sira kaka 

## Save Results

In [None]:
output_path = Path("results/direct_yami_tgl_evaluation.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

comparison_df.to_csv(output_path, index=False)
print(f"Full results saved to: {output_path}")

summary = {
    'Pipeline': 'Direct: Yami → Tagalog',
    'Test Pairs': len(test_data),
    'Corpus BLEU': f"{corpus_bleu_result.score:.4f}",
    'Mean Sentence BLEU': f"{statistics.mean(sentence_bleu_scores):.4f}",
    'Median Sentence BLEU': f"{statistics.median(sentence_bleu_scores):.4f}",
    'Min BLEU': f"{min(sentence_bleu_scores):.4f}",
    'Max BLEU': f"{max(sentence_bleu_scores):.4f}",
    'Stdev': f"{statistics.stdev(sentence_bleu_scores) if len(sentence_bleu_scores) > 1 else 0:.4f}",
    'BLEU-1': f"{corpus_bleu_result.precisions[0]:.4f}",
    'BLEU-2': f"{corpus_bleu_result.precisions[1]:.4f}",
    'BLEU-3': f"{corpus_bleu_result.precisions[2]:.4f}",
    'BLEU-4': f"{corpus_bleu_result.precisions[3]:.4f}",
    'Brevity Penalty': f"{corpus_bleu_result.bp:.4f}"
}

summary_df = pd.DataFrame([summary])
summary_path = Path("results/direct_yami_tgl_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"Summary saved to: {summary_path}")

print(f"\nEvaluation complete!")

Full results saved to: results/direct_yami_tgl_evaluation.csv
Summary saved to: results/direct_yami_tgl_summary.csv

Evaluation complete!


## Evaluation on Original Bible Dataset (Tagalog_Yami_Parallel.tsv)

In [None]:
bible_dataset_path = Path("data/dataset/Tagalog_Yami_Parallel.tsv")

print(f"Loading original bible dataset from: {bible_dataset_path}")

bible_df = pd.read_csv(bible_dataset_path, sep='\t')

print(f"Loaded {len(bible_df)} total verse pairs from bible dataset")
print(f"\nColumns: {bible_df.columns.tolist()}")
print(f"Unique books: {bible_df['Book'].nunique()}")
print(f"\nDataset info:")
print(f"  - Total verses: {len(bible_df)}")
print(f"  - Verses with Yami translation: {(bible_df['Yami'] != '<no verse>').sum()}")
print(f"  - Verses without Yami translation: {(bible_df['Yami'] == '<no verse>').sum()}")

Loading original bible dataset from: data/dataset/Tagalog_Yami_Parallel.tsv
Loaded 34618 total verse pairs from bible dataset

Columns: ['Book', 'Chapter', 'Verse', 'Tagalog', 'Yami']
Unique books: 64

Dataset info:
  - Total verses: 34618
Loaded 34618 total verse pairs from bible dataset

Columns: ['Book', 'Chapter', 'Verse', 'Tagalog', 'Yami']
Unique books: 64

Dataset info:
  - Total verses: 34618
  - Verses with Yami translation: 11666
  - Verses without Yami translation: 22952
  - Verses with Yami translation: 11666
  - Verses without Yami translation: 22952


### Prepare Bible Dataset Sample

In [None]:
bible_with_yami = bible_df[(bible_df['Yami'] != '<no verse>') & (bible_df['Tagalog'] != '<no verse>')].copy()

print(f"Verses with both Tagalog and Yami: {len(bible_with_yami)}")

sample_size = min(200, len(bible_with_yami))
bible_sample = bible_with_yami.sample(n=sample_size, random_state=42)

print(f"Using sample of {len(bible_sample)} verses for evaluation")
print(f"Sample includes verses from books: {bible_sample['Book'].unique()[:5].tolist()}...")

bible_test_data = []
for idx, row in bible_sample.iterrows():
    src = row['Yami']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        bible_test_data.append({
            'book': row['Book'],
            'chapter': row['Chapter'],
            'verse': row['Verse'],
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"\nPrepared {len(bible_test_data)} valid bible verse pairs\n")

Verses with both Tagalog and Yami: 11665
Using sample of 200 verses for evaluation
Sample includes verses from books: ['John', 'Acts', 'Romans', 'Luke', 'Mark']...

Prepared 200 valid bible verse pairs

Using sample of 200 verses for evaluation
Sample includes verses from books: ['John', 'Acts', 'Romans', 'Luke', 'Mark']...

Prepared 200 valid bible verse pairs



### Translate Bible Sample

In [None]:
print("Loading Yami → Tagalog model for bible dataset evaluation...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_yami_tgl_path).to(device)
model.eval()

bible_translations = []
print(f"Translating {len(bible_test_data)} bible verses...")
for i, item in enumerate(bible_test_data):
    if (i + 1) % max(1, len(bible_test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(bible_test_data)}")
    
    translation = translate_nllb(
        item['src_cleaned'],
        model,
        tokenizer,
        device
    )
    bible_translations.append(translation)

print("Bible dataset translation complete")

del model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

bible_references = [item['ref_cleaned'] for item in bible_test_data]

print(f"Translated all {len(bible_test_data)} bible verses")

Loading Yami → Tagalog model for bible dataset evaluation...
Translating 200 bible verses...
Translating 200 bible verses...
  Progress: 20/200
  Progress: 20/200
  Progress: 40/200
  Progress: 40/200
  Progress: 60/200
  Progress: 60/200
  Progress: 80/200
  Progress: 80/200
  Progress: 100/200
  Progress: 100/200
  Progress: 120/200
  Progress: 120/200
  Progress: 140/200
  Progress: 140/200
  Progress: 160/200
  Progress: 160/200
  Progress: 180/200
  Progress: 180/200
  Progress: 200/200
  Progress: 200/200
Bible dataset translation complete
Translated all 200 bible verses
Bible dataset translation complete
Translated all 200 bible verses


### Evaluate Bible Sample with BLEU Scores

In [None]:
bleu_bible = sacrebleu.BLEU(smooth_method='exp')
corpus_bleu_bible = bleu_bible.corpus_score(bible_translations, [bible_references])

print("="*70)
print("BLEU SCORE EVALUATION - BIBLE DATASET (using SacreBLEU with smooth method)")
print("="*70)
print(f"\nDirect Translation: Yami → Tagalog (from Bible Dataset)")
print(f"Test Pairs: {len(bible_test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_bible.score:.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_bible.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_bible.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_bible.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_bible.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_bible.bp:.4f}")
print(f"  Ratio: {corpus_bleu_bible.ratio:.4f}")
print("="*70)

BLEU SCORE EVALUATION - BIBLE DATASET (using SacreBLEU with smooth method)

Direct Translation: Yami → Tagalog (from Bible Dataset)
Test Pairs: 200

Corpus BLEU Score: 32.1611

BLEU Breakdown:
  BLEU-1: 62.0690
  BLEU-2: 38.7779
  BLEU-3: 27.4260
  BLEU-4: 19.7318
  Brevity Penalty: 0.9520
  Ratio: 0.9531


### Bible Dataset Examples

In [None]:
bible_comparison_df = pd.DataFrame({
    'Book': [item['book'] for item in bible_test_data],
    'Chapter': [item['chapter'] for item in bible_test_data],
    'Verse': [item['verse'] for item in bible_test_data],
    'Source (Yami)': [item['src'] for item in bible_test_data],
    'Source (Cleaned)': [item['src_cleaned'] for item in bible_test_data],
    'Translation (Tagalog)': bible_translations,
    'Reference (Tagalog)': [item['ref'] for item in bible_test_data],
    'Reference (Cleaned)': [item['ref_cleaned'] for item in bible_test_data],
    'BLEU': bible_sentence_bleu_scores
})

bible_comparison_df_sorted = bible_comparison_df.sort_values('BLEU')

print("\nWORST PERFORMING BIBLE VERSE TRANSLATIONS (lowest BLEU):\n")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)
print(bible_comparison_df_sorted.head(5).to_string())

print("\n" + "="*70)
print("\nBEST PERFORMING BIBLE VERSE TRANSLATIONS (highest BLEU):\n")
print(bible_comparison_df_sorted.tail(5).to_string())


WORST PERFORMING BIBLE VERSE TRANSLATIONS (lowest BLEU):

           Book  Chapter Verse                                                                                                                                                                                                                                        Source (Yami)                                                                                                                                                                                                                                Source (Cleaned)                                                                                                                                                                                                                                           Translation (Tagalog)                                                                                                                                                                          

### Save Bible Dataset Results

In [None]:
bible_output_path = Path("results/bible_yami_tgl_evaluation.csv")
bible_output_path.parent.mkdir(parents=True, exist_ok=True)

bible_comparison_df.to_csv(bible_output_path, index=False)
print(f"Full bible results saved to: {bible_output_path}")

bible_summary = {
    'Pipeline': 'Direct: Yami → Tagalog (Bible Dataset)',
    'Test Pairs': len(bible_test_data),
    'Corpus BLEU': f"{corpus_bleu_bible.score:.4f}",
    'Mean Sentence BLEU': f"{statistics.mean(bible_sentence_bleu_scores):.4f}",
    'Median Sentence BLEU': f"{statistics.median(bible_sentence_bleu_scores):.4f}",
    'Min BLEU': f"{min(bible_sentence_bleu_scores):.4f}",
    'Max BLEU': f"{max(bible_sentence_bleu_scores):.4f}",
    'Stdev': f"{statistics.stdev(bible_sentence_bleu_scores) if len(bible_sentence_bleu_scores) > 1 else 0:.4f}",
    'BLEU-1': f"{corpus_bleu_bible.precisions[0]:.4f}",
    'BLEU-2': f"{corpus_bleu_bible.precisions[1]:.4f}",
    'BLEU-3': f"{corpus_bleu_bible.precisions[2]:.4f}",
    'BLEU-4': f"{corpus_bleu_bible.precisions[3]:.4f}",
    'Brevity Penalty': f"{corpus_bleu_bible.bp:.4f}"
}

bible_summary_df = pd.DataFrame([bible_summary])
bible_summary_path = Path("results/bible_yami_tgl_summary.csv")
bible_summary_df.to_csv(bible_summary_path, index=False)
print(f"Bible summary saved to: {bible_summary_path}")

print("\n" + "="*70)
print("COMPARISON: Validation Set vs Bible Dataset")
print("="*70)
comparison_results = pd.DataFrame([
    {
        'Dataset': 'Validation Set (yami-tgl.tsv)',
        'Test Pairs': len(test_data),
        'Corpus BLEU': f"{corpus_bleu_result.score:.4f}",
        'Mean BLEU': f"{statistics.mean(sentence_bleu_scores):.4f}",
        'BLEU-1': f"{corpus_bleu_result.precisions[0]:.4f}",
        'BLEU-4': f"{corpus_bleu_result.precisions[3]:.4f}"
    },
    {
        'Dataset': 'Bible Dataset Sample',
        'Test Pairs': len(bible_test_data),
        'Corpus BLEU': f"{corpus_bleu_bible.score:.4f}",
        'Mean BLEU': f"{statistics.mean(bible_sentence_bleu_scores):.4f}",
        'BLEU-1': f"{corpus_bleu_bible.precisions[0]:.4f}",
        'BLEU-4': f"{corpus_bleu_bible.precisions[3]:.4f}"
    }
])

print(comparison_results.to_string(index=False))

comparison_path = Path("results/yami_tgl_evaluation_comparison.csv")
comparison_results.to_csv(comparison_path, index=False)
print(f"\nComparison saved to: {comparison_path}")
print(f"\nAll evaluations complete!")

Full bible results saved to: results/bible_yami_tgl_evaluation.csv
Bible summary saved to: results/bible_yami_tgl_summary.csv

COMPARISON: Validation Set vs Bible Dataset
                      Dataset  Test Pairs Corpus BLEU Mean BLEU  BLEU-1  BLEU-4
Validation Set (yami-tgl.tsv)          20      2.3511    5.8927 25.8555  0.2463
         Bible Dataset Sample         200     32.1611   29.3976 62.0690 19.7318

Comparison saved to: results/yami_tgl_evaluation_comparison.csv

All evaluations complete!


## Comparison with Base NLLB Model (No Fine-tuning)

In [None]:
base_results_path = Path("results/model_comparison_finetuned_vs_base.csv")

if base_results_path.exists():
    base_comparison_df = pd.read_csv(base_results_path)
    
    print("\n" + "="*100)
    print("MODEL COMPARISON: FINE-TUNED NLLB vs BASE NLLB (No Fine-tuning)")
    print("="*100)
    print("\n" + base_comparison_df.to_string(index=False))
    
    validation_finetuned = base_comparison_df[base_comparison_df['Scenario'].str.contains('Fine-tuned.*Validation')]['Corpus BLEU'].values[0]
    validation_base = base_comparison_df[base_comparison_df['Scenario'].str.contains('Base NLLB.*Validation')]['Corpus BLEU'].values[0]
    validation_improvement = ((validation_finetuned - validation_base) / validation_base) * 100
    
    bible_finetuned = base_comparison_df[base_comparison_df['Scenario'].str.contains('Fine-tuned.*Bible')]['Corpus BLEU'].values[0]
    bible_base = base_comparison_df[base_comparison_df['Scenario'].str.contains('Base NLLB.*Bible')]['Corpus BLEU'].values[0]
    bible_improvement = ((bible_finetuned - bible_base) / bible_base) * 100
    
    print(f"\n{'IMPROVEMENT ANALYSIS':^100}")
    print("-" * 100)
    print(f"Validation Set Improvement: {validation_improvement:+.2f}% (Fine-tuned: {validation_finetuned:.4f} vs Base: {validation_base:.4f})")
    print(f"Bible Dataset Improvement:  {bible_improvement:+.2f}% (Fine-tuned: {bible_finetuned:.4f} vs Base: {bible_base:.4f})")
    print(f"Average Improvement:        {(validation_improvement + bible_improvement) / 2:+.2f}%")
    print("="*100)
    
    print(f"\nComparison data loaded from: {base_results_path}")
else:
    print(f"Base model comparison results not found at: {base_results_path}")
    print("Please run base_model_comparison.ipynb first to generate these results.")