# Pivot Language Evaluation

Evaluate translation quality using Bikolano as a pivot language:
- Pangasinan → Bikolano (Model 1)
- Bikolano → Tagalog (Model 2)
- Pangasinan → Tagalog (via Bikolano pivot)

## Setup and Load Test Data

In [1]:
from pathlib import Path
import pickle
import pandas as pd
import nltk
import sacrebleu

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

TEST_DATA_PATH = Path("data/validation/png-tgl.tsv")

print(f"Loading test data from: {TEST_DATA_PATH}")

test_df = pd.read_csv(TEST_DATA_PATH, sep='\t')

print(f"Loaded {len(test_df)} test pairs")
print(f"\nColumns: {test_df.columns.tolist()}")
print(f"\nFirst row:")
print(test_df.iloc[0])

Loading test data from: data/validation/png-tgl.tsv
Loaded 39 test pairs

Columns: ['Pangasinan', 'Tagalog']

First row:
Pangasinan       Ama mi a wala kad tawen.
Tagalog       Ama Namin, sumasalangit ka.
Name: 0, dtype: object


## Load Trained Models

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gc

model_png_bcl_path = Path("data/models/nllb-pag-bcl")
model_bcl_tgl_path = Path("data/models/nllb-bcl-tgl")

tokenizers = {}
try:
    tokenizers['png_bcl'] = AutoTokenizer.from_pretrained(model_png_bcl_path)
    print(f"Loaded Pangasinan → Bikolano tokenizer")
except Exception as e:
    print(f" Error loading Pangasinan → Bikolano tokenizer: {e}")

try:
    tokenizers['bcl_tgl'] = AutoTokenizer.from_pretrained(model_bcl_tgl_path)
    print(f"Loaded Bikolano → Tagalog tokenizer")
except Exception as e:
    print(f" Error loading Bikolano → Tagalog tokenizer: {e}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")
print("Note: Models will be loaded sequentially to manage memory")

  from .autonotebook import tqdm as notebook_tqdm


Loaded Pangasinan → Bikolano tokenizer
Loaded Bikolano → Tagalog tokenizer

Using device: cuda
Note: Models will be loaded sequentially to manage memory
Loaded Bikolano → Tagalog tokenizer

Using device: cuda
Note: Models will be loaded sequentially to manage memory


## Translation Functions

In [None]:
import regex as re

def clean_string(input_string):
    cleaned = re.sub(r"[^\p{L}\s]", "", input_string.strip().lower())
    return cleaned

def translate_nllb(text, model, tokenizer, device, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
    
    with torch.no_grad():
        translated_tokens = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )
    
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

print("Translation functions ready (NLLB-based, sequential loading)")

Translation functions ready (NLLB-based, sequential loading)


## Evaluate Pivot Translation

In [None]:
test_data = []
for idx, row in test_df.iterrows():
    src = row['Pangasinan']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned
        })

print(f"Prepared {len(test_data)} valid test pairs\n")

if len(test_data) > 0:
    sample = test_data[0]
    print("Sample data (before and after cleaning):")
    print(f"  Source (original):  {sample['src']}")
    print(f"  Source (cleaned):   {sample['src_cleaned']}")
    print(f"  Reference (orig):   {sample['ref']}")
    print(f"  Reference (clean):  {sample['ref_cleaned']}")
    print()

print("STEP 1: Translating Pangasinan → Bikolano...")
print(f"Loading Pangasinan → Bikolano model...")
model_png_bcl = AutoModelForSeq2SeqLM.from_pretrained(model_png_bcl_path).to(device)
model_png_bcl.eval()

pivot_outputs = []
for i, item in enumerate(test_data):
    if (i + 1) % max(1, len(test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(test_data)}")
    
    pivot_text = translate_nllb(
        item['src_cleaned'],
        model_png_bcl,
        tokenizers['png_bcl'],
        device
    )
    pivot_outputs.append(pivot_text)

print("Step 1 complete: Pangasinan → Bikolano translation done")

del model_png_bcl
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()
print("Memory cleared\n")

print("STEP 2: Translating Bikolano → Tagalog...")
print(f"Loading Bikolano → Tagalog model...")
model_bcl_tgl = AutoModelForSeq2SeqLM.from_pretrained(model_bcl_tgl_path).to(device)
model_bcl_tgl.eval()

translations = []
for i, pivot_text in enumerate(pivot_outputs):
    if (i + 1) % max(1, len(pivot_outputs) // 10) == 0:
        print(f"  Progress: {i+1}/{len(pivot_outputs)}")
    
    target_text = translate_nllb(
        pivot_text,
        model_bcl_tgl,
        tokenizers['bcl_tgl'],
        device
    )
    translations.append(target_text)

print("Step 2 complete: Bikolano → Tagalog translation done")

del model_bcl_tgl
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

references = [item['ref_cleaned'] for item in test_data]

print(f"\nTranslated all {len(test_data)} test pairs via pivot language")

test_data_with_translations = test_data.copy()
for i in range(len(test_data)):
    test_data_with_translations[i]['translation'] = translations[i]
    test_data_with_translations[i]['pivot'] = pivot_outputs[i]

Prepared 39 valid test pairs

Sample data (before and after cleaning):
  Source (original):  Ama mi a wala kad tawen.
  Source (cleaned):   ama mi a wala kad tawen
  Reference (orig):   Ama Namin, sumasalangit ka.
  Reference (clean):  ama namin sumasalangit ka

STEP 1: Translating Pangasinan → Bikolano...
Loading Pangasinan → Bikolano model...
  Progress: 3/39
  Progress: 3/39
  Progress: 6/39
  Progress: 6/39
  Progress: 9/39
  Progress: 9/39
  Progress: 12/39
  Progress: 12/39
  Progress: 15/39
  Progress: 15/39
  Progress: 18/39
  Progress: 18/39
  Progress: 21/39
  Progress: 21/39
  Progress: 24/39
  Progress: 24/39
  Progress: 27/39
  Progress: 27/39
  Progress: 30/39
  Progress: 30/39
  Progress: 33/39
  Progress: 33/39
  Progress: 36/39
  Progress: 36/39
  Progress: 39/39
Step 1 complete: Pangasinan → Bikolano translation done
  Progress: 39/39
Step 1 complete: Pangasinan → Bikolano translation done
Memory cleared

STEP 2: Translating Bikolano → Tagalog...
Loading Bikolano → Ta

## Calculate BLEU Scores with SacreBLEU

In [None]:
bleu = sacrebleu.BLEU(smooth_method='exp', smooth_value=0.0)
corpus_bleu_result = bleu.corpus_score(translations, [references])

print("="*70)
print("BLEU SCORE EVALUATION (using SacreBLEU with smooth method)")
print("="*70)
print(f"\nPipeline: Pangasinan → Bikolano → Tagalog")
print(f"Test Pairs: {len(test_data)}")
print(f"\nCorpus BLEU Score: {corpus_bleu_result.score:.4f}")
print(f"\nBLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_result.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_result.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_result.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_result.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_result.bp:.4f}")
print(f"  Ratio: {corpus_bleu_result.ratio:.4f}")
print("="*70)

BLEU SCORE EVALUATION (using SacreBLEU with smooth method)

Pipeline: Pangasinan → Bikolano → Tagalog
Test Pairs: 39

Corpus BLEU Score: 9.4350

BLEU Breakdown:
  BLEU-1: 39.4118
  BLEU-2: 17.2757
  BLEU-3: 6.4885
  BLEU-4: 1.7937
  Brevity Penalty: 1.0000
  Ratio: 1.1111


## Sentence-level BLEU Scores

In [None]:
sentence_bleu_scores = []
bleu_sent = sacrebleu.BLEU(smooth_method='exp', smooth_value=0.0)

for hyp, ref in zip(translations, references):
    score = bleu_sent.sentence_score(hyp, [ref])
    sentence_bleu_scores.append(score.score)

import statistics

print(f"\nSentence-level BLEU Statistics:")
print(f"  Mean: {statistics.mean(sentence_bleu_scores):.4f}")
print(f"  Median: {statistics.median(sentence_bleu_scores):.4f}")
print(f"  Min: {min(sentence_bleu_scores):.4f}")
print(f"  Max: {max(sentence_bleu_scores):.4f}")
print(f"  Stdev: {statistics.stdev(sentence_bleu_scores) if len(sentence_bleu_scores) > 1 else 0:.4f}")

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 


Sentence-level BLEU Statistics:
  Mean: 9.6214
  Median: 8.1309
  Min: 0.0000
  Max: 40.7322
  Stdev: 9.9215


## Translation Examples

In [None]:
comparison_df = pd.DataFrame({
    'Source (Pangasinan)': [item['src'] for item in test_data],
    'Source (Cleaned)': [item['src_cleaned'] for item in test_data],
    'Pivot (Bikolano)': pivot_outputs,
    'Translation (Tagalog)': translations,
    'Reference (Tagalog)': [item['ref'] for item in test_data],
    'Reference (Cleaned)': [item['ref_cleaned'] for item in test_data],
    'BLEU': sentence_bleu_scores
})

comparison_df_sorted = comparison_df.sort_values('BLEU')

print("\nWORST PERFORMING TRANSLATIONS (lowest BLEU):\n")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)
print(comparison_df_sorted.head(5).to_string())

print("\n" + "="*70)
print("\nBEST PERFORMING TRANSLATIONS (highest BLEU):\n")
print(comparison_df_sorted.tail(5).to_string())


WORST PERFORMING TRANSLATIONS (lowest BLEU):

     Source (Pangasinan)      Source (Cleaned) Pivot (Bikolano) Translation (Tagalog) Reference (Tagalog) Reference (Cleaned)  BLEU
25            Unla ak la            unla ak la   maduman na ako   narito akoy pupunta        Aalis na ako        aalis na ako   0.0
30      Masantos ya labi      masantos ya labi  banal na banggi         banal na gabi      Magandang gabi      magandang gabi   0.0
29    Masantos ya ngarem    masantos ya ngarem  banal na banggi         banal na gabi     Magandang hapon     magandang hapon   0.0
28              Labay ko              labay ko     boot ko iyan    ito ang aking nais            Gusto ko            gusto ko   0.0
31  Masantos ya kabwasan  masantos ya kabwasan     banal na aga        banal na umaga     Magandang umaga     magandang umaga   0.0


BEST PERFORMING TRANSLATIONS (highest BLEU):

                                                          Source (Pangasinan)                                    

## Save Results

In [None]:
output_path = Path("results/pivot_evaluation.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

comparison_df.to_csv(output_path, index=False)
print(f"Full results saved to: {output_path}")

summary = {
    'Pipeline': 'Pangasinan → Bikolano → Tagalog',
    'Test Pairs': len(test_data),
    'Corpus BLEU': f"{corpus_bleu_result.score:.4f}",
    'Mean Sentence BLEU': f"{statistics.mean(sentence_bleu_scores):.4f}",
    'Median Sentence BLEU': f"{statistics.median(sentence_bleu_scores):.4f}",
    'Min BLEU': f"{min(sentence_bleu_scores):.4f}",
    'Max BLEU': f"{max(sentence_bleu_scores):.4f}",
    'Stdev': f"{statistics.stdev(sentence_bleu_scores) if len(sentence_bleu_scores) > 1 else 0:.4f}",
    'BLEU-1': f"{corpus_bleu_result.precisions[0]:.4f}",
    'BLEU-2': f"{corpus_bleu_result.precisions[1]:.4f}",
    'BLEU-3': f"{corpus_bleu_result.precisions[2]:.4f}",
    'BLEU-4': f"{corpus_bleu_result.precisions[3]:.4f}",
    'Brevity Penalty': f"{corpus_bleu_result.bp:.4f}"
}

summary_df = pd.DataFrame([summary])
summary_path = Path("results/evaluation_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"Summary saved to: {summary_path}")

print(f"\nEvaluation complete!")

Full results saved to: results/pivot_evaluation.csv
Summary saved to: results/evaluation_summary.csv

Evaluation complete!


## Bible Dataset Evaluation (Original Data)

In [None]:
BIBLE_DATA_PATH = Path("data/dataset/Tagalog_Pangasinan_Parallel.tsv")

print(f"Loading Bible dataset from: {BIBLE_DATA_PATH}")

bible_df = pd.read_csv(BIBLE_DATA_PATH, sep='\t')
print(f"Loaded {len(bible_df)} Bible verse pairs")

bible_with_pag = bible_df[bible_df['Pangasinan'] != '<no verse>'].copy()
print(f"Verses with Pangasinan: {len(bible_with_pag)}")

sample_size = 200
bible_sample = bible_with_pag.sample(n=min(sample_size, len(bible_with_pag)), random_state=42)
print(f"Sampled {len(bible_sample)} verses for evaluation")

Loading Bible dataset from: data/dataset/Tagalog_Pangasinan_Parallel.tsv
Loaded 33833 Bible verse pairs
Verses with Pangasinan: 33811
Sampled 200 verses for evaluation
Loaded 33833 Bible verse pairs
Verses with Pangasinan: 33811
Sampled 200 verses for evaluation


In [None]:
bible_test_data = []
for idx, row in bible_sample.iterrows():
    src = row['Pangasinan']
    ref = row['Tagalog']
    
    if pd.isna(src) or pd.isna(ref):
        continue
    if str(src).lower() == '<no verse>' or str(ref).lower() == '<no verse>':
        continue
    
    src_text = str(src).strip()
    ref_text = str(ref).strip()
    
    if src_text and ref_text:
        src_cleaned = clean_string(src_text)
        ref_cleaned = clean_string(ref_text)
        
        bible_test_data.append({
            'src': src_text,
            'src_cleaned': src_cleaned,
            'ref': ref_text,
            'ref_cleaned': ref_cleaned,
            'book': row.get('Book', 'Unknown'),
            'chapter': row.get('Chapter', 0),
            'verse': row.get('Verse', 0)
        })

print(f"Prepared {len(bible_test_data)} valid Bible verse pairs for translation")

Prepared 200 valid Bible verse pairs for translation


In [None]:
print("BIBLE EVALUATION: Step 1 - Translating Pangasinan → Bikolano...")
model_png_bcl = AutoModelForSeq2SeqLM.from_pretrained(model_png_bcl_path).to(device)
model_png_bcl.eval()

bible_pivot_outputs = []
for i, item in enumerate(bible_test_data):
    if (i + 1) % max(1, len(bible_test_data) // 10) == 0:
        print(f"  Progress: {i+1}/{len(bible_test_data)}")
    
    pivot_text = translate_nllb(
        item['src_cleaned'],
        model_png_bcl,
        tokenizers['png_bcl'],
        device
    )
    bible_pivot_outputs.append(pivot_text)

print("Step 1 complete")

del model_png_bcl
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

print("BIBLE EVALUATION: Step 2 - Translating Bikolano → Tagalog...")
model_bcl_tgl = AutoModelForSeq2SeqLM.from_pretrained(model_bcl_tgl_path).to(device)
model_bcl_tgl.eval()

bible_translations = []
for i, pivot_text in enumerate(bible_pivot_outputs):
    if (i + 1) % max(1, len(bible_pivot_outputs) // 10) == 0:
        print(f"  Progress: {i+1}/{len(bible_pivot_outputs)}")
    
    target_text = translate_nllb(
        pivot_text,
        model_bcl_tgl,
        tokenizers['bcl_tgl'],
        device
    )
    bible_translations.append(target_text)

print("Step 2 complete")

del model_bcl_tgl
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

bible_references = [item['ref_cleaned'] for item in bible_test_data]
print(f"Bible translations complete: {len(bible_translations)} verses")

BIBLE EVALUATION: Step 1 - Translating Pangasinan → Bikolano...
  Progress: 20/200
  Progress: 20/200
  Progress: 40/200
  Progress: 40/200
  Progress: 60/200
  Progress: 60/200
  Progress: 80/200
  Progress: 80/200
  Progress: 100/200
  Progress: 100/200
  Progress: 120/200
  Progress: 120/200
  Progress: 140/200
  Progress: 140/200
  Progress: 160/200
  Progress: 160/200
  Progress: 180/200
  Progress: 180/200
  Progress: 200/200
  Progress: 200/200
Step 1 complete
BIBLE EVALUATION: Step 2 - Translating Bikolano → Tagalog...
Step 1 complete
BIBLE EVALUATION: Step 2 - Translating Bikolano → Tagalog...
  Progress: 20/200
  Progress: 20/200
  Progress: 40/200
  Progress: 40/200
  Progress: 60/200
  Progress: 60/200
  Progress: 80/200
  Progress: 80/200
  Progress: 100/200
  Progress: 100/200
  Progress: 120/200
  Progress: 120/200
  Progress: 140/200
  Progress: 140/200
  Progress: 160/200
  Progress: 160/200
  Progress: 180/200
  Progress: 180/200
  Progress: 200/200
  Progress: 200/20

In [None]:
bleu_bible = sacrebleu.BLEU(smooth_method='exp', smooth_value=0.0)
bleu_sent_bible = sacrebleu.BLEU(smooth_method='exp', smooth_value=0.0)

corpus_bleu_bible = bleu_bible.corpus_score(bible_translations, [bible_references])
bible_sentence_bleu_scores = [bleu_sent_bible.sentence_score(hyp, [ref]).score for hyp, ref in zip(bible_translations, bible_references)]

print(f"\nBible Dataset - Corpus BLEU: {corpus_bleu_bible.score:.4f}")
print(f"Bible Dataset - Mean Sentence BLEU: {statistics.mean(bible_sentence_bleu_scores):.4f}")
print(f"Bible Dataset - Median Sentence BLEU: {statistics.median(bible_sentence_bleu_scores):.4f}")

print(f"\nBible Dataset - BLEU Breakdown:")
print(f"  BLEU-1: {corpus_bleu_bible.precisions[0]:.4f}")
print(f"  BLEU-2: {corpus_bleu_bible.precisions[1]:.4f}")
print(f"  BLEU-3: {corpus_bleu_bible.precisions[2]:.4f}")
print(f"  BLEU-4: {corpus_bleu_bible.precisions[3]:.4f}")
print(f"  Brevity Penalty: {corpus_bleu_bible.bp:.4f}")

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 


Bible Dataset - Corpus BLEU: 33.1618
Bible Dataset - Mean Sentence BLEU: 31.8642
Bible Dataset - Median Sentence BLEU: 25.9945

Bible Dataset - BLEU Breakdown:
  BLEU-1: 66.2321
  BLEU-2: 42.3979
  BLEU-3: 29.4461
  BLEU-4: 21.6720
  Brevity Penalty: 0.9064


In [None]:
bible_output_path = Path("results/pivot_bible_png_tgl_evaluation.csv")
bible_output_path.parent.mkdir(parents=True, exist_ok=True)

bible_comparison_df = pd.DataFrame({
    'Book': [item.get('book', 'Unknown') for item in bible_test_data],
    'Chapter': [item.get('chapter', 0) for item in bible_test_data],
    'Verse': [item.get('verse', 0) for item in bible_test_data],
    'Source (Pangasinan)': [item['src'] for item in bible_test_data],
    'Source (Cleaned)': [item['src_cleaned'] for item in bible_test_data],
    'Pivot (Bikolano)': bible_pivot_outputs,
    'Translation (Tagalog)': bible_translations,
    'Reference (Tagalog)': [item['ref'] for item in bible_test_data],
    'Reference (Cleaned)': [item['ref_cleaned'] for item in bible_test_data],
    'BLEU': bible_sentence_bleu_scores
})

bible_comparison_df.to_csv(bible_output_path, index=False)
print(f"Bible evaluation results saved to: {bible_output_path}")

bible_summary = {
    'Dataset': 'Bible (Pangasinan-Tagalog)',
    'Pipeline': 'Pangasinan → Bikolano → Tagalog',
    'Test Verses': len(bible_test_data),
    'Corpus BLEU': f"{corpus_bleu_bible.score:.4f}",
    'Mean Sentence BLEU': f"{statistics.mean(bible_sentence_bleu_scores):.4f}",
    'Median Sentence BLEU': f"{statistics.median(bible_sentence_bleu_scores):.4f}",
    'Min BLEU': f"{min(bible_sentence_bleu_scores):.4f}",
    'Max BLEU': f"{max(bible_sentence_bleu_scores):.4f}",
    'Stdev': f"{statistics.stdev(bible_sentence_bleu_scores) if len(bible_sentence_bleu_scores) > 1 else 0:.4f}",
    'BLEU-1': f"{corpus_bleu_bible.precisions[0]:.4f}",
    'BLEU-2': f"{corpus_bleu_bible.precisions[1]:.4f}",
    'BLEU-3': f"{corpus_bleu_bible.precisions[2]:.4f}",
    'BLEU-4': f"{corpus_bleu_bible.precisions[3]:.4f}",
    'Brevity Penalty': f"{corpus_bleu_bible.bp:.4f}"
}

bible_summary_df = pd.DataFrame([bible_summary])
bible_summary_path = Path("results/pivot_bible_png_tgl_summary.csv")
bible_summary_df.to_csv(bible_summary_path, index=False)
print(f"Bible summary saved to: {bible_summary_path}")

Bible evaluation results saved to: results/pivot_bible_png_tgl_evaluation.csv
Bible summary saved to: results/pivot_bible_png_tgl_summary.csv


## Compare Validation vs Bible Evaluation

In [None]:
validation_summary = {
    'Dataset': 'Validation Set (png-tgl.tsv)',
    'Pipeline': 'Pangasinan → Bikolano → Tagalog',
    'Corpus BLEU': corpus_bleu_result.score,
    'Mean Sentence BLEU': statistics.mean(sentence_bleu_scores)
}

bible_summary_stats = {
    'Dataset': 'Bible (Pangasinan-Tagalog)',
    'Pipeline': 'Pangasinan → Bikolano → Tagalog',
    'Corpus BLEU': corpus_bleu_bible.score,
    'Mean Sentence BLEU': statistics.mean(bible_sentence_bleu_scores)
}

comparison_results = pd.DataFrame([validation_summary, bible_summary_stats])

print("\nVALIDATION vs BIBLE DATASET COMPARISON:")
print(comparison_results.to_string(index=False))

comparison_path = Path("results/pivot_png_tgl_evaluation_comparison.csv")
comparison_results.to_csv(comparison_path, index=False)
print(f"\nComparison saved to: {comparison_path}")


VALIDATION vs BIBLE DATASET COMPARISON:
                     Dataset                        Pipeline  Corpus BLEU  Mean Sentence BLEU
Validation Set (png-tgl.tsv) Pangasinan → Bikolano → Tagalog     9.434988            9.621433
  Bible (Pangasinan-Tagalog) Pangasinan → Bikolano → Tagalog    33.161840           31.864214

Comparison saved to: results/pivot_png_tgl_evaluation_comparison.csv
