<a href="https://colab.research.google.com/github/u17095736-coder/Part--2-MIT-805-Assignment/blob/main/Machine_Translation_model_for_African_language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile requirements.txt

torch
pandas
numpy
datasets
evaluate
transformers
comet-ml
unbabel-comet
tqdm
requests
sacrebleu
bert-score
protobuf

nltk
accelerate
comet-ml
evaluate


Overwriting requirements.txt


In [None]:
!pip install -qqq -r requirements.txt

In [None]:
import torch
import pandas as pd
import numpy as np
import random
import requests
import nltk
import os
import gc
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
from evaluate import load as load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from accelerate import Accelerator
from comet import download_model, load_from_checkpoint
from tqdm.notebook import tqdm


try:
    nltk.download('punkt', quiet=True)

    nltk.download('punkt_tab', quiet=True)
except Exception:
    print("NLTK punkt not available.")

In [None]:
#Set device for PyTorch operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


**1. Define the models to evaluate**

In [None]:
MODEL_CONFIGS = {
    "NLLB-200": "facebook/nllb-200-distilled-600M",
    "M2M-100": "facebook/m2m100_418M",
}

# Define the target languages (BCP-47 codes for NLLB)
TARGET_LANGS = {
    "Hausa": "hau_Latn",
    "Sepedi": "nso_Latn",
    "Xitsonga": "tso_Latn",
    "isiZulu": "zul_Latn"
}

SOURCE_LANG = "eng_Latn"
SPLIT = "devtest"

# GitHub Configuration for Corrected Data (Directly accessing the source files)
GITHUB_BASE_URL = "https://raw.githubusercontent.com/dsfsi/flores-fix-4-africa/main/data/"

# File names based on the confirmed GitHub structure:
CORRECTED_FILES = {
    "hau": f"corrected/devtest/hau_Latn.{SPLIT}",
    "nso": f"corrected/devtest/nso_Latn.{SPLIT}",
    "tso": f"corrected/devtest/tso_Latn.{SPLIT}",
    "zul": f"corrected/devtest/zul_Latn.{SPLIT}",
    "eng": f"original/devtest/eng_Latn.{SPLIT}",
}

UNCORRECTED_FILES = {
    "hau": f"original/devtest/hau_Latn.{SPLIT}",
    "nso": f"original/devtest/nso_Latn.{SPLIT}",
    "tso": f"original/devtest/tso_Latn.{SPLIT}",
    "zul": f"original/devtest/zul_Latn.{SPLIT}",
}


M2M_LANG_MAP = {
    "hau_Latn": "ha",
    "nso_Latn": "ns",
    "tso_Latn": "ts",
    "zul_Latn": "zu",
}

# Initialize accelerator
accelerator = Accelerator()

2. **Data Acquisition and Preprocessing**

In [None]:
def load_data_from_github(file_path):
    """Downloads and reads a raw text file from the specified GitHub path."""
    full_url = GITHUB_BASE_URL + file_path
    try:
        response = requests.get(full_url)
        response.raise_for_status()


        return [line.strip() for line in response.text.splitlines() if line.strip()]
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file from {full_url}: {e}")
        return []


def load_all_references(file_map):
    """Loads a set of references (corrected or uncorrected) from GitHub."""
    ref_storage = {}
    for lang_code_3, file_path in file_map.items():
        if lang_code_3 == 'eng': continue

        ref_list = load_data_from_github(file_path)
        if ref_list:
            ref_storage[lang_code_3] = ref_list
        else:
            print(f"   Failed to download {lang_code_3} from {file_path}.")

            continue
    return ref_storage



**3. Load Dataset Execution**

In [None]:

print("1. Loading English Source Text (Input)...")
source_texts = load_data_from_github(CORRECTED_FILES['eng'])
if not source_texts:
    raise RuntimeError("[CRITICAL ERROR] Failed to load English source text. Aborting execution.")
else:
    print(f"   Successfully loaded {len(source_texts)} English source sentences.")

print("\n2. Downloading Corrected References (NEW Baseline)...")
corrected_references_storage = load_all_references(CORRECTED_FILES)
if len(corrected_references_storage) != len(TARGET_LANGS):
     raise RuntimeError(f"[ERROR] Only {len(corrected_references_storage)} of {len(TARGET_LANGS)} corrected languages loaded.")
else:
    print(f"   [SUCCESS] Corrected data loaded for: {', '.join(corrected_references_storage.keys()).upper()}")

print("\n3. Downloading Uncorrected References (OLD Baseline)...")
uncorrected_references_storage = load_all_references(UNCORRECTED_FILES)
if len(uncorrected_references_storage) != len(TARGET_LANGS):
     raise RuntimeError(f"[ERROR] Only {len(uncorrected_references_storage)} of {len(TARGET_LANGS)} uncorrected languages loaded.")
else:
    print(f"   [SUCCESS] Uncorrected data loaded for: {', '.join(uncorrected_references_storage.keys()).upper()}")


all_lengths = [len(source_texts)] + [len(r) for r in corrected_references_storage.values()] + [len(r) for r in uncorrected_references_storage.values()]
if len(set(all_lengths)) != 1:
    raise RuntimeError(f"\n[CRITICAL ERROR] Data lengths mismatch: {all_lengths}. Aborting.")
print("\n--- Data Loading Complete and Validated ---")

1. Loading English Source Text (Input)...
   Successfully loaded 1012 English source sentences.

2. Downloading Corrected References (NEW Baseline)...
   [SUCCESS] Corrected data loaded for: HAU, NSO, TSO, ZUL

3. Downloading Uncorrected References (OLD Baseline)...
   [SUCCESS] Uncorrected data loaded for: HAU, NSO, TSO, ZUL

--- Data Loading Complete and Validated ---


**4. Model Inference (Translation)**

In [None]:
def run_inference(model_checkpoint, target_lang_code, source_texts):
    """Loads a model and generates translations for the English source text in batches."""
    global device, SOURCE_LANG, M2M_LANG_MAP

    print(f"  -> Initializing Model: {model_checkpoint} for {target_lang_code}")


    is_m2m = "m2m100" in model_checkpoint.lower()
    forced_bos_token_id = None

    if is_m2m:
        src_code_tokenizer = "en"
        tgt_code_tokenizer = M2M_LANG_MAP.get(target_lang_code)

        if not tgt_code_tokenizer:
            print(f"[M2M-100] Language mapping not found for target: {target_lang_code}. Skipping inference.")
            return []

        tokenizer = AutoTokenizer.from_pretrained(
            model_checkpoint,
            src_lang=src_code_tokenizer,
            tgt_lang=tgt_code_tokenizer
        )

        try:

            forced_bos_token_id = tokenizer.get_lang_id(tgt_code_tokenizer)
        except KeyError:
            print(f"[M2M-100] Unsupported language code for tokenizer: {tgt_code_tokenizer}. Skipping inference.")

            del tokenizer
            torch.cuda.empty_cache()
            gc.collect()
            return []


    else:

        src_code_tokenizer = SOURCE_LANG
        tgt_code_tokenizer = target_lang_code

        tokenizer = AutoTokenizer.from_pretrained(
            model_checkpoint,
            src_lang=src_code_tokenizer,
            tgt_lang=tgt_code_tokenizer
        )

        if hasattr(tokenizer, "lang_code_to_id"):
            forced_bos_token_id = tokenizer.lang_code_to_id.get(tgt_code_tokenizer)
        else:
            try:
                tokenizer.target_lang = tgt_code_tokenizer
                forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_code_tokenizer)
            except Exception:
                try:
                    forced_bos_token_id = tokenizer.get_vocab().get(tgt_code_tokenizer)
                except KeyError:
                     print(f"[NLLB] Could not find BOS token ID for target: {tgt_code_tokenizer}. Skipping inference.")

                     del tokenizer
                     torch.cuda.empty_cache()
                     gc.collect()
                     return []


    if forced_bos_token_id is None:
         print(f"Could not determine forced_bos_token_id for {target_lang_code}. Skipping inference.")

         del tokenizer
         torch.cuda.empty_cache()
         gc.collect()
         return []


    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

    BATCH_SIZE = 16
    translated_texts = []

    for i in tqdm(range(0, len(source_texts), BATCH_SIZE), desc=f"Translating {target_lang_code}"):
        batch = source_texts[i:i + BATCH_SIZE]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            translated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=forced_bos_token_id,
                max_length=256
            )

        decoded_batch = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        translated_texts.extend(decoded_batch)

    # Memory cleanup
    del model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return translated_texts

**4. Excute Inference and save hypotheses**

In [None]:
import os


all_hypotheses = {}

if 'SAVE_HYPOTHESES_TO_FILE' not in globals():
    SAVE_HYPOTHESES_TO_FILE = True

if SAVE_HYPOTHESES_TO_FILE:
    HYPOTHESIS_DIR = "generated_translations"
    os.makedirs(HYPOTHESIS_DIR, exist_ok=True)
    print(f"Saving generated translations to directory: {HYPOTHESIS_DIR}")


required_vars = ['MODEL_CONFIGS', 'TARGET_LANGS', 'source_texts', 'M2M_LANG_MAP']
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Missing required variables: {missing}. Please run earlier setup cells first.")

print("\n--- STARTING INFERENCE PHASE ---")

Saving generated translations to directory: generated_translations

--- STARTING INFERENCE PHASE ---


**NLLB-200 Inference**

In [None]:
all_hypotheses_nllb = {}

for lang_name, lang_code in TARGET_LANGS.items():
    lang_3_code = lang_code.split('_')[0]
    print(f"\nGenerating NLLB-200 translations for {lang_name} ({lang_code})...")

    hypothesis = run_inference(MODEL_CONFIGS["NLLB-200"], lang_code, source_texts)
    all_hypotheses_nllb[(lang_name, lang_3_code)] = hypothesis

    if SAVE_HYPOTHESES_TO_FILE:
        file_path = os.path.join(HYPOTHESIS_DIR, f"NLLB-200_{lang_3_code}_hypotheses.txt")
        with open(file_path, 'w', encoding='utf-8') as f:
            for line in hypothesis:
                f.write(line + '\n')
        print(f"   -> Saved hypotheses to {file_path}")

print("\n--- NLLB-200 Inference Complete ---")



Generating NLLB-200 translations for Hausa (hau_Latn)...
  -> Initializing Model: facebook/nllb-200-distilled-600M for hau_Latn


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Translating hau_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/NLLB-200_hau_hypotheses.txt

Generating NLLB-200 translations for Sepedi (nso_Latn)...
  -> Initializing Model: facebook/nllb-200-distilled-600M for nso_Latn


Translating nso_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/NLLB-200_nso_hypotheses.txt

Generating NLLB-200 translations for Xitsonga (tso_Latn)...
  -> Initializing Model: facebook/nllb-200-distilled-600M for tso_Latn


Translating tso_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/NLLB-200_tso_hypotheses.txt

Generating NLLB-200 translations for isiZulu (zul_Latn)...
  -> Initializing Model: facebook/nllb-200-distilled-600M for zul_Latn


Translating zul_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/NLLB-200_zul_hypotheses.txt

--- NLLB-200 Inference Complete ---


**M2M-100 inference**

In [None]:
all_hypotheses_m2m = {}

for lang_name, lang_code in TARGET_LANGS.items():
    lang_3_code = lang_code.split('_')[0]

    if lang_code not in M2M_LANG_MAP:
        print(f"   [SKIP] {lang_name} ({lang_code}) not supported by M2M-100.")
        continue

    print(f"\nGenerating M2M-100 translations for {lang_name} ({lang_code})...")

    hypothesis = run_inference(MODEL_CONFIGS["M2M-100"], lang_code, source_texts)
    all_hypotheses_m2m[(lang_name, lang_3_code)] = hypothesis

    if SAVE_HYPOTHESES_TO_FILE:
        file_path = os.path.join(HYPOTHESIS_DIR, f"M2M-100_{lang_3_code}_hypotheses.txt")
        with open(file_path, 'w', encoding='utf-8') as f:
            for line in hypothesis:
                f.write(line + '\n')
        print(f"   -> Saved hypotheses to {file_path}")

print("\n--- M2M-100 Inference Complete ---")



Generating M2M-100 translations for Hausa (hau_Latn)...
  -> Initializing Model: facebook/m2m100_418M for hau_Latn


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Translating hau_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/M2M-100_hau_hypotheses.txt

Generating M2M-100 translations for Sepedi (nso_Latn)...
  -> Initializing Model: facebook/m2m100_418M for nso_Latn


Translating nso_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/M2M-100_nso_hypotheses.txt

Generating M2M-100 translations for Xitsonga (tso_Latn)...
  -> Initializing Model: facebook/m2m100_418M for tso_Latn
[M2M-100] Unsupported language code for tokenizer: ts. Skipping inference.
   -> Saved hypotheses to generated_translations/M2M-100_tso_hypotheses.txt

Generating M2M-100 translations for isiZulu (zul_Latn)...
  -> Initializing Model: facebook/m2m100_418M for zul_Latn


Translating zul_Latn:   0%|          | 0/64 [00:00<?, ?it/s]

   -> Saved hypotheses to generated_translations/M2M-100_zul_hypotheses.txt

--- M2M-100 Inference Complete ---


**Comparative Performance Metrics**

In [None]:
evaluation_results = {}


required_vars = ['all_hypotheses_nllb', 'all_hypotheses_m2m', 'corrected_references_storage', 'uncorrected_references_storage', 'TARGET_LANGS', 'MODEL_CONFIGS', 'accelerator', 'source_texts']
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Missing required variables: {missing}. Please run earlier setup and inference cells first.")



print("\n--- Initializing Evaluation Metrics ---")
bleu_metric = load_metric("bleu")
chrf_metric = load_metric("chrf")
bertscore_metric = load_metric("bertscore")
comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_metric = load_from_checkpoint(comet_model_path)


comet_metric = accelerator.prepare(comet_metric)

print("--- Metric Initialization Complete ---")



def evaluate_translations(hypotheses, references, lang_name, lang_3_code, reference_type):
    """Calculates BLEU, ChrF, BERTScore, and COMET for a given set of hypotheses and references."""
    print(f"\n  -> Evaluating {lang_name} ({lang_3_code}) against {reference_type} references...")

    # NLTK Tokenization for BLEU (ensure references are tokenized)
    tokenized_references = [[word_tokenize(sent)] for sent in references]
    tokenized_hypotheses = [word_tokenize(sent) for sent in hypotheses]

    # Calculate BLEU (NLTK)

    bleu_score = corpus_bleu(tokenized_references, tokenized_hypotheses)



    hf_references = [[ref] for ref in references]

    # Calculate ChrF
    chrf_score = chrf_metric.compute(predictions=hypotheses, references=hf_references)['score']

    # Calculate BERTScore
    try:
      bertscore_results = bertscore_metric.compute(predictions=hypotheses, references=hf_references, lang=lang_3_code)
      bertscore_f1 = np.mean(bertscore_results['f1']) # Average F1 score
    except Exception as e:
      print(f"     [BERTScore ERROR] Could not compute for {lang_3_code}: {e}")
      bertscore_f1 = None


    # Calculate COMET (Unbabel)
    comet_data = [{"src": source_texts[i], "mt": hypotheses[i], "ref": references[i]} for i in range(len(hypotheses))]


    COMET_BATCH_SIZE = 64
    comet_scores = []
    print(f"     Calculating COMET scores in batches (size {COMET_BATCH_SIZE})...")
    for i in tqdm(range(0, len(comet_data), COMET_BATCH_SIZE), desc="COMET batches"):
        batch_data = comet_data[i:i + COMET_BATCH_SIZE]
        try:


            batch_data = accelerator.prepare(batch_data)
            batch_results = comet_metric.predict(batch_data)
            comet_scores.extend(batch_results.scores)
        except Exception as e:
            print(f"     [COMET ERROR] Batch {i//COMET_BATCH_SIZE} failed: {e}")


    comet_score = np.mean(comet_scores) if comet_scores else None


    results = {
        "BLEU": bleu_score,
        "ChrF++": chrf_score,
        "BERTScore": bertscore_f1,
        "COMET": comet_score
    }
    return results



--- Initializing Evaluation Metrics ---


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


--- Metric Initialization Complete ---


**Evaluation Loop**

In [None]:
# Evaluate NLLB-200
for (lang_name, lang_3_code), hypotheses in all_hypotheses_nllb.items():
    if not hypotheses:
        print(f"\nSkipping evaluation for NLLB-200 {lang_name} ({lang_3_code}) due to no hypotheses.")
        continue

    corrected_refs = corrected_references_storage.get(lang_3_code)
    if corrected_refs:
        nllb_corrected_results = evaluate_translations(hypotheses, corrected_refs, lang_name, lang_3_code, "Corrected")
        evaluation_results[("NLLB-200", lang_name, "Corrected")] = nllb_corrected_results
    else:
        print(f"     [SKIP] No corrected references found for {lang_name} ({lang_3_code}).")

    uncorrected_refs = uncorrected_references_storage.get(lang_3_code)
    if uncorrected_refs:
        nllb_uncorrected_results = evaluate_translations(hypotheses, uncorrected_refs, lang_name, lang_3_code, "Uncorrected")
        evaluation_results[("NLLB-200", lang_name, "Uncorrected")] = nllb_uncorrected_results
    else:
         print(f"     [SKIP] No uncorrected references found for {lang_name} ({lang_3_code}).")


# Evaluate M2M-100
for (lang_name, lang_3_code), hypotheses in all_hypotheses_m2m.items():
    if not hypotheses:
        print(f"\nSkipping evaluation for M2M-100 {lang_name} ({lang_3_code}) due to no hypotheses.")
        continue

    corrected_refs = corrected_references_storage.get(lang_3_code)
    if corrected_refs:
        m2m_corrected_results = evaluate_translations(hypotheses, corrected_refs, lang_name, lang_3_code, "Corrected")
        evaluation_results[("M2M-100", lang_name, "Corrected")] = m2m_corrected_results
    else:
        print(f"     [SKIP] No corrected references found for {lang_name} ({lang_3_code}).")


    uncorrected_refs = uncorrected_references_storage.get(lang_3_code)
    if uncorrected_refs:
        m2m_uncorrected_results = evaluate_translations(hypotheses, uncorrected_refs, lang_name, lang_3_code, "Uncorrected")
        evaluation_results[("M2M-100", lang_name, "Uncorrected")] = m2m_uncorrected_results
    else:
        print(f"     [SKIP] No uncorrected references found for {lang_name} ({lang_3_code}).")


print("\n--- Evaluation Complete ---")


  -> Evaluating Hausa (hau) against Corrected references...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.20it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Hausa (hau) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Sepedi (nso) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Sepedi (nso) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.26it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Xitsonga (tso) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Xitsonga (tso) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating isiZulu (zul) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.69it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating isiZulu (zul) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.71it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Hausa (hau) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Hausa (hau) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.30it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Sepedi (nso) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating Sepedi (nso) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


Skipping evaluation for M2M-100 Xitsonga (tso) due to no hypotheses.

  -> Evaluating isiZulu (zul) against Corrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


  -> Evaluating isiZulu (zul) against Uncorrected references...
     Calculating COMET scores in batches (size 64)...


COMET batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


--- Evaluation Complete ---


In [None]:

all_results = []
for (model, lang, ref_type), results in evaluation_results.items():
    all_results.append({
        'Model': model,
        'Language': lang,
        'Reference_Type': ref_type + '_Ref', # Standardize names
        'BLEU': results.get('BLEU'),
        'ChrF++': results.get('ChrF++'),
        'BERTScore': results.get('BERTScore'),
        'COMET': results.get('COMET')
    })


final_df = pd.DataFrame(all_results)


print("## QUANTITATIVE PERFORMANCE EVALUATION RESULTS")
print(final_df.to_markdown(index=False))



print("\n--- Ranking Comparison (Based on COMET Score) ---")

# (Corrected References)
df_corrected = final_df[final_df['Reference_Type'] == 'Corrected_Ref'].sort_values(by='COMET', ascending=False)
print("\nModel Ranking using CORRECTED References:")
print(df_corrected[['Language', 'Model', 'COMET']].reset_index(drop=True).to_markdown(index=False))

# (Uncorrected References - Baseline)
df_uncorrected = final_df[final_df['Reference_Type'] == 'Uncorrected_Ref'].sort_values(by='COMET', ascending=False)
print("\nModel Ranking using UNCORRECTED References (Baseline):")
print(df_uncorrected[['Language', 'Model', 'COMET']].reset_index(drop=True).to_markdown(index=False))

## QUANTITATIVE PERFORMANCE EVALUATION RESULTS
| Model    | Language   | Reference_Type   |       BLEU |   ChrF++ |   BERTScore |    COMET |
|:---------|:-----------|:-----------------|-----------:|---------:|------------:|---------:|
| NLLB-200 | Hausa      | Corrected_Ref    | 0.214355   | 49.497   |    0.815747 | 0.790905 |
| NLLB-200 | Hausa      | Uncorrected_Ref  | 0.217065   | 49.7409  |    0.817011 | 0.791232 |
| NLLB-200 | Sepedi     | Corrected_Ref    | 0.213224   | 50.2895  |    0.811823 | 0.660231 |
| NLLB-200 | Sepedi     | Uncorrected_Ref  | 0.210668   | 50.0866  |    0.81085  | 0.65918  |
| NLLB-200 | Xitsonga   | Corrected_Ref    | 0.205422   | 50.844   |    0.805497 | 0.652514 |
| NLLB-200 | Xitsonga   | Uncorrected_Ref  | 0.20325    | 50.7276  |    0.805262 | 0.652121 |
| NLLB-200 | isiZulu    | Corrected_Ref    | 0.160361   | 55.3164  |    0.831104 | 0.766966 |
| NLLB-200 | isiZulu    | Uncorrected_Ref  | 0.152397   | 54.5693  |    0.828272 | 0.764054 |
| M2M-100  | 

**Correction Magnitude Results**

In [None]:
print("\n--- Correction Magnitude Analysis (Difference between Corrected and Uncorrected References) ---")

correction_magnitude_results = {}

# Check for required globals
required_vars = ['corrected_references_storage', 'uncorrected_references_storage', 'TARGET_LANGS', 'bleu_metric', 'chrf_metric']
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Missing required variables for Correction Magnitude Analysis: {missing}. Please run earlier setup and evaluation cells first.")


for lang_name, lang_code in TARGET_LANGS.items():
    lang_3_code = lang_code.split('_')[0]
    print(f"\nAnalyzing correction magnitude for {lang_name} ({lang_code})...")

    corrected_refs = corrected_references_storage.get(lang_3_code)
    uncorrected_refs = uncorrected_references_storage.get(lang_3_code)

    if not corrected_refs or not uncorrected_refs:
        print(f"   [SKIP] Missing corrected or uncorrected references for {lang_name} ({lang_3_code}).")
        continue

    tokenized_uncorrected_refs = [[word_tokenize(sent)] for sent in uncorrected_refs]
    tokenized_corrected_refs = [word_tokenize(sent) for sent in corrected_refs]

    bleu_diff = corpus_bleu(tokenized_uncorrected_refs, tokenized_corrected_refs)


    # Calculate ChrF difference
    hf_uncorrected_refs = [[ref] for ref in uncorrected_refs]
    chrf_diff_result = chrf_metric.compute(predictions=corrected_refs, references=hf_uncorrected_refs)
    chrf_diff = chrf_diff_result['score'] if chrf_diff_result and 'score' in chrf_diff_result else None


    # Calculate additional metrics for correction magnitude table
    total_sentences = len(uncorrected_refs)
    sentences_corrected = sum([1 for i in range(total_sentences) if uncorrected_refs[i] != corrected_refs[i]])
    percentage_changed = (sentences_corrected / total_sentences) * 100 if total_sentences > 0 else 0

    original_tokens = sum([len(word_tokenize(sent)) for sent in uncorrected_refs])
    corrected_tokens = sum([len(word_tokenize(sent)) for sent in corrected_refs])
    token_divergence = abs(original_tokens - corrected_tokens)


    correction_magnitude_results[lang_name] = {
        "Total Sentences": total_sentences,
        "Sents Corrected": sentences_corrected,
        "% Changed": percentage_changed,
        "Original Tokens": original_tokens,
        "Corrected Tokens": corrected_tokens,
        "Token Divergence": token_divergence,
        "BLEU_Difference": bleu_diff,
        "ChrF++_Difference": chrf_diff,
    }

# Display correction magnitude results
print("\n--- Correction Magnitude Results ---")
print("| Language | Total Sentences | Sents Corrected | % Changed | Original Tokens | Corrected Tokens | Token Divergence | BLEU Difference (Corrected vs. Uncorrected) | ChrF++ Difference (Corrected vs. Uncorrected) |")
print("|----------|-----------------|-----------------|-----------|-----------------|------------------|------------------|---------------------------------------------|-----------------------------------------------|")

for lang, results in correction_magnitude_results.items():
    bleu_diff = f"{results['BLEU_Difference']:.4f}" if results.get('BLEU_Difference') is not None else "N/A"
    chrf_diff = f"{results['ChrF++_Difference']:.4f}" if results.get('ChrF++_Difference') is not None else "N/A"
    total_sentences = results.get("Total Sentences", "N/A")
    sentences_corrected = results.get("Sents Corrected", "N/A")
    percentage_changed = f"{results.get('% Changed', 0):.2f}%" if results.get('% Changed') is not None else "N/A"
    original_tokens = results.get("Original Tokens", "N/A")
    corrected_tokens = results.get("Corrected Tokens", "N/A")
    token_divergence = results.get("Token Divergence", "N/A")


    print(f"| {lang} | {total_sentences} | {sentences_corrected} | {percentage_changed} | {original_tokens} | {corrected_tokens} | {token_divergence} | {bleu_diff} | {chrf_diff} |")

print("\n--- Correction Magnitude Analysis Complete ---")


--- Correction Magnitude Analysis (Difference between Corrected and Uncorrected References) ---

Analyzing correction magnitude for Hausa (hau_Latn)...

Analyzing correction magnitude for Sepedi (nso_Latn)...

Analyzing correction magnitude for Xitsonga (tso_Latn)...

Analyzing correction magnitude for isiZulu (zul_Latn)...

--- Correction Magnitude Results ---
| Language | Total Sentences | Sents Corrected | % Changed | Original Tokens | Corrected Tokens | Token Divergence | BLEU Difference (Corrected vs. Uncorrected) | ChrF++ Difference (Corrected vs. Uncorrected) |
|----------|-----------------|-----------------|-----------|-----------------|------------------|------------------|---------------------------------------------|-----------------------------------------------|
| Hausa | 1012 | 68 | 6.72% | 28217 | 28189 | 28 | 0.9696 | 97.9065 |
| Sepedi | 1012 | 61 | 6.03% | 31177 | 31200 | 23 | 0.9811 | 98.9459 |
| Xitsonga | 1012 | 64 | 6.32% | 30228 | 30256 | 28 | 0.9753 | 98.5539 |

## Correction Magnitude Analysis Results

| Language   |   Total Sentences |   Sents Corrected | % Changed   |   Original Tokens |   Corrected Tokens |   Token Divergence | BLEU Difference (Corrected vs. Uncorrected)   | ChrF++ Difference (Corrected vs. Uncorrected)   |
|:-----------|------------------:|------------------:|:------------|------------------:|-------------------:|-------------------:|:----------------------------------------------|:------------------------------------------------|
| Hausa      |              1012 |                68 | 6.72%       |             28217 |              28189 |                 28 | 0.9696                                        | 97.9065                                         |
| Sepedi     |              1012 |                61 | 6.03%       |             31177 |              31200 |                 23 | 0.9811                                        | 98.9459                                         |
| Xitsonga   |              1012 |                64 | 6.32%       |             30228 |              30256 |                 28 | 0.9753                                        | 98.5539                                         |
| isiZulu    |              1012 |               225 | 22.23%      |             18534 |              18516 |                 18 | 0.9300                                        | 96.3023                                         |

**Diagnostic Error Sampling and Qualitative Analysis**

In [None]:
print("\n--- Identifying Samples for Diagnostic Error Sampling ---")

# Ensure required data is available
required_vars = ['all_hypotheses_nllb', 'source_texts', 'corrected_references_storage', 'uncorrected_references_storage', 'TARGET_LANGS', 'comet_metric', 'accelerator']
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Missing required variables for Diagnostic Error Sampling: {missing}. Please run earlier setup, inference, and evaluation cells first.")

# Function to calculate sentence-level COMET scores and their difference
def calculate_comet_diff(model_hypotheses, source_texts, corrected_refs, uncorrected_refs, comet_metric, accelerator):
    """Calculates sentence-level COMET scores for corrected and uncorrected references and their difference."""
    if not model_hypotheses or not corrected_refs or not uncorrected_refs or len(model_hypotheses) != len(corrected_refs) or len(model_hypotheses) != len(uncorrected_refs):
        print("   [SKIP] Data length mismatch or missing data for COMET difference calculation.")
        return []

    comet_data_corrected = [{"src": source_texts[i], "mt": model_hypotheses[i], "ref": corrected_refs[i]} for i in range(len(model_hypotheses))]
    comet_data_uncorrected = [{"src": source_texts[i], "mt": model_hypotheses[i], "ref": uncorrected_refs[i]} for i in range(len(model_hypotheses))]

    COMET_BATCH_SIZE = 64 # Adjust based on memory
    corrected_scores = []
    uncorrected_scores = []

    print(f"     Calculating sentence-level COMET scores (size {COMET_BATCH_SIZE})...")

    # Calculate scores for Corrected References
    for i in tqdm(range(0, len(comet_data_corrected), COMET_BATCH_SIZE), desc="COMET (Corrected) batches"):
        batch_data = comet_data_corrected[i:i + COMET_BATCH_SIZE]
        try:
            batch_data = accelerator.prepare(batch_data)
            batch_results = comet_metric.predict(batch_data)
            corrected_scores.extend(batch_results.scores)
        except Exception as e:
            print(f"     [COMET ERROR - Corrected] Batch {i//COMET_BATCH_SIZE} failed: {e}")
            corrected_scores.extend([None] * len(batch_data)) # Add None for failed batches

    # Calculate scores for Uncorrected References
    for i in tqdm(range(0, len(comet_data_uncorrected), COMET_BATCH_SIZE), desc="COMET (Uncorrected) batches"):
        batch_data = comet_data_uncorrected[i:i + COMET_BATCH_SIZE]
        try:
            batch_data = accelerator.prepare(batch_data)
            batch_results = comet_metric.predict(batch_data)
            uncorrected_scores.extend(batch_results.scores)
        except Exception as e:
            print(f"     [COMET ERROR - Uncorrected] Batch {i//COMET_BATCH_SIZE} failed: {e}")
            uncorrected_scores.extend([None] * len(batch_data)) # Add None for failed batches


    # Calculate the difference (Corrected - Uncorrected)
    comet_diffs = [
        (corrected_scores[i] - uncorrected_scores[i]) if corrected_scores[i] is not None and uncorrected_scores[i] is not None else None
        for i in range(len(corrected_scores))
    ]

    return comet_diffs, corrected_scores, uncorrected_scores

# Analyze NLLB-200 for COMET differences
nllb_comet_diffs = {}
nllb_sentence_scores = {} # Store sentence-level scores if needed later

for lang_name, lang_code in TARGET_LANGS.items():
    lang_3_code = lang_code.split('_')[0]
    print(f"\nCalculating sentence-level COMET difference for NLLB-200 {lang_name} ({lang_3_code})...")

    corrected_refs = corrected_references_storage.get(lang_3_code)
    uncorrected_refs = uncorrected_references_storage.get(lang_3_code)

    # Check if hypotheses exist for this language
    hypotheses_key = (lang_name, lang_3_code)
    if hypotheses_key not in all_hypotheses_nllb or not all_hypotheses_nllb[hypotheses_key]:
        print(f"   [SKIP] No hypotheses found for NLLB-200 {lang_name}. Cannot calculate COMET differences.")
        nllb_comet_diffs[lang_name] = []
        nllb_sentence_scores[lang_name] = ([], []) # Store empty lists
        continue


    if not corrected_refs or not uncorrected_refs:
        print(f"   [SKIP] Missing reference data for {lang_name}. Cannot calculate COMET differences.")
        nllb_comet_diffs[lang_name] = []
        nllb_sentence_scores[lang_name] = ([], []) # Store empty lists
        continue

    comet_diffs, corrected_scores, uncorrected_scores = calculate_comet_diff(
        all_hypotheses_nllb[hypotheses_key],
        source_texts,
        corrected_refs,
        uncorrected_refs,
        comet_metric,
        accelerator
    )

    nllb_comet_diffs[lang_name] = comet_diffs
    nllb_sentence_scores[lang_name] = (corrected_scores, uncorrected_scores)




--- Identifying Samples for Diagnostic Error Sampling ---

Calculating sentence-level COMET difference for NLLB-200 Hausa (hau)...
     Calculating sentence-level COMET scores (size 64)...


COMET (Corrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.22it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light

COMET (Uncorrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.17it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


Calculating sentence-level COMET difference for NLLB-200 Sepedi (nso)...
     Calculating sentence-level COMET scores (size 64)...


COMET (Corrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light

COMET (Uncorrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


Calculating sentence-level COMET difference for NLLB-200 Xitsonga (tso)...
     Calculating sentence-level COMET scores (size 64)...


COMET (Corrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.43it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light

COMET (Uncorrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light


Calculating sentence-level COMET difference for NLLB-200 isiZulu (zul)...
     Calculating sentence-level COMET scores (size 64)...


COMET (Corrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.72it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light

COMET (Uncorrected) batches:   0%|          | 0/16 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 4/4 [00:02<00:00,  1.67it/s]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_light

**Diagnostic Error Sampling Complete**

In [None]:
# Identify and display samples with largest positive COMET_DIFF
print("\n--- Samples with Largest Positive COMET_DIFF (NLLB-200) ---")

NUM_SAMPLES_TO_SHOW = 5

for lang_name, diffs in nllb_comet_diffs.items():
    if not diffs:
        print(f"\nNo COMET differences calculated for {lang_name}. Skipping sample analysis.")
        continue

    print(f"\nLanguage: {lang_name}")


    valid_diffs = [(diff, i) for i, diff in enumerate(diffs) if diff is not None]

    if not valid_diffs:
         print(f"   No valid COMET differences to analyze for {lang_name}.")
         continue


    top_diffs = sorted(valid_diffs, key=lambda x: x[0], reverse=True)[:NUM_SAMPLES_TO_SHOW]

    if not top_diffs:
         print(f"   No positive COMET differences found for {lang_name}.")
         continue

    lang_3_code = TARGET_LANGS[lang_name].split('_')[0]

    for diff, index in top_diffs:
        print(f"\nSample {index + 1} (COMET_DIFF: {diff:.4f}):")
        print(f"  Source: {source_texts[index]}")
        print(f"  MT Output (NLLB-200): {all_hypotheses_nllb[(lang_name, lang_3_code)][index]}")
        print(f"  OLD Ref (Score: {nllb_sentence_scores[lang_name][1][index]:.4f}): {uncorrected_references_storage[lang_3_code][index]}")
        print(f"  NEW Ref (Score: {nllb_sentence_scores[lang_name][0][index]:.4f}): {corrected_references_storage[lang_3_code][index]}")
        print("  Analysis: (Add your analysis here - e.g., grammatical error, morphological issue in old ref)")


print("\n--- Diagnostic Error Sampling Complete ---")


--- Samples with Largest Positive COMET_DIFF (NLLB-200) ---

Language: Hausa

Sample 474 (COMET_DIFF: 0.1191):
  Source: The mob of people forced the King And Queen to have their carriage windows wide open.
  MT Output (NLLB-200): Jama'a sun tilasta wa Sarki da Sarauniya su buɗe windows na karusa.
  OLD Ref (Score: 0.5936): Abokai masu nisa, ba tare da wayar tarho ba, wayar tauraron dan adam zata iya zama mai zaɓar.
  NEW Ref (Score: 0.7127): Tarin mutanen ya tilastawa sarkin da sarauniyar bude tagogin abun hawan nasu.
  Analysis: (Add your analysis here - e.g., grammatical error, morphological issue in old ref)

Sample 1010 (COMET_DIFF: 0.0955):
  Source: Suits are standard business attire, and coworkers call each other by their family names or by job titles.
  MT Output (NLLB-200): Suits ne na kasuwanci, kuma abokan aiki suna kiran juna da sunayensu ko kuma sunayen aiki.
  OLD Ref (Score: 0.6557): Kaya masu kala ɗaya su ne cikakkun tufafin mu’amala, kuma abokan aiki kan kira junansu