# IN22-Gen: Evaluation Notebook (eng → 16 Indic languages)

This notebook loads two JSONL files:
1. `predictions.jsonl` — each line: `{"id": ..., "english_og": ..., "translations": {"hi": "...", "bn": "...", ...}}`
2. `ground_truths.jsonl` — ai4bharat/IN22-Gen style rows (one JSON object per line) with columns such as `hin_Deva`, `ben_Beng`, etc. and an `id` field.

It will merge both files on `id` and compute the following metrics per language:
- chrF (character n-gram F-score)
- chrF++ (character+word n-gram variant)
- sacreBLEU (corpus BLEU via sacrebleu)
- COMET (optional; requires `comet` package and model checkpoint)

**Output:** a CSV and a printed DataFrame with per-language metrics and counts.

----

Notes:
- The notebook tries to compute COMET if the `comet` package is available and a model checkpoint is loadable. If not, COMET is skipped and other metrics are computed.
- You can change input filenames in the next cell.


In [None]:

!pip install sacrebleu pandas tqdm numpy
!pip install git+https://github.com/Unbabel/comet.git

print('If you need to install dependencies, uncomment the pip install lines in this cell and run it.')

In [None]:
import json
from pathlib import Path
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm
import sacrebleu
from sacrebleu.metrics import CHRF
import warnings
warnings.filterwarnings('ignore')


GROUND_TRUTHS_PATH = '/nfs/bds/translation_benchmark/datasets/IN22-Gen/data/train-00000-of-00001_with_ids.jsonl' 
PREDICTIONS_PATH = '/nfs/home/bhargav.patel/bhargav/MT/ds_IN22_enhanced_trans.jsonl'  
OUTPUT_CSV = '/nfs/bds/translation_benchmark/INT2/in22_eval_results.csv'

LANG_CODE_MAP = {
    'as': 'asm_Beng',
    'bn': 'ben_Beng',
    'gu': 'guj_Gujr',
    'hi': 'hin_Deva',
    'kn': 'kan_Knda',
    'mai': 'mai_Deva',
    'ml': 'mal_Mlym',
    'mr': 'mar_Deva',
    'ne': 'npi_Deva',
    'or': 'ory_Orya',
    'pa': 'pan_Guru',
    'sa': 'san_Deva',
    'sdd': 'snd_Deva',  
    'ta': 'tam_Taml',
    'te': 'tel_Telu',
    'ur': 'urd_Arab'
}

PRED_CODES = list(LANG_CODE_MAP.keys())
GT_COLS = list(LANG_CODE_MAP.values())

print('Prediction codes:', PRED_CODES)
print('Ground-truth columns:', GT_COLS)


def read_jsonl(path):
    data = []
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    with p.open('r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except Exception as e:
                print(f'Warning: skipping invalid JSON line: {e} -- line starts with: {line[:80]}')
    return data

print('Loading prediction file...')
pred_list = read_jsonl(PREDICTIONS_PATH)
print(f'Loaded {len(pred_list)} prediction rows')

print('Loading ground truth file...')
gt_list = read_jsonl(GROUND_TRUTHS_PATH)
print(f'Loaded {len(gt_list)} ground truth rows')


pred_by_id = {item['id']: item for item in pred_list}
gt_by_id = {item['id']: item for item in gt_list}


common_ids = sorted(set(pred_by_id.keys()) & set(gt_by_id.keys()))
print(f'Found {len(common_ids)} common IDs between predictions and ground-truths')

if len(common_ids) == 0:
    raise RuntimeError('No overlapping ids found. Check your input files.')

results = []


comet_available = False
comet_model = None
try:
    from comet import load_from_checkpoint
    try:
        comet_model = load_from_checkpoint("wmt22-comet-da")
        comet_available = True
        print('Loaded COMET model wmt22-comet-da')
    except Exception:
        print('COMET package found but could not load default checkpoint. COMET will be skipped unless you load a model manually.')
        comet_available = False
except Exception:
    print('COMET package not available. To enable COMET, install Unbabel comET: pip install git+https://github.com/Unbabel/comet.git')
    comet_available = False


for pred_code, gt_col in LANG_CODE_MAP.items():
    system_outputs = []
    references = []
    srcs = []

    for _id in common_ids:
        p = pred_by_id[_id]
        g = gt_by_id[_id]

        translations = p.get('translations', {}) or {}
        pred_text = translations.get(pred_code)

        ref_text = g.get(gt_col)


        if pred_text is None or pred_text == '':
            continue
        if ref_text is None or ref_text == '':
            continue

        system_outputs.append(pred_text)
        references.append(ref_text)
        srcs.append(p.get('english_og') or p.get('eng') or p.get('source'))

    n = len(system_outputs)
    if n == 0:
        print(f'No valid pairs for {pred_code} -> {gt_col}. Skipping.')
        results.append({
            'pred_code': pred_code,
            'gt_col': gt_col,
            'n': 0,
            'chrF': None,
            'chrF++': None,
            'BLEU': None,
            'COMET': None
        })
        continue


    try:
        chrf_metric = CHRF(word_order=0, char_order=6)
        chrf_score = chrf_metric.corpus_score(system_outputs, [references]).score
    except Exception as e:
        print('Error computing chrF:', e)
        chrf_score = None

    try:
        chrfpp_metric = CHRF(word_order=2, char_order=6)
        chrfpp_score = chrfpp_metric.corpus_score(system_outputs, [references]).score
    except Exception as e:
        print('Error computing chrF++:', e)
        chrfpp_score = None

    try:
        bleu = sacrebleu.corpus_bleu(system_outputs, [references]).score
    except Exception as e:
        print('Error computing BLEU:', e)
        bleu = None


    comet_score_avg = None
    if comet_available and comet_model is not None:
        try:
            data_for_comet = [{'src': s if s is not None else '', 'mt': mt, 'ref': ref} for s, mt, ref in zip(srcs, system_outputs, references)]
            scores = comet_model.predict(data_for_comet, batch_size=16)

            comet_score_avg = float(np.mean(scores))
        except Exception as e:
            print('Error running COMET:', e)
            comet_score_avg = None

    results.append({
        'pred_code': pred_code,
        'gt_col': gt_col,
        'n': n,
        'chrF': chrf_score,
        'chrF++': chrfpp_score,
        'BLEU': bleu,
        'COMET': comet_score_avg
    })


df_res = pd.DataFrame(results)
df_res = df_res.sort_values('pred_code').reset_index(drop=True)
df_res.to_csv(OUTPUT_CSV, index=False)
print('Saved results to', OUTPUT_CSV)

df_res


## How to use
1. Upload both JSONL files to the notebook environment and set the filenames in the `PREDICTIONS_PATH` and `GROUND_TRUTHS_PATH` variables.
2. Install missing dependencies if any (uncomment pip install lines and run the cell).
3. Run the evaluation cell. Results will be saved to `in22_eval_results.csv` and displayed in the notebook.

### Notes on COMET
- If you want COMET scores, install the Unbabel COMET package and download/load an appropriate checkpoint. The cell attempts to load `wmt22-comet-da` by default; change to a different checkpoint name if you have a different model.
- Loading COMET and running predictions requires a GPU for reasonable speed and enough RAM to hold the model.
