# Backtranslation Analysis

Analysis Notebook for the results from backtranslation and LLM-as-a-judge experiments across different models.

In [None]:
import json
import pandas as pd
from pathlib import Path
import glob
import re

import sys
sys.path.append('..')

from IR_model import TikzIR

In [None]:
# Configuration
MODELS = ["gpt-5", "gpt-5-mini", "gpt-4.1", "gpt-4.1-mini"] 
CACHE_DIR = Path("../results/backtranslation/cache")

EVALUATION_PATHS = {}

for MODEL in MODELS:
    EVALUATION_FILES = list(Path("../results/backtranslation/").glob(f"evaluation_results_{MODEL}_*.csv"))
    if EVALUATION_FILES:
        EVALUATION_PATHS[MODEL] = EVALUATION_FILES[0]  
        print(f"Found evaluation file for {MODEL}: {EVALUATION_PATHS[MODEL]}")
    else:
        print(f"No evaluation file found for {MODEL}")

DATASET_PATH = Path("../data/geometric_shapes_test_set.csv")

In [None]:
# Load the original dataset to get prompts and metadata
dataset = pd.read_csv(DATASET_PATH)
print(f"Dataset shape: {dataset.shape}")
print(f"Dataset columns: {list(dataset.columns)}")
dataset.head()

In [None]:
dfs = {}

for MODEL in MODELS:
    if MODEL in EVALUATION_PATHS:
        dfs[MODEL] = pd.read_csv(EVALUATION_PATHS[MODEL])
        print(f"{MODEL} strictly enforced IR count: {dfs[MODEL]['extraction_success'].sum()} / {len(dfs[MODEL])}")

In [None]:
print("| Model | Correct IR Count |")
print("|-------|------------------|")
for MODEL in MODELS:
    if MODEL in dfs:
        success_count = dfs[MODEL]['extraction_success'].sum()
        total_count = len(dfs[MODEL])
        print(f"| {MODEL} | {success_count} / {total_count} |")

In [None]:
for MODEL in MODELS:
    if MODEL in dfs:
        df = dfs[MODEL]
        
        if len(df) > 0:
            # Overall metrics
            avg_total_time_ms = (df['extraction_time_ms'] + df['evaluation_time_ms']).mean()
            avg_total_time_s = avg_total_time_ms / 1000  # Convert to seconds
            avg_cost = df['extraction_cost'].mean()
            total_cost = df['extraction_cost'].sum()
            
            print(f"{MODEL}:")
            print(f"  Average time: {avg_total_time_s:.2f} s")
            # Break down by main category
            for category in df['main_category'].unique():
                category_df = df[df['main_category'] == category]
                category_avg_time_ms = (category_df['extraction_time_ms'] + category_df['evaluation_time_ms']).mean()
                category_avg_time_s = category_avg_time_ms / 1000
                
                print(f"    {category}: {category_avg_time_s:.2f} s (n={len(category_df)})")

            print(f"  Average cost: ${avg_cost:.4f}")
            print(f"  Total cost: ${total_cost:.2f}")
            
            print()

Human Eval Processing

In [None]:
irr_path = Path("../data/geometric_shapes_IRR_consensus.csv")
# calib_path = Path("../data/calib_set_consensus.csv")
vishal_set = Path("../data/vishal_geometric_shapes_test_set.csv")
rebecca_set = Path("../data/rebecca_geometric_shapes_test_set.csv")
shubhra_set = Path("../data/shubhra_geometric_shapes_test_set.csv")


irr = pd.read_csv(irr_path, keep_default_na=False, na_values=[''])
# calib = pd.read_csv(calib_path, keep_default_na=False, na_values=['']) # used as our set for developing the checks and our calibration set for IRR
vishal_df = pd.read_csv(vishal_set, keep_default_na=False, na_values=[''])
rebecca_df = pd.read_csv(rebecca_set, keep_default_na=False, na_values=[''])
shubhra_df = pd.read_csv(shubhra_set, keep_default_na=False, na_values=[''])

human_eval_df = pd.concat([irr, vishal_df, rebecca_df, shubhra_df])


print(f"Human evaluation data shape: {human_eval_df.shape}")
print("Columns:", human_eval_df.columns.tolist())

print("\nChecking applicable columns after fixed CSV reading:")
for col in human_eval_df.columns:
    if col.startswith('Applicable'):
        print(f"\n{col}:")
        unique_vals = human_eval_df[col].unique()
        print(f"  Unique values: {unique_vals}")
        value_counts = human_eval_df[col].value_counts()
        for val, count in value_counts.items():
            print(f"    '{val}': {count}")

In [None]:
# Identify rating columns and separate those with/without 'Applicable' pairs
metadata_cols = ['assigned_to', 'prompt', 'subcategory', 'image', 'tikz', 'Mathematical', 'Spatial']

base_rating_cols = [col for col in human_eval_df.columns 
                   if col not in metadata_cols and not col.startswith('Applicable')]

# Identify which columns have corresponding 'Applicable' columns
cols_with_applicable = []
standalone_cols = []

for col in base_rating_cols:
    applicable_col = f"Applicable - {col}"
    if applicable_col in human_eval_df.columns:
        cols_with_applicable.append(col)
    else:
        standalone_cols.append(col)

print("Columns with 'Applicable' option:")
for col in cols_with_applicable:
    print(f"  - {col}")
    
print(f"\nStandalone columns:")
for col in standalone_cols:
    print(f"  - {col}")

In [None]:
# Function to collapse a main column with its applicable column
def collapse_applicable_column(df, main_col, applicable_col):
    """
    Collapse main column and applicable column into a single column with Yes/No/N/A values
    If applicable column is 'N/A', the result is N/A
    Otherwise, use the main column value
    """
    result = df[main_col].copy()
    
    # Where applicable column indicates N/A, set result to N/A
    if applicable_col in df.columns:
        applicable_mask = df[applicable_col] == 'N/A'
        result.loc[applicable_mask] = 'N/A'
    
    return result

collapsed_human_data = {}

for col in standalone_cols:
    collapsed_human_data[col] = human_eval_df[col]

for col in cols_with_applicable:
    applicable_col = f"Applicable - {col}"
    collapsed_human_data[col] = collapse_applicable_column(human_eval_df, col, applicable_col)

tikz_series = human_eval_df['tikz'].astype(str).str.strip() if 'tikz' in human_eval_df.columns else pd.Series([''] * len(human_eval_df))
human_eval_df = pd.DataFrame({
    'prompt': human_eval_df['prompt'].astype(str).str.strip(),
    'subcategory': human_eval_df['subcategory'],
    'tikz': tikz_series,
    **collapsed_human_data
})

print(f"Processed human evaluation data shape: {human_eval_df.shape}")
print("Available rating columns:", list(collapsed_human_data.keys()))

print("\nUnique values after collapse:")
for col in ['Labeled angles (if any) match drawn angle', 'Labeled lengths and areas (if any) match visual proportions', 'Labels (if any) are associated with correct elements']:
    if col in human_eval_df.columns:
        print(f"\n{col}:")
        value_counts = human_eval_df[col].value_counts()
        for val, count in value_counts.items():
            print(f"  {val}: {count}")

# Drop duplicate diagram_id rows 
if 'human_eval_with_ids' in globals():
    human_eval_with_ids = human_eval_with_ids.drop_duplicates('diagram_id', keep='first')


In [None]:
# Load the original dataset to match prompts and TikZ code
DATASET_PATH = Path("../data/geometric_shapes_test_set.csv")
original_dataset = pd.read_csv(DATASET_PATH, keep_default_na=False, na_values=[''])
original_dataset['prompt'] = original_dataset['prompt'].astype(str).str.strip()
if 'tikz' in original_dataset.columns:
    original_dataset['tikz'] = original_dataset['tikz'].astype(str).str.strip()
else:
    original_dataset['tikz'] = ''
human_eval_with_ids = human_eval_df.merge(
    original_dataset[['diagram_id', 'prompt', 'tikz']],
    on=['prompt', 'tikz'],
    how='inner'
)

# due to heavy class imbalance, omitted shape_outlines_are_closed_passed and core_mathematical_properties_of_shapes_correct_passed
human_eval_with_ids = human_eval_with_ids.drop_duplicates('diagram_id', keep='first')
column_mapping = {
    # "Shape(s) is closed": "shape_outlines_are_closed_passed",
    # "Core math properties of the shape(s) are correct": "core_mathematical_properties_of_shapes_correct_passed",
    "Diagram is fully in frame": "diagram_fully_in_canvas_passed",
    "Diagram elements are scaled to be readable": "diagram_elements_are_readable_size_passed",
    "Diagram elements don't problematically overlap": "diagram_elements_dont_problematically_overlap_passed",
    "Labeled angles (if any) match drawn angle": "angle_labels_matches_arcs_passed",
    "Labeled lengths and areas (if any) match visual proportions": "labeled_lengths_areas_match_proportions_passed",
    "Labels (if any) are associated with correct elements": "labels_associated_with_elements_passed",
}


In [None]:
from sklearn.metrics import cohen_kappa_score
def calculate_cohen_kappa_agreement(human_eval_with_ids, model_df, column_mapping):
    
    # Merge human eval with model results on diagram_id
    merged = human_eval_with_ids.merge(
        model_df, 
        on='diagram_id', 
        how='inner'
    )
    
    print(f"Merged {len(merged)} examples for comparison")
    
    if len(merged) < 2:
        print("Too few examples for meaningful comparison")
        return {}
    
    results = {}
    
    for human_col, auto_col in column_mapping.items():
        human_vals = merged[human_col].tolist()
        auto_vals = merged[auto_col].tolist()
        
        # Standardize auto values (True/False -> Yes/No, keep N/A as is)
        auto_standardized = []
        for val in auto_vals:
            if pd.isna(val):
                auto_standardized.append('N/A')
            elif isinstance(val, str):
                auto_standardized.append(val)
            elif val is True:
                auto_standardized.append('Yes')
            elif val is False:
                auto_standardized.append('No')
            else:
                auto_standardized.append(str(val))
        
        try:
            # Calculate Cohen's Kappa with explicit labels
            labels = ['Yes', 'No', 'N/A']
            kappa = cohen_kappa_score(human_vals, auto_standardized, labels=labels)
            
            # Get distributions for context
            human_dist = pd.Series(human_vals).value_counts().to_dict()
            auto_dist = pd.Series(auto_standardized).value_counts().to_dict()
            
            results[human_col] = {
                'kappa': kappa,
                'n_examples': len(human_vals),
                'human_distribution': human_dist,
                'auto_distribution': auto_dist
            }
            
        except Exception as e:
            results[human_col] = {
                'kappa': None,
                'n_examples': len(human_vals),
                'error': str(e)
            }
    
    return results
# Calculate Cohen's Kappa for each model
print("COHEN'S KAPPA ANALYSIS:")
print("="*60)
all_kappa_results = {}
for MODEL in MODELS:
    if MODEL not in dfs:
        continue
    
    print(f"\n{MODEL}:")
    print("-" * 40)
    
    kappa_results = calculate_cohen_kappa_agreement(
        human_eval_with_ids, 
        dfs[MODEL], 
        column_mapping
    )
    
    all_kappa_results[MODEL] = kappa_results
    
    # Print results for this model
    for check, result in kappa_results.items():
        print(f"\nCHECK: {check}")
        if result['kappa'] is not None:
            print(f"  Cohen's Kappa: {result['kappa']:.3f}")
            print(f"  Examples: {result['n_examples']}")

In [None]:
ordered_rubrics = [
    ("Shape(s) is closed", "Shapes closed"),
    ("Core math properties of the shape(s) are correct", "Core math correct"),
    ("Diagram is fully in frame", "Diagram fully in canvas"),
    ("Diagram elements are scaled to be readable", "Elements are readable size"),
    ("Diagram elements don't problematically overlap", "Elements do not problematically overlap"),
    ("Labeled angles (if any) match drawn angle", "Angle labels match arcs"),
    ("Labeled lengths and areas (if any) match visual proportions", "Labeled lengths/areas match proportions"),
    ("Labels (if any) are associated with correct elements", "Labels associated with elements"),
]

rows = []
for raw_name, pretty_name in ordered_rubrics:
    series = human_eval_df[raw_name].astype(str).str.strip()
    total = len(series)

    yes = (series == "Yes").sum()
    no = (series == "No").sum()
    na = (series == "N/A").sum()

    rows.append({
        "Rubric": pretty_name,
        "Yes (#)": yes,
        "Yes (%)": yes / total,
        "No (#)": no,
        "No (%)": no / total,
        "N/A (#)": na,
        "N/A (%)": na / total,
        "Total": total,
    })

human_error_stats = (
    pd.DataFrame(rows)
      .set_index("Rubric")
      .round({"Yes (%)": 3, "No (%)": 3, "N/A (%)": 3})
)

print("Human annotation distribution with counts and percentages:")
display(human_error_stats)

LLM as a judge 

In [None]:
# Helper functions for LLM Judge analysis
import json
from typing import Dict, List

MODEL_COSTS = {
    'gpt-5':        (1.25, 10.00),
    'gpt-5-mini':   (0.25, 2.00),
    'gpt-4.1':      (2.00, 8.00),
    'gpt-4.1-mini': (0.40, 1.60),
}
CACHE_DISCOUNT = 0.25  # cached prompt tokens are billed at 25% of the prompt rate

def calculate_judge_cost(tokens: Dict[str, int], model: str, *, include_cache_discount: bool = False) -> float:
    """Estimate API cost from token usage. Optionally ignore cache discount."""
    if model not in MODEL_COSTS:
        raise ValueError(f'Unknown model: {model}')
    prompt_cost_per_million, completion_cost_per_million = MODEL_COSTS[model]
    prompt_rate = prompt_cost_per_million / 1_000_000
    cached_rate = (prompt_rate * CACHE_DISCOUNT) if include_cache_discount else prompt_rate
    completion_rate = completion_cost_per_million / 1_000_000

    input_tokens = tokens.get('input_tokens') or 0
    cached_tokens = tokens.get('cached_tokens') or 0
    cached_tokens = min(cached_tokens, input_tokens)
    fresh_tokens = input_tokens - cached_tokens
    completion_tokens = tokens.get('output_tokens') or 0

    return (fresh_tokens * prompt_rate) + (cached_tokens * cached_rate) + (completion_tokens * completion_rate)

def calculate_judge_cost_no_cache(tokens: Dict[str, int], model: str) -> float:
    """Cost if cached tokens are billed at the full prompt rate."""
    return calculate_judge_cost(tokens, model, include_cache_discount=False)

def load_judge_records(results_root: Path) -> pd.DataFrame:
    records: List[Dict[str, object]] = []
    for mode_dir in results_root.iterdir():
        if not mode_dir.is_dir():
            continue
        mode = mode_dir.name
        for model_dir in mode_dir.iterdir():
            if not model_dir.is_dir():
                continue
            model = model_dir.name
            for path in model_dir.glob('diagram_*.json'):
                data = json.loads(path.read_text())
                tokens = data.get('tokens', {})
                record: Dict[str, object] = {
                    'diagram_id': data.get('diagram_id'),
                    'mode': data.get('mode', mode),
                    'model': data.get('model', model),
                    'temperature': data.get('temperature'),
                    'reasoning_effort': data.get('reasoning_effort'),
                    'elapsed_ms': data.get('elapsed_ms'),
                    'cost': calculate_judge_cost(tokens, data.get('model', model)),
                    'cost_no_cache': calculate_judge_cost_no_cache(tokens, data.get('model', model)),
                    'input_tokens': tokens.get('input_tokens'),
                    'cached_tokens': tokens.get('cached_tokens'),
                    'output_tokens': tokens.get('output_tokens'),
                    'total_tokens': tokens.get('total_tokens'),
                }
                rubric = data.get('rubric', {}) or {}
                for rubric_key, rubric_payload in rubric.items():
                    value = None
                    if isinstance(rubric_payload, dict):
                        value = rubric_payload.get('value')
                    elif isinstance(rubric_payload, str):
                        value = rubric_payload
                    record[f'rubric::{rubric_key}'] = value
                records.append(record)
    if not records:
        raise RuntimeError(f'No judge records found under {results_root}')
    df = pd.DataFrame(records)
    df.sort_values(['mode', 'model', 'diagram_id'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

print("Helper functions for LLM Judge analysis loaded")


In [None]:
# Judge-specific agreement helpers
from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score, precision_recall_fscore_support

def calculate_judge_cohen_kappa_agreement(human_eval_with_ids, judge_df, column_mapping):
    judge_df_copy = judge_df.copy()
    judge_df_copy['diagram_id'] = judge_df_copy['diagram_id'].astype(str)
    judge_df_copy = judge_df_copy.drop_duplicates('diagram_id', keep='first')

    human_eval_copy = human_eval_with_ids.copy()
    human_eval_copy['diagram_id'] = human_eval_copy['diagram_id'].astype(str)
    human_eval_copy = human_eval_copy.drop_duplicates('diagram_id', keep='first')

    merged = human_eval_copy.merge(judge_df_copy, on='diagram_id', how='inner')
    merged = merged.drop_duplicates('diagram_id', keep='first')

    print(f"Merged {len(merged)} examples for comparison")
    if len(merged) < 2:
        print("Too few examples for meaningful comparison")
        return {}

    results = {}
    for human_col, judge_col in column_mapping.items():
        human_vals = merged[human_col].tolist()
        judge_vals = merged[judge_col].tolist()

        # Normalize auto labels
        auto_standardized = []
        for val in judge_vals:
            if pd.isna(val):
                auto_standardized.append('N/A')
            elif isinstance(val, str):
                auto_standardized.append(val)
            elif val is True:
                auto_standardized.append('Yes')
            elif val is False:
                auto_standardized.append('No')
            else:
                auto_standardized.append(str(val))

        labels = ['Yes', 'No', 'N/A']
        kappa = cohen_kappa_score(human_vals, auto_standardized, labels=labels)
        human_dist = pd.Series(human_vals).value_counts().to_dict()
        judge_dist = pd.Series(auto_standardized).value_counts().to_dict()

        results[human_col] = {
            'kappa': kappa,
            'n_examples': len(human_vals),
            'human_distribution': human_dist,
            'judge_distribution': judge_dist,
        }

    return results

In [None]:
from pathlib import Path
judge_results_root = Path("../results/llm_judge")
judge_df = load_judge_records(judge_results_root)

In [None]:
judge_column_mapping = {
    "Diagram is fully in frame": "rubric::diagram_fully_in_canvas",
    "Diagram elements are scaled to be readable": "rubric::diagram_elements_are_readable_size",
    "Diagram elements don't problematically overlap": "rubric::diagram_elements_dont_problematically_overlap",
    "Labeled angles (if any) match drawn angle": "rubric::angle_labels_matches_arcs",
    "Labeled lengths and areas (if any) match visual proportions": "rubric::labeled_lengths_areas_match_proportions",
    "Labels (if any) are associated with correct elements": "rubric::labels_associated_with_elements",
}

judge_modes = sorted(judge_df["mode"].unique())
all_judge_kappa_results = {mode: {} for mode in judge_modes}
all_judge_pr_results = {mode: {} for mode in judge_modes}

for mode in judge_modes:
    mode_df = judge_df[judge_df["mode"] == mode]
    for MODEL in ["gpt-4.1", "gpt-5", "gpt-4.1-mini", "gpt-5-mini"]:
        model_df = mode_df[mode_df["model"] == MODEL].copy()
        if model_df.empty:
            continue

        kappa_results = calculate_judge_cohen_kappa_agreement(
            human_eval_with_ids, model_df, judge_column_mapping
        )
        all_judge_kappa_results[mode][MODEL] = kappa_results

        pr_results = calculate_judge_precision_recall_metrics(
            human_eval_with_ids, model_df, judge_column_mapping
        )
        all_judge_pr_results[mode][MODEL] = pr_results

In [None]:
rubrics = [
    "Labeled angles (if any) match drawn angle",
    "Labeled lengths and areas (if any) match visual proportions",
    "Diagram is fully in frame",
    "Diagram elements are scaled to be readable",
    "Labels (if any) are associated with correct elements",
    "Diagram elements don't problematically overlap",
]

friendly = {
    "gpt-4.1": "gpt 4.1",
    "gpt-4.1-mini": "gpt 4.1 mini",
    "gpt-5": "gpt 5",
    "gpt-5-mini": "gpt 5 mini",
}

summary_rows = []
for rubric in rubrics:
    row = {"rubric": rubric}

    # Backtranslation K
    for model in ["gpt-4.1", "gpt-4.1-mini", "gpt-5", "gpt-5-mini"]:
        kappa_val = all_kappa_results.get(model, {}).get(rubric, {}).get("kappa")
        row[("Backtranslation", friendly[model])] = kappa_val

    # Judge K per mode
    for mode in sorted(judge_df["mode"].unique()):
        mode_results = all_judge_kappa_results.get(mode, {})
        for model in ["gpt-4.1", "gpt-4.1-mini", "gpt-5", "gpt-5-mini"]:
            kappa_val = mode_results.get(model, {}).get(rubric, {}).get("kappa")
            row[(f"Judge – {mode}", friendly[model])] = kappa_val

    summary_rows.append(row)

kappa_table_only = pd.DataFrame(summary_rows).set_index("rubric")
kappa_table_only.columns = pd.MultiIndex.from_tuples(kappa_table_only.columns)
print("\nκ comparisons (Backtranslation vs LLM-as-a-Judge modes):")
display(kappa_table_only.round(3))

Tables

In [None]:
cols_back = [("Backtranslation", name) for name in friendly.values()]

# Backtranslation vs Judge (both mode)
back_vs_both = kappa_table_only.loc[
    :, cols_back + [("Judge – both", name) for name in friendly.values()]
]
print("\nκ comparison: Backtranslation vs LLM-as-a-Judge (both mode)")
back_vs_both_styled = back_vs_both.style.highlight_max(axis=1, props="font-weight: bold;")
display(back_vs_both_styled)


# Backtranslation vs Judge (code mode)
back_vs_code = kappa_table_only.loc[
    :, cols_back + [("Judge – code", name) for name in friendly.values()]
]
print("\nκ comparison: Backtranslation vs LLM-as-a-Judge (code mode)")
back_vs_code_styled = back_vs_code.style.highlight_max(axis=1, props="font-weight: bold;")
display(back_vs_code_styled)

# Backtranslation vs Judge (image mode)
back_vs_image = kappa_table_only.loc[
    :, cols_back + [("Judge – image", name) for name in friendly.values()]
]
print("\nκ comparison: Backtranslation vs LLM-as-a-Judge (image mode)")
back_vs_image_styled = back_vs_image.style.highlight_max(axis=1, props="font-weight: bold;")
display(back_vs_image_styled)

In [None]:
cols_back = [("Backtranslation", name) for name in friendly.values()]
cols_both = [("Judge – both", name) for name in friendly.values()]
cols_code = [("Judge – code", name) for name in friendly.values()]
cols_image = [("Judge – image", name) for name in friendly.values()]

single_row = pd.DataFrame({
    "Backtranslation – κ": back_vs_both[cols_back].mean(skipna=True).values,
    "Judge – both – κ": back_vs_both[cols_both].mean(skipna=True).values,
    "Judge – code – κ": back_vs_code[cols_code].mean(skipna=True).values,
    "Judge – image – κ": back_vs_image[cols_image].mean(skipna=True).values,
}, index=friendly.values())

print("Average κ across all rubric checks (columns = baseline vs each judge mode)")
display(single_row.round(3).style.highlight_max(axis=1, props="font-weight: bold;"))

In [None]:
# Backtranslation summary
bt_summary_rows = []
for MODEL in MODELS:
    if MODEL not in all_kappa_results:
        continue
    kappa_results = all_kappa_results[MODEL]
    kappas = [res['kappa'] for res in kappa_results.values() if res.get('kappa') is not None]
    avg_kappa = sum(kappas) / len(kappas) if kappas else float('nan')
    bt_summary_rows.append({
        'model': friendly[MODEL], 
        'cohen_k': avg_kappa,
    })
bt_summary_df = pd.DataFrame(bt_summary_rows)

# Judge summary (all modes)
judge_summary_rows = []
for mode in judge_modes:
    for MODEL in MODELS:
        kappa_results = all_judge_kappa_results.get(mode, {}).get(MODEL)
        if not kappa_results:
            continue
        kappas = [res["kappa"] for res in kappa_results.values() if res.get("kappa") is not None]
        avg_kappa = sum(kappas) / len(kappas) if kappas else float("nan")

        judge_summary_rows.append({
            "mode": mode,
            "model": friendly[MODEL], 
            "cohen_k": avg_kappa,
        })
judge_summary_df = pd.DataFrame(judge_summary_rows)

In [None]:
def compute_backtranslation_stats(model_id):
    df = dfs[model_id]
    avg_time_s = ((df["extraction_time_ms"] + df["evaluation_time_ms"]).mean()) / 1000
    total_cost = df["extraction_cost"].sum()
    avg_kappa = bt_summary_df.set_index("model").loc[friendly[model_id], "cohen_k"]
    return avg_kappa, avg_time_s, total_cost

def compute_judge_stats(mode, model_id):
    df = judge_df[(judge_df["mode"] == mode) & (judge_df["model"] == model_id)]
    avg_time_s = df["elapsed_ms"].mean() / 1000
    total_cost = df["cost"].sum()
    avg_kappa = judge_summary_df.set_index(["mode", "model"]).loc[(mode, friendly[model_id]), "cohen_k"]
    return avg_kappa, avg_time_s, total_cost

comparison_tables = {}

for mode in sorted(judge_df["mode"].unique()):
    rows = []
    for model_id in ["gpt-4.1", "gpt-5", "gpt-4.1-mini", "gpt-5-mini"]:
        bt_k, bt_time, bt_cost = compute_backtranslation_stats(model_id)
        judge_k, judge_time, judge_cost = compute_judge_stats(mode, model_id)
        rows.append({
            "Model": friendly[model_id],
            "BT κ": bt_k,
            "BT time (s)": bt_time,
            "BT cost ($)": bt_cost,
            f"Judge ({mode}) κ": judge_k,
            f"Judge ({mode}) time (s)": judge_time,
            f"Judge ({mode}) cost ($)": judge_cost,
        })

    comparison_tables[mode] = (
        pd.DataFrame(rows)
        .set_index("Model")
        .round({"BT κ": 3, "BT time (s)": 2, "BT cost ($)": 2,
                f"Judge ({mode}) κ": 3, f"Judge ({mode}) time (s)": 2, f"Judge ({mode}) cost ($)": 2})
    )

for mode, table in comparison_tables.items():
    print(f"\nBacktranslation vs LLM-as-a-Judge ({mode})")
    display(table)