## Human Agreement Analysis: Backtranslation vs LLM-as-Judge

This notebook compares results from both approaches across models: backtranslation and LLM-as-Judge.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import re
import json
from sklearn.metrics import cohen_kappa_score

REPO_ROOT = Path.cwd().resolve()

if not (REPO_ROOT / "data").exists():
    REPO_ROOT = REPO_ROOT.parent

if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))


# for normalizing the labels across different datasets
label_map = {
    "yes": "Yes", "y": "Yes", "true": "Yes", "1": "Yes",
    "no": "No", "n": "No", "false": "No", "0": "No",
    "n/a": "N/A", "na": "N/A", "not applicable": "N/A", "": "N/A",
}

In [24]:
# Configuration
MODELS = ["gpt-5", "gpt-5-mini", "gpt-4.1", "gpt-4.1-mini"]
BACKTRANSLATION_RESULTS_ROOT = REPO_ROOT / "results" / "backtranslation"

EVALUATION_PATHS = {}

for MODEL in MODELS:
    EVALUATION_FILES = list(BACKTRANSLATION_RESULTS_ROOT.glob(f"evaluation_results_{MODEL}_*.csv"))
    if EVALUATION_FILES:
        EVALUATION_PATHS[MODEL] = EVALUATION_FILES[0]
        print(f"Found evaluation file for {MODEL}: {EVALUATION_PATHS[MODEL]}")
    else:
        print(f"No evaluation file found for {MODEL}")

DATASET_PATH = REPO_ROOT / "data" / "geometric_shapes_test_set.csv"


Found evaluation file for gpt-5: /Users/vishalkumar/Desktop/Research/code/DiagramIR/results/backtranslation/evaluation_results_gpt-5_20250924.csv
Found evaluation file for gpt-5-mini: /Users/vishalkumar/Desktop/Research/code/DiagramIR/results/backtranslation/evaluation_results_gpt-5-mini_20250924.csv
Found evaluation file for gpt-4.1: /Users/vishalkumar/Desktop/Research/code/DiagramIR/results/backtranslation/evaluation_results_gpt-4.1_20250924.csv
Found evaluation file for gpt-4.1-mini: /Users/vishalkumar/Desktop/Research/code/DiagramIR/results/backtranslation/evaluation_results_gpt-4.1-mini_20260219.csv


First, we'll load the original dataset

In [19]:
dataset = pd.read_csv(DATASET_PATH)
print(f"Dataset shape: {dataset.shape}")
print(f"Dataset columns: {list(dataset.columns)}")
dataset.head(1)

Dataset shape: (398, 9)
Dataset columns: ['prompt', 'tikz', 'image', 'main_category', 'subcategory', 'diagram_id', 'assignment_type', 'assigned_to', 'image_png_path']


Unnamed: 0,prompt,tikz,image,main_category,subcategory,diagram_id,assignment_type,assigned_to,image_png_path
0,triangle with side length 8 horizontal at bott...,\documentclass{IM}\n\usepackage{tikz}\n\begin{...,https://2xavun1dsa0sayar.public.blob.vercel-st...,2d shapes,triangle,1,individual,Shubhra,data/judge_pngs/diagram_1.png


Now, we'll load and shape the human annotations, necessary for calculating human agreement across both auto-eval methods

In [None]:
human_eval_raw = pd.read_csv(REPO_ROOT / "data" / "human_ratings.csv", keep_default_na=False, na_values=[""])
original_dataset = pd.read_csv(DATASET_PATH, keep_default_na=False, na_values=[""])

# Normalize join keys
for df in (human_eval_raw, original_dataset):
    df["prompt"] = df["prompt"].astype(str).str.strip()
    df["tikz"] = df.get("tikz", "").astype(str).str.strip() if "tikz" in df.columns else ""

# Collapse "Applicable - X": if not applicable, force X -> "N/A"
metadata_cols = {"assigned_to", "prompt", "subcategory", "image", "tikz", "Mathematical", "Spatial", "diagram_id", "rating_source"}
rating_cols = [c for c in human_eval_raw.columns if c not in metadata_cols and not c.startswith("Applicable - ")]

human_eval_df = human_eval_raw[["prompt", "subcategory", "tikz"] + rating_cols].copy()
for col in rating_cols:
    app_col = f"Applicable - {col}"
    if app_col in human_eval_raw.columns:
        human_eval_df.loc[human_eval_raw[app_col].eq("N/A"), col] = "N/A"

# Attach diagram_id
human_eval_with_ids = (
    human_eval_df
    .merge(original_dataset[["diagram_id", "prompt", "tikz"]], on=["prompt", "tikz"], how="inner")
    .drop_duplicates("diagram_id", keep="first")
)

# Human column -> backtranslation output column
column_mapping = {
    "Diagram is fully in frame": "diagram_fully_in_canvas_passed",
    "Diagram elements are scaled to be readable": "diagram_elements_are_readable_size_passed",
    "Diagram elements don't problematically overlap": "diagram_elements_dont_problematically_overlap_passed",
    "Labeled angles (if any) match drawn angle": "angle_labels_matches_arcs_passed",
    "Labeled lengths and areas (if any) match visual proportions": "labeled_lengths_areas_match_proportions_passed",
    "Labels (if any) are associated with correct elements": "labels_associated_with_elements_passed",
}

print(f"Human eval rows (raw): {len(human_eval_raw)}")
print(f"Human eval rows matched to dataset: {len(human_eval_with_ids)}")

Human eval rows (raw): 386
Human eval rows matched to dataset: 386


See the distrubition of human ratings

In [26]:
ordered_rubrics = [
    ("Shape(s) is closed", "Shapes closed"),
    ("Core math properties of the shape(s) are correct", "Core math correct"),
    ("Diagram is fully in frame", "Diagram fully in canvas"),
    ("Diagram elements are scaled to be readable", "Elements are readable size"),
    ("Diagram elements don't problematically overlap", "Elements do not problematically overlap"),
    ("Labeled angles (if any) match drawn angle", "Angle labels match arcs"),
    ("Labeled lengths and areas (if any) match visual proportions", "Labeled lengths/areas match proportions"),
    ("Labels (if any) are associated with correct elements", "Labels associated with elements"),
]

summary = []
for raw_name, pretty_name in ordered_rubrics:
    counts = human_eval_df[raw_name].astype(str).str.strip().value_counts()
    total = int(counts.sum())
    yes = int(counts.get("Yes", 0))
    no = int(counts.get("No", 0))
    na = int(counts.get("N/A", 0))

    summary.append({
        "Rubric": pretty_name,
        "Yes (#)": yes,
        "Yes (%)": yes / total if total else 0,
        "No (#)": no,
        "No (%)": no / total if total else 0,
        "N/A (#)": na,
        "N/A (%)": na / total if total else 0,
        "Total": total,
    })

human_error_stats = pd.DataFrame(summary).set_index("Rubric").round({
    "Yes (%)": 3, "No (%)": 3, "N/A (%)": 3
})

display(human_error_stats)


Unnamed: 0_level_0,Yes (#),Yes (%),No (#),No (%),N/A (#),N/A (%),Total
Rubric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Shapes closed,378,0.979,8,0.021,0,0.0,386
Core math correct,366,0.948,20,0.052,0,0.0,386
Diagram fully in canvas,345,0.894,41,0.106,0,0.0,386
Elements are readable size,369,0.956,17,0.044,0,0.0,386
Elements do not problematically overlap,297,0.769,89,0.231,0,0.0,386
Angle labels match arcs,36,0.093,10,0.026,340,0.881,386
Labeled lengths/areas match proportions,91,0.236,50,0.13,245,0.635,386
Labels associated with elements,191,0.495,40,0.104,155,0.402,386


Now, we calculate human agreement (Cohen's Kappa) for Backtranslation

In [56]:
all_kappa_results = {}

for model in MODELS:
    if model not in dfs:
        continue
    merged = human_eval_with_ids.merge(dfs[model], on="diagram_id", how="inner")
    model_results = {}

    for human_col, auto_col in column_mapping.items():
        human_vals = merged[human_col].astype(str).str.strip().str.lower().map(label_map).fillna("N/A")
        auto_vals = merged[auto_col].astype(str).str.strip().str.lower().map(label_map).fillna("N/A")

        model_results[human_col] = {
            "kappa": cohen_kappa_score(human_vals, auto_vals),
            "n_examples": len(merged),
            "human_distribution": human_vals.value_counts().to_dict(),
            "auto_distribution": auto_vals.value_counts().to_dict(),
        }

    all_kappa_results[model] = model_results
    print(f"{model}: avg_kappa={sum(r['kappa'] for r in model_results.values()) / len(model_results):.3f}")

gpt-5: avg_kappa=0.556
gpt-5-mini: avg_kappa=0.527
gpt-4.1: avg_kappa=0.563
gpt-4.1-mini: avg_kappa=0.499


Now, we can move onto calculating human agreement for LLM-as-a-judge 

### Load latest LLM-as-Judge CSVs (one per mode/model)

In [61]:
# Load latest LLM-as-Judge CSVs (inline, no helpers)
judge_results_root = REPO_ROOT / "results" / "llm_judge"
pattern = re.compile(r"evaluation_results_(?P<mode>[^_]+)_(?P<model>.+)_(?P<date>\d{8})\.csv$")
latest = {}

for path in sorted(judge_results_root.glob("evaluation_results_*.csv")):
    match = pattern.match(path.name)
    if not match:
        continue
    mode = match.group("mode")
    model = match.group("model")
    date = match.group("date")
    key = (mode, model)
    if key not in latest or date > latest[key][0]:
        latest[key] = (date, path)

if not latest:
    raise RuntimeError(f"No judge CSV summaries found under {judge_results_root}")

judge_frames = []
for (mode, model), (_, path) in sorted(latest.items()):
    frame = pd.read_csv(path)
    if frame.empty:
        continue
    if "mode" not in frame.columns:
        frame["mode"] = mode
    if "model" not in frame.columns:
        frame["model"] = model
    if "cost" not in frame.columns:
        frame["cost"] = 0.0
    judge_frames.append(frame)

if not judge_frames:
    raise RuntimeError(f"Judge CSV summaries were found but empty under {judge_results_root}")

judge_df = pd.concat(judge_frames, ignore_index=True)
judge_df["diagram_id"] = judge_df["diagram_id"].astype(str)
judge_df = judge_df.drop_duplicates(["mode", "model", "diagram_id"], keep="first")
judge_modes = sorted(judge_df["mode"].unique())
print(f"Loaded judge data: modes={judge_modes}, models={sorted(judge_df['model'].unique())}")



Loaded judge data: modes=['both'], models=['gpt-4.1', 'gpt-4.1-mini', 'gpt-5', 'gpt-5-mini']


In [62]:
judge_column_mapping = {
    "Diagram is fully in frame": "diagram_fully_in_canvas_value",
    "Diagram elements are scaled to be readable": "diagram_elements_are_readable_size_value",
    "Diagram elements don't problematically overlap": "diagram_elements_dont_problematically_overlap_value",
    "Labeled angles (if any) match drawn angle": "angle_labels_matches_arcs_value",
    "Labeled lengths and areas (if any) match visual proportions": "labeled_lengths_areas_match_proportions_value",
    "Labels (if any) are associated with correct elements": "labels_associated_with_elements_value",
}

judge_modes = sorted(judge_df["mode"].unique())

human_for_judge = human_eval_with_ids.copy()
human_for_judge["diagram_id"] = human_for_judge["diagram_id"].astype(str)
human_for_judge = human_for_judge.drop_duplicates("diagram_id", keep="first")

all_judge_kappa_results = {}

for mode in judge_modes:
    mode_results = {}
    for model in MODELS:
        model_df = judge_df[(judge_df["mode"] == mode) & (judge_df["model"] == model)].copy()
        if model_df.empty:
            continue

        model_df["diagram_id"] = model_df["diagram_id"].astype(str)
        model_df = model_df.drop_duplicates("diagram_id", keep="first")
        merged = human_for_judge.merge(model_df, on="diagram_id", how="inner")

        rubric_results = {}
        for human_col, judge_col in judge_column_mapping.items():
            human_vals = merged[human_col].astype(str).str.strip().str.lower().map(label_map).fillna("N/A")
            judge_vals = merged[judge_col].astype(str).str.strip().str.lower().map(label_map).fillna("N/A")
            rubric_results[human_col] = {
                "kappa": cohen_kappa_score(human_vals, judge_vals),
                "n_examples": len(merged),
                "human_distribution": human_vals.value_counts().to_dict(),
                "judge_distribution": judge_vals.value_counts().to_dict(),
            }

        mode_results[model] = rubric_results
        print(f"{mode}/{model}: avg_kappa={sum(r['kappa'] for r in rubric_results.values()) / len(rubric_results):.3f}")

    all_judge_kappa_results[mode] = mode_results

both/gpt-5: avg_kappa=0.498
both/gpt-5-mini: avg_kappa=0.465
both/gpt-4.1: avg_kappa=0.399
both/gpt-4.1-mini: avg_kappa=0.388


In [71]:
# Define rubrics to compare and model name mappings
rubrics = [
    "Labeled angles (if any) match drawn angle",
    "Labeled lengths and areas (if any) match visual proportions",
    "Diagram is fully in frame",
    "Diagram elements are scaled to be readable",
    "Labels (if any) are associated with correct elements",
    "Diagram elements don't problematically overlap",
]

friendly = {
    "gpt-4.1": "gpt 4.1",
    "gpt-4.1-mini": "gpt 4.1 mini",
    "gpt-5": "gpt 5",
    "gpt-5-mini": "gpt 5 mini",
}

models = list(friendly.keys())  # Avoid repeating model list

# Build comparison table: each row is a rubric, columns are (method, model) pairs
summary_rows = []
for rubric in rubrics:
    row = {"rubric": rubric}
    
    # Add Backtranslation kappa values for each model
    for model in models:
        kappa_val = all_kappa_results.get(model, {}).get(rubric, {}).get("kappa")
        row[("Backtranslation", friendly[model])] = kappa_val
    
    # Add Judge kappa values for each mode and model
    for mode in sorted(judge_df["mode"].unique()):
        mode_results = all_judge_kappa_results.get(mode, {})
        for model in models:
            kappa_val = mode_results.get(model, {}).get(rubric, {}).get("kappa")
            row[(f"Judge – {mode}", friendly[model])] = kappa_val
    
    summary_rows.append(row)

# Create DataFrame with multi-level columns: (method, model)
kappa_table_only = pd.DataFrame(summary_rows).set_index("rubric")
kappa_table_only.columns = pd.MultiIndex.from_tuples(kappa_table_only.columns)

# Display table with max values per row highlighted in bold
print("\nκ comparisons (Backtranslation vs LLM-as-a-Judge modes):")
styled_table = kappa_table_only.round(3).style.highlight_max(axis=1, props="font-weight: bold;")
display(styled_table)

# Store column references for later use in average kappa calculations
cols_back = [("Backtranslation", name) for name in friendly.values()]


κ comparisons (Backtranslation vs LLM-as-a-Judge modes):


Unnamed: 0_level_0,Backtranslation,Backtranslation,Backtranslation,Backtranslation,Judge – both,Judge – both,Judge – both,Judge – both
Unnamed: 0_level_1,gpt 4.1,gpt 4.1 mini,gpt 5,gpt 5 mini,gpt 4.1,gpt 4.1 mini,gpt 5,gpt 5 mini
rubric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Labeled angles (if any) match drawn angle,0.691,0.593,0.644,0.652,0.793,0.791,0.795,0.829
Labeled lengths and areas (if any) match visual proportions,0.449,0.429,0.429,0.422,0.581,0.596,0.673,0.616
Diagram is fully in frame,0.573,0.541,0.604,0.581,0.184,0.162,0.39,0.398
Diagram elements are scaled to be readable,0.362,0.308,0.334,0.272,-0.017,0.0,0.043,0.097
Labels (if any) are associated with correct elements,0.812,0.688,0.715,0.687,0.789,0.78,0.768,0.757
Diagram elements don't problematically overlap,0.489,0.436,0.608,0.549,0.062,0.0,0.315,0.094


Tables

In [81]:
# Backtranslation summary
bt_summary_rows = []
for MODEL in MODELS:
    if MODEL not in all_kappa_results:
        continue
    kappa_results = all_kappa_results[MODEL]
    kappas = [res['kappa'] for res in kappa_results.values() if res.get('kappa') is not None]
    avg_kappa = sum(kappas) / len(kappas) if kappas else float('nan')
    bt_summary_rows.append({
        'model': friendly[MODEL], 
        'cohen_k': avg_kappa,
    })
bt_summary_df = pd.DataFrame(bt_summary_rows)

# Judge summary (all modes)
judge_summary_rows = []
for mode in judge_modes:
    for MODEL in MODELS:
        kappa_results = all_judge_kappa_results.get(mode, {}).get(MODEL)
        if not kappa_results:
            continue
        kappas = [res["kappa"] for res in kappa_results.values() if res.get("kappa") is not None]
        avg_kappa = sum(kappas) / len(kappas) if kappas else float("nan")

        judge_summary_rows.append({
            "mode": mode,
            "model": friendly[MODEL], 
            "cohen_k": avg_kappa,
        })
judge_summary_df = pd.DataFrame(judge_summary_rows)

In [82]:
def compute_pipeline_stats(*, df, avg_kappa, time_col_ms, cost_col):
    avg_time_s = pd.to_numeric(df[time_col_ms], errors='coerce').fillna(0).mean() / 1000
    # Handle missing cost column (backtranslation CSVs don't have cost column)
    if cost_col in df.columns:
        total_cost = pd.to_numeric(df[cost_col], errors='coerce').fillna(0).sum()
    else:
        total_cost = 0.0
    return avg_kappa, avg_time_s, total_cost

comparison_tables = {}

for mode in sorted(judge_df["mode"].unique()):
    rows = []
    for model_id in ["gpt-4.1", "gpt-5", "gpt-4.1-mini", "gpt-5-mini"]:
        bt_df = dfs[model_id].copy()
        bt_df["total_time_ms"] = (
            pd.to_numeric(bt_df["extraction_time_ms"], errors='coerce').fillna(0)
            + pd.to_numeric(bt_df["evaluation_time_ms"], errors='coerce').fillna(0)
        )
        bt_avg_kappa = bt_summary_df.set_index("model").loc[friendly[model_id], "cohen_k"]
        bt_k, bt_time, bt_cost = compute_pipeline_stats(
            df=bt_df,
            avg_kappa=bt_avg_kappa,
            time_col_ms="total_time_ms",
            cost_col="extraction_cost",
        )

        judge_model_df = judge_df[(judge_df["mode"] == mode) & (judge_df["model"] == model_id)]
        judge_avg_kappa = judge_summary_df.set_index(["mode", "model"]).loc[(mode, friendly[model_id]), "cohen_k"]
        judge_k, judge_time, judge_cost = compute_pipeline_stats(
            df=judge_model_df,
            avg_kappa=judge_avg_kappa,
            time_col_ms="elapsed_ms",
            cost_col="cost",
        )

        rows.append({
            "Model": friendly[model_id],
            "BT κ": bt_k,
            "BT time (s)": bt_time,
            "BT cost ($)": bt_cost,
            f"Judge ({mode}) κ": judge_k,
            f"Judge ({mode}) time (s)": judge_time,
            f"Judge ({mode}) cost ($)": judge_cost,
        })

    comparison_tables[mode] = (
        pd.DataFrame(rows)
        .set_index("Model")
        .round({"BT κ": 3, "BT time (s)": 2, "BT cost ($)": 2,
                f"Judge ({mode}) κ": 3, f"Judge ({mode}) time (s)": 2, f"Judge ({mode}) cost ($)": 2})
    )

for mode, table in comparison_tables.items():
    print(f"\nBacktranslation vs LLM-as-a-Judge ({mode})")
    display(table)




Backtranslation vs LLM-as-a-Judge (both)


Unnamed: 0_level_0,BT κ,BT time (s),BT cost ($),Judge (both) κ,Judge (both) time (s),Judge (both) cost ($)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gpt 4.1,0.563,25.74,6.76,0.399,14.5,0.0
gpt 5,0.556,36.93,10.3,0.498,26.2,0.0
gpt 4.1 mini,0.499,24.69,0.0,0.388,8.55,0.0
gpt 5 mini,0.527,42.26,2.25,0.465,12.66,0.0


In [80]:
# Cost summary (simple)
MODEL_COSTS = {
    "gpt-5": (1.25, 10.00),
    "gpt-5-mini": (0.25, 2.00),
    "gpt-4.1": (2.00, 8.00),
    "gpt-4.1-mini": (0.40, 1.60),
}
CACHE_DISCOUNT = 0.25

# Backtranslation: use recorded extraction_cost from CSV
bt_cost = (
    pd.concat([
        pd.DataFrame({"model": [model] * len(dfs[model]), "cost": pd.to_numeric(dfs[model]["extraction_cost"], errors="coerce").fillna(0)})
        for model in MODELS if model in dfs
    ], ignore_index=True)
    .groupby("model", as_index=False)
    .agg(n_diagrams=("cost", "size"), avg_cost_per_diagram_usd=("cost", "mean"), total_cost_usd=("cost", "sum"))
)
bt_cost.insert(0, "mode", "-")
bt_cost.insert(0, "pipeline", "backtranslation")

# Judge: recompute cost from token counts in CSV
judge_tmp = judge_df.copy()
judge_tmp["input_tokens"] = pd.to_numeric(judge_tmp.get("input_tokens", 0), errors="coerce").fillna(0)
judge_tmp["cached_tokens"] = pd.to_numeric(judge_tmp.get("cached_tokens", 0), errors="coerce").fillna(0)
judge_tmp["output_tokens"] = pd.to_numeric(judge_tmp.get("output_tokens", 0), errors="coerce").fillna(0)

def row_cost(row):
    prompt_per_m, completion_per_m = MODEL_COSTS[row["model"]]
    prompt_rate = prompt_per_m / 1_000_000
    completion_rate = completion_per_m / 1_000_000
    cached_rate = prompt_rate * CACHE_DISCOUNT

    input_tokens = row["input_tokens"]
    cached_tokens = min(row["cached_tokens"], input_tokens)
    fresh_tokens = input_tokens - cached_tokens
    output_tokens = row["output_tokens"]

    return fresh_tokens * prompt_rate + cached_tokens * cached_rate + output_tokens * completion_rate

judge_tmp["cost"] = judge_tmp.apply(row_cost, axis=1)
judge_cost = (
    judge_tmp.groupby(["mode", "model"], as_index=False)
    .agg(n_diagrams=("cost", "size"), avg_cost_per_diagram_usd=("cost", "mean"), total_cost_usd=("cost", "sum"))
)
judge_cost.insert(0, "pipeline", "llm_judge")

cost_summary = pd.concat([bt_cost, judge_cost], ignore_index=True)
cost_summary = cost_summary.sort_values(["pipeline", "mode", "model"]).reset_index(drop=True)
display(cost_summary.round({"avg_cost_per_diagram_usd": 6, "total_cost_usd": 4}))



KeyError: 'extraction_cost'