### LLM-as-Judge Evaluation

This notebook runs the LLM-as-Judge pipeline end-to-end, ouputs are cached under `results/llm_judge/cache/`.


In [22]:
# Setup
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
import os
import subprocess
import sys
from pathlib import Path

notebook_dir = Path.cwd()
repo_root = (notebook_dir / '..').resolve() if notebook_dir.name == 'notebooks' else notebook_dir
os.chdir(repo_root)
print(f'Repo root: {repo_root}')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Repo root: /Users/vishalkumar/Desktop/Research/code/DiagramIR


### Runtime config
You can run llm as a judge with a few input modes: `image`, `code`, `both` (image + code).
Flip `RUN_PRECOMPUTE_PNGS` to generate PNGs for our test set before running `image` or `both`

In [23]:

DATASET = Path('data/geometric_shapes_test_set.csv')
MODE = 'both'
MODEL = 'gpt-4.1-mini'
LIMIT = None
CONCURRENCY = 6
RUN_PRECOMPUTE_PNGS = False

assert DATASET.exists(), f'Missing dataset: {DATASET}'
df = pd.read_csv(DATASET)
required_cols = {'diagram_id', 'tikz'}
missing_cols = sorted(required_cols - set(df.columns))
if missing_cols:
    raise ValueError(f'Missing required columns: {missing_cols}')
print('Dataset:', DATASET)
print('Rows:', len(df))
print('Columns:', list(df.columns))
print(f'MODE={MODE}, RUN_PRECOMPUTE_PNGS={RUN_PRECOMPUTE_PNGS}')
df.head(2)


Dataset: data/geometric_shapes_test_set.csv
Rows: 398
Columns: ['prompt', 'tikz', 'image', 'main_category', 'subcategory', 'diagram_id', 'assignment_type', 'assigned_to', 'image_png_path']
MODE=both, RUN_PRECOMPUTE_PNGS=False


Unnamed: 0,prompt,tikz,image,main_category,subcategory,diagram_id,assignment_type,assigned_to,image_png_path
0,triangle with side length 8 horizontal at bott...,\documentclass{IM}\n\usepackage{tikz}\n\begin{...,https://2xavun1dsa0sayar.public.blob.vercel-st...,2d shapes,triangle,1,individual,Shubhra,data/judge_pngs/diagram_1.png
1,Two triangles showing scaled copy relationship...,\documentclass{IM}\n\usepackage{tikz}\n\begin{...,https://2xavun1dsa0sayar.public.blob.vercel-st...,2d shapes,triangle,2,individual,Rebecca,data/judge_pngs/diagram_2.png


In [24]:
if MODE in {'image', 'both'} and RUN_PRECOMPUTE_PNGS:
    cmd = [sys.executable, 'scripts/precompute_judge_pngs.py']
    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)


In [25]:
# Load samples and run judge evaluations
from llm_judge import load_samples, run_judge_evaluations, export_mode_model_csvs

samples = load_samples(DATASET, MODE)
if LIMIT is not None and LIMIT > 0:
    samples = samples[:LIMIT]
    print(f"Evaluating first {len(samples)} samples due to LIMIT={LIMIT}.")

await run_judge_evaluations(
    samples,
    mode=MODE,
    model=MODEL,
    concurrency=CONCURRENCY,
)


Loaded 396/398 rows (skipped missing tikz=0, missing image=2).


Collecting judgements: 100%|██████████| 396/396 [00:00<00:00, 64789.53it/s]


In [26]:
# Quick cache inspection: confirm records landed where expected.
result_dir = Path('results/llm_judge/cache') / MODE / MODEL.replace('/', '_')
files = sorted(result_dir.glob('diagram_*.json'))
print(f'Result directory: {result_dir}')
print(f'Cached records: {len(files)}')
if files:
    sample = json.loads(files[0].read_text())
    print('Sample keys:', sorted(sample.keys()))
    print('Sample diagram_id:', sample.get('diagram_id'))
    print('Sample mode/model:', sample.get('mode'), sample.get('model'))


Result directory: results/llm_judge/cache/both/gpt-4.1-mini
Cached records: 396
Sample keys: ['diagram_id', 'elapsed_ms', 'image_png_path', 'mode', 'model', 'reasoning_effort', 'rubric', 'temperature', 'tikz_code', 'tokens']
Sample diagram_id: 1
Sample mode/model: both gpt-4.1-mini


In [27]:
exported_csv_paths = export_mode_model_csvs(
    mode=MODE,
    model=MODEL,
    cache_root=Path("results/llm_judge/cache"),
    output_root=Path("results/llm_judge"),
)
judge_results_df = pd.read_csv(exported_csv_paths[-1])
judge_results_df.head()

Unnamed: 0,diagram_id,model,mode,temperature,reasoning_effort,elapsed_ms,input_tokens,cached_tokens,output_tokens,total_tokens,...,core_mathematical_properties_of_shapes_correct_value,core_mathematical_properties_of_shapes_correct_rationale,diagram_fully_in_canvas_value,diagram_fully_in_canvas_rationale,diagram_elements_are_readable_size_value,diagram_elements_are_readable_size_rationale,labels_associated_with_elements_value,labels_associated_with_elements_rationale,diagram_elements_dont_problematically_overlap_value,diagram_elements_dont_problematically_overlap_rationale
0,1,gpt-4.1-mini,both,0.0,,9569.401291,4182,0,525,4707,...,Yes,The triangle is correctly drawn with a base an...,Yes,"All elements of the triangle, including the ba...",Yes,The triangle and its elements are large enough...,Yes,The label 'base' is placed below the base line...,Yes,No labels or diagram elements overlap in a way...
1,10,gpt-4.1-mini,both,0.0,,7960.060959,2188,1152,495,2683,...,Yes,"The triangle is drawn with vertices at (0,0), ...",Yes,The entire triangle and its vertices are fully...,Yes,The triangle and its vertices are drawn with t...,,There are no textual labels or annotations ass...,Yes,"The diagram elements, including the triangle e..."
2,100,gpt-4.1-mini,both,0.0,,9003.130958,5064,1152,446,5510,...,Yes,The triangles are correctly positioned as rota...,Yes,All four triangles and their labels are fully ...,Yes,The triangles and labels are clearly visible a...,Yes,Each vertex label (A through L) is placed near...,Yes,No elements or labels overlap in a way that ob...
3,101,gpt-4.1-mini,both,0.0,,8276.729875,2904,1152,492,3396,...,Yes,The triangle has a base of length 5 and a heig...,Yes,"All elements of the diagram, including the tri...",Yes,"The triangle and all labels, including measure...",Yes,The labels for base and height are placed near...,Yes,No labels or diagram elements overlap in a way...
4,102,gpt-4.1-mini,both,0.0,,9070.639167,3540,1152,461,4001,...,Yes,The shapes are stylized bird figures rather th...,Yes,"All elements, including the card background an...",Yes,The shapes and labels (bird eyes) are clearly ...,,There are no textual labels or length/angle la...,Yes,The three bird shapes are positioned in a tria...
