# LLM-as-Judge Evaluation

This notebook runs the LLM-as-Judge pipeline end-to-end and inspects cached outputs under `results/llm_judge/`.


In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
import os
import subprocess
import sys
from pathlib import Path

notebook_dir = Path.cwd()
repo_root = (notebook_dir / '..').resolve() if notebook_dir.name == 'notebooks' else notebook_dir
os.chdir(repo_root)
print(f'Repo root: {repo_root}')


In [None]:
DATASET = Path('data/geometric_shapes_test_set.csv')
MODE = 'both'
MODELS = 'gpt-4.1-mini'
LIMIT = 10
CONCURRENCY = 4
RUN_PRECOMPUTE_PNGS = False

assert DATASET.exists(), f'Missing dataset: {DATASET}'
df = pd.read_csv(DATASET)
required_cols = {'diagram_id', 'tikz'}
missing_cols = sorted(required_cols - set(df.columns))
if missing_cols:
    raise ValueError(f'Missing required columns: {missing_cols}')
print('Dataset:', DATASET)
print('Rows:', len(df))
print('Columns:', list(df.columns))
df.head(2)


In [None]:
if RUN_PRECOMPUTE_PNGS:
    cmd = [sys.executable, 'scripts/precompute_judge_pngs.py']
    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)
else:
    print('Skipping PNG precompute (set RUN_PRECOMPUTE_PNGS=True to enable).')


In [None]:
cmd = [
    sys.executable, 'llm_judge.py',
    '--csv', str(DATASET),
    '--mode', MODE,
    '--models', MODELS,
    '--limit', str(LIMIT),
    '--concurrency', str(CONCURRENCY),
]
print('Running:', ' '.join(cmd))
subprocess.run(cmd, check=True)


In [None]:
result_dir = Path('results/llm_judge') / MODE / MODELS.replace('/', '_')
files = sorted(result_dir.glob('diagram_*.json'))
print(f'Result directory: {result_dir}')
print(f'Cached records: {len(files)}')
if files:
    sample = json.loads(files[0].read_text())
    print('Sample keys:', sorted(sample.keys()))
    print('Sample diagram_id:', sample.get('diagram_id'))
    print('Sample mode/model:', sample.get('mode'), sample.get('model'))
