### Backtranslation Evaluation

This notebook runs the Backtranslation pipeline end-to-end, ouputs are cached under `results/backtranslation/cache/`.

In [2]:
%load_ext autoreload
%autoreload 2

import os, pathlib
from pathlib import Path
from datetime import datetime

notebook_dir = pathlib.Path.cwd()
target = (notebook_dir / "..").resolve()
os.chdir(target)

import pandas as pd

from backtranslation import run_backtranslation_pipeline, export_results_csv

MODEL = os.environ.get("OPENAI_MODEL")
print(f"Using model: {MODEL}")

Using model: gpt-4.1-mini


In [3]:
RESULTS_DIR = Path("results/backtranslation")
MAX_CONCURRENT = 6
LIMIT = None
TEST_SET_PATH = "data/geometric_shapes_test_set.csv"

print(f"Results will be saved to: {RESULTS_DIR}")

Results will be saved to: results/backtranslation


In [4]:
print("Loading test dataset...")
df = pd.read_csv(TEST_SET_PATH)
print(f"Loaded {len(df)} examples")
print(f"Columns: {list(df.columns)}")
print(f"Categories: {df['main_category'].value_counts()}")
df.head(2)

Loading test dataset...
Loaded 398 examples
Columns: ['prompt', 'tikz', 'image', 'main_category', 'subcategory', 'diagram_id', 'assignment_type', 'assigned_to', 'image_png_path']
Categories: main_category
2d shapes    208
3d shapes    190
Name: count, dtype: int64


Unnamed: 0,prompt,tikz,image,main_category,subcategory,diagram_id,assignment_type,assigned_to,image_png_path
0,triangle with side length 8 horizontal at bott...,\documentclass{IM}\n\usepackage{tikz}\n\begin{...,https://2xavun1dsa0sayar.public.blob.vercel-st...,2d shapes,triangle,1,individual,Shubhra,data/judge_pngs/diagram_1.png
1,Two triangles showing scaled copy relationship...,\documentclass{IM}\n\usepackage{tikz}\n\begin{...,https://2xavun1dsa0sayar.public.blob.vercel-st...,2d shapes,triangle,2,individual,Rebecca,data/judge_pngs/diagram_2.png


In [5]:
subset = df.head(LIMIT) if LIMIT else df

all_results = await run_backtranslation_pipeline(
    subset,
    model=MODEL,
    concurrency=MAX_CONCURRENT,
    results_dir=RESULTS_DIR,
)

success_count = sum(1 for r in all_results if r.get("ir", {}).get("success", False))
print(f"\nCompleted {len(all_results)} examples")
print(f"Success rate: {success_count / len(all_results):.2%}")

Processing 398 diagrams with model=gpt-4.1-mini, concurrency=6


Processing diagrams: 100%|██████████| 398/398 [00:00<00:00, 5034.53it/s]


Completed 398 examples
Success rate: 92.46%





In [6]:
date_str = datetime.now().strftime("%Y%m%d")
model_str = MODEL.replace(":", "_").replace("/", "_")
results_csv_path = RESULTS_DIR / f"evaluation_results_{model_str}_{date_str}.csv"

results_df = export_results_csv(all_results, results_csv_path)
print(f"Results CSV shape: {results_df.shape}")
results_df.head()

Results exported to results/backtranslation/evaluation_results_gpt-4.1-mini_20260219.csv
Results CSV shape: (398, 29)


Unnamed: 0,diagram_id,model,main_category,subcategory,extraction_success,extraction_time_ms,prompt_tokens,completion_tokens,evaluation_time_ms,overall_score,...,diagram_elements_are_readable_size_passed,diagram_elements_are_readable_size_message,shape_outlines_are_closed_passed,shape_outlines_are_closed_message,core_mathematical_properties_of_shapes_correct_passed,core_mathematical_properties_of_shapes_correct_message,labeled_lengths_areas_match_proportions_passed,labeled_lengths_areas_match_proportions_message,schema_validation_passed,schema_validation_message
0,1,gpt-4.1-mini,2d shapes,triangle,True,2973.48175,3100,188,4.916708,0.857143,...,True,All elements >= 2.6pt,True,All shapes marked closed,True,Shape parameters are non-degenerate,,Not enough labeled pairs to compare proportions,,
1,2,gpt-4.1-mini,2d shapes,triangle,True,5235.296583,3225,316,15.882208,0.714286,...,False,Elements below readability threshold 6.3pt: no...,True,All shapes marked closed,True,Shape parameters are non-degenerate,False,"Length proportion mismatch: ((no id), label=3)...",,
2,3,gpt-4.1-mini,2d shapes,triangle,True,4983.09575,3276,280,11.426833,0.714286,...,True,All elements >= 2.0pt,True,All shapes marked closed,True,Shape parameters are non-degenerate,False,"Length proportion mismatch: ((no id), label=6)...",,
3,4,gpt-4.1-mini,2d shapes,triangle,True,4883.188,3069,167,5.543083,0.714286,...,False,Elements below readability threshold 2.5pt: re...,True,All shapes marked closed,False,Rectangle has zero area,,Not enough labeled pairs to compare proportions,,
4,5,gpt-4.1-mini,2d shapes,triangle,True,3825.771417,3102,198,10.283084,0.571429,...,False,Elements below readability threshold 1.3pt: re...,True,All shapes marked closed,False,Rectangle has zero area,,No numeric labels to check,,
