# Document-understanding benchmark sweeps
This notebook compares synthetic invoice benchmarks against the bundled FUNSD sample.
It tracks accuracy, average tokens kept, and the information bottleneck proxies emitted per budget.


In [None]:
from pprint import pprint

from benchmarks.doc_understanding import run_benchmark, run_funsd_benchmark


In [None]:
synthetic_report = run_benchmark(budget_values=(2, 4, 6), dataset_size=8, threshold=400.0, seed=1)
funsd_report = run_funsd_benchmark(budget_values=(6, 8, 10), dataset_size=2, use_sample=True)

print('Synthetic budgets:')
pprint(synthetic_report['budgets'])
print('
FUNSD budgets:')
pprint(funsd_report['budgets'])


In [None]:
def summarise(report, label):
    rows = []
    for entry in report['budgets']:
        metrics = entry.get('metrics', {})
        rows.append({
            'benchmark': label,
            'budget': entry['budget'],
            'accuracy': entry['accuracy'],
            'average_kept_tokens': entry['average_kept_tokens'],
            'information_bound': metrics.get('information_bound'),
            'rate_distortion': metrics.get('rate_distortion'),
        })
    return rows

comparison = summarise(synthetic_report, 'synthetic') + summarise(funsd_report, 'funsd-sample')
for row in comparison:
    pprint(row)
