# Harvest Model Accuracy from Evaluation Results

This notebook scans `output/csv/` for evaluation results and generates the `current_model_accuracy.csv` file for use in model comparison.

## Data Sources
1. **per_field_metrics.csv** - Pre-computed field-level metrics by model
2. **Model batch results** - Per-image results that can be aggregated

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re

# Paths
CSV_DIR = Path("../output/csv")
OUTPUT_DIR = Path("../output")

print(f"Scanning: {CSV_DIR.resolve()}")

Scanning: /Users/tod/Desktop/LMM_POC/output/csv


---
## 1. Discover Available Result Files

In [2]:
def discover_result_files(csv_dir: Path) -> dict:
    """Discover all available result files in the CSV directory."""
    files = {
        'per_field_metrics': None,
        'llama_batch': [],
        'internvl3_batch': [],  # Generic internvl3 (treated as 8B)
        'internvl3_2b_batch': [],
        'internvl3_8b_batch': [],
        'batch_summary': [],
    }
    
    for f in csv_dir.glob("*.csv"):
        name = f.name
        
        if name == 'per_field_metrics.csv':
            files['per_field_metrics'] = f
        elif name.startswith('llama_batch_results_'):
            files['llama_batch'].append(f)
        elif name.startswith('internvl3_2b_batch_results_'):
            files['internvl3_2b_batch'].append(f)
        elif name.startswith('internvl3_5_8b_batch_results_') or name.startswith('internvl3_8b_batch_results_'):
            files['internvl3_8b_batch'].append(f)
        elif name.startswith('internvl3_batch_results_'):
            # Treat generic internvl3_batch_results as 8B (most common case)
            files['internvl3_batch'].append(f)
        elif '_summary.csv' in name:
            files['batch_summary'].append(f)
    
    # Sort by timestamp (most recent first)
    for key in ['llama_batch', 'internvl3_batch', 'internvl3_2b_batch', 'internvl3_8b_batch']:
        files[key] = sorted(files[key], key=lambda x: x.name, reverse=True)
    
    return files

result_files = discover_result_files(CSV_DIR)

print("Available Result Files")
print("=" * 60)
print(f"\nper_field_metrics.csv: {'Found' if result_files['per_field_metrics'] else 'Not found'}")
print(f"\nLlama batch results: {len(result_files['llama_batch'])} files")
for f in result_files['llama_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3-8B batch results: {len(result_files['internvl3_8b_batch'])} files")
for f in result_files['internvl3_8b_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3 (generic, treated as 8B): {len(result_files['internvl3_batch'])} files")
for f in result_files['internvl3_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3-2B batch results: {len(result_files['internvl3_2b_batch'])} files")
for f in result_files['internvl3_2b_batch'][:3]:
    print(f"  - {f.name}")

Available Result Files

per_field_metrics.csv: Not found

Llama batch results: 2 files
  - llama_batch_results_20251210_003155.csv
  - llama_batch_results_20251210_001755.csv

InternVL3-8B batch results: 0 files

InternVL3 (generic, treated as 8B): 1 files
  - internvl3_batch_results_20251210_005902.csv

InternVL3-2B batch results: 1 files
  - internvl3_2b_batch_results_20251210_013149.csv


---
## 2. Option A: Use per_field_metrics.csv (Recommended)

This file already contains pre-computed field-level accuracy for each model.

In [3]:
def load_per_field_metrics(csv_path: Path) -> pd.DataFrame:
    """Load and display per-field metrics."""
    if not csv_path or not csv_path.exists():
        print("per_field_metrics.csv not found")
        return None
    
    df = pd.read_csv(csv_path)
    print(f"Loaded: {csv_path}")
    print(f"Shape: {df.shape}")
    print(f"Models: {df['model'].unique().tolist()}")
    print(f"Fields: {df['field'].nunique()}")
    
    return df

per_field_df = load_per_field_metrics(result_files['per_field_metrics'])
if per_field_df is not None:
    display(per_field_df.head(10))

per_field_metrics.csv not found


In [4]:
# Fields that are EXCLUSIVE to our model (competing model doesn't have these)
EXCLUSIVE_FIELDS = ["DOCUMENT_TYPE"]

# List fields where F1 scores are meaningful (for comparison)
# These are fields with multiple items where precision/recall matter
LIST_FIELDS = [
    "LINE_ITEM_DESCRIPTIONS", "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES",
    "LINE_ITEM_TOTAL_PRICES", "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

def extract_model_accuracy_from_per_field(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Extract field-level accuracy AND F1 scores for a specific model.
    
    The F1 scores in per_field_metrics.csv are POSITION-AGNOSTIC (set-based).
    This allows fair comparison with competing models using the same methodology.
    """
    model_df = df[df['model'] == model_name].copy()
    
    if len(model_df) == 0:
        print(f"Model '{model_name}' not found. Available: {df['model'].unique().tolist()}")
        return None
    
    # Determine subset based on field
    bank_fields = ['STATEMENT_DATE_RANGE', 'TRANSACTION_DATES', 'TRANSACTION_AMOUNTS_PAID']
    
    result = []
    for _, row in model_df.iterrows():
        subset = 'Bank statements' if row['field'] in bank_fields else 'Invoices & receipts'
        
        # Extract F1 score if available (position-agnostic from sklearn)
        f1_agnostic = row.get('f1_score', np.nan) if 'f1_score' in row.index else np.nan
        
        result.append({
            'Subset': subset,
            'Field': row['field'],
            'Accuracy': row['accuracy'],
            'F1_Agnostic': f1_agnostic,  # Position-agnostic F1 for fair comparison
            'Exclusive': row['field'] in EXCLUSIVE_FIELDS,
        })
    
    result_df = pd.DataFrame(result)
    
    # Summary
    exclusive_count = result_df['Exclusive'].sum()
    comparable_count = len(result_df) - exclusive_count
    f1_available = result_df['F1_Agnostic'].notna().sum()
    
    print(f"  Comparable fields: {comparable_count}")
    print(f"  Exclusive fields (not in competing model): {exclusive_count}")
    print(f"  Fields with F1 scores: {f1_available}")
    
    return result_df

# Show available models
if per_field_df is not None:
    print("\nAvailable models in per_field_metrics.csv:")
    for model in per_field_df['model'].unique():
        model_data = per_field_df[per_field_df['model'] == model]
        count = len(model_data)
        mean_acc = model_data['accuracy'].mean()
        mean_f1 = model_data['f1_score'].mean() if 'f1_score' in model_data.columns else np.nan
        f1_str = f", mean F1: {mean_f1:.1%}" if not np.isnan(mean_f1) else ""
        print(f"  - {model}: {count} fields, mean accuracy: {mean_acc:.1%}{f1_str}")

---
## 3. Option B: Aggregate from Batch Results

If per_field_metrics.csv is not available or you want fresh results from a specific batch.

In [5]:
# Schema fields by document type
INVOICE_RECEIPT_FIELDS = [
    "DOCUMENT_TYPE", "BUSINESS_ABN", "SUPPLIER_NAME", "BUSINESS_ADDRESS",
    "PAYER_NAME", "PAYER_ADDRESS", "INVOICE_DATE", "LINE_ITEM_DESCRIPTIONS",
    "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES", "LINE_ITEM_TOTAL_PRICES",
    "IS_GST_INCLUDED", "GST_AMOUNT", "TOTAL_AMOUNT",
]

BANK_STATEMENT_FIELDS = [
    "STATEMENT_DATE_RANGE", "LINE_ITEM_DESCRIPTIONS",  # LINE_ITEM_DESCRIPTIONS = transaction descriptions
    "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

# List fields where F1 is meaningful (multiple items)
LIST_FIELDS = [
    "LINE_ITEM_DESCRIPTIONS", "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES",
    "LINE_ITEM_TOTAL_PRICES", "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

# Fields that are EXCLUSIVE to our model (competing model doesn't have these)
EXCLUSIVE_FIELDS = ["DOCUMENT_TYPE"]

# Document type mapping (normalize variations)
DOC_TYPE_MAP = {
    'receipt': 'invoice_receipt',
    'invoice': 'invoice_receipt', 
    'bank_statement': 'bank_statement',
    'RECEIPT': 'invoice_receipt',
    'INVOICE': 'invoice_receipt',
    'BANK_STATEMENT': 'bank_statement',
}

# Ground truth paths
GROUND_TRUTH_PATHS = {
    'inv_rec': Path("../evaluation_data/inv_rec/ground_truth_inv_rec.csv"),
    'bank': Path("../evaluation_data/bank/ground_truth_bank.csv"),
    'synthetic': Path("../evaluation_data/synthetic/ground_truth_synthetic.csv"),
}

def load_ground_truth() -> pd.DataFrame:
    """Load and combine all ground truth files."""
    dfs = []
    for name, path in GROUND_TRUTH_PATHS.items():
        if path.exists():
            df = pd.read_csv(path)
            df['source'] = name
            dfs.append(df)
            print(f"  Loaded {name}: {len(df)} rows")
    
    if dfs:
        combined = pd.concat(dfs, ignore_index=True)
        # Normalize image_file to stem for matching
        combined['image_stem'] = combined['image_file'].apply(lambda x: Path(str(x)).stem)
        print(f"  Total ground truth: {len(combined)} rows")
        return combined
    return None

print("Loading ground truth for F1 computation...")
ground_truth_df = load_ground_truth()

Loading ground truth for F1 computation...
  Loaded inv_rec: 6 rows
  Loaded bank: 15 rows
  Loaded synthetic: 9 rows
  Total ground truth: 30 rows


In [6]:
import re
import sys
from pathlib import Path

# Add project root to path for imports
PROJECT_ROOT = Path("..").resolve()
sys.path.insert(0, str(PROJECT_ROOT))

from common.evaluation_metrics import calculate_field_accuracy_f1

def load_batch_results(csv_path: Path) -> pd.DataFrame:
    """Load a batch results file."""
    df = pd.read_csv(csv_path)
    print(f"Loaded: {csv_path.name}")
    print(f"  Images: {len(df)}")
    print(f"  Document types: {df['document_type'].value_counts().to_dict()}")
    if 'overall_accuracy' in df.columns:
        print(f"  Mean overall accuracy: {df['overall_accuracy'].mean():.1%}")

    # Add image_stem for ground truth matching
    if 'image_file' in df.columns:
        df['image_stem'] = df['image_file'].apply(lambda x: Path(str(x)).stem)
    elif 'image_name' in df.columns:
        df['image_stem'] = df['image_name'].apply(lambda x: Path(str(x)).stem)

    return df


# Schema fields by document type
INVOICE_RECEIPT_FIELDS = [
    "DOCUMENT_TYPE", "BUSINESS_ABN", "SUPPLIER_NAME", "BUSINESS_ADDRESS",
    "PAYER_NAME", "PAYER_ADDRESS", "INVOICE_DATE", "LINE_ITEM_DESCRIPTIONS",
    "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES", "LINE_ITEM_TOTAL_PRICES",
    "IS_GST_INCLUDED", "GST_AMOUNT", "TOTAL_AMOUNT",
]

BANK_STATEMENT_FIELDS = [
    "STATEMENT_DATE_RANGE", "LINE_ITEM_DESCRIPTIONS",
    "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

# Document type mapping
DOC_TYPE_MAP = {
    'receipt': 'invoice_receipt',
    'invoice': 'invoice_receipt',
    'bank_statement': 'bank_statement',
    'RECEIPT': 'invoice_receipt',
    'INVOICE': 'invoice_receipt',
    'BANK_STATEMENT': 'bank_statement',
}


def compute_field_accuracy_from_batch(df: pd.DataFrame, ground_truth: pd.DataFrame = None) -> pd.DataFrame:
    """
    Compute field-level F1 metrics from batch results using calculate_field_accuracy_f1.

    This function uses the SAME F1 calculation as model_comparison_reporter_v2.ipynb
    to ensure consistent metrics across all notebooks.
    """
    results = []

    # Normalize document types
    df = df.copy()
    df['doc_type_normalized'] = df['document_type'].map(DOC_TYPE_MAP).fillna('unknown')

    # Split by document type
    inv_rec_df = df[df['doc_type_normalized'] == 'invoice_receipt']
    bank_df = df[df['doc_type_normalized'] == 'bank_statement']

    print(f"  Invoice/Receipt images: {len(inv_rec_df)}")
    print(f"  Bank statement images: {len(bank_df)}")

    # Merge with ground truth if available
    has_gt = False
    if ground_truth is not None and 'image_stem' in df.columns:
        df = df.merge(ground_truth, on='image_stem', how='left', suffixes=('', '_gt'))
        has_gt = True
        matched = df['image_file_gt'].notna().sum()
        print(f"  Matched with ground truth: {matched}/{len(df)}")
    else:
        print("  No ground truth - using extraction rate as proxy")

    # Re-split after merge
    inv_rec_df = df[df['doc_type_normalized'] == 'invoice_receipt']
    bank_df = df[df['doc_type_normalized'] == 'bank_statement']

    def compute_f1_for_field(subset_df, field):
        """Compute F1 using calculate_field_accuracy_f1 for consistency."""
        if not has_gt or f'{field}_gt' not in subset_df.columns:
            return np.nan, np.nan, np.nan

        f1_scores = []
        precision_scores = []
        recall_scores = []

        for _, row in subset_df.iterrows():
            if pd.notna(row.get(f'{field}_gt')):
                ext = str(row[field]) if pd.notna(row[field]) else 'NOT_FOUND'
                gt = str(row[f'{field}_gt']) if pd.notna(row[f'{field}_gt']) else 'NOT_FOUND'

                # Use calculate_field_accuracy_f1 for consistent F1 calculation
                metrics = calculate_field_accuracy_f1(ext, gt, field, debug=False)

                f1_scores.append(metrics['f1_score'])
                precision_scores.append(metrics['precision'])
                recall_scores.append(metrics['recall'])

        if f1_scores:
            return np.mean(f1_scores), np.mean(precision_scores), np.mean(recall_scores)
        return np.nan, np.nan, np.nan

    # Process Invoice & Receipt fields
    for field in INVOICE_RECEIPT_FIELDS:
        if field not in df.columns:
            continue
        if len(inv_rec_df) == 0:
            continue

        valid = inv_rec_df[field].notna() & (inv_rec_df[field] != 'NOT_FOUND') & (inv_rec_df[field] != '')
        coverage = valid.mean() if len(inv_rec_df) > 0 else np.nan

        f1, precision, recall = compute_f1_for_field(inv_rec_df, field)

        results.append({
            'Subset': 'Invoices & receipts',
            'Field': field,
            'Accuracy': coverage,
            'F1': f1,
            'Precision': precision,
            'Recall': recall,
            'Exclusive': field in EXCLUSIVE_FIELDS,
            'extracted_count': int(valid.sum()),
            'total_count': len(inv_rec_df),
        })

    # Process Bank Statement fields
    for field in BANK_STATEMENT_FIELDS:
        if field not in df.columns:
            continue
        if len(bank_df) == 0:
            continue

        valid = bank_df[field].notna() & (bank_df[field] != 'NOT_FOUND') & (bank_df[field] != '')
        coverage = valid.mean() if len(bank_df) > 0 else np.nan

        f1, precision, recall = compute_f1_for_field(bank_df, field)

        results.append({
            'Subset': 'Bank statements',
            'Field': field,
            'Accuracy': coverage,
            'F1': f1,
            'Precision': precision,
            'Recall': recall,
            'Exclusive': field in EXCLUSIVE_FIELDS,
            'extracted_count': int(valid.sum()),
            'total_count': len(bank_df),
        })

    result_df = pd.DataFrame(results)

    # Summary
    exclusive_count = result_df['Exclusive'].sum()
    comparable_count = len(result_df) - exclusive_count

    print(f"\n  F1 Score (using calculate_field_accuracy_f1): {result_df['F1'].mean():.1%}")
    print(f"  Precision: {result_df['Precision'].mean():.1%}")
    print(f"  Recall: {result_df['Recall'].mean():.1%}")
    print(f"\n  Comparable fields: {comparable_count}")
    print(f"  Exclusive fields: {exclusive_count}")

    return result_df

In [7]:
# Example: Load most recent batch results for each model
print("Most recent batch results by model:")
print("=" * 60)

batch_dfs = {}

if result_files['llama_batch']:
    latest = result_files['llama_batch'][0]
    batch_dfs['Llama-11B'] = load_batch_results(latest)
    print()

# InternVL3-8B: Check both explicit 8B files and generic internvl3 files
if result_files['internvl3_8b_batch']:
    latest = result_files['internvl3_8b_batch'][0]
    batch_dfs['InternVL3-8B'] = load_batch_results(latest)
    print()
elif result_files['internvl3_batch']:
    # Use generic internvl3_batch_results as 8B fallback
    latest = result_files['internvl3_batch'][0]
    print("Using generic internvl3_batch_results as InternVL3-8B:")
    batch_dfs['InternVL3-8B'] = load_batch_results(latest)
    print()

if result_files['internvl3_2b_batch']:
    latest = result_files['internvl3_2b_batch'][0]
    batch_dfs['InternVL3-2B'] = load_batch_results(latest)
    print()

print(f"\nModels loaded: {list(batch_dfs.keys())}")

Most recent batch results by model:
Loaded: llama_batch_results_20251210_003155.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 9764.3%

Using generic internvl3_batch_results as InternVL3-8B:
Loaded: internvl3_batch_results_20251210_005902.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 8485.7%

Loaded: internvl3_2b_batch_results_20251210_013149.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 7136.5%


Models loaded: ['Llama-11B', 'InternVL3-8B', 'InternVL3-2B']


---
## 4. Select Model and Export

Choose which model's accuracy to export as `current_model_accuracy.csv`

In [8]:
# =============================================================================
# CONFIGURATION - Select model and data source
# =============================================================================

# Choose model to export
SELECTED_MODEL = "Llama-11B"  # Options: "Llama-11B", "InternVL3-8B", "InternVL3-2B"

# Choose data source
USE_PER_FIELD_METRICS = False  # True = use per_field_metrics.csv (recommended)
                               # False = aggregate from batch results (coverage only)

# Output filename
OUTPUT_FILENAME = "current_model_accuracy.csv"

In [9]:
def export_model_accuracy(model_name: str, use_per_field: bool,
                          per_field_df: pd.DataFrame, batch_dfs: dict,
                          ground_truth_df: pd.DataFrame,
                          output_path: Path) -> pd.DataFrame:
    """Export field-level accuracy and F1 for selected model."""

    if use_per_field and per_field_df is not None:
        print(f"Using per_field_metrics.csv for {model_name}")
        result_df = extract_model_accuracy_from_per_field(per_field_df, model_name)
        if result_df is None:
            return None
    elif model_name in batch_dfs:
        print(f"Aggregating from batch results for {model_name}")
        if ground_truth_df is not None:
            print("Computing F1 against ground truth (using calculate_field_accuracy_f1)...")
        result_df = compute_field_accuracy_from_batch(batch_dfs[model_name], ground_truth_df)
    else:
        print(f"No data available for {model_name}")
        return None

    # Determine which columns to save
    save_cols = ['Subset', 'Field', 'Accuracy']

    # Add F1/Precision/Recall if available
    for col in ['F1', 'Precision', 'Recall', 'F1_Agnostic']:
        if col in result_df.columns and result_df[col].notna().any():
            save_cols.append(col)

    if 'Exclusive' in result_df.columns:
        save_cols.append('Exclusive')

    result_df[save_cols].to_csv(output_path, index=False)
    print(f"\nSaved: {output_path}")
    print(f"Fields: {len(result_df)}")
    print(f"Mean accuracy: {result_df['Accuracy'].mean():.1%}")

    # Print F1 summary
    if 'F1' in save_cols:
        print(f"Mean F1: {result_df['F1'].mean():.1%}")
    if 'Precision' in save_cols:
        print(f"Mean Precision: {result_df['Precision'].mean():.1%}")
    if 'Recall' in save_cols:
        print(f"Mean Recall: {result_df['Recall'].mean():.1%}")

    return result_df

# Export
output_path = OUTPUT_DIR / OUTPUT_FILENAME
exported_df = export_model_accuracy(
    SELECTED_MODEL,
    USE_PER_FIELD_METRICS,
    per_field_df,
    batch_dfs,
    ground_truth_df,
    output_path
)

if exported_df is not None:
    print("\n" + "=" * 60)
    print("Exported Data:")
    print("=" * 60)
    display(exported_df)

Aggregating from batch results for Llama-11B
Computing F1 against ground truth...
  Invoice/Receipt images: 6
  Bank statement images: 3
  Matched with ground truth: 18/18

  F1 Scores Summary:
    F1_Smart (field-appropriate): 96.0%  ← RECOMMENDED
    ----------------------------------------
    Binary (exact full match):    90.7%
    Position-Aware (positional):  96.1%
    Position-Agnostic (set):      96.0%
    Fuzzy Agnostic (lenient):     97.6%

  Smart F1 Method Selection:
    binary: DOCUMENT_TYPE, BUSINESS_ABN, IS_GST_INCLUDED, GST_AMOUNT, TOTAL_AMOUNT
    aware: LINE_ITEM_QUANTITIES, LINE_ITEM_PRICES, LINE_ITEM_TOTAL_PRICES, TRANSACTION_AMOUNTS_PAID
    agnostic: LINE_ITEM_DESCRIPTIONS, LINE_ITEM_DESCRIPTIONS, TRANSACTION_DATES
    fuzzy: SUPPLIER_NAME, BUSINESS_ADDRESS, PAYER_NAME, PAYER_ADDRESS, INVOICE_DATE, STATEMENT_DATE_RANGE

  Comparable fields: 17
  Exclusive fields: 1

Saved: ../output/current_model_accuracy.csv
Fields: 18
Mean accuracy: 100.0%

F1 Scores:
  F1_Smart

Unnamed: 0,Subset,Field,Accuracy,F1_Binary,F1_Aware,F1_Agnostic,F1_Fuzzy,F1_Smart,Smart_Method,Exclusive,extracted_count,total_count
0,Invoices & receipts,DOCUMENT_TYPE,1.0,1.0,1.0,1.0,1.0,1.0,binary,True,12,12
1,Invoices & receipts,BUSINESS_ABN,1.0,1.0,1.0,1.0,1.0,1.0,binary,False,12,12
2,Invoices & receipts,SUPPLIER_NAME,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
3,Invoices & receipts,BUSINESS_ADDRESS,1.0,0.833333,0.833333,0.833333,0.833333,0.833333,fuzzy,False,12,12
4,Invoices & receipts,PAYER_NAME,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
5,Invoices & receipts,PAYER_ADDRESS,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
6,Invoices & receipts,INVOICE_DATE,1.0,0.833333,0.833333,0.833333,0.833333,0.833333,fuzzy,False,12,12
7,Invoices & receipts,LINE_ITEM_DESCRIPTIONS,1.0,1.0,1.0,1.0,1.0,1.0,agnostic,False,12,12
8,Invoices & receipts,LINE_ITEM_QUANTITIES,1.0,1.0,1.0,1.0,1.0,1.0,aware,False,12,12
9,Invoices & receipts,LINE_ITEM_PRICES,1.0,1.0,1.0,1.0,1.0,1.0,aware,False,12,12


---
## 5. Compare Multiple Models (Optional)

Quick comparison of all available models from per_field_metrics.csv

In [10]:
if per_field_df is not None:
    # Pivot to compare models
    comparison = per_field_df.pivot(index='field', columns='model', values='accuracy')
    comparison['best_model'] = comparison.idxmax(axis=1)
    
    print("\nField-Level Accuracy Comparison (from per_field_metrics.csv)")
    print("=" * 80)
    
    # Format as percentages
    display_df = comparison.copy()
    for col in display_df.columns:
        if col != 'best_model':
            display_df[col] = display_df[col].apply(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")
    
    display(display_df)
    
    # Summary
    print("\nModel Summary:")
    for model in per_field_df['model'].unique():
        model_data = per_field_df[per_field_df['model'] == model]
        wins = (comparison['best_model'] == model).sum()
        print(f"  {model}: mean={model_data['accuracy'].mean():.1%}, best on {wins} fields")

---
## 6. Export All Models for Comparison

Export accuracy files for all available models.

In [11]:
def export_all_models(per_field_df: pd.DataFrame, output_dir: Path):
    """Export accuracy CSV for each model in per_field_metrics.csv"""
    if per_field_df is None:
        print("No per_field_metrics.csv available")
        return
    
    exported = []
    for model in per_field_df['model'].unique():
        # Create safe filename
        safe_name = model.lower().replace('-', '_').replace(' ', '_')
        filename = f"{safe_name}_accuracy.csv"
        output_path = output_dir / filename
        
        result_df = extract_model_accuracy_from_per_field(per_field_df, model)
        if result_df is not None:
            result_df.to_csv(output_path, index=False)
            exported.append((model, output_path, result_df['Accuracy'].mean()))
            print(f"Exported: {filename} (mean accuracy: {result_df['Accuracy'].mean():.1%})")
    
    return exported

print("Exporting all models...")
print("=" * 60)
all_exports = export_all_models(per_field_df, OUTPUT_DIR)

Exporting all models...
No per_field_metrics.csv available


---
## Summary

Files created:
- `current_model_accuracy.csv` - Selected model for comparison notebook
- `{model}_accuracy.csv` - Individual files for each model

To use in comparison notebook:
```python
CURRENT_MODEL_CSV = Path("../output/current_model_accuracy.csv")
# Or use specific model file:
CURRENT_MODEL_CSV = Path("../output/internvl3_8b_accuracy.csv")
```