# Harvest Model Accuracy from Evaluation Results

This notebook scans `output/csv/` for evaluation results and generates the `current_model_accuracy.csv` file for use in model comparison.

## Data Sources
1. **per_field_metrics.csv** - Pre-computed field-level metrics by model
2. **Model batch results** - Per-image results that can be aggregated

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re

# Paths
CSV_DIR = Path("../output/csv")
OUTPUT_DIR = Path("../output")

print(f"Scanning: {CSV_DIR.resolve()}")

Scanning: /Users/tod/Desktop/LMM_POC/output/csv


---
## 1. Discover Available Result Files

In [2]:
def discover_result_files(csv_dir: Path) -> dict:
    """Discover all available result files in the CSV directory."""
    files = {
        'per_field_metrics': None,
        'llama_batch': [],
        'internvl3_batch': [],  # Generic internvl3 (treated as 8B)
        'internvl3_2b_batch': [],
        'internvl3_8b_batch': [],
        'batch_summary': [],
    }
    
    for f in csv_dir.glob("*.csv"):
        name = f.name
        
        if name == 'per_field_metrics.csv':
            files['per_field_metrics'] = f
        elif name.startswith('llama_batch_results_'):
            files['llama_batch'].append(f)
        elif name.startswith('internvl3_2b_batch_results_'):
            files['internvl3_2b_batch'].append(f)
        elif name.startswith('internvl3_5_8b_batch_results_') or name.startswith('internvl3_8b_batch_results_'):
            files['internvl3_8b_batch'].append(f)
        elif name.startswith('internvl3_batch_results_'):
            # Treat generic internvl3_batch_results as 8B (most common case)
            files['internvl3_batch'].append(f)
        elif '_summary.csv' in name:
            files['batch_summary'].append(f)
    
    # Sort by timestamp (most recent first)
    for key in ['llama_batch', 'internvl3_batch', 'internvl3_2b_batch', 'internvl3_8b_batch']:
        files[key] = sorted(files[key], key=lambda x: x.name, reverse=True)
    
    return files

result_files = discover_result_files(CSV_DIR)

print("Available Result Files")
print("=" * 60)
print(f"\nper_field_metrics.csv: {'Found' if result_files['per_field_metrics'] else 'Not found'}")
print(f"\nLlama batch results: {len(result_files['llama_batch'])} files")
for f in result_files['llama_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3-8B batch results: {len(result_files['internvl3_8b_batch'])} files")
for f in result_files['internvl3_8b_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3 (generic, treated as 8B): {len(result_files['internvl3_batch'])} files")
for f in result_files['internvl3_batch'][:3]:
    print(f"  - {f.name}")

print(f"\nInternVL3-2B batch results: {len(result_files['internvl3_2b_batch'])} files")
for f in result_files['internvl3_2b_batch'][:3]:
    print(f"  - {f.name}")

Available Result Files

per_field_metrics.csv: Not found

Llama batch results: 2 files
  - llama_batch_results_20251210_003155.csv
  - llama_batch_results_20251210_001755.csv

InternVL3-8B batch results: 0 files

InternVL3 (generic, treated as 8B): 1 files
  - internvl3_batch_results_20251210_005902.csv

InternVL3-2B batch results: 1 files
  - internvl3_2b_batch_results_20251210_013149.csv


---
## 2. Option A: Use per_field_metrics.csv (Recommended)

This file already contains pre-computed field-level accuracy for each model.

In [3]:
def load_per_field_metrics(csv_path: Path) -> pd.DataFrame:
    """Load and display per-field metrics."""
    if not csv_path or not csv_path.exists():
        print("per_field_metrics.csv not found")
        return None
    
    df = pd.read_csv(csv_path)
    print(f"Loaded: {csv_path}")
    print(f"Shape: {df.shape}")
    print(f"Models: {df['model'].unique().tolist()}")
    print(f"Fields: {df['field'].nunique()}")
    
    return df

per_field_df = load_per_field_metrics(result_files['per_field_metrics'])
if per_field_df is not None:
    display(per_field_df.head(10))

per_field_metrics.csv not found


In [4]:
# Fields that are EXCLUSIVE to our model (competing model doesn't have these)
EXCLUSIVE_FIELDS = ["DOCUMENT_TYPE"]

# List fields where F1 scores are meaningful (for comparison)
# These are fields with multiple items where precision/recall matter
LIST_FIELDS = [
    "LINE_ITEM_DESCRIPTIONS", "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES",
    "LINE_ITEM_TOTAL_PRICES", "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

def extract_model_accuracy_from_per_field(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Extract field-level accuracy AND F1 scores for a specific model.
    
    The F1 scores in per_field_metrics.csv are POSITION-AGNOSTIC (set-based).
    This allows fair comparison with competing models using the same methodology.
    """
    model_df = df[df['model'] == model_name].copy()
    
    if len(model_df) == 0:
        print(f"Model '{model_name}' not found. Available: {df['model'].unique().tolist()}")
        return None
    
    # Determine subset based on field
    bank_fields = ['STATEMENT_DATE_RANGE', 'TRANSACTION_DATES', 'TRANSACTION_AMOUNTS_PAID']
    
    result = []
    for _, row in model_df.iterrows():
        subset = 'Bank statements' if row['field'] in bank_fields else 'Invoices & receipts'
        
        # Extract F1 score if available (position-agnostic from sklearn)
        f1_agnostic = row.get('f1_score', np.nan) if 'f1_score' in row.index else np.nan
        
        result.append({
            'Subset': subset,
            'Field': row['field'],
            'Accuracy': row['accuracy'],
            'F1_Agnostic': f1_agnostic,  # Position-agnostic F1 for fair comparison
            'Exclusive': row['field'] in EXCLUSIVE_FIELDS,
        })
    
    result_df = pd.DataFrame(result)
    
    # Summary
    exclusive_count = result_df['Exclusive'].sum()
    comparable_count = len(result_df) - exclusive_count
    f1_available = result_df['F1_Agnostic'].notna().sum()
    
    print(f"  Comparable fields: {comparable_count}")
    print(f"  Exclusive fields (not in competing model): {exclusive_count}")
    print(f"  Fields with F1 scores: {f1_available}")
    
    return result_df

# Show available models
if per_field_df is not None:
    print("\nAvailable models in per_field_metrics.csv:")
    for model in per_field_df['model'].unique():
        model_data = per_field_df[per_field_df['model'] == model]
        count = len(model_data)
        mean_acc = model_data['accuracy'].mean()
        mean_f1 = model_data['f1_score'].mean() if 'f1_score' in model_data.columns else np.nan
        f1_str = f", mean F1: {mean_f1:.1%}" if not np.isnan(mean_f1) else ""
        print(f"  - {model}: {count} fields, mean accuracy: {mean_acc:.1%}{f1_str}")

---
## 3. Option B: Aggregate from Batch Results

If per_field_metrics.csv is not available or you want fresh results from a specific batch.

In [5]:
# Schema fields by document type
INVOICE_RECEIPT_FIELDS = [
    "DOCUMENT_TYPE", "BUSINESS_ABN", "SUPPLIER_NAME", "BUSINESS_ADDRESS",
    "PAYER_NAME", "PAYER_ADDRESS", "INVOICE_DATE", "LINE_ITEM_DESCRIPTIONS",
    "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES", "LINE_ITEM_TOTAL_PRICES",
    "IS_GST_INCLUDED", "GST_AMOUNT", "TOTAL_AMOUNT",
]

BANK_STATEMENT_FIELDS = [
    "STATEMENT_DATE_RANGE", "LINE_ITEM_DESCRIPTIONS",  # LINE_ITEM_DESCRIPTIONS = transaction descriptions
    "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

# List fields where F1 is meaningful (multiple items)
LIST_FIELDS = [
    "LINE_ITEM_DESCRIPTIONS", "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES",
    "LINE_ITEM_TOTAL_PRICES", "TRANSACTION_DATES", "TRANSACTION_AMOUNTS_PAID",
]

# Fields that are EXCLUSIVE to our model (competing model doesn't have these)
EXCLUSIVE_FIELDS = ["DOCUMENT_TYPE"]

# Document type mapping (normalize variations)
DOC_TYPE_MAP = {
    'receipt': 'invoice_receipt',
    'invoice': 'invoice_receipt', 
    'bank_statement': 'bank_statement',
    'RECEIPT': 'invoice_receipt',
    'INVOICE': 'invoice_receipt',
    'BANK_STATEMENT': 'bank_statement',
}

# Ground truth paths
GROUND_TRUTH_PATHS = {
    'inv_rec': Path("../evaluation_data/inv_rec/ground_truth_inv_rec.csv"),
    'bank': Path("../evaluation_data/bank/ground_truth_bank.csv"),
    'synthetic': Path("../evaluation_data/synthetic/ground_truth_synthetic.csv"),
}

def load_ground_truth() -> pd.DataFrame:
    """Load and combine all ground truth files."""
    dfs = []
    for name, path in GROUND_TRUTH_PATHS.items():
        if path.exists():
            df = pd.read_csv(path)
            df['source'] = name
            dfs.append(df)
            print(f"  Loaded {name}: {len(df)} rows")
    
    if dfs:
        combined = pd.concat(dfs, ignore_index=True)
        # Normalize image_file to stem for matching
        combined['image_stem'] = combined['image_file'].apply(lambda x: Path(str(x)).stem)
        print(f"  Total ground truth: {len(combined)} rows")
        return combined
    return None

print("Loading ground truth for F1 computation...")
ground_truth_df = load_ground_truth()

Loading ground truth for F1 computation...
  Loaded inv_rec: 6 rows
  Loaded bank: 15 rows
  Loaded synthetic: 9 rows
  Total ground truth: 30 rows


In [6]:
import re

def load_batch_results(csv_path: Path) -> pd.DataFrame:
    """Load a batch results file."""
    df = pd.read_csv(csv_path)
    print(f"Loaded: {csv_path.name}")
    print(f"  Images: {len(df)}")
    print(f"  Document types: {df['document_type'].value_counts().to_dict()}")
    if 'overall_accuracy' in df.columns:
        print(f"  Mean overall accuracy: {df['overall_accuracy'].mean():.1%}")
    
    # Add image_stem for ground truth matching
    if 'image_file' in df.columns:
        df['image_stem'] = df['image_file'].apply(lambda x: Path(str(x)).stem)
    elif 'image_name' in df.columns:
        df['image_stem'] = df['image_name'].apply(lambda x: Path(str(x)).stem)
    
    return df


def normalize_value(value: str, field_name: str = "") -> str:
    """Normalize a value for comparison."""
    if pd.isna(value):
        return ""
    s = str(value).strip().lower()
    if s == 'not_found':
        return 'NOT_FOUND'
    
    # For monetary fields, normalize to just digits and decimal
    if any(x in field_name.upper() for x in ['AMOUNT', 'PRICE', 'TOTAL', 'GST']):
        s = re.sub(r'[^\d.]', '', s)
    
    # For ABN/numeric IDs, keep only digits
    if any(x in field_name.upper() for x in ['ABN', 'BSB', 'NUMBER']):
        s = re.sub(r'\D', '', s)
    
    # General normalization: remove extra spaces
    s = re.sub(r'\s+', ' ', s)
    s = s.strip()
    
    return s


def compute_token_similarity(s1: str, s2: str) -> float:
    """Compute token-level F1 similarity between two strings."""
    t1 = set(s1.lower().split())
    t2 = set(s2.lower().split())
    
    if not t1 or not t2:
        return 0.0
    
    overlap = len(t1 & t2)
    precision = overlap / len(t1)
    recall = overlap / len(t2)
    
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)


# =============================================================================
# Field Type Categories for Smart F1 Selection
# =============================================================================

# Fields requiring strict binary matching (exact full value)
BINARY_FIELDS = [
    "BUSINESS_ABN", "GST_AMOUNT", "TOTAL_AMOUNT", 
    "IS_GST_INCLUDED", "DOCUMENT_TYPE",
]

# Fields requiring position-aware matching (ordered lists)
POSITION_AWARE_FIELDS = [
    "LINE_ITEM_QUANTITIES", "LINE_ITEM_PRICES", "LINE_ITEM_TOTAL_PRICES",
    "TRANSACTION_AMOUNTS_PAID",
]

# Fields using position-agnostic matching (unordered sets)
POSITION_AGNOSTIC_FIELDS = [
    "LINE_ITEM_DESCRIPTIONS", "TRANSACTION_DATES",
]

# Fields using fuzzy matching (names, addresses, free text)
FUZZY_FIELDS = [
    "SUPPLIER_NAME", "BUSINESS_ADDRESS", "PAYER_NAME", "PAYER_ADDRESS",
    "INVOICE_DATE", "STATEMENT_DATE_RANGE",
]


def get_smart_f1_method(field_name: str) -> str:
    """Determine the most appropriate F1 method for a field."""
    if field_name in BINARY_FIELDS:
        return "binary"
    elif field_name in POSITION_AWARE_FIELDS:
        return "aware"
    elif field_name in POSITION_AGNOSTIC_FIELDS:
        return "agnostic"
    elif field_name in FUZZY_FIELDS:
        return "fuzzy"
    else:
        # Default to position-agnostic for unknown fields
        return "agnostic"


def compute_f1_binary(extracted: str, ground_truth: str, field_name: str = "") -> float:
    """
    Binary F1 - entire field value must match exactly.
    
    - Strictest possible matching
    - No item-level comparison for lists
    - Entire string must match after normalization
    - Returns 1.0 or 0.0 only
    """
    if pd.isna(extracted) or pd.isna(ground_truth):
        return np.nan
    
    ext_str = str(extracted).strip()
    gt_str = str(ground_truth).strip()
    
    # Handle NOT_FOUND
    if ext_str.upper() == 'NOT_FOUND' and gt_str.upper() == 'NOT_FOUND':
        return 1.0
    if ext_str.upper() == 'NOT_FOUND' or gt_str.upper() == 'NOT_FOUND':
        return 0.0
    
    # Normalize entire string and compare
    ext_norm = normalize_value(ext_str, field_name)
    gt_norm = normalize_value(gt_str, field_name)
    
    return 1.0 if ext_norm == gt_norm else 0.0


def compute_f1_position_agnostic(extracted: str, ground_truth: str, field_name: str = "") -> float:
    """
    Position-Agnostic F1 with EXACT item matching.
    
    - Order doesn't matter (set-based)
    - Items must match exactly after normalization
    - No partial credit
    """
    if pd.isna(extracted) or pd.isna(ground_truth):
        return np.nan
    
    ext_str = str(extracted).strip()
    gt_str = str(ground_truth).strip()
    
    # Handle NOT_FOUND
    if ext_str.upper() == 'NOT_FOUND' and gt_str.upper() == 'NOT_FOUND':
        return 1.0
    if ext_str.upper() == 'NOT_FOUND' or gt_str.upper() == 'NOT_FOUND':
        return 0.0
    
    # Check if list field
    is_list = '|' in ext_str or '|' in gt_str or field_name in LIST_FIELDS
    
    if is_list:
        ext_items = {normalize_value(item.strip(), field_name)
                     for item in ext_str.split('|') if item.strip()}
        gt_items = {normalize_value(item.strip(), field_name)
                    for item in gt_str.split('|') if item.strip()}
        
        if not ext_items and not gt_items:
            return 1.0
        if not ext_items or not gt_items:
            return 0.0
        
        # Set intersection = exact matches
        tp = len(ext_items & gt_items)
        precision = tp / len(ext_items)
        recall = tp / len(gt_items)
        
        if precision + recall == 0:
            return 0.0
        return 2 * precision * recall / (precision + recall)
    else:
        # Scalar: exact match
        return 1.0 if normalize_value(ext_str, field_name) == normalize_value(gt_str, field_name) else 0.0


def compute_f1_fuzzy_agnostic(extracted: str, ground_truth: str, field_name: str = "") -> float:
    """
    Fuzzy Position-Agnostic F1 with token-level matching.
    
    - Order doesn't matter (set-based)
    - Items can partially match using token overlap
    - Partial credit for similar items (threshold 0.5)
    """
    if pd.isna(extracted) or pd.isna(ground_truth):
        return np.nan
    
    ext_str = str(extracted).strip()
    gt_str = str(ground_truth).strip()
    
    # Handle NOT_FOUND
    if ext_str.upper() == 'NOT_FOUND' and gt_str.upper() == 'NOT_FOUND':
        return 1.0
    if ext_str.upper() == 'NOT_FOUND' or gt_str.upper() == 'NOT_FOUND':
        return 0.0
    
    # Check if list field
    is_list = '|' in ext_str or '|' in gt_str or field_name in LIST_FIELDS
    
    if is_list:
        ext_items = [normalize_value(item.strip(), field_name)
                     for item in ext_str.split('|') if item.strip()]
        gt_items = [normalize_value(item.strip(), field_name)
                    for item in gt_str.split('|') if item.strip()]
        
        if not ext_items and not gt_items:
            return 1.0
        if not ext_items or not gt_items:
            return 0.0
        
        # Greedy matching: for each extracted, find best GT match
        matched_gt = set()
        tp = 0.0
        
        for ext_item in ext_items:
            best_score = 0.0
            best_idx = -1
            for i, gt_item in enumerate(gt_items):
                if i in matched_gt:
                    continue
                # Exact match = 1.0, else token similarity
                if ext_item == gt_item:
                    score = 1.0
                else:
                    score = compute_token_similarity(ext_item, gt_item)
                if score > best_score:
                    best_score = score
                    best_idx = i
            
            if best_score >= 0.5:  # Threshold for match
                tp += best_score  # Partial credit
                if best_idx >= 0:
                    matched_gt.add(best_idx)
        
        precision = tp / len(ext_items)
        recall = tp / len(gt_items)
        
        if precision + recall == 0:
            return 0.0
        return 2 * precision * recall / (precision + recall)
    else:
        # Scalar: token similarity
        ext_norm = normalize_value(ext_str, field_name)
        gt_norm = normalize_value(gt_str, field_name)
        if ext_norm == gt_norm:
            return 1.0
        return compute_token_similarity(ext_norm, gt_norm)


def compute_f1_position_aware(extracted: str, ground_truth: str, field_name: str = "") -> float:
    """
    Position-Aware F1 with exact positional matching.
    
    - Items must match at the SAME position
    - Strict positional matching
    - No partial credit
    """
    if pd.isna(extracted) or pd.isna(ground_truth):
        return np.nan
    
    ext_str = str(extracted).strip()
    gt_str = str(ground_truth).strip()
    
    # Handle NOT_FOUND
    if ext_str.upper() == 'NOT_FOUND' and gt_str.upper() == 'NOT_FOUND':
        return 1.0
    if ext_str.upper() == 'NOT_FOUND' or gt_str.upper() == 'NOT_FOUND':
        return 0.0
    
    # Check if list field
    is_list = '|' in ext_str or '|' in gt_str or field_name in LIST_FIELDS
    
    if is_list:
        ext_items = [normalize_value(item.strip(), field_name)
                     for item in ext_str.split('|') if item.strip()]
        gt_items = [normalize_value(item.strip(), field_name)
                    for item in gt_str.split('|') if item.strip()]
        
        if not ext_items and not gt_items:
            return 1.0
        if not ext_items or not gt_items:
            return 0.0
        
        # Position-aware: compare at same index
        tp = 0
        for i in range(min(len(ext_items), len(gt_items))):
            if ext_items[i] == gt_items[i]:
                tp += 1
        
        precision = tp / len(ext_items)
        recall = tp / len(gt_items)
        
        if precision + recall == 0:
            return 0.0
        return 2 * precision * recall / (precision + recall)
    else:
        # Scalar: exact match
        return 1.0 if normalize_value(ext_str, field_name) == normalize_value(gt_str, field_name) else 0.0


def compute_f1_smart(extracted: str, ground_truth: str, field_name: str) -> tuple[float, str]:
    """
    Smart F1 - automatically selects the most appropriate F1 method based on field type.
    
    Returns: (f1_score, method_used)
    
    Field type mappings:
    - Binary: ABN, amounts, GST, document type (strict validation)
    - Position-Aware: quantities, prices, amounts (ordered lists)
    - Position-Agnostic: descriptions, dates (unordered sets)
    - Fuzzy: names, addresses (free text)
    """
    method = get_smart_f1_method(field_name)
    
    if method == "binary":
        return compute_f1_binary(extracted, ground_truth, field_name), "binary"
    elif method == "aware":
        return compute_f1_position_aware(extracted, ground_truth, field_name), "aware"
    elif method == "agnostic":
        return compute_f1_position_agnostic(extracted, ground_truth, field_name), "agnostic"
    else:  # fuzzy
        return compute_f1_fuzzy_agnostic(extracted, ground_truth, field_name), "fuzzy"


def compute_field_accuracy_from_batch(df: pd.DataFrame, ground_truth: pd.DataFrame = None) -> pd.DataFrame:
    """
    Compute field-level accuracy AND five F1 variants from batch results.
    
    F1 Variants (from strictest to most lenient):
    - F1_Binary: Entire field must match exactly (strictest)
    - F1_Aware: Position-aware with exact item matching at same position
    - F1_Agnostic: Position-agnostic with exact item matching (set-based)
    - F1_Fuzzy: Position-agnostic with fuzzy/token matching (most lenient)
    - F1_Smart: Automatically selects best method per field type (RECOMMENDED)
    """
    results = []
    
    # Normalize document types
    df = df.copy()
    df['doc_type_normalized'] = df['document_type'].map(DOC_TYPE_MAP).fillna('unknown')
    
    # Split by document type
    inv_rec_df = df[df['doc_type_normalized'] == 'invoice_receipt']
    bank_df = df[df['doc_type_normalized'] == 'bank_statement']
    
    print(f"  Invoice/Receipt images: {len(inv_rec_df)}")
    print(f"  Bank statement images: {len(bank_df)}")
    
    # Merge with ground truth if available
    if ground_truth is not None and 'image_stem' in df.columns:
        df = df.merge(ground_truth, on='image_stem', how='left', suffixes=('', '_gt'))
        has_gt = True
        matched = df['image_file_gt'].notna().sum()
        print(f"  Matched with ground truth: {matched}/{len(df)}")
    else:
        has_gt = False
        print("  No ground truth - using extraction rate as proxy")
    
    # Re-split after merge
    inv_rec_df = df[df['doc_type_normalized'] == 'invoice_receipt']
    bank_df = df[df['doc_type_normalized'] == 'bank_statement']
    
    def compute_all_f1(subset_df, field):
        """Compute all five F1 variants for a field."""
        f1_binary, f1_agnostic, f1_fuzzy, f1_aware, f1_smart = np.nan, np.nan, np.nan, np.nan, np.nan
        smart_method = get_smart_f1_method(field)
        
        if has_gt and f'{field}_gt' in subset_df.columns:
            scores_binary, scores_agnostic, scores_fuzzy, scores_aware, scores_smart = [], [], [], [], []
            
            for _, row in subset_df.iterrows():
                if pd.notna(row.get(f'{field}_gt')):
                    ext = row[field]
                    gt = row[f'{field}_gt']
                    
                    s0 = compute_f1_binary(ext, gt, field)
                    s1 = compute_f1_position_agnostic(ext, gt, field)
                    s2 = compute_f1_fuzzy_agnostic(ext, gt, field)
                    s3 = compute_f1_position_aware(ext, gt, field)
                    s4, _ = compute_f1_smart(ext, gt, field)
                    
                    if not np.isnan(s0): scores_binary.append(s0)
                    if not np.isnan(s1): scores_agnostic.append(s1)
                    if not np.isnan(s2): scores_fuzzy.append(s2)
                    if not np.isnan(s3): scores_aware.append(s3)
                    if not np.isnan(s4): scores_smart.append(s4)
            
            if scores_binary: f1_binary = np.mean(scores_binary)
            if scores_agnostic: f1_agnostic = np.mean(scores_agnostic)
            if scores_fuzzy: f1_fuzzy = np.mean(scores_fuzzy)
            if scores_aware: f1_aware = np.mean(scores_aware)
            if scores_smart: f1_smart = np.mean(scores_smart)
        
        return f1_binary, f1_agnostic, f1_fuzzy, f1_aware, f1_smart, smart_method
    
    # Process Invoice & Receipt fields
    for field in INVOICE_RECEIPT_FIELDS:
        if field not in df.columns:
            continue
        if len(inv_rec_df) == 0:
            continue
        
        valid = inv_rec_df[field].notna() & (inv_rec_df[field] != 'NOT_FOUND') & (inv_rec_df[field] != '')
        coverage = valid.mean() if len(inv_rec_df) > 0 else np.nan
        
        f1_binary, f1_agnostic, f1_fuzzy, f1_aware, f1_smart, smart_method = compute_all_f1(inv_rec_df, field)
        
        results.append({
            'Subset': 'Invoices & receipts',
            'Field': field,
            'Accuracy': coverage,
            'F1_Binary': f1_binary,
            'F1_Aware': f1_aware,
            'F1_Agnostic': f1_agnostic,
            'F1_Fuzzy': f1_fuzzy,
            'F1_Smart': f1_smart,
            'Smart_Method': smart_method,
            'Exclusive': field in EXCLUSIVE_FIELDS,
            'extracted_count': int(valid.sum()),
            'total_count': len(inv_rec_df),
        })
    
    # Process Bank Statement fields
    for field in BANK_STATEMENT_FIELDS:
        if field not in df.columns:
            continue
        if len(bank_df) == 0:
            continue
        
        valid = bank_df[field].notna() & (bank_df[field] != 'NOT_FOUND') & (bank_df[field] != '')
        coverage = valid.mean() if len(bank_df) > 0 else np.nan
        
        f1_binary, f1_agnostic, f1_fuzzy, f1_aware, f1_smart, smart_method = compute_all_f1(bank_df, field)
        
        results.append({
            'Subset': 'Bank statements',
            'Field': field,
            'Accuracy': coverage,
            'F1_Binary': f1_binary,
            'F1_Aware': f1_aware,
            'F1_Agnostic': f1_agnostic,
            'F1_Fuzzy': f1_fuzzy,
            'F1_Smart': f1_smart,
            'Smart_Method': smart_method,
            'Exclusive': field in EXCLUSIVE_FIELDS,
            'extracted_count': int(valid.sum()),
            'total_count': len(bank_df),
        })
    
    result_df = pd.DataFrame(results)
    
    # Summary
    exclusive_count = result_df['Exclusive'].sum()
    comparable_count = len(result_df) - exclusive_count
    
    print(f"\n  F1 Scores Summary:")
    print(f"    F1_Smart (field-appropriate): {result_df['F1_Smart'].mean():.1%}  ← RECOMMENDED")
    print(f"    ----------------------------------------")
    print(f"    Binary (exact full match):    {result_df['F1_Binary'].mean():.1%}")
    print(f"    Position-Aware (positional):  {result_df['F1_Aware'].mean():.1%}")
    print(f"    Position-Agnostic (set):      {result_df['F1_Agnostic'].mean():.1%}")
    print(f"    Fuzzy Agnostic (lenient):     {result_df['F1_Fuzzy'].mean():.1%}")
    print(f"\n  Smart F1 Method Selection:")
    for method in ['binary', 'aware', 'agnostic', 'fuzzy']:
        fields = result_df[result_df['Smart_Method'] == method]['Field'].tolist()
        if fields:
            print(f"    {method}: {', '.join(fields)}")
    print(f"\n  Comparable fields: {comparable_count}")
    print(f"  Exclusive fields: {exclusive_count}")
    
    return result_df

In [7]:
# Example: Load most recent batch results for each model
print("Most recent batch results by model:")
print("=" * 60)

batch_dfs = {}

if result_files['llama_batch']:
    latest = result_files['llama_batch'][0]
    batch_dfs['Llama-11B'] = load_batch_results(latest)
    print()

# InternVL3-8B: Check both explicit 8B files and generic internvl3 files
if result_files['internvl3_8b_batch']:
    latest = result_files['internvl3_8b_batch'][0]
    batch_dfs['InternVL3-8B'] = load_batch_results(latest)
    print()
elif result_files['internvl3_batch']:
    # Use generic internvl3_batch_results as 8B fallback
    latest = result_files['internvl3_batch'][0]
    print("Using generic internvl3_batch_results as InternVL3-8B:")
    batch_dfs['InternVL3-8B'] = load_batch_results(latest)
    print()

if result_files['internvl3_2b_batch']:
    latest = result_files['internvl3_2b_batch'][0]
    batch_dfs['InternVL3-2B'] = load_batch_results(latest)
    print()

print(f"\nModels loaded: {list(batch_dfs.keys())}")

Most recent batch results by model:
Loaded: llama_batch_results_20251210_003155.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 9764.3%

Using generic internvl3_batch_results as InternVL3-8B:
Loaded: internvl3_batch_results_20251210_005902.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 8485.7%

Loaded: internvl3_2b_batch_results_20251210_013149.csv
  Images: 9
  Document types: {'bank_statement': 3, 'receipt': 3, 'invoice': 3}
  Mean overall accuracy: 7136.5%


Models loaded: ['Llama-11B', 'InternVL3-8B', 'InternVL3-2B']


---
## 4. Select Model and Export

Choose which model's accuracy to export as `current_model_accuracy.csv`

In [8]:
# =============================================================================
# CONFIGURATION - Select model and data source
# =============================================================================

# Choose model to export
SELECTED_MODEL = "Llama-11B"  # Options: "Llama-11B", "InternVL3-8B", "InternVL3-2B"

# Choose data source
USE_PER_FIELD_METRICS = False  # True = use per_field_metrics.csv (recommended)
                               # False = aggregate from batch results (coverage only)

# Output filename
OUTPUT_FILENAME = "current_model_accuracy.csv"

In [9]:
def export_model_accuracy(model_name: str, use_per_field: bool, 
                          per_field_df: pd.DataFrame, batch_dfs: dict,
                          ground_truth_df: pd.DataFrame,
                          output_path: Path) -> pd.DataFrame:
    """Export field-level accuracy AND all F1 variants for selected model."""
    
    if use_per_field and per_field_df is not None:
        print(f"Using per_field_metrics.csv for {model_name}")
        result_df = extract_model_accuracy_from_per_field(per_field_df, model_name)
        if result_df is None:
            return None
    elif model_name in batch_dfs:
        print(f"Aggregating from batch results for {model_name}")
        if ground_truth_df is not None:
            print("Computing F1 against ground truth...")
        result_df = compute_field_accuracy_from_batch(batch_dfs[model_name], ground_truth_df)
    else:
        print(f"No data available for {model_name}")
        return None
    
    # Determine which columns to save
    save_cols = ['Subset', 'Field', 'Accuracy']
    
    # Add F1 columns if they have data (F1_Smart first as recommended)
    for f1_col in ['F1_Smart', 'F1_Binary', 'F1_Aware', 'F1_Agnostic', 'F1_Fuzzy']:
        if f1_col in result_df.columns and result_df[f1_col].notna().any():
            save_cols.append(f1_col)
    
    # Add Smart_Method to show which method was used
    if 'Smart_Method' in result_df.columns:
        save_cols.append('Smart_Method')
    
    if 'Exclusive' in result_df.columns:
        save_cols.append('Exclusive')
    
    result_df[save_cols].to_csv(output_path, index=False)
    print(f"\nSaved: {output_path}")
    print(f"Fields: {len(result_df)}")
    print(f"Mean accuracy: {result_df['Accuracy'].mean():.1%}")
    
    # Print F1 summaries
    print(f"\nF1 Scores:")
    if 'F1_Smart' in save_cols:
        print(f"  F1_Smart (field-appropriate): {result_df['F1_Smart'].mean():.1%}  ← RECOMMENDED")
        print(f"  ----------------------------------------")
    if 'F1_Binary' in save_cols:
        print(f"  Binary (exact full match):    {result_df['F1_Binary'].mean():.1%}")
    if 'F1_Aware' in save_cols:
        print(f"  Position-Aware (positional):  {result_df['F1_Aware'].mean():.1%}")
    if 'F1_Agnostic' in save_cols:
        print(f"  Position-Agnostic (set):      {result_df['F1_Agnostic'].mean():.1%}")
    if 'F1_Fuzzy' in save_cols:
        print(f"  Fuzzy Agnostic (lenient):     {result_df['F1_Fuzzy'].mean():.1%}")
    
    return result_df

# Export
output_path = OUTPUT_DIR / OUTPUT_FILENAME
exported_df = export_model_accuracy(
    SELECTED_MODEL, 
    USE_PER_FIELD_METRICS, 
    per_field_df, 
    batch_dfs,
    ground_truth_df,  # Pass ground truth for F1 computation
    output_path
)

if exported_df is not None:
    print("\n" + "=" * 60)
    print("Exported Data:")
    print("=" * 60)
    display(exported_df)

Aggregating from batch results for Llama-11B
Computing F1 against ground truth...
  Invoice/Receipt images: 6
  Bank statement images: 3
  Matched with ground truth: 18/18

  F1 Scores Summary:
    F1_Smart (field-appropriate): 96.0%  ← RECOMMENDED
    ----------------------------------------
    Binary (exact full match):    90.7%
    Position-Aware (positional):  96.1%
    Position-Agnostic (set):      96.0%
    Fuzzy Agnostic (lenient):     97.6%

  Smart F1 Method Selection:
    binary: DOCUMENT_TYPE, BUSINESS_ABN, IS_GST_INCLUDED, GST_AMOUNT, TOTAL_AMOUNT
    aware: LINE_ITEM_QUANTITIES, LINE_ITEM_PRICES, LINE_ITEM_TOTAL_PRICES, TRANSACTION_AMOUNTS_PAID
    agnostic: LINE_ITEM_DESCRIPTIONS, LINE_ITEM_DESCRIPTIONS, TRANSACTION_DATES
    fuzzy: SUPPLIER_NAME, BUSINESS_ADDRESS, PAYER_NAME, PAYER_ADDRESS, INVOICE_DATE, STATEMENT_DATE_RANGE

  Comparable fields: 17
  Exclusive fields: 1

Saved: ../output/current_model_accuracy.csv
Fields: 18
Mean accuracy: 100.0%

F1 Scores:
  F1_Smart

Unnamed: 0,Subset,Field,Accuracy,F1_Binary,F1_Aware,F1_Agnostic,F1_Fuzzy,F1_Smart,Smart_Method,Exclusive,extracted_count,total_count
0,Invoices & receipts,DOCUMENT_TYPE,1.0,1.0,1.0,1.0,1.0,1.0,binary,True,12,12
1,Invoices & receipts,BUSINESS_ABN,1.0,1.0,1.0,1.0,1.0,1.0,binary,False,12,12
2,Invoices & receipts,SUPPLIER_NAME,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
3,Invoices & receipts,BUSINESS_ADDRESS,1.0,0.833333,0.833333,0.833333,0.833333,0.833333,fuzzy,False,12,12
4,Invoices & receipts,PAYER_NAME,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
5,Invoices & receipts,PAYER_ADDRESS,1.0,1.0,1.0,1.0,1.0,1.0,fuzzy,False,12,12
6,Invoices & receipts,INVOICE_DATE,1.0,0.833333,0.833333,0.833333,0.833333,0.833333,fuzzy,False,12,12
7,Invoices & receipts,LINE_ITEM_DESCRIPTIONS,1.0,1.0,1.0,1.0,1.0,1.0,agnostic,False,12,12
8,Invoices & receipts,LINE_ITEM_QUANTITIES,1.0,1.0,1.0,1.0,1.0,1.0,aware,False,12,12
9,Invoices & receipts,LINE_ITEM_PRICES,1.0,1.0,1.0,1.0,1.0,1.0,aware,False,12,12


---
## 5. Compare Multiple Models (Optional)

Quick comparison of all available models from per_field_metrics.csv

In [10]:
if per_field_df is not None:
    # Pivot to compare models
    comparison = per_field_df.pivot(index='field', columns='model', values='accuracy')
    comparison['best_model'] = comparison.idxmax(axis=1)
    
    print("\nField-Level Accuracy Comparison (from per_field_metrics.csv)")
    print("=" * 80)
    
    # Format as percentages
    display_df = comparison.copy()
    for col in display_df.columns:
        if col != 'best_model':
            display_df[col] = display_df[col].apply(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")
    
    display(display_df)
    
    # Summary
    print("\nModel Summary:")
    for model in per_field_df['model'].unique():
        model_data = per_field_df[per_field_df['model'] == model]
        wins = (comparison['best_model'] == model).sum()
        print(f"  {model}: mean={model_data['accuracy'].mean():.1%}, best on {wins} fields")

---
## 6. Export All Models for Comparison

Export accuracy files for all available models.

In [11]:
def export_all_models(per_field_df: pd.DataFrame, output_dir: Path):
    """Export accuracy CSV for each model in per_field_metrics.csv"""
    if per_field_df is None:
        print("No per_field_metrics.csv available")
        return
    
    exported = []
    for model in per_field_df['model'].unique():
        # Create safe filename
        safe_name = model.lower().replace('-', '_').replace(' ', '_')
        filename = f"{safe_name}_accuracy.csv"
        output_path = output_dir / filename
        
        result_df = extract_model_accuracy_from_per_field(per_field_df, model)
        if result_df is not None:
            result_df.to_csv(output_path, index=False)
            exported.append((model, output_path, result_df['Accuracy'].mean()))
            print(f"Exported: {filename} (mean accuracy: {result_df['Accuracy'].mean():.1%})")
    
    return exported

print("Exporting all models...")
print("=" * 60)
all_exports = export_all_models(per_field_df, OUTPUT_DIR)

Exporting all models...
No per_field_metrics.csv available


---
## Summary

Files created:
- `current_model_accuracy.csv` - Selected model for comparison notebook
- `{model}_accuracy.csv` - Individual files for each model

To use in comparison notebook:
```python
CURRENT_MODEL_CSV = Path("../output/current_model_accuracy.csv")
# Or use specific model file:
CURRENT_MODEL_CSV = Path("../output/internvl3_8b_accuracy.csv")
```