# Llama Universal Extraction - Method 1

**Strategy:** Extract all 17 fields from every document regardless of type.

**Key Differences from Two-Stage:**
- No document type classification step
- Single universal prompt for all documents
- Evaluation uses ground truth doc type to select relevant fields (Option C)
- Tracks false positives (irrelevant fields extracted)

**Output:** `llama_universal_batch_results_*.csv` compatible with model_comparison.ipynb

## Imports

In [None]:
#Cell 1
# Enable autoreload for module changes
%load_ext autoreload
%autoreload 2

import os
os.environ['EVALUATION_METHOD'] = 'order_aware_f1'  # or 'f1', 'kieval', 'order_aware_f1', 'correlation'

# Standard library imports
import warnings
import yaml
from datetime import datetime
from pathlib import Path

# Third-party imports
import numpy as np
import pandas as pd
from IPython.display import display, Image
from rich import print as rprint
from rich.console import Console

# Project-specific imports
from common.batch_analytics import BatchAnalytics
from common.batch_reporting import BatchReporter
from common.batch_visualizations import BatchVisualizer
from common.evaluation_metrics import load_ground_truth
from common.extraction_parser import discover_images
from common.gpu_optimization import emergency_cleanup
from common.llama_model_loader_robust import load_llama_model_robust

print("‚úÖ All imports loaded successfully")
print("‚úÖ Universal extraction mode")
warnings.filterwarnings('ignore')

## Pre-emptive Memory Cleanup

In [None]:
#Cell 2
# Initialize console for rich output
console = Console()

# Pre-emptive V100 Memory Cleanup - Run FIRST to prevent OOM errors
rprint("[bold red]üßπ PRE-EMPTIVE V100 MEMORY CLEANUP[/bold red]")
rprint("[yellow]Clearing any existing model caches before loading...[/yellow]")

# Emergency cleanup to ensure clean slate
emergency_cleanup(verbose=True)

rprint("[green]‚úÖ Memory cleanup complete - ready for model loading[/green]")

## Configuration

In [None]:
#Cell 3
# Environment-specific base paths
ENVIRONMENT_BASES = {
    'sandbox': '/home/jovyan/nfs_share/tod',
    'efs': '/efs/shared/PoC_data'
}
base_data_path = ENVIRONMENT_BASES['sandbox']

CONFIG = {
    # Model settings
    'MODEL_PATH': "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct",

    # Batch settings
    'DATA_DIR': f'{base_data_path}/evaluation_data',
    'GROUND_TRUTH': f'{base_data_path}/evaluation_data/ground_truth.csv',
    'OUTPUT_BASE': f'{base_data_path}/LMM_POC/output',
    'MAX_IMAGES': None,  # None for all, or set limit
    'DOCUMENT_TYPES': None,  # None for all
    
    # EXTRACTION METHOD - Universal
    'EXTRACTION_METHOD': 'universal',  # Extract all 17 fields for every document
    'ENABLE_MATH_ENHANCEMENT': False,
    
    # Inference and evaluation mode
    'INFERENCE_ONLY': False,  # Set to True to skip evaluation

    # Verbosity control
    'VERBOSE': True,
    'SHOW_PROMPTS': True,

    # GPU optimization settings
    'USE_QUANTIZATION': False,
    'DEVICE_MAP': 'auto',
    'MAX_NEW_TOKENS': 2000,
    'TORCH_DTYPE': 'bfloat16',
    'LOW_CPU_MEM_USAGE': True,
    
    # Image preprocessing settings
    'ENABLE_PREPROCESSING': True,
    'PREPROCESSING_MODE': 'adaptive',  # 'light', 'moderate', 'aggressive', 'adaptive'
    'SAVE_PREPROCESSED': False,
    'PREPROCESSED_DIR': None,
}

# Make GROUND_TRUTH conditional based on INFERENCE_ONLY mode
if CONFIG['INFERENCE_ONLY']:
    CONFIG['GROUND_TRUTH'] = None

print("‚úÖ Configuration set up successfully")
print(f"üìÇ Data directory: {CONFIG['DATA_DIR']}")
print(f"üìä Ground truth: {CONFIG['GROUND_TRUTH']}")
print(f"ü§ñ Model path: {CONFIG['MODEL_PATH']}")
print(f"üìÅ Output base: {CONFIG['OUTPUT_BASE']}")
print(f"üî¨ Extraction method: {CONFIG['EXTRACTION_METHOD']} (all 17 fields)")
print(f"üéØ Mode: {'Inference-only' if CONFIG['INFERENCE_ONLY'] else 'Evaluation mode'}")
print(f"üîß Preprocessing: {'Enabled (' + CONFIG['PREPROCESSING_MODE'] + ')' if CONFIG['ENABLE_PREPROCESSING'] else 'Disabled'}")

## Output Directory Setup

In [None]:
#Cell 4
# Setup output directories
OUTPUT_BASE = Path(CONFIG['OUTPUT_BASE'])
if not OUTPUT_BASE.is_absolute():
    OUTPUT_BASE = Path.cwd() / OUTPUT_BASE

BATCH_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")

OUTPUT_DIRS = {
    'base': OUTPUT_BASE,
    'batch': OUTPUT_BASE / 'batch_results',
    'csv': OUTPUT_BASE / 'csv',
    'visualizations': OUTPUT_BASE / 'visualizations',
    'reports': OUTPUT_BASE / 'reports'
}

for dir_path in OUTPUT_DIRS.values():
    dir_path.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Output directories created at: {OUTPUT_BASE}")

## Model Loading

In [None]:
#Cell 5
# Load model once for entire batch
rprint("[bold green]Loading model with robust multi-GPU detection...[/bold green]")

model, processor = load_llama_model_robust(
    model_path=CONFIG['MODEL_PATH'],
    use_quantization=CONFIG['USE_QUANTIZATION'],
    device_map=CONFIG['DEVICE_MAP'],
    max_new_tokens=CONFIG['MAX_NEW_TOKENS'],
    torch_dtype=CONFIG['TORCH_DTYPE'],
    low_cpu_mem_usage=CONFIG['LOW_CPU_MEM_USAGE'],
    verbose=CONFIG['VERBOSE']
)

rprint("[bold green]‚úÖ Model ready for universal extraction[/bold green]")

## Universal Extraction Prompt

In [None]:
#Cell 6
# Load universal extraction prompt from YAML
prompt_file = Path('prompts/universal.yaml')

with open(prompt_file, 'r') as f:
    prompt_data = yaml.safe_load(f)

UNIVERSAL_PROMPT = prompt_data['prompts']['universal']['prompt']

rprint("[green]‚úÖ Universal extraction prompt loaded from prompts/universal.yaml[/green]")

In [None]:
#Cell 6.25
# Load universal field list from field_definitions.yaml
field_defs_path = Path('config/field_definitions.yaml')

with open(field_defs_path, 'r') as f:
    field_defs = yaml.safe_load(f)

# Get universal fields from YAML (19 total)
all_universal_fields = field_defs['document_fields']['universal']['fields']

# Remove fields no longer extracted (TRANSACTION_AMOUNTS_RECEIVED, ACCOUNT_BALANCE)
# Final 17 fields for universal extraction
EXCLUDED_FIELDS = ['TRANSACTION_AMOUNTS_RECEIVED', 'ACCOUNT_BALANCE']
UNIVERSAL_FIELDS = [f for f in all_universal_fields if f not in EXCLUDED_FIELDS]

rprint(f"[green]‚úÖ Loaded {len(UNIVERSAL_FIELDS)} universal fields (excluded {len(EXCLUDED_FIELDS)} fields)[/green]")
rprint(f"[cyan]Excluded: {', '.join(EXCLUDED_FIELDS)}[/cyan]")
rprint(f"[dim]Fields: {', '.join(UNIVERSAL_FIELDS[:5])}... (and {len(UNIVERSAL_FIELDS)-5} more)[/dim]")

In [None]:
#Cell 6.5
# Display universal prompt contents
if CONFIG.get('SHOW_PROMPTS', True):
    console.rule("[bold cyan]Universal Extraction Prompt[/bold cyan]")
    print(UNIVERSAL_PROMPT)
    console.rule("[bold cyan]End of Prompt[/bold cyan]")
else:
    rprint("[dim]Prompt display disabled (set CONFIG['SHOW_PROMPTS'] = True to view)[/dim]")

## Image Discovery and Ground Truth Loading

In [None]:
#Cell 7
# Discover images
data_dir = Path(CONFIG['DATA_DIR'])
if not data_dir.is_absolute():
    data_dir = Path.cwd() / data_dir

all_images = discover_images(str(data_dir))

# Image preprocessing (if enabled)
if CONFIG['ENABLE_PREPROCESSING']:
    import tempfile
    from common.image_preprocessing import (
        enhance_statement_quality,
        enhance_for_llama,
        preprocess_statement_for_llama,
        adaptive_enhance,
        preprocess_recommended
    )
    
    preprocess_functions = {
        'light': enhance_statement_quality,
        'moderate': enhance_for_llama,
        'aggressive': preprocess_statement_for_llama,
        'adaptive': adaptive_enhance,
        'recommended': preprocess_recommended
    }
    
    preprocess_fn = preprocess_functions[CONFIG['PREPROCESSING_MODE']]
    preprocessed_images = []
    
    rprint(f"[cyan]üîß Preprocessing {len(all_images)} images (mode: {CONFIG['PREPROCESSING_MODE']})[/cyan]")
    
    if CONFIG['SAVE_PREPROCESSED']:
        preprocessed_dir = Path(CONFIG['PREPROCESSED_DIR'] or 'preprocessed_images')
        preprocessed_dir.mkdir(parents=True, exist_ok=True)
    else:
        preprocessed_dir = Path(tempfile.mkdtemp(prefix='preprocessed_'))
    
    for img_path in all_images:
        original_filename = Path(img_path).name
        try:
            preprocessed_img = preprocess_fn(img_path)
            preprocessed_path = preprocessed_dir / original_filename
            preprocessed_img.save(preprocessed_path)
            preprocessed_images.append(str(preprocessed_path))
        except Exception as e:
            rprint(f"[yellow]‚ö†Ô∏è  Preprocessing failed for {original_filename}: {e}[/yellow]")
            preprocessed_images.append(img_path)
    
    all_images = preprocessed_images
    rprint(f"[green]‚úÖ Preprocessing complete[/green]")

# Load ground truth
ground_truth = {}
if not CONFIG['INFERENCE_ONLY'] and CONFIG['GROUND_TRUTH']:
    ground_truth_path = Path(CONFIG['GROUND_TRUTH'])
    if not ground_truth_path.is_absolute():
        ground_truth_path = Path.cwd() / ground_truth_path
    
    ground_truth = load_ground_truth(str(ground_truth_path), verbose=CONFIG['VERBOSE'])
    rprint(f"[green]‚úÖ Ground truth loaded for {len(ground_truth)} images[/green]")
else:
    rprint("[cyan]üìã Running in inference-only mode (no ground truth required)[/cyan]")

# Apply filters
if CONFIG['DOCUMENT_TYPES'] and ground_truth:
    filtered = []
    for img in all_images:
        img_name = Path(img).name
        if img_name in ground_truth:
            doc_type = ground_truth[img_name].get('DOCUMENT_TYPE', '').lower()
            if any(dt.lower() in doc_type for dt in CONFIG['DOCUMENT_TYPES']):
                filtered.append(img)
    all_images = filtered

if CONFIG['MAX_IMAGES']:
    all_images = all_images[:CONFIG['MAX_IMAGES']]

rprint(f"[bold green]Ready to process {len(all_images)} images with universal extraction[/bold green]")
for i, img in enumerate(all_images[:5], 1):
    print(f"  {i}. {Path(img).name}")
if len(all_images) > 5:
    print(f"  ... and {len(all_images) - 5} more")

In [None]:
#Cell 7.5
# Debug: Check ground truth keys vs image filenames
if ground_truth:
    rprint("\n[bold yellow]üîç Ground Truth Debug Info[/bold yellow]")
    rprint(f"[cyan]Total ground truth entries: {len(ground_truth)}[/cyan]")
    rprint(f"[cyan]Total images to process: {len(all_images)}[/cyan]")
    
    # Show first 3 ground truth keys
    gt_keys = list(ground_truth.keys())[:3]
    rprint(f"[cyan]Sample GT keys: {gt_keys}[/cyan]")
    
    # Show first 3 image filenames (with and without extensions)
    img_names_full = [Path(img).name for img in all_images[:3]]
    img_names_no_ext = [Path(img).stem for img in all_images[:3]]
    rprint(f"[cyan]Sample image names (full): {img_names_full}[/cyan]")
    rprint(f"[cyan]Sample image names (no ext): {img_names_no_ext}[/cyan]")
    
    # Check for mismatches using filename WITHOUT extension (Path.stem)
    missing_gt = []
    for img in all_images:
        img_name_no_ext = Path(img).stem  # Strip extension for GT lookup
        if img_name_no_ext not in ground_truth:
            missing_gt.append(Path(img).name)  # Show full name in error
    
    if missing_gt:
        rprint(f"[red]‚ö†Ô∏è  WARNING: {len(missing_gt)} images missing from ground truth![/red]")
        rprint(f"[red]First 5 missing: {missing_gt[:5]}[/red]")
    else:
        rprint(f"[green]‚úÖ All {len(all_images)} images have ground truth entries (using stem lookup)[/green]")
    
    console.rule()
else:
    rprint("[yellow]‚ö†Ô∏è  No ground truth loaded (inference-only mode)[/yellow]")

## Universal Batch Processing

Process all images with the same universal prompt (no document type detection).

In [None]:
#Cell 8
import torch
from PIL import Image as PILImage
from tqdm import tqdm

# Import the correct parser
from common.extraction_parser import parse_extraction_response

# Universal batch processing function
def process_with_universal_prompt(image_paths, model, processor, prompt, ground_truth_data, verbose=False):
    """
    Process images using universal extraction (all 17 fields).
    
    Returns:
        batch_results: List of result dictionaries
        processing_times: List of processing times
    """
    batch_results = []
    processing_times = []
    
    for img_path in tqdm(image_paths, desc="Processing images"):
        start_time = datetime.now()
        img_name = Path(img_path).name
        
        # CRITICAL FIX: Strip file extension for ground truth lookup
        # Ground truth keys don't have extensions (e.g., "invoice_001" not "invoice_001.jpg")
        img_name_no_ext = Path(img_path).stem
        
        try:
            # Load image
            image = PILImage.open(img_path)
            
            # Create message for Llama
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": prompt}
                    ]
                }
            ]
            
            # Process with model
            input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
            inputs = processor(image, input_text, return_tensors="pt").to(model.device)
            
            # Generate
            with torch.no_grad():
                output = model.generate(**inputs, max_new_tokens=2000)
            
            # Decode response
            response = processor.decode(output[0], skip_special_tokens=True)
            
            # Extract only the assistant's response (after "assistant\n\n")
            if "assistant\n\n" in response:
                response = response.split("assistant\n\n", 1)[1]
            
            # Parse the field-by-field response format (NOT JSON)
            extracted_data = parse_extraction_response(response)
            
            # Get ground truth for this image (using name WITHOUT extension)
            gt = ground_truth_data.get(img_name_no_ext, {})
            
            # Calculate processing time
            processing_time = (datetime.now() - start_time).total_seconds()
            processing_times.append(processing_time)
            
            # Store result
            result = {
                'image_name': img_name,  # Keep full filename for display
                'image_name_no_ext': img_name_no_ext,  # For ground truth lookup
                'image_path': img_path,
                'extracted_data': extracted_data,
                'raw_response': response,
                'processing_time': processing_time,
                'ground_truth': gt
            }
            
            batch_results.append(result)
            
            if verbose:
                gt_found = "‚úÖ" if gt else "‚ùå"
                fields_found = sum(1 for v in extracted_data.values() if v != "NOT_FOUND")
                rprint(f"[green]{gt_found} {img_name}: {processing_time:.2f}s ({fields_found} fields)[/green]")
                
        except Exception as e:
            rprint(f"[red]‚ùå Error processing {img_name}: {e}[/red]")
            import traceback
            rprint(f"[dim]{traceback.format_exc()}[/dim]")
            batch_results.append({
                'image_name': img_name,
                'image_name_no_ext': img_name_no_ext,
                'image_path': img_path,
                'error': str(e),
                'processing_time': 0
            })
            processing_times.append(0)
    
    return batch_results, processing_times

# Process batch
rprint("[bold cyan]Starting universal batch extraction...[/bold cyan]")
batch_results, processing_times = process_with_universal_prompt(
    all_images,
    model,
    processor,
    UNIVERSAL_PROMPT,
    ground_truth,
    verbose=CONFIG['VERBOSE']
)

rprint(f"[bold green]‚úÖ Processed {len(batch_results)} images[/bold green]")
rprint(f"[cyan]Average time: {np.mean(processing_times):.2f}s[/cyan]")

In [None]:
if batch_results:
    first_result = batch_results[0]
    print(f"Image: {first_result['image_name']}")
    print(f"\nExtracted data:")
    for field, value in first_result['extracted_data'].items():
        print(f"  {field}: {value}")

    print(f"\nGround truth:")
    for field, value in first_result['ground_truth'].items():
        print(f"  {field}: {value}")

    print(f"\nRaw response (first 500 chars):")
    print(first_result['raw_response'][:500])

## Evaluation with Option C (Hybrid Strategy)

Evaluate only fields that should exist based on ground truth document type.

In [None]:
#Cell 9
from common.evaluation_metrics import calculate_field_accuracy_with_method

# Load document-specific field mappings directly from YAML
field_defs_path = Path('config/field_definitions.yaml')
with open(field_defs_path, 'r') as f:
    field_defs = yaml.safe_load(f)

# Create document type to fields mapping
DOC_TYPE_FIELDS = {
    'invoice': field_defs['document_fields']['invoice']['fields'],
    'receipt': field_defs['document_fields']['receipt']['fields'],
    'bank_statement': field_defs['document_fields']['bank_statement']['fields'],
    'statement': field_defs['document_fields']['bank_statement']['fields'],  # Alias
}

rprint(f"[green]‚úÖ Loaded document-specific field mappings from YAML[/green]")
rprint(f"[cyan]  Invoice: {len(DOC_TYPE_FIELDS['invoice'])} fields[/cyan]")
rprint(f"[cyan]  Receipt: {len(DOC_TYPE_FIELDS['receipt'])} fields[/cyan]")
rprint(f"[cyan]  Bank Statement: {len(DOC_TYPE_FIELDS['bank_statement'])} fields[/cyan]")

def evaluate_universal_extraction(batch_results, inference_only_mode=False, evaluation_method='order_aware_f1'):
    """
    Evaluate universal extraction using Option C strategy.
    Only evaluates fields relevant to the ground truth document type.
    Tracks false positives separately.
    """
    evaluated_results = []
    
    for result in batch_results:
        if 'error' in result:
            evaluated_results.append(result)
            continue
        
        extracted_data = result['extracted_data']
        ground_truth = result.get('ground_truth', {})
        
        # Check global inference_only mode OR missing ground truth for this image
        if inference_only_mode or not ground_truth:
            # Inference only mode OR no ground truth for this specific image
            result['evaluation'] = {
                'inference_only': True,
                'reason': 'global_config' if inference_only_mode else 'no_ground_truth_for_image'
            }
            evaluated_results.append(result)
            continue
        
        # Get document type from ground truth
        doc_type = ground_truth.get('DOCUMENT_TYPE', 'invoice').lower().replace(' ', '_')
        
        # Get relevant fields for this document type from YAML mapping
        relevant_fields = DOC_TYPE_FIELDS.get(doc_type, DOC_TYPE_FIELDS['invoice'])
        irrelevant_fields = set(UNIVERSAL_FIELDS) - set(relevant_fields)
        
        # Evaluate relevant fields only (PRIMARY METRIC)
        field_scores = {}
        total_f1 = 0.0
        fields_evaluated = 0
        fields_matched = 0
        
        for field in relevant_fields:
            extracted_value = extracted_data.get(field, "NOT_FOUND")
            gt_value = ground_truth.get(field, "NOT_FOUND")
            
            if extracted_value == "NOT_FOUND" and gt_value == "NOT_FOUND":
                continue
            
            fields_evaluated += 1
            
            try:
                metrics = calculate_field_accuracy_with_method(
                    extracted_value, gt_value, field, method=evaluation_method
                )
            except Exception as e:
                metrics = {'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0}
            
            field_scores[field] = metrics
            total_f1 += metrics.get('f1_score', 0.0)
            
            if metrics.get('f1_score', 0.0) > 0.9:
                fields_matched += 1
        
        # Calculate primary accuracy
        primary_accuracy = (total_f1 / fields_evaluated) if fields_evaluated > 0 else 0.0
        
        # Track false positives (SECONDARY METRIC)
        false_positives = []
        for field in irrelevant_fields:
            if extracted_data.get(field, "NOT_FOUND") != "NOT_FOUND":
                false_positives.append(field)
        
        false_positive_rate = len(false_positives) / len(irrelevant_fields) if irrelevant_fields else 0.0
        
        # Store evaluation
        result['evaluation'] = {
            'overall_accuracy': primary_accuracy,
            'fields_evaluated': fields_evaluated,
            'fields_matched': fields_matched,
            'total_fields': len(relevant_fields),
            'field_scores': field_scores,
            'false_positive_count': len(false_positives),
            'false_positive_rate': false_positive_rate,
            'false_positive_fields': false_positives,
            'document_type': doc_type,
            'inference_only': False
        }
        
        evaluated_results.append(result)
    
    return evaluated_results

# Run evaluation
if not CONFIG['INFERENCE_ONLY']:
    rprint("[bold cyan]Evaluating with Option C (hybrid strategy)...[/bold cyan]")
    batch_results = evaluate_universal_extraction(batch_results, inference_only_mode=CONFIG['INFERENCE_ONLY'])
    
    # Count how many were actually evaluated vs skipped
    evaluated_count = sum(1 for r in batch_results if 'evaluation' in r and not r['evaluation'].get('inference_only'))
    skipped_count = sum(1 for r in batch_results if 'evaluation' in r and r['evaluation'].get('inference_only'))
    
    if evaluated_count > 0:
        avg_accuracy = np.mean([r.get('evaluation', {}).get('overall_accuracy', 0) * 100 
                                for r in batch_results if 'evaluation' in r and not r['evaluation'].get('inference_only')])
        avg_false_positives = np.mean([r.get('evaluation', {}).get('false_positive_count', 0) 
                                        for r in batch_results if 'evaluation' in r and not r['evaluation'].get('inference_only')])
        
        rprint(f"[green]‚úÖ Evaluation complete[/green]")
        rprint(f"[cyan]Evaluated: {evaluated_count} images[/cyan]")
        if skipped_count > 0:
            rprint(f"[yellow]Skipped: {skipped_count} images (no ground truth)[/yellow]")
        rprint(f"[cyan]Average accuracy: {avg_accuracy:.2f}%[/cyan]")
        rprint(f"[cyan]Average false positives: {avg_false_positives:.2f} fields/image[/cyan]")
    else:
        rprint(f"[red]‚ö†Ô∏è  No images evaluated - all {skipped_count} images missing ground truth[/red]")
else:
    rprint("[cyan]üìã Skipping evaluation (inference-only mode)[/cyan]")

## Generate Output CSV

In [None]:
#Cell 10
# Create model-specific CSV (compatible with model_comparison.ipynb)
csv_data = []

for i, result in enumerate(batch_results):
    image_name = Path(result['image_path']).name
    extracted_data = result.get('extracted_data', {})
    evaluation = result.get('evaluation', {})
    processing_time = result.get('processing_time', 0)
    
    # Determine document type (from ground truth if available, else from extraction)
    doc_type = result.get('ground_truth', {}).get('DOCUMENT_TYPE', 
                                                    extracted_data.get('DOCUMENT_TYPE', 'unknown')).lower()
    
    # Count fields
    found_fields = sum(1 for v in extracted_data.values() if v != "NOT_FOUND")
    field_coverage = (found_fields / len(UNIVERSAL_FIELDS) * 100)
    
    # Create row
    row = {
        'image_file': image_name,
        'image_name': image_name,
        'document_type': doc_type,
        'processing_time': processing_time,
        'field_count': evaluation.get('total_fields', len(UNIVERSAL_FIELDS)),
        'found_fields': found_fields,
        'field_coverage': field_coverage,
        'prompt_used': 'llama_universal',
        'timestamp': datetime.now().isoformat(),
        'overall_accuracy': evaluation.get('overall_accuracy', 0) * 100 if not CONFIG['INFERENCE_ONLY'] else None,
        'fields_extracted': evaluation.get('fields_evaluated', 0),
        'fields_matched': evaluation.get('fields_matched', 0),
        'total_fields': evaluation.get('total_fields', len(UNIVERSAL_FIELDS)),
        'false_positive_count': evaluation.get('false_positive_count', 0),
        'false_positive_rate': evaluation.get('false_positive_rate', 0),
        'inference_only': CONFIG['INFERENCE_ONLY']
    }
    
    # Add all field values
    for field in UNIVERSAL_FIELDS:
        row[field] = extracted_data.get(field, 'NOT_FOUND')
    
    csv_data.append(row)

# Create DataFrame and save
df = pd.DataFrame(csv_data)
csv_path = OUTPUT_DIRS['csv'] / f"llama_universal_batch_results_{BATCH_TIMESTAMP}.csv"
df.to_csv(csv_path, index=False)

rprint("[bold green]‚úÖ Universal extraction CSV exported:[/bold green]")
rprint(f"[cyan]üìÑ File: {csv_path}[/cyan]")
rprint(f"[cyan]üìä Structure: {len(df)} rows √ó {len(df.columns)} columns[/cyan]")

# Display sample
if CONFIG['INFERENCE_ONLY']:
    sample_cols = ['image_file', 'document_type', 'processing_time', 'found_fields', 
                   'field_coverage', 'false_positive_count']
else:
    sample_cols = ['image_file', 'document_type', 'overall_accuracy', 'processing_time', 
                   'found_fields', 'false_positive_count']

rprint("\n[bold blue]üìã Sample data:[/bold blue]")
display(df[sample_cols].head(3))

## Summary Statistics

In [None]:
#Cell 11
# Summary statistics
console.rule("[bold green]Universal Extraction Summary[/bold green]")

rprint(f"[bold green]‚úÖ Total images processed: {len(batch_results)}[/bold green]")
rprint(f"[cyan]Average processing time: {np.mean(processing_times):.2f}s[/cyan]")
rprint(f"[cyan]Average fields extracted: {df['found_fields'].mean():.1f} / {len(UNIVERSAL_FIELDS)}[/cyan]")
rprint(f"[cyan]Average field coverage: {df['field_coverage'].mean():.1f}%[/cyan]")

if not CONFIG['INFERENCE_ONLY']:
    rprint(f"[cyan]Average accuracy (relevant fields): {df['overall_accuracy'].mean():.2f}%[/cyan]")
    rprint(f"[cyan]Average false positives: {df['false_positive_count'].mean():.2f} fields/image[/cyan]")
    rprint(f"[cyan]False positive rate: {df['false_positive_rate'].mean()*100:.1f}%[/cyan]")

rprint(f"\n[cyan]Output saved to: {OUTPUT_DIRS['csv']}[/cyan]")
rprint("[green]‚úÖ Ready for comparison with two-stage and oracle methods[/green]")