# InternVL3-2B Pure Multi-Turn Adaptive Extraction ⭐

**TRUE multi-turn chat using official InternVL3 pattern** - exactly like Llama.

This notebook implements genuine multi-turn conversation using the official InternVL3 pattern:
1. Direct model loading
2. Manual prompt loading from YAML  
3. TRUE multi-turn via `return_history=True` (official pattern)
4. Manual response parsing
5. Full control over generation parameters

**Official Pattern** (from InternVL3 docs):
```python
# Load pixel_values ONCE
pixel_values = load_image(image_path)

# First turn: history=None
response, history = model.chat(tokenizer, pixel_values, prompt, 
                               generation_config, history=None, return_history=True)

# Second turn: pass history from first turn
response, history = model.chat(tokenizer, pixel_values, prompt,
                               generation_config, history=history, return_history=True)
```

**Key difference from hybrid**: Uses `return_history=True` to maintain conversation context across turns with the SAME `pixel_values` tensor (no reloading needed!).

Outputs compatible with model_comparison.ipynb

In [None]:
import gc
import json
import random
import time
import traceback
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import yaml
from PIL import Image
from rich import print as rprint
from rich.console import Console
from rich.progress import track
from transformers import AutoModel, AutoTokenizer

# Initialize console for rich output
console = Console()

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
rprint("[green]✅ Imports loaded[/green]")

## Configuration

In [None]:
# Environment-specific base paths
ENVIRONMENT_BASES = {
    'sandbox': '/home/jovyan/nfs_share/tod',
    'efs': '/efs/shared/PoC_data'
}
base_data_path = ENVIRONMENT_BASES['sandbox']

CONFIG = {
    # Model settings
    'MODEL_PATH': '/home/jovyan/nfs_share/models/InternVL3-2B',

    # Data paths
    'DATA_DIR': f'{base_data_path}/evaluation_data',

    # Prompt files
    'PROMPT_FILE_DOCTYPE': f'{base_data_path}/LMM_POC/prompts/document_type_detection.yaml',
    'PROMPT_FILE_INVOICE': f'{base_data_path}/LMM_POC/prompts/internvl3_prompts.yaml',
    'PROMPT_FILE_RECEIPT': f'{base_data_path}/LMM_POC/prompts/internvl3_prompts.yaml',
    'PROMPT_FILE_BANK': f'{base_data_path}/LMM_POC/prompts/internvl3_prompts.yaml',

    # Output directory
    'OUTPUT_DIR': f'{base_data_path}/LMM_POC/output',

    # Token limits
    'MAX_NEW_TOKENS_DOCTYPE': 50,
    'MAX_NEW_TOKENS_STRUCTURE': 50,
    'MAX_NEW_TOKENS_EXTRACT': 2000,
    
    # Verbosity control
    'VERBOSE': True,  # Show stage-by-stage progress
    'SHOW_PROMPTS': True,  # Show actual prompts being used
}

# Create output directory
output_dir = Path(CONFIG['OUTPUT_DIR'])
csv_dir = output_dir / 'csv'
csv_dir.mkdir(parents=True, exist_ok=True)

# Timestamp for output files
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")

rprint("[green]✅ Configuration loaded[/green]")
rprint(f"[cyan]  Environment: {[k for k, v in ENVIRONMENT_BASES.items() if v == base_data_path][0]}[/cyan]")
rprint(f"[cyan]  Base path: {base_data_path}[/cyan]")
rprint(f"[cyan]  Output directory: {output_dir}[/cyan]")
rprint(f"[cyan]  Timestamp: {TIMESTAMP}[/cyan]")
rprint(f"[cyan]  Verbosity: {'ON' if CONFIG['VERBOSE'] else 'OFF'}[/cyan]")

## Load Model

In [None]:
# Load InternVL3 model directly (no processor wrapper)
rprint("[bold green]🔧 Loading InternVL3 model...[/bold green]")

model = AutoModel.from_pretrained(
    CONFIG['MODEL_PATH'],
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=False,  # V100 compatible
    trust_remote_code=True,
    device_map="auto"
).eval()

tokenizer = AutoTokenizer.from_pretrained(
    CONFIG['MODEL_PATH'],
    trust_remote_code=True,
    use_fast=False
)

# Fix pad_token_id
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

rprint("[green]✅ Model and tokenizer loaded[/green]")

# Display model info
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    rprint(f"[blue]📊 GPU Memory: {allocated:.2f}GB / {total:.0f}GB ({(allocated/total*100):.1f}%)[/blue]")

## Image Preprocessing

InternVL3-specific image loading with dynamic tiling:

In [None]:
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    """Build InternVL3 image transform."""
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    """Find closest aspect ratio for tiling."""
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    """InternVL3 dynamic preprocessing into tiles."""
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # Calculate target ratios
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1)
        for i in range(1, n + 1) for j in range(1, n + 1)
        if i * j <= max_num and i * j >= min_num
    )
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # Find best fit
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size
    )

    # Calculate dimensions
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # Resize image
    resized_img = image.resize((target_width, target_height))
    processed_images = []

    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)

    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)

    return processed_images

def load_image(image_path, input_size=448, max_num=24, debug=None):
    """Complete InternVL3 image loading pipeline - matches working processor."""
    # Use CONFIG VERBOSE if debug not specified
    if debug is None:
        debug = CONFIG.get('VERBOSE', False)
    
    if debug:
        rprint(f"[blue]🔍 LOAD_IMAGE: max_num={max_num}, input_size={input_size}[/blue]")
    
    image = Image.open(image_path).convert('RGB')
    images = dynamic_preprocess(image, min_num=1, max_num=max_num,
                                image_size=input_size, use_thumbnail=True)
    transform = build_transform(input_size=input_size)
    pixel_values = [transform(img) for img in images]
    pixel_values = torch.stack(pixel_values)

    # CRITICAL: Robust dtype detection (copied from working processor)
    try:
        # Try vision model's embedding layer dtype (most reliable)
        if hasattr(model, 'vision_model') and hasattr(model.vision_model, 'embeddings'):
            vision_dtype = next(model.vision_model.embeddings.parameters()).dtype
            pixel_values = pixel_values.to(dtype=vision_dtype)
            if debug:
                rprint(f"[blue]🔧 TENSOR_DTYPE: Using vision model dtype {vision_dtype}[/blue]")
        elif hasattr(model, 'dtype'):
            pixel_values = pixel_values.to(dtype=model.dtype)
            if debug:
                rprint(f"[blue]🔧 TENSOR_DTYPE: Using model.dtype {model.dtype}[/blue]")
        else:
            # Fall back to parameter dtype
            model_dtype = next(model.parameters()).dtype
            pixel_values = pixel_values.to(dtype=model_dtype)
            if debug:
                rprint(f"[blue]🔧 TENSOR_DTYPE: Using parameter dtype {model_dtype}[/blue]")
    except Exception:
        # Safe fallback: use bfloat16 for 2B model
        pixel_values = pixel_values.to(dtype=torch.bfloat16)
        if debug:
            rprint("[blue]🔧 TENSOR_DTYPE: Using bfloat16 (fallback)[/blue]")
    
    if debug:
        rprint(f"[blue]📐 TENSOR_SHAPE: {pixel_values.shape} (batch_size={pixel_values.shape[0]} tiles)[/blue]")
        rprint(f"[blue]📊 TENSOR_DTYPE: {pixel_values.dtype}[/blue]")

    return pixel_values

rprint("[green]✅ Image preprocessing functions defined[/green]")

## Load All Prompts

Loading prompts for:
- Document type detection
- Invoice extraction
- Receipt extraction
- Bank statement extraction (flat and grouped variants)

In [None]:
# Load all prompts from YAML files

# Document type detection prompt
with open(CONFIG['PROMPT_FILE_DOCTYPE'], 'r') as f:
    doctype_data = yaml.safe_load(f)
    DOCTYPE_PROMPT = doctype_data['prompts']['detection']['prompt']

# Load all InternVL3 prompts from single file
with open(CONFIG['PROMPT_FILE_INVOICE'], 'r') as f:
    internvl3_data = yaml.safe_load(f)
    INVOICE_PROMPT = internvl3_data['prompts']['invoice']['prompt']
    RECEIPT_PROMPT = internvl3_data['prompts']['receipt']['prompt']
    BANK_PROMPTS = {
        'flat': internvl3_data['prompts']['bank_statement_flat']['prompt'],
        'date_grouped': internvl3_data['prompts']['bank_statement_date_grouped']['prompt']
    }

# Bank statement structure classification prompt
STRUCTURE_CLASSIFICATION_PROMPT = """Look at how dates are displayed in this bank statement's transaction list.

Answer with ONLY one word:
- FLAT (if dates appear as the FIRST COLUMN in a table row, like: "05/05/2025 | Purchase | $22.50")
- GROUPED (if dates appear as SECTION HEADERS above transactions, like: "Thu 05 Sep 2025" followed by indented transaction details below)

The key difference: FLAT has dates IN the table columns, GROUPED has dates AS headers ABOVE the rows.

Answer (one word only):"""

rprint("[green]✅ All prompts loaded[/green]")
rprint(f"[cyan]  Document type detection: {len(DOCTYPE_PROMPT)} chars[/cyan]")
rprint(f"[cyan]  Invoice extraction: {len(INVOICE_PROMPT)} chars[/cyan]")
rprint(f"[cyan]  Receipt extraction: {len(RECEIPT_PROMPT)} chars[/cyan]")
rprint(f"[cyan]  Bank flat extraction: {len(BANK_PROMPTS['flat'])} chars[/cyan]")
rprint(f"[cyan]  Bank grouped extraction: {len(BANK_PROMPTS['date_grouped'])} chars[/cyan]")

## Multi-Turn Chat Function

In [None]:
def chat_with_internvl(model, tokenizer, prompt, pixel_values, history=None,
                       max_new_tokens=2000, do_sample=False, debug=None):
    """
    Multi-turn chat with InternVL3 using conversation history.

    OFFICIAL PATTERN: Uses SAME pixel_values with return_history=True.
    Based on: https://internvl.readthedocs.io/en/latest/internvl3.0/quick_start.html

    Args:
        model: InternVL3 model
        tokenizer: InternVL3 tokenizer
        prompt: Text prompt for this turn
        pixel_values: Preprocessed image tensor (REUSED across turns)
        history: Conversation history from previous turn or None
        max_new_tokens: Maximum tokens to generate
        do_sample: Whether to use sampling
        debug: Show debug output (uses CONFIG['VERBOSE'] if None)

    Returns:
        Tuple of (response, updated_history)
    """
    # Use CONFIG VERBOSE if debug not specified
    if debug is None:
        debug = CONFIG.get('VERBOSE', False)
    
    if debug:
        rprint(f"[magenta]💭 Generating with max_new_tokens={max_new_tokens}[/magenta]")
        if CONFIG.get('SHOW_PROMPTS', False):
            rprint(f"[yellow]📝 Prompt ({len(prompt)} chars):[/yellow]")
            rprint("[dim]" + "="*80 + "[/dim]")
            # Show first 500 chars of prompt
            preview = prompt[:500] + ("..." if len(prompt) > 500 else "")
            rprint(f"[dim]{preview}[/dim]")
            rprint("[dim]" + "="*80 + "[/dim]")
    
    # Build generation config
    generation_config = {
        "max_new_tokens": max_new_tokens,
        "temperature": None if not do_sample else 0.6,
        "do_sample": do_sample,
        "top_p": 0.9 if do_sample else None,
        "pad_token_id": tokenizer.eos_token_id,
    }

    # OFFICIAL PATTERN: Use model.chat() with return_history=True
    response, history = model.chat(
        tokenizer,
        pixel_values,
        prompt,
        generation_config=generation_config,
        history=history,  # None for first turn, then pass returned history
        return_history=True  # CRITICAL: Must be True for multi-turn
    )
    
    if debug:
        rprint(f"[magenta]📄 Model response ({len(response)} chars):[/magenta]")
        if CONFIG.get('SHOW_PROMPTS', False):
            rprint("[dim]" + "="*80 + "[/dim]")
            # Show first 500 chars of response
            preview = response[:500] + ("..." if len(response) > 500 else "")
            rprint(f"[dim]{preview}[/dim]")
            rprint("[dim]" + "="*80 + "[/dim]")

    return response, history

rprint("[green]✅ Multi-turn chat function defined[/green]")
rprint("[cyan]💡 Using official InternVL3 multi-turn pattern (return_history=True)[/cyan]")

## Parser Functions

Functions to parse VLM responses:
- Document type classification
- Bank statement structure classification
- Field extraction parsing

In [None]:
def parse_document_type(response):
    """Parse document type from VLM response."""
    response = response.strip().upper()
    if "INVOICE" in response:
        return "INVOICE"
    elif "RECEIPT" in response:
        return "RECEIPT"
    elif "BANK" in response or "STATEMENT" in response:
        return "BANK_STATEMENT"
    else:
        return "INVOICE"  # Default fallback

def parse_structure_type(response):
    """Parse bank statement structure type from VLM response."""
    response = response.strip().upper()
    if "FLAT" in response:
        return "flat"
    elif "GROUPED" in response or "DATE" in response:
        return "date_grouped"
    else:
        return "flat"  # Default fallback

def parse_extraction(extraction_text):
    """Parse extraction text into field dictionary."""
    extracted_fields = {}

    for line in extraction_text.split('\n'):
        line = line.strip()
        if ':' in line and not line.startswith('#'):
            parts = line.split(':', 1)
            if len(parts) == 2:
                field_name = parts[0].strip()
                field_value = parts[1].strip()
                extracted_fields[field_name] = field_value if field_value else 'NOT_FOUND'

    return extracted_fields

rprint("[green]✅ Parser functions defined[/green]")

## Discover Images

In [None]:
# Discover all images (no filtering by document type)
data_dir = Path(CONFIG['DATA_DIR'])
image_files = sorted(data_dir.glob("*.png"))

rprint(f"[green]✅ Found {len(image_files)} images to process[/green]")

rprint("\n[bold blue]Images to process:[/bold blue]")
for img in image_files:
    rprint(f"[cyan]  - {img.name}[/cyan]")

## Multi-Stage Batch Processing

**Explicit multi-stage processing** with true multi-turn conversations:
- **Stage 0**: Document Type Classification (INVOICE/RECEIPT/BANK_STATEMENT)
- **Stage 1**: Structure Classification (for BANK_STATEMENT only: FLAT/GROUPED)
- **Stage 2**: Document-Type-Aware Extraction (using appropriate prompt)

Each image maintains a conversation history across all stages.

In [None]:
# Multi-stage adaptive extraction with TRUE multi-turn chat (official pattern)
results = []
processing_times = []
doctype_counts = {'INVOICE': 0, 'RECEIPT': 0, 'BANK_STATEMENT': 0}
structure_counts = {'flat': 0, 'date_grouped': 0}

rprint("\n[bold green]🚀 Starting multi-stage adaptive extraction...[/bold green]\n")

for idx, image_path in enumerate(track(image_files, description="Processing images"), 1):
    image_name = image_path.name

    try:
        start_time = time.time()

        # Load image ONCE for all stages (official pattern)
        pixel_values = load_image(str(image_path))

        # Initialize conversation history (official pattern: history=None for first turn)
        history = None

        # ===================================================================
        # STAGE 0: Document Type Classification (Turn 1)
        # ===================================================================
        if CONFIG['VERBOSE']:
            rprint(f"\n[bold blue]Processing [{idx}/{len(image_files)}]: {image_name}[/bold blue]")
            rprint("[dim]Stage 0: Document type detection...[/dim]")

        doctype_answer, history = chat_with_internvl(
            model, tokenizer, DOCTYPE_PROMPT, pixel_values, history,
            max_new_tokens=CONFIG['MAX_NEW_TOKENS_DOCTYPE']
        )

        # Parse document type
        document_type = parse_document_type(doctype_answer)
        doctype_counts[document_type] += 1

        # ===================================================================
        # STAGE 1: Structure Classification (Turn 2 - only for BANK_STATEMENT)
        # ===================================================================
        structure_type = "N/A"
        structure_answer = "N/A"

        if document_type == "BANK_STATEMENT":
            if CONFIG['VERBOSE']:
                rprint("[dim]Stage 1: Bank statement structure classification...[/dim]")

            structure_answer, history = chat_with_internvl(
                model, tokenizer, STRUCTURE_CLASSIFICATION_PROMPT, pixel_values, history,
                max_new_tokens=CONFIG['MAX_NEW_TOKENS_STRUCTURE']
            )

            structure_type = parse_structure_type(structure_answer)
            structure_counts[structure_type] += 1
            extraction_prompt = BANK_PROMPTS[structure_type]
            prompt_key = f"internvl3_bank_statement_{structure_type}"

        elif document_type == "INVOICE":
            extraction_prompt = INVOICE_PROMPT
            prompt_key = "internvl3_invoice"

        elif document_type == "RECEIPT":
            extraction_prompt = RECEIPT_PROMPT
            prompt_key = "internvl3_receipt"

        # ===================================================================
        # STAGE 2: Document-Type-Aware Extraction (Turn 2/3)
        # ===================================================================
        if CONFIG['VERBOSE']:
            rprint(f"[dim]Stage 2: Extraction using {prompt_key}...[/dim]")

        extraction_result, history = chat_with_internvl(
            model, tokenizer, extraction_prompt, pixel_values, history,
            max_new_tokens=CONFIG['MAX_NEW_TOKENS_EXTRACT']
        )

        # Parse extraction
        extracted_fields = parse_extraction(extraction_result)

        # Store results
        result = {
            'image_file': image_name,
            'document_type': document_type,
            'structure_type': structure_type,
            'prompt_used': prompt_key,
            'doctype_classification': doctype_answer.strip(),
            'structure_classification': structure_answer.strip() if isinstance(structure_answer, str) else structure_answer,
            'extraction_raw': extraction_result,
            **extracted_fields
        }
        results.append(result)

        processing_time = time.time() - start_time
        processing_times.append(processing_time)

        structure_display = structure_type if structure_type != 'N/A' else 'direct'
        rprint(f"[green]✅ {image_name}: {document_type} ({structure_display}) - {processing_time:.2f}s[/green]")

    except Exception as e:
        error_msg = f"{type(e).__name__}: {str(e)}"
        rprint(f"[red]❌ {image_name}: Error - {error_msg}[/red]")
        
        # Print full traceback for debugging
        if CONFIG.get('VERBOSE', True):
            rprint("[yellow]Full traceback:[/yellow]")
            traceback.print_exc()
        
        results.append({
            'image_file': image_name,
            'document_type': 'ERROR',
            'structure_type': 'ERROR',
            'error': error_msg
        })
        processing_times.append(0)

    finally:
        # Memory cleanup after each image
        if 'pixel_values' in locals():
            del pixel_values

        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Periodic garbage collection every 3 images
        if idx % 3 == 0:
            gc.collect()

console.rule("[bold green]Batch Processing Complete[/bold green]")

# Display summary statistics
rprint(f"\n[bold blue]📊 Document Type Classification Summary:[/bold blue]")
rprint(f"[cyan]  Invoices: {doctype_counts['INVOICE']}[/cyan]")
rprint(f"[cyan]  Receipts: {doctype_counts['RECEIPT']}[/cyan]")
rprint(f"[cyan]  Bank Statements: {doctype_counts['BANK_STATEMENT']}[/cyan]")

if doctype_counts['BANK_STATEMENT'] > 0:
    rprint(f"\n[bold blue]📊 Bank Statement Structure Summary:[/bold blue]")
    rprint(f"[cyan]  Flat table: {structure_counts['flat']}[/cyan]")
    rprint(f"[cyan]  Date-grouped: {structure_counts['date_grouped']}[/cyan]")

## Save Results (Llama-Compatible Format)

In [None]:
# Convert results to DataFrame
df = pd.DataFrame(results)

# Save to CSV (compatible with model_comparison.ipynb)
csv_output = csv_dir / f"internvl3_pure_adaptive_results_{TIMESTAMP}.csv"
df.to_csv(csv_output, index=False)

rprint(f"[green]✅ CSV saved to: {csv_output}[/green]")
rprint(f"[cyan]  Rows: {len(df)}[/cyan]")
rprint(f"[cyan]  Columns: {len(df.columns)}[/cyan]")

# Show column names to verify Llama-compatible structure
rprint("\n[bold blue]📋 CSV Columns (Llama-compatible):[/bold blue]")
core_cols = ['image_file', 'document_type', 'structure_type', 'prompt_used',
             'doctype_classification', 'structure_classification', 'extraction_raw']
rprint(f"[cyan]Core columns: {', '.join(core_cols)}[/cyan]")
field_cols = [col for col in df.columns if col not in core_cols and col != 'error']
rprint(f"[cyan]Field columns ({len(field_cols)}): {', '.join(field_cols[:5])}{'...' if len(field_cols) > 5 else ''}[/cyan]")

# Save detailed JSON results
json_output = csv_dir / f"internvl3_pure_adaptive_results_{TIMESTAMP}.json"
with open(json_output, 'w') as f:
    json.dump(results, f, indent=2)

rprint(f"[green]✅ JSON saved to: {json_output}[/green]")

## Display Sample Results

In [None]:
# Display sample results
console.rule("[bold blue]Sample Results[/bold blue]")

display_cols = ['image_file', 'document_type', 'structure_type', 'prompt_used']
rprint(df[display_cols].to_string(index=False))

## Summary Statistics

In [None]:
print("\n📊 DOCUMENT-TYPE-AWARE ADAPTIVE EXTRACTION SUMMARY")
print("="*80)
print(f"Total images processed: {len(results)}")
print(f"Successful extractions: {len([r for r in results if 'error' not in r])}")
print(f"Errors: {len([r for r in results if 'error' in r])}")

print("\nDocument Type Classification:")
print(f"  Invoices: {doctype_counts['INVOICE']}")
print(f"  Receipts: {doctype_counts['RECEIPT']}")
print(f"  Bank Statements: {doctype_counts['BANK_STATEMENT']}")

if doctype_counts['BANK_STATEMENT'] > 0:
    print("\nBank Statement Structure Classification:")
    print(f"  Flat table format: {structure_counts['flat']}")
    print(f"  Date-grouped format: {structure_counts['date_grouped']}")

print("\nPrompts Used:")
prompt_usage = {}
for result in results:
    if 'prompt_used' in result:
        prompt = result['prompt_used']
        prompt_usage[prompt] = prompt_usage.get(prompt, 0) + 1

for prompt, count in sorted(prompt_usage.items()):
    print(f"  {prompt}: {count}")

print("="*80)

# Field extraction statistics
if len(df) > 0:
    field_cols = [col for col in df.columns if col not in [
        'image_file', 'document_type', 'structure_type', 'prompt_used',
        'doctype_classification', 'structure_classification', 'extraction_raw', 'error'
    ]]

    if field_cols:
        print("\n📈 Field Extraction Coverage:")
        for field in field_cols:
            if field in df.columns:
                found_count = df[field].notna().sum()
                coverage = (found_count / len(df)) * 100
                print(f"  {field}: {found_count}/{len(df)} ({coverage:.1f}%)")

## View Individual Extraction

Change `image_to_view` to view detailed extraction for a specific image:

In [None]:
# View detailed extraction for specific image
image_to_view = "image_003.png"  # Change this

result = next((r for r in results if r['image_file'] == image_to_view), None)

if result:
    print(f"\n🔍 Detailed Extraction: {image_to_view}")
    print("="*80)
    print(f"Document Type: {result['document_type']}")
    print(f"Structure Type: {result['structure_type']}")
    print(f"Prompt Used: {result['prompt_used']}")
    print(f"\nDocument Type Classification Response:")
    print(result.get('doctype_classification', 'N/A'))
    print(f"\nStructure Classification Response:")
    print(result.get('structure_classification', 'N/A'))
    print(f"\nExtraction Result:")
    extraction_display = result.get('extraction_raw', 'N/A')
    if len(extraction_display) > 1000:
        extraction_display = extraction_display[:1000] + "\n...[truncated]..."
    print(extraction_display)
    print("="*80)
else:
    print(f"Image {image_to_view} not found in results")