# InternVL3 Simple Key-Value Extraction

**Purpose:** Simplified single-image extraction demo for InternVL3 vision-language model

This stripped-down notebook:
- Loads a single image
- Displays the image
- Runs InternVL3 extraction
- Shows the extracted key-value pairs

Perfect for quick testing and understanding the extraction process.

In [None]:
# ============================================================================
# MINIMAL IMPORTS WITH COMPREHENSIVE FIELD METADATA
# ============================================================================

import sys
import warnings
from pathlib import Path

from IPython.display import Image as IPImage
from IPython.display import Markdown, display
from PIL import Image

# Add parent directory to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Import essential modules with comprehensive field metadata
from common.config import (
    DATA_DIR,
    DATE_FIELDS,
    EXTRACTION_FIELDS,
    FIELD_COUNT,
    FIELD_DESCRIPTIONS,
    # Field metadata - single source of truth
    FIELD_INSTRUCTIONS,
    FIELD_TYPES,
    INTERNVL3_MODEL_PATH,
    # Field groupings by type
    MONETARY_FIELDS,
    OUTPUT_DIR,
    TEXT_FIELDS,
)
from models.internvl3_processor import InternVL3Processor

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Schema-driven field configuration ready
print("✅ Simple InternVL3 Key-Value Extractor Ready")
print(f"📋 Will extract {FIELD_COUNT} fields from business documents")
print(f"💰 Monetary fields: {len(MONETARY_FIELDS)}")
print(f"📅 Date fields: {len(DATE_FIELDS)}")
print(f"🔤 Text fields: {len(TEXT_FIELDS)}")
print("✨ All configurations loaded from schema - Phase 2 complete!")

In [None]:
# ============================================================================
# INITIALIZE INTERNVL3 MODEL
# ============================================================================

print("🚀 Initializing InternVL3 Model...")
print("=" * 50)

# Initialize processor
processor = InternVL3Processor(model_path=INTERNVL3_MODEL_PATH)

print("\n✅ Model loaded successfully!")
print(f"📍 Model path: {INTERNVL3_MODEL_PATH}")
print(f"🎯 Ready to extract {FIELD_COUNT} business fields")

# Show extraction fields
print("\n📋 Fields to extract:")
for i, field in enumerate(EXTRACTION_FIELDS, 1):
    print(f"  {i:2d}. {field}")
    if i == 5:  # Show first 5 fields then ellipsis
        print(f"  ... and {FIELD_COUNT - 5} more fields")
        break

In [None]:
# ============================================================================
# SELECT IMAGE TO PROCESS
# ============================================================================

# Default to first sample invoice
# CHANGE THIS PATH to test different images
image_path = Path(DATA_DIR) / "synthetic_invoice_001.jpeg"

# Alternative: specify your own image path
# image_path = Path("/path/to/your/image.jpeg")

# Verify image exists
if not image_path.exists():
    print(f"❌ Image not found: {image_path}")
    print(f"\n📁 Available images in {DATA_DIR}:")
    for img in sorted(Path(DATA_DIR).glob("*.jpeg"))[:5]:
        print(f"   - {img.name}")
else:
    print(f"✅ Selected image: {image_path.name}")
    print(f"📁 Full path: {image_path}")

In [None]:
# ============================================================================
# DISPLAY THE IMAGE
# ============================================================================

if image_path.exists():
    # Load image with PIL to get dimensions
    pil_image = Image.open(image_path)
    width, height = pil_image.size

    print(f"📐 Image dimensions: {width} x {height} pixels")
    print(f"📄 Image type: {pil_image.format}")
    print(f"🎨 Image mode: {pil_image.mode}")

    # Display the image
    print("\n🖼️ Document Image:")
    display(IPImage(str(image_path), width=600))  # Adjust width as needed
else:
    print("❌ Cannot display - image file not found")

In [None]:
# ============================================================================
# RUN EXTRACTION
# ============================================================================

print("🔍 Running InternVL3 Extraction...")
print("=" * 50)

# Process the image
result = processor.process_single_image(str(image_path))

# Extract key information
extracted_data = result["extracted_data"]
processing_time = result["processing_time"]
extracted_count = result["extracted_fields_count"]
raw_response = result["raw_response"]

print(f"\n✅ Extraction completed in {processing_time:.2f} seconds")
print(f"📊 Extracted {extracted_count}/{FIELD_COUNT} fields with values")
print(f"📝 Response length: {len(raw_response)} characters")

In [None]:
# ============================================================================
# DISPLAY EXTRACTED KEY-VALUE PAIRS WITH FIELD METADATA
# ============================================================================

display(Markdown("## 📋 Extracted Key-Value Pairs"))

# Group fields by extraction status and type
extracted_fields = {}
missing_fields = []
field_type_stats = {}

for field in EXTRACTION_FIELDS:
    value = extracted_data.get(field, "N/A")
    if value != "N/A" and value:
        extracted_fields[field] = value
    else:
        missing_fields.append(field)

    # Track field type statistics
    field_type = FIELD_TYPES[field]
    if field_type not in field_type_stats:
        field_type_stats[field_type] = {"extracted": 0, "total": 0}
    field_type_stats[field_type]["total"] += 1
    if value != "N/A" and value:
        field_type_stats[field_type]["extracted"] += 1

# Display extracted fields grouped by type
if extracted_fields:
    display(Markdown("### ✅ Successfully Extracted Fields:"))

    # Group by field type for display
    monetary_extracted = {k: v for k, v in extracted_fields.items() if k in MONETARY_FIELDS}
    date_extracted = {k: v for k, v in extracted_fields.items() if k in DATE_FIELDS}
    text_extracted = {k: v for k, v in extracted_fields.items() if k in TEXT_FIELDS}
    other_extracted = {k: v for k, v in extracted_fields.items() 
                      if k not in MONETARY_FIELDS and k not in DATE_FIELDS and k not in TEXT_FIELDS}

    if monetary_extracted:
        print("💰 MONETARY FIELDS:")
        print("-" * 60)
        for field, value in monetary_extracted.items():
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"💰 {field:<20} : {value}")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if date_extracted:
        print("📅 DATE FIELDS:")
        print("-" * 60)
        for field, value in date_extracted.items():
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"📅 {field:<20} : {value}")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if text_extracted:
        print("🔤 TEXT FIELDS:")
        print("-" * 60)
        for field, value in text_extracted.items():
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"🔤 {field:<20} : {value}")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if other_extracted:
        print("🔍 OTHER FIELDS:")
        print("-" * 60)
        for field, value in other_extracted.items():
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"🔍 {field:<20} : {value}")
            print(f"   📝 {field_type:<15} | {description}")
            print()

# Display missing fields grouped by type
if missing_fields:
    display(Markdown("### ❌ Fields Not Found:"))

    monetary_missing = [f for f in missing_fields if f in MONETARY_FIELDS]
    date_missing = [f for f in missing_fields if f in DATE_FIELDS]
    text_missing = [f for f in missing_fields if f in TEXT_FIELDS]
    other_missing = [f for f in missing_fields 
                    if f not in MONETARY_FIELDS and f not in DATE_FIELDS and f not in TEXT_FIELDS]

    if monetary_missing:
        print("💰 MISSING MONETARY FIELDS:")
        print("-" * 60)
        for field in monetary_missing:
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"❌ {field:<20} : N/A")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if date_missing:
        print("📅 MISSING DATE FIELDS:")
        print("-" * 60)
        for field in date_missing:
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"❌ {field:<20} : N/A")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if text_missing:
        print("🔤 MISSING TEXT FIELDS:")
        print("-" * 60)
        for field in text_missing:
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"❌ {field:<20} : N/A")
            print(f"   📝 {field_type:<15} | {description}")
            print()

    if other_missing:
        print("🔍 MISSING OTHER FIELDS:")
        print("-" * 60)
        for field in other_missing:
            field_type = FIELD_TYPES[field]
            description = (
                FIELD_DESCRIPTIONS[field][:40] + "..."
                if len(FIELD_DESCRIPTIONS[field]) > 40
                else FIELD_DESCRIPTIONS[field]
            )
            print(f"❌ {field:<20} : N/A")
            print(f"   📝 {field_type:<15} | {description}")
            print()

# Enhanced summary statistics with field type breakdown
display(Markdown("### 📊 Extraction Summary:"))
print(f"Total fields expected:    {FIELD_COUNT}")
print(f"Fields with values:       {len(extracted_fields)}")
print(f"Missing fields:           {len(missing_fields)}")
print(f"Extraction rate:          {(len(extracted_fields) / FIELD_COUNT) * 100:.1f}%")
print(f"Processing time:          {processing_time:.2f} seconds")

print("\n📈 Field Type Performance:")
print("-" * 50)
for field_type, stats in field_type_stats.items():
    rate = (stats["extracted"] / stats["total"]) * 100 if stats["total"] > 0 else 0
    print(
        f"{field_type.upper():<15} : {stats['extracted']}/{stats['total']} ({rate:.1f}%)"
    )

In [None]:
# ============================================================================
# OPTIONAL: VIEW RAW RESPONSE
# ============================================================================

# Uncomment the lines below to see the raw model response

# display(Markdown("## 📝 Raw Model Response"))
# print("Raw output from InternVL3:")
# print("=" * 50)
# print(raw_response)
# print("=" * 50)

In [None]:
# ============================================================================
# QUICK TEST WITH DIFFERENT IMAGE (USING DYNAMIC FIELD METADATA)
# ============================================================================

# This cell allows you to quickly test another image without re-running everything


def quick_extract(image_file):
    """Quick extraction function for testing multiple images with field metadata."""
    print(f"\n🔍 Processing: {Path(image_file).name}")
    print("=" * 50)

    # Check if file exists
    if not Path(image_file).exists():
        print(f"❌ File not found: {image_file}")
        return

    # Process image
    result = processor.process_single_image(str(image_file))

    # Extract key information
    extracted_data = result["extracted_data"]
    processing_time = result["processing_time"]
    extracted_count = result["extracted_fields_count"]

    print(f"\n✅ Extraction completed in {processing_time:.2f} seconds")
    print(f"📊 Extracted {extracted_count}/{FIELD_COUNT} fields with values\n")

    # Group fields by extraction status and type
    extracted_fields = {}
    missing_fields = []
    field_type_stats = {}

    for field in EXTRACTION_FIELDS:
        value = extracted_data.get(field, "N/A")
        if value != "N/A" and value:
            extracted_fields[field] = value
        else:
            missing_fields.append(field)

        # Track field type statistics
        field_type = FIELD_TYPES[field]
        if field_type not in field_type_stats:
            field_type_stats[field_type] = {"extracted": 0, "total": 0}
        field_type_stats[field_type]["total"] += 1
        if value != "N/A" and value:
            field_type_stats[field_type]["extracted"] += 1

    # Display extracted fields grouped by type
    if extracted_fields:
        print("✅ Successfully Extracted Fields:")
        print("-" * 60)

        # Group by field type for display
        monetary_extracted = {k: v for k, v in extracted_fields.items() if k in MONETARY_FIELDS}
        date_extracted = {k: v for k, v in extracted_fields.items() if k in DATE_FIELDS}
        text_extracted = {k: v for k, v in extracted_fields.items() if k in TEXT_FIELDS}
        other_extracted = {k: v for k, v in extracted_fields.items() 
                          if k not in MONETARY_FIELDS and k not in DATE_FIELDS and k not in TEXT_FIELDS}

        if monetary_extracted:
            print("💰 MONETARY:")
            for field, value in monetary_extracted.items():
                field_type = FIELD_TYPES[field]
                print(f"💰 {field:<20} : {value} ({field_type})")

        if date_extracted:
            print("\n📅 DATE:")
            for field, value in date_extracted.items():
                field_type = FIELD_TYPES[field]
                print(f"📅 {field:<20} : {value} ({field_type})")

        if text_extracted:
            print("\n🔤 TEXT:")
            for field, value in text_extracted.items():
                field_type = FIELD_TYPES[field]
                print(f"🔤 {field:<20} : {value} ({field_type})")

        if other_extracted:
            print("\n🔍 OTHER:")
            for field, value in other_extracted.items():
                field_type = FIELD_TYPES[field]
                print(f"🔍 {field:<20} : {value} ({field_type})")

    # Display missing fields by type
    if missing_fields:
        print("\n❌ Fields Not Found:")
        print("-" * 60)

        monetary_missing = [f for f in missing_fields if f in MONETARY_FIELDS]
        date_missing = [f for f in missing_fields if f in DATE_FIELDS]
        text_missing = [f for f in missing_fields if f in TEXT_FIELDS]
        other_missing = [f for f in missing_fields 
                        if f not in MONETARY_FIELDS and f not in DATE_FIELDS and f not in TEXT_FIELDS]

        if monetary_missing:
            print("💰 MISSING MONETARY:")
            for field in monetary_missing:
                field_type = FIELD_TYPES[field]
                print(f"❌ {field:<20} : N/A ({field_type})")

        if date_missing:
            print("\n📅 MISSING DATE:")
            for field in date_missing:
                field_type = FIELD_TYPES[field]
                print(f"❌ {field:<20} : N/A ({field_type})")

        if text_missing:
            print("\n🔤 MISSING TEXT:")
            for field in text_missing:
                field_type = FIELD_TYPES[field]
                print(f"❌ {field:<20} : N/A ({field_type})")

        if other_missing:
            print("\n🔍 MISSING OTHER:")
            for field in other_missing:
                field_type = FIELD_TYPES[field]
                print(f"❌ {field:<20} : N/A ({field_type})")

    # Enhanced summary statistics
    print("\n📊 Extraction Summary:")
    print("-" * 60)
    print(f"Total fields expected:    {FIELD_COUNT}")
    print(f"Fields with values:       {len(extracted_fields)}")
    print(f"Missing fields:           {len(missing_fields)}")
    print(
        f"Extraction rate:          {(len(extracted_fields) / FIELD_COUNT) * 100:.1f}%"
    )
    print(f"Processing time:          {processing_time:.2f} seconds")

    # Field type performance
    print("\n📈 Field Type Performance:")
    for field_type, stats in field_type_stats.items():
        rate = (stats["extracted"] / stats["total"]) * 100 if stats["total"] > 0 else 0
        print(
            f"   {field_type.upper():<12} : {stats['extracted']}/{stats['total']} ({rate:.1f}%)"
        )

    return extracted_data


# Example: Test with another invoice
# Uncomment and modify the path below to test
# test_result = quick_extract(Path(DATA_DIR) / "synthetic_invoice_002.jpeg")

In [None]:
# ============================================================================
# BATCH PROCESS ALL JPEGS AND CREATE CSV (WITH VRAM MANAGEMENT & FIELD METADATA)
# ============================================================================

import gc
from datetime import datetime

import pandas as pd
import torch

print("🚀 Batch Processing All JPEG Images with VRAM Management & Field Metadata")
print("=" * 70)

# Import GPU optimization utilities
from common.gpu_optimization import (
    clear_model_caches,
    comprehensive_memory_cleanup,
    handle_memory_fragmentation,
)

# Find all JPEG files in DATA_DIR
jpeg_files = sorted(Path(DATA_DIR).glob("*.jpeg"))
print(f"📁 Found {len(jpeg_files)} JPEG files to process")

# Determine batch size based on model
is_8b = "8B" in str(INTERNVL3_MODEL_PATH)
batch_size = 1 if is_8b else 2  # Conservative batch sizes for V100
print(f"🎯 Using batch size: {batch_size} (Model: {'8B' if is_8b else '2B'})")
print("⚡ VRAM optimization: ENABLED")
print(f"📋 Field configuration: {FIELD_COUNT} extraction fields")
print()

if jpeg_files:
    # Initialize results list and comprehensive statistics
    all_results = []
    global_field_type_stats = {}

    # Initialize field type statistics
    for field_type in set(FIELD_TYPES.values()):
        global_field_type_stats[field_type] = {"extracted": 0, "total": 0}

    # Process images in batches for better memory management
    for batch_idx in range(0, len(jpeg_files), batch_size):
        batch_end = min(batch_idx + batch_size, len(jpeg_files))
        batch_files = jpeg_files[batch_idx:batch_end]

        print(
            f"\n[Batch {batch_idx // batch_size + 1}] Processing images {batch_idx + 1}-{batch_end} of {len(jpeg_files)}"
        )

        # Pre-batch memory cleanup
        if batch_idx > 0:  # Skip cleanup before first batch
            handle_memory_fragmentation(threshold_gb=1.0, aggressive=True)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()

        # Process each image in the batch
        for image_file in batch_files:
            print(f"  📄 Processing: {image_file.name}")

            try:
                # Process the image with proper error handling
                result = processor.process_single_image(str(image_file))
                extracted_data = result["extracted_data"]

                # Create row dictionary with image name first
                row = {"image_name": image_file.name}

                # Add all fields in alphabetical order as requested
                # Use "NOT_FOUND" instead of "N/A" to prevent pandas conversion to NaN
                for field in sorted(EXTRACTION_FIELDS):
                    value = extracted_data.get(field, "N/A")
                    # Replace "N/A" with "NOT_FOUND" to prevent pandas NaN conversion
                    if value == "N/A" or value == "" or value is None:
                        row[field] = "NOT_FOUND"
                    else:
                        row[field] = value

                    # Update global statistics by field type
                    field_type = FIELD_TYPES[field]
                    global_field_type_stats[field_type]["total"] += 1

                    # Count extractions (not N/A or NOT_FOUND)
                    if value not in ["N/A", "", None]:
                        global_field_type_stats[field_type]["extracted"] += 1

                all_results.append(row)

                # Show quick stats
                extracted_count = sum(
                    1 for v in extracted_data.values() if v not in ["N/A", "", None]
                )
                print(f"     ✅ Extracted {extracted_count}/{FIELD_COUNT} fields")

            except torch.cuda.OutOfMemoryError as oom_error:
                print("     ⚠️ OOM Error - Attempting recovery...")

                # Emergency cleanup
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
                clear_model_caches(processor.model, processor.tokenizer)
                handle_memory_fragmentation(threshold_gb=0.5, aggressive=True)
                gc.collect()

                # Add error row with all ERROR_OOM values
                row = {"image_name": image_file.name}
                for field in sorted(EXTRACTION_FIELDS):
                    row[field] = "ERROR_OOM"
                all_results.append(row)

            except Exception as e:
                print(f"     ❌ Error: {e}")
                # Add error row with all NOT_FOUND values
                row = {"image_name": image_file.name}
                for field in sorted(EXTRACTION_FIELDS):
                    row[field] = "NOT_FOUND"
                all_results.append(row)

        # Post-batch cleanup - CRITICAL for V100
        if torch.cuda.is_available():
            # Comprehensive cleanup after each batch
            comprehensive_memory_cleanup(processor.model, processor.tokenizer)

            # Extra aggressive cleanup for 8B model
            if is_8b:
                handle_memory_fragmentation(threshold_gb=0.5, aggressive=True)
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                print(
                    f"  🧹 Aggressive memory cleanup after batch {batch_idx // batch_size + 1}"
                )
            else:
                print(f"  🧹 Memory cleanup after batch {batch_idx // batch_size + 1}")

        # Force garbage collection
        gc.collect()

    # Create DataFrame with alphabetical field ordering
    print("\n📊 Creating DataFrame with alphabetical field ordering...")
    df = pd.DataFrame(all_results)

    # Ensure columns are in correct order: image_name first, then all fields alphabetically
    column_order = ["image_name"] + sorted(EXTRACTION_FIELDS)
    df = df[column_order]

    # Generate timestamp for unique filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"internvl3_extraction_results_{timestamp}.csv"
    csv_path = Path(OUTPUT_DIR) / csv_filename

    # Ensure output directory exists
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

    # Save main CSV with explicit na_rep to prevent NaN conversion
    df.to_csv(csv_path, index=False, na_rep="NOT_FOUND")

    # Create field metadata CSV for reference
    field_metadata = []
    for field in sorted(EXTRACTION_FIELDS):
        field_metadata.append(
            {
                "field_name": field,
                "field_type": FIELD_TYPES[field],
                "description": FIELD_DESCRIPTIONS[field],
                "instruction": FIELD_INSTRUCTIONS[field],
            }
        )

    metadata_df = pd.DataFrame(field_metadata)
    metadata_csv_path = Path(OUTPUT_DIR) / f"field_metadata_{timestamp}.csv"
    metadata_df.to_csv(metadata_csv_path, index=False)

    print("\n💾 Files saved:")
    print(f"   📊 Main results: {csv_filename}")
    print(f"   📋 Field metadata: field_metadata_{timestamp}.csv")
    print("   📝 Missing values shown as: NOT_FOUND (prevents pandas NaN conversion)")

    # Enhanced summary statistics with field metadata
    print("\n📈 Comprehensive Extraction Analysis:")
    print("=" * 70)
    print(f"Total images processed: {len(jpeg_files)}")
    print(
        f"Total columns in CSV: {len(df.columns)} (1 image_name + {FIELD_COUNT} fields)"
    )

    # Count OOM errors if any
    oom_count = df.apply(lambda row: (row == "ERROR_OOM").any(), axis=1).sum()
    if oom_count > 0:
        print(f"⚠️ OOM errors encountered: {oom_count} images")

    # Field type performance analysis
    print("\n📊 Field Type Performance Analysis:")
    print("-" * 70)
    for field_type, stats in global_field_type_stats.items():
        if stats["total"] > 0:
            rate = (stats["extracted"] / stats["total"]) * 100
            print(
                f"{field_type.upper():<15} : {stats['extracted']:,}/{stats['total']:,} fields ({rate:.1f}%)"
            )

    # Individual field performance analysis
    individual_field_stats = {}
    for field in sorted(EXTRACTION_FIELDS):
        valid_rows = df[field].apply(lambda x: x not in ["NOT_FOUND", "ERROR_OOM", ""])
        extracted = valid_rows.sum()
        total_valid = df[field].apply(lambda x: x != "ERROR_OOM").sum()
        if total_valid > 0:
            individual_field_stats[field] = (extracted / total_valid) * 100
        else:
            individual_field_stats[field] = 0

    sorted_fields = sorted(
        individual_field_stats.items(), key=lambda x: x[1], reverse=True
    )

    print("\n🏆 Top 5 Best Performing Fields:")
    print("-" * 70)
    for field, rate in sorted_fields[:5]:
        field_type = FIELD_TYPES[field]
        print(f"   {field:<25} : {rate:.1f}% success | {field_type}")

    print("\n⚠️ Top 5 Most Challenging Fields:")
    print("-" * 70)
    for field, rate in sorted_fields[-5:]:
        field_type = FIELD_TYPES[field]
        print(f"   {field:<25} : {rate:.1f}% success | {field_type}")

    # Display CSV structure
    print("\n📋 CSV Structure:")
    print("-" * 70)
    print("Column order: image_name + all fields in alphabetical order")
    print(f"Fields (alphabetical): {', '.join(sorted(EXTRACTION_FIELDS)[:5])}...")
    print("Missing values: NOT_FOUND (readable, won't convert to NaN)")

    # Final memory cleanup
    if torch.cuda.is_available():
        print("\n🧹 Final memory cleanup...")
        comprehensive_memory_cleanup(processor.model, processor.tokenizer)
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

        # Show final memory status
        free_memory = (
            torch.cuda.get_device_properties(0).total_memory
            - torch.cuda.memory_allocated()
        )
        print(f"✅ Free VRAM after processing: {free_memory / 1e9:.2f} GB")

    print(f"\n🎉 Batch processing completed! Check output directory: {OUTPUT_DIR}")

else:
    print("❌ No JPEG files found in DATA_DIR")
    print(f"   Please check: {DATA_DIR}")