In [None]:
"""
Cell 1: Environment Setup and Model Loading for Key-Value Extraction

Purpose:
- Import all required libraries for InternVL3-2B vision-language model
- Load the InternVL3-2B model following official documentation best practices
- Initialize model with proper dtype and settings for optimal inference
- Define global configuration variables for data paths

Key Components (Following Official InternVL3 Documentation):
- torch.bfloat16: Recommended precision for optimal performance
- use_flash_attn=True: Enable Flash Attention for better efficiency (recommended)
- low_cpu_mem_usage=True: Optimize CPU memory during loading
- trust_remote_code=True: Allow loading custom model code from HuggingFace
- .eval().cuda(): Set model to evaluation mode and move to GPU

Global Configuration:
- data_dir: Centralized data directory path for all image operations
- model_path: Local path to InternVL3-2B model files
- output_dir: Directory for saving extraction results

Official Requirements:
- transformers>=4.37.2
- Flash Attention support for optimal performance
- Proper dtype consistency throughout the pipeline
"""

from pathlib import Path
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as T

# Check transformers version (should be >=4.37.2)
import transformers
print(f"🔍 Transformers version: {transformers.__version__}")

# Global configuration variables
data_dir = "/home/jovyan/nfs_share/tod/huaifeng_data"
model_path = "/home/jovyan/nfs_share/models/InternVL3-2B" 
output_dir = "/home/jovyan/nfs_share/tod/output"

print(f"🗂️  Data directory: {data_dir}")
print(f"📁 Output directory: {output_dir}")
print(f"🔧 Loading InternVL3-2B model following official documentation from: {model_path}")

# Load model with official recommended settings (following InternVL documentation)
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,   # Official recommendation: use bfloat16
    low_cpu_mem_usage=True,       # Optimize CPU memory during loading
    use_flash_attn=True,          # Enable Flash Attention (recommended)
    trust_remote_code=True        # Allow custom model code execution
).eval().cuda()                   # Set to evaluation mode and move to GPU

# Load tokenizer with official settings
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    trust_remote_code=True,  # Allow custom tokenizer code
    use_fast=False          # Use slower but more reliable tokenizer for structured tasks
)

print("✅ Model and tokenizer loaded successfully following official InternVL3 guidelines")

In [None]:
"""
Cell 2: Official InternVL3 Dynamic Image Processing Pipeline

Purpose:
- Implement official InternVL3 dynamic image preprocessing following documentation
- Support dynamic tiling with proper dtype consistency
- Handle document formats with optimal preprocessing for text extraction

Official Dynamic Preprocessing Features (from InternVL3 docs):
1. build_transform(): Official transformation pipeline with proper normalization
2. find_closest_aspect_ratio(): Aspect ratio optimization for multiple tiles
3. dynamic_preprocess(): Official dynamic tiling algorithm (1-12 tiles max)
4. load_image(): Complete preprocessing with proper dtype handling

Key Requirements from Documentation:
- Proper dtype consistency (bfloat16 throughout pipeline)
- ImageNet normalization constants
- BICUBIC interpolation for quality
- Dynamic tiling with thumbnail support
- Memory-safe processing with configurable max_num
"""

import math

# Official ImageNet normalization constants
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    """
    Official InternVL3 image transformation pipeline
    
    Args:
        input_size: Target size for image resizing (default 448)
    
    Returns:
        torchvision.transforms.Compose: Official transformation pipeline
    """
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    """
    Official InternVL3 aspect ratio optimization algorithm
    """
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    """
    Official InternVL3 dynamic preprocessing algorithm
    """
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # Calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # Find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # Calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # Resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # Split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    """
    Official InternVL3 image loading with proper dtype handling
    
    Args:
        image_file: Path to image file (relative to data_dir or absolute)
        input_size: Target size for each tile
        max_num: Maximum number of tiles to generate (1-12 as per docs)
    
    Returns:
        torch.Tensor: Properly processed image tensor with correct dtype (bfloat16)
    """
    # Handle both relative and absolute paths
    if not image_file.startswith('/'):
        image_file = f"{data_dir}/{image_file}"
    
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    
    # CRITICAL: Ensure proper dtype for InternVL3 (must match model's bfloat16)
    return pixel_values.to(torch.bfloat16).cuda()

# Load and process document following official guidelines
document_image = "image2.png"  # Configurable document filename
print(f"📄 Loading document from: {data_dir}/{document_image}")

# Load original image for analysis
image_path = f"{data_dir}/{document_image}"
original_image = Image.open(image_path)
print(f"📷 Original document size: {original_image.size}")
print(f"📐 Document aspect ratio: {original_image.size[0]/original_image.size[1]:.2f}")

# Process with official dynamic preprocessing
print("🖼️  Processing with official InternVL3 dynamic preprocessing...")
pixel_values = load_image(document_image, max_num=12)
print(f"✅ Document processed into {pixel_values.shape[0]} tiles: {pixel_values.shape}")
print(f"🔍 Tensor dtype: {pixel_values.dtype} (should be torch.bfloat16)")
print("📋 Ready for InternVL3 key-value extraction")

In [4]:
"""
Cell 3: Structured Key-Value Extraction Prompt Configuration

Purpose:
- Define comprehensive prompt for extracting structured business document data
- Configure extraction parameters for consistent, standardized output
- Specify exact output format requirements for downstream processing

Extraction Specifications:
- 25 predefined fields covering common business document types
- Supports invoices, receipts, bank statements, and tax documents
- Handles missing fields gracefully with "N/A" placeholders
- Enforces plain text output without markdown formatting
- Ensures deterministic field ordering for automated processing

Field Categories:
1. Document metadata (type, dates)
2. Supplier/business information (name, address, contact)
3. Financial data (amounts, GST, totals)
4. Transaction details (quantities, prices, descriptions)
5. Banking information (account numbers, BSB, balances)

Output Quality Controls:
- Explicit formatting rules to prevent markdown artifacts
- Character limits and validation requirements
- Structured field validation for downstream systems
"""

# Comprehensive key-value extraction prompt optimized for business documents
extraction_prompt = """Extract data from this business document. 
Output ALL fields below with their exact keys. 
Use "N/A" if field is not visible or not present.

REQUIRED OUTPUT FORMAT (output ALL lines exactly as shown):
DOCUMENT_TYPE: [value or N/A]
SUPPLIER: [value or N/A]
ABN: [11-digit Australian Business Number or N/A]
PAYER_NAME: [value or N/A]
PAYER_ADDRESS: [value or N/A]
PAYER_PHONE: [value or N/A]
PAYER_EMAIL: [value or N/A]
INVOICE_DATE: [value or N/A]
DUE_DATE: [value or N/A]
GST: [GST amount in dollars or N/A]
TOTAL: [total amount in dollars or N/A]
SUBTOTAL: [subtotal amount in dollars or N/A]
SUPPLIER_WEBSITE: [value or N/A]
QUANTITIES: [list of quantities or N/A]
PRICES: [individual prices in dollars or N/A]
BUSINESS_ADDRESS: [value or N/A]
BUSINESS_PHONE: [value or N/A]
BANK_NAME: [bank name from bank statements only or N/A]
BSB_NUMBER: [6-digit BSB from bank statements only or N/A]
BANK_ACCOUNT_NUMBER: [account number from bank statements only or N/A]
ACCOUNT_HOLDER: [value or N/A]
STATEMENT_PERIOD: [value or N/A]
OPENING_BALANCE: [opening balance amount in dollars or N/A]
CLOSING_BALANCE: [closing balance amount in dollars or N/A]
DESCRIPTIONS: [list of transaction descriptions or N/A]

CRITICAL: Output in PLAIN TEXT format only. Do NOT use markdown formatting.

CORRECT format: DOCUMENT_TYPE: TAX INVOICE
WRONG format: **DOCUMENT_TYPE:** TAX INVOICE
WRONG format: **DOCUMENT_TYPE: TAX INVOICE**
WRONG format: DOCUMENT_TYPE: **TAX INVOICE**

Use exactly: KEY: value (with colon and space)
Never use: **KEY:** or **KEY** or any asterisks
Never use bold, italic, or any markdown formatting

ABSOLUTELY CRITICAL: Output EXACTLY 25 lines using ONLY the keys listed above. 
Do NOT add extra fields like \"Balance\", \"Credit\", \"Debit\", \"Date\", \"Description\".
Do NOT include ANY fields not in the required list above.
Include ALL 25 keys listed above even if value is N/A.
STOP after exactly 25 lines."""

# Format prompt for InternVL3 with proper image token
question = f'<image>\n{extraction_prompt}'

print("📋 Structured key-value extraction prompt configured")
print(f"📄 Prompt length: {len(extraction_prompt)} characters")
print(f"🔍 Extracting 25 standardized business document fields")
print("⚙️ Configured for deterministic, structured output")

📋 Structured key-value extraction prompt configured
📄 Prompt length: 1912 characters
🔍 Extracting 25 standardized business document fields
⚙️ Configured for deterministic, structured output


In [5]:
"""
Cell 4: Key-Value Extraction Execution and Processing

Purpose:
- Execute structured field extraction using optimized generation parameters
- Process document with InternVL3 model for consistent key-value pairs
- Handle extraction errors gracefully with comprehensive error reporting

Generation Configuration:
- max_new_tokens=1000: Sufficient for 25 structured fields
- do_sample=False: Deterministic output for consistent field extraction
- pad_token_id=tokenizer.eos_token_id: Prevents padding warnings
- Temperature disabled: Ensures reproducible extraction results

Error Handling:
- Comprehensive exception catching with detailed error reporting
- Type-specific error identification for debugging
- Stack trace output for development troubleshooting
- Graceful failure with actionable error messages

Output Validation:
- Field count verification (should extract exactly 25 fields)
- Format validation for downstream processing
- Quality indicators for extraction success assessment
"""

# Generation configuration optimized for structured output
generation_config = dict(
    max_new_tokens=1000,                    # Adequate tokens for 25 structured fields
    do_sample=False,                        # Deterministic for consistent field extraction
    pad_token_id=tokenizer.eos_token_id     # Prevent pad_token_id warnings
    # Note: Temperature omitted since do_sample=False
)

print("🤖 Executing key-value extraction with InternVL3...")
print("⚙️ Using deterministic generation for consistent field extraction")

try:
    # Execute structured field extraction
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    
    print("✅ Key-value extraction completed successfully!")
    print("\n" + "="*60)
    print("EXTRACTED BUSINESS DOCUMENT FIELDS:")
    print("="*60)
    print(response)
    print("="*60)
    
    # Basic validation of extraction results
    lines = response.split('\n')
    field_lines = [line for line in lines if ':' in line and not line.strip().startswith('<')]
    print(f"\n📊 Extraction Statistics:")
    print(f"   • Total field lines extracted: {len(field_lines)}")
    print(f"   • Expected field count: 25")
    print(f"   • Extraction completeness: {len(field_lines)/25*100:.1f}%")
    
    if len(field_lines) == 25:
        print("✅ Perfect field extraction - all 25 fields captured")
    elif len(field_lines) > 0:
        print("⚠️ Partial extraction - some fields may be missing")
    else:
        print("❌ No structured fields detected in response")
    
except Exception as e:
    print(f"❌ Error during key-value extraction: {e}")
    print(f"🔍 Error type: {type(e).__name__}")
    print("\n📋 Troubleshooting suggestions:")
    print("   • Check document image quality and readability")
    print("   • Verify model and tokenizer are properly loaded")
    print("   • Ensure sufficient GPU memory for processing")
    print("   • Validate document contains extractable text fields")
    
    import traceback
    print(f"\n🔧 Full error traceback:")
    traceback.print_exc()

🤖 Executing key-value extraction with InternVL3...
⚙️ Using deterministic generation for consistent field extraction
❌ Error during key-value extraction: Input type (float) and bias type (c10::BFloat16) should be the same
🔍 Error type: RuntimeError

📋 Troubleshooting suggestions:
   • Check document image quality and readability
   • Verify model and tokenizer are properly loaded
   • Ensure sufficient GPU memory for processing
   • Validate document contains extractable text fields

🔧 Full error traceback:


Traceback (most recent call last):
  File "/tmp/ipykernel_8676/2058508661.py", line 40, in <module>
    response = model.chat(tokenizer, pixel_values, question, generation_config)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/.cache/huggingface/modules/transformers_modules/InternVL3-2B/modeling_internvl_chat.py", line 291, in chat
    generation_output = self.generate(
                        ^^^^^^^^^^^^^^
  File "/home/jovyan/.conda/envs/unified_vision_processor/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/.cache/huggingface/modules/transformers_modules/InternVL3-2B/modeling_internvl_chat.py", line 326, in generate
    vit_embeds = self.extract_feature(pixel_values)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/.cache/huggingface/modules/transformers_modules/InternVL3-2B/mode

In [None]:
"""
Cell 5: Results Saving and Analysis Pipeline

Purpose:
- Save extracted key-value pairs to persistent storage for further processing
- Perform quality analysis and validation of extraction results
- Generate extraction reports and statistics for workflow integration

File Operations:
- Creates output directory using global output_dir configuration
- Uses UTF-8 encoding for proper international character handling
- Saves with descriptive filename including timestamp capability
- Implements atomic file operations to prevent data corruption

Quality Analysis Features:
- Field completeness assessment (target: 25 fields)
- Content validation for required field formats
- Data quality indicators for downstream processing
- Extraction confidence metrics and reporting

Error Handling:
- NameError: Handles case where response variable isn't defined
- FileSystem errors: Permission issues, disk space, path problems
- Encoding errors: Character set and formatting issues
- Provides actionable troubleshooting guidance for each error type

Integration Features:
- Structured output suitable for database import
- JSON-compatible field parsing for API integration
- Batch processing support for multiple document workflows
"""

# Configure output path using global output_dir variable
output_filename = "internvl3_keyvalue_extraction.txt"
output_path = Path(output_dir) / output_filename

print(f"💾 Saving extraction results to: {output_path}")

try:
    # Ensure output directory exists with proper permissions
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write extraction results with UTF-8 encoding for international support
    with output_path.open("w", encoding="utf-8") as text_file:
        text_file.write(response)
    
    print(f"✅ Key-value extraction results saved successfully!")
    print(f"📄 File location: {output_path}")
    print(f"📊 File size: {output_path.stat().st_size} bytes")
    
    # Advanced extraction analysis and reporting
    lines = response.split('\n')
    field_lines = [line for line in lines if ':' in line and not line.strip().startswith('<')]
    
    print(f"\n📈 Detailed Extraction Analysis:")
    print(f"   • Document processed: {document_image}")
    print(f"   • Total response lines: {len(lines)}")
    print(f"   • Structured field lines: {len(field_lines)}")
    print(f"   • Field extraction rate: {len(field_lines)/25*100:.1f}%")
    
    # Field content analysis
    non_na_fields = [line for line in field_lines if not line.split(':')[1].strip().upper() in ['N/A', 'NA']]
    print(f"   • Fields with content: {len(non_na_fields)}")
    print(f"   • Content coverage: {len(non_na_fields)/25*100:.1f}%")
    
    # File validation
    file_size = output_path.stat().st_size
    if file_size > 100:
        print("✅ Output file validation: PASSED (sufficient content)")
    else:
        print("⚠️ Output file validation: WARNING (minimal content detected)")
    
    print(f"\n🔗 Integration ready: Results saved in structured format")
    print(f"📁 Output directory: {output_dir}")
    
except NameError:
    print("❌ Error: Extraction response not available")
    print("💡 Solution: Execute Cell 4 first to generate extraction results")
    print("🔄 Then re-run this cell to save the results")
    
except PermissionError:
    print(f"❌ Permission Error: Cannot write to {output_path}")
    print("💡 Solutions:")
    print("   • Check directory write permissions")
    print("   • Verify output_dir path is accessible")
    print("   • Try running with appropriate user permissions")
    
except OSError as e:
    print(f"❌ File System Error: {e}")
    print("💡 Solutions:")
    print("   • Check available disk space")
    print("   • Verify path validity and accessibility")
    print("   • Ensure parent directories exist")
    
except Exception as e:
    print(f"❌ Unexpected error during file operations: {e}")
    print(f"🔍 Error type: {type(e).__name__}")
    print("💡 Check system resources and file path configuration")
    print(f"🗂️ Configured output directory: {output_dir}")
    
    import traceback
    print(f"\n🔧 Full error details:")
    traceback.print_exc()