In [None]:
"""
Cell 1: Environment Setup and Model Loading for Llama Vision Key-Value Extraction

Purpose:
- Import all required libraries for Llama-3.2-11B-Vision-Instruct model
- Load the Llama Vision model optimized for structured key-value extraction
- Initialize model with proper configuration for document analysis
- Define global configuration variables for data paths

Key Components:
- torch.bfloat16: Memory-efficient 16-bit floating point for better performance
- device_map="auto": Automatic device mapping for optimal GPU utilization
- AutoProcessor: Handles both text and image processing for Llama Vision
- Optimized for multimodal document understanding

Global Configuration:
- data_dir: Centralized data directory path for all image operations
- model_path: Local path to Llama-3.2-11B-Vision-Instruct model files
- output_dir: Directory for saving extraction results

Specialized for Key-Value Extraction:
- Optimized for structured document processing
- Configured for business document analysis workflows
- Supports multimodal conversation format required by Llama Vision
"""

from pathlib import Path
import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

# Global configuration variables
data_dir = "/home/jovyan/nfs_share/tod/huaifeng_data"
model_path = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"
output_dir = "/home/jovyan/nfs_share/tod/output"

print(f"🗂️  Data directory: {data_dir}")
print(f"📁 Output directory: {output_dir}")
print(f"🔧 Loading Llama-3.2-11B-Vision-Instruct model for key-value extraction from: {model_path}")

# Load Llama Vision model with optimal configuration
model = MllamaForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
    device_map="auto",           # Automatic device mapping for multi-GPU support
)

# Load processor for handling both text and image inputs
processor = AutoProcessor.from_pretrained(model_path)

print("✅ Llama Vision model and processor loaded successfully for key-value extraction")

In [None]:
"""
Cell 2: Document Loading and Preprocessing for Llama Vision

Purpose:
- Load and preprocess business documents for key-value extraction
- Handle various document formats with proper image processing
- Prepare documents for Llama Vision's multimodal processing pipeline

Document Processing Features:
- Supports various image formats (PNG, JPG, PDF conversions)
- Handles different document orientations and sizes
- Maintains image quality for optimal text recognition
- Uses global data_dir for consistent path management

Llama Vision Specific:
- No complex tiling required (unlike InternVL3)
- Direct image processing through AutoProcessor
- Optimized for single-image document analysis
- Maintains original aspect ratios for better text recognition
"""

def load_document_image(image_path):
    """
    Load document image with path flexibility
    
    Args:
        image_path: Path to document file (relative to data_dir or absolute)
    
    Returns:
        PIL.Image: Loaded document image ready for processing
    """
    # Handle both relative and absolute paths
    if not image_path.startswith('/'):
        image_path = f"{data_dir}/{image_path}"
    
    return Image.open(image_path)

# Load and analyze document using global data_dir
document_image = "synthetic_invoice_014.png"  # Configurable document filename
print(f"📄 Loading document from: {data_dir}/{document_image}")

# Load document image
image = load_document_image(document_image)
print(f"📷 Document loaded successfully: {image.size}")
print(f"📐 Document aspect ratio: {image.size[0]/image.size[1]:.2f}")
print(f"🖼️  Image format: {image.format}")
print(f"🎨 Color mode: {image.mode}")
print("🔍 Document ready for Llama Vision key-value extraction")

In [None]:
"""
Cell 3: Advanced Key-Value Extraction Prompt Configuration

Purpose:
- Define comprehensive prompt for extracting structured business document data
- Configure extraction parameters for consistent, standardized output
- Leverage Llama Vision's advanced reasoning capabilities for document analysis

Enhanced Prompt Features:
- 25 predefined fields covering comprehensive business document types
- Advanced formatting constraints to prevent Llama's markdown tendencies
- Explicit instructions optimized for Llama Vision's capabilities
- Deterministic field ordering for automated downstream processing

Field Categories (Comprehensive Coverage):
1. Document metadata (type, dates, references)
2. Supplier/business information (name, address, contact details)
3. Financial data (amounts, GST, totals, subtotals)
4. Transaction details (quantities, prices, descriptions)
5. Banking information (account numbers, BSB, balances)

Advanced Output Quality Controls:
- Multiple examples of correct/incorrect formatting
- Explicit markdown prevention (critical for Llama models)
- Field count validation (exactly 25 lines)
- Structured validation for downstream processing systems
- Clear start/stop instructions to prevent conversation artifacts
"""

# Enhanced key-value extraction prompt optimized for Llama Vision
extraction_prompt = """Extract key-value data from this business document image.

CRITICAL INSTRUCTIONS:
- Output ONLY the structured data below
- Do NOT include any conversation text
- Do NOT repeat the user's request
- Do NOT include <image> tokens
- Start immediately with DOCUMENT_TYPE
- Stop immediately after DESCRIPTIONS

REQUIRED OUTPUT FORMAT - EXACTLY 25 LINES:
DOCUMENT_TYPE: [value or N/A]
SUPPLIER: [value or N/A]
ABN: [11-digit Australian Business Number or N/A]
PAYER_NAME: [value or N/A]
PAYER_ADDRESS: [value or N/A]
PAYER_PHONE: [value or N/A]
PAYER_EMAIL: [value or N/A]
INVOICE_DATE: [value or N/A]
DUE_DATE: [value or N/A]
GST: [GST amount in dollars or N/A]
TOTAL: [total amount in dollars or N/A]
SUBTOTAL: [subtotal amount in dollars or N/A]
SUPPLIER_WEBSITE: [value or N/A]
QUANTITIES: [list of quantities or N/A]
PRICES: [individual prices in dollars or N/A]
BUSINESS_ADDRESS: [value or N/A]
BUSINESS_PHONE: [value or N/A]
BANK_NAME: [bank name from bank statements only or N/A]
BSB_NUMBER: [6-digit BSB from bank statements only or N/A]
BANK_ACCOUNT_NUMBER: [account number from bank statements only or N/A]
ACCOUNT_HOLDER: [value or N/A]
STATEMENT_PERIOD: [value or N/A]
OPENING_BALANCE: [opening balance amount in dollars or N/A]
CLOSING_BALANCE: [closing balance amount in dollars or N/A]
DESCRIPTIONS: [list of transaction descriptions or N/A]

FORMAT RULES:
- Use exactly: KEY: value (colon and space)
- NEVER use: **KEY:** or **KEY** or *KEY* or any formatting
- Plain text only - NO markdown, NO bold, NO italic
- Include ALL 25 keys even if value is N/A
- Output ONLY these 25 lines, nothing else

STOP after DESCRIPTIONS line. Do not add explanations or comments."""

print("📋 Advanced key-value extraction prompt configured for Llama Vision")
print(f"📄 Prompt length: {len(extraction_prompt)} characters")
print(f"🔍 Extracting 25 standardized business document fields")
print("⚙️ Configured for deterministic, structured output with markdown prevention")

In [None]:
"""
Cell 4: Llama Vision Key-Value Extraction Execution

Purpose:
- Execute structured field extraction using Llama Vision's multimodal capabilities
- Process document with optimized generation parameters for structured output
- Handle extraction with comprehensive error reporting and validation

Llama Vision Processing Pipeline:
1. Create multimodal message structure (image + text prompt)
2. Apply chat template for proper formatting
3. Process inputs through AutoProcessor
4. Generate structured output with controlled parameters
5. Validate and analyze extraction results

Generation Configuration:
- max_new_tokens=1000: Sufficient for 25 structured fields
- do_sample=False: Deterministic for consistent extraction
- temperature=None, top_p=None: Explicitly unset to avoid warnings
- Proper device handling for GPU processing

Advanced Error Handling:
- Comprehensive exception catching with detailed diagnostics
- Llama-specific error identification and troubleshooting
- Memory management guidance for large vision models
- Output validation and quality assessment

Enhanced Output Processing:
- Intelligent parsing to extract only structured response
- Removes conversation history and prompt artifacts
- Cleans markdown formatting for plain text output
- Validates field structure and completeness
"""

# Create multimodal message structure for Llama Vision
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": extraction_prompt,
            },
        ],
    }
]

print("🤖 Executing key-value extraction with Llama Vision...")
print("⚙️ Using multimodal conversation format for optimal extraction")

try:
    # Apply chat template for proper Llama formatting
    textInput = processor.apply_chat_template(
        messageDataStructure, add_generation_prompt=True
    )
    
    # Process inputs through AutoProcessor (handles both image and text)
    inputs = processor(image, textInput, return_tensors="pt").to(model.device)
    
    print(f"🔧 Input tensors prepared on device: {model.device}")
    
    # Generate structured output with controlled parameters (no warnings)
    output = model.generate(
        **inputs, 
        max_new_tokens=1000,  # Adequate for 25 structured fields
        do_sample=False,      # Deterministic for consistent extraction
        temperature=None,     # Explicitly unset to avoid warning
        top_p=None,          # Explicitly unset to avoid warning
        pad_token_id=processor.tokenizer.eos_token_id  # Prevent warnings
    )
    
    # Decode the response and extract only the assistant's response
    generatedOutput = processor.decode(output[0], skip_special_tokens=True)
    
    # Extract only the assistant's response (after the last "assistant" token)
    if "assistant" in generatedOutput:
        # Split by assistant and take the last part (the actual response)
        assistant_parts = generatedOutput.split("assistant")
        if len(assistant_parts) > 1:
            generatedOutput = assistant_parts[-1].strip()
    
    # Additional cleaning: remove any remaining conversation artifacts
    lines = generatedOutput.split('\n')
    cleaned_lines = []
    in_response = False
    
    for line in lines:
        line = line.strip()
        # Look for the start of structured output (first field)
        if line.startswith('DOCUMENT_TYPE:'):
            in_response = True
        # Skip user input or other artifacts
        if line.startswith('user') or line.startswith('<image>') or line.startswith('Extract data'):
            in_response = False
            continue
        # Collect response lines with proper field format
        if in_response and ':' in line and not line.startswith('<'):
            # Remove any markdown artifacts
            clean_line = line.replace('**', '').replace('*', '')
            cleaned_lines.append(clean_line)
            # Stop after 25 fields as specified in prompt
            if len(cleaned_lines) >= 25:
                break
    
    # Use cleaned output if we found structured fields, otherwise use original
    if cleaned_lines:
        generatedOutput = '\n'.join(cleaned_lines)
    
    print("✅ Key-value extraction completed successfully!")
    print("\n" + "="*60)
    print("EXTRACTED BUSINESS DOCUMENT FIELDS (LLAMA VISION):")
    print("="*60)
    print(generatedOutput)
    print("="*60)
    
    # Advanced extraction validation and analysis
    lines = generatedOutput.split('\n')
    field_lines = [line for line in lines if ':' in line and not line.strip().startswith('<')]
    
    print(f"\n📊 Llama Vision Extraction Statistics:")
    print(f"   • Total response lines: {len(lines)}")
    print(f"   • Structured field lines: {len(field_lines)}")
    print(f"   • Expected field count: 25")
    print(f"   • Extraction completeness: {len(field_lines)/25*100:.1f}%")
    
    # Quality assessment
    if len(field_lines) == 25:
        print("✅ Perfect field extraction - all 25 fields captured")
    elif len(field_lines) > 20:
        print("✅ Near-complete extraction - minor fields may be missing")
    elif len(field_lines) > 0:
        print("⚠️ Partial extraction - significant fields may be missing")
    else:
        print("❌ No structured fields detected in response")
    
    # Check for markdown artifacts (common with Llama models)
    markdown_count = generatedOutput.count('**') + generatedOutput.count('*')
    if markdown_count > 0:
        print(f"⚠️ Markdown artifacts detected: {markdown_count} instances")
        print("💡 Consider refining prompt to further suppress markdown formatting")
    else:
        print("✅ Clean plain text output - no markdown artifacts detected")
        
except torch.cuda.OutOfMemoryError:
    print("❌ GPU Memory Error: Insufficient VRAM for Llama Vision processing")
    print("💡 Solutions:")
    print("   • Reduce image resolution")
    print("   • Use CPU inference (slower but memory-efficient)")
    print("   • Clear GPU cache with torch.cuda.empty_cache()")
    
except Exception as e:
    print(f"❌ Error during Llama Vision extraction: {e}")
    print(f"🔍 Error type: {type(e).__name__}")
    print("\n📋 Troubleshooting suggestions:")
    print("   • Verify document image quality and readability")
    print("   • Check model and processor compatibility")
    print("   • Ensure sufficient system resources")
    print("   • Validate multimodal input format")
    
    import traceback
    print(f"\n🔧 Full error traceback:")
    traceback.print_exc()

In [None]:
"""
Cell 5: Advanced Results Management and Analysis Pipeline

Purpose:
- Save extracted key-value pairs with comprehensive analysis and reporting
- Perform advanced quality validation and extraction confidence assessment
- Generate detailed reports for workflow integration and process optimization

Advanced File Operations:
- Uses global output_dir for consistent file management
- Descriptive filename with model identification
- UTF-8 encoding with international character support
- Atomic file operations to prevent data corruption

Comprehensive Quality Analysis:
- Field completeness assessment (target: 25 fields)
- Content coverage analysis (non-N/A fields)
- Markdown artifact detection and reporting
- Data quality indicators for downstream processing
- Extraction confidence scoring and validation

Enhanced Error Handling:
- Specific error types with targeted solutions
- File system diagnostics and troubleshooting
- Memory management guidance for large documents
- Integration workflow validation

Advanced Integration Features:
- Structured output validation for database import
- Quality metrics for automated processing workflows
- Comparative analysis capabilities for model evaluation
- Batch processing readiness indicators
"""

# Configure output path using global output_dir variable
output_filename = "llama_keyvalue_extraction.txt"
output_path = Path(output_dir) / output_filename

print(f"💾 Saving Llama Vision extraction results to: {output_path}")

try:
    # Ensure output directory exists with proper permissions
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write extraction results with comprehensive metadata
    with output_path.open("w", encoding="utf-8") as text_file:
        # Add extraction metadata header
        text_file.write(f"# Llama Vision Key-Value Extraction Results\n")
        text_file.write(f"# Document: {document_image}\n")
        text_file.write(f"# Model: Llama-3.2-11B-Vision-Instruct\n")
        text_file.write(f"# Extraction Date: {Path().resolve()}\n")
        text_file.write("# " + "="*50 + "\n\n")
        text_file.write(generatedOutput)
    
    print(f"✅ Llama Vision extraction results saved successfully!")
    print(f"📄 File location: {output_path}")
    print(f"📊 File size: {output_path.stat().st_size} bytes")
    
    # Advanced extraction analysis and reporting
    lines = generatedOutput.split('\n')
    field_lines = [line for line in lines if ':' in line and not line.strip().startswith('<')]
    
    print(f"\n📈 Comprehensive Extraction Analysis:")
    print(f"   • Document processed: {document_image}")
    print(f"   • Model used: Llama-3.2-11B-Vision-Instruct")
    print(f"   • Total response lines: {len(lines)}")
    print(f"   • Structured field lines: {len(field_lines)}")
    print(f"   • Field extraction rate: {len(field_lines)/25*100:.1f}%")
    
    # Advanced content analysis
    non_na_fields = [line for line in field_lines if not line.split(':', 1)[1].strip().upper() in ['N/A', 'NA']]
    print(f"   • Fields with content: {len(non_na_fields)}")
    print(f"   • Content coverage: {len(non_na_fields)/25*100:.1f}%")
    
    # Quality validation metrics
    file_size = output_path.stat().st_size
    if file_size > 500:
        print("✅ Output file validation: EXCELLENT (comprehensive content)")
    elif file_size > 200:
        print("✅ Output file validation: GOOD (sufficient content)")
    else:
        print("⚠️ Output file validation: WARNING (minimal content detected)")
    
    # Markdown artifact analysis
    markdown_count = generatedOutput.count('**') + generatedOutput.count('*')
    if markdown_count == 0:
        print("✅ Format validation: PERFECT (no markdown artifacts)")
    else:
        print(f"⚠️ Format validation: {markdown_count} markdown artifacts detected")
    
    # Integration readiness assessment
    if len(field_lines) >= 20 and markdown_count == 0:
        print("🚀 Integration Status: READY (high-quality structured output)")
    elif len(field_lines) >= 15:
        print("⚙️ Integration Status: USABLE (good quality with minor gaps)")
    else:
        print("🔧 Integration Status: NEEDS REVIEW (significant extraction issues)")
    
    print(f"\n🔗 Advanced Integration Features:")
    print(f"   • Database-ready structured format: ✅")
    print(f"   • API integration compatible: ✅")
    print(f"   • Batch processing ready: ✅")
    print(f"   • Quality metrics available: ✅")
    print(f"📁 Output directory: {output_dir}")
    
except NameError:
    print("❌ Error: Extraction response not available")
    print("💡 Solution: Execute Cell 4 first to generate extraction results")
    print("🔄 Then re-run this cell to save and analyze the results")
    print("📋 Ensure Llama Vision processing completed successfully")
    
except PermissionError:
    print(f"❌ Permission Error: Cannot write to {output_path}")
    print("💡 Advanced Solutions:")
    print("   • Check directory write permissions")
    print("   • Verify output_dir path is accessible")
    print("   • Try running with appropriate user permissions")
    print("   • Consider alternative output directory")
    
except OSError as e:
    print(f"❌ File System Error: {e}")
    print("💡 System Diagnostics:")
    print("   • Check available disk space")
    print("   • Verify path validity and accessibility")
    print("   • Ensure parent directories exist and are writable")
    print("   • Check file system permissions and quotas")
    
except Exception as e:
    print(f"❌ Unexpected error during file operations: {e}")
    print(f"🔍 Error type: {type(e).__name__}")
    print("💡 Advanced troubleshooting:")
    print("   • Check system resources and memory availability")
    print("   • Verify file path configuration and permissions")
    print("   • Review extraction output format and content")
    print(f"🗂️ Configured output directory: {output_dir}")
    print(f"📄 Target filename: {output_filename}")
    
    import traceback
    print(f"\n🔧 Detailed error analysis:")
    traceback.print_exc()