In [1]:
from pathlib import Path

import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

In [None]:
# Model loading - Simple InternVL3 approach (from official docs)
from pathlib import Path
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as T

# Use path from model_comparison.yaml
model_id = "/home/jovyan/nfs_share/models/InternVL3-2B"
imageName = "/home/jovyan/nfs_share/tod/datasets/synthetic_invoice_014.png"

print("🔧 Loading InternVL3 model...")

# Load model with official recommended settings
model = AutoModel.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,  # Key: bfloat16, not float16!
    low_cpu_mem_usage=True,
    trust_remote_code=True
).eval().cuda()

# Load tokenizer with official settings
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True, 
    use_fast=False  # Important for InternVL3
)

print("✅ Model and tokenizer loaded successfully")

# Load image
image = Image.open(imageName)
print(f"📷 Image loaded: {image.size}")


In [None]:
# Simple image processing (from official InternVL3 docs)
def load_image(image, input_size=448):
    """Simple image preprocessing following official InternVL3 docs"""
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size)),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    return transform(image).unsqueeze(0).to(torch.bfloat16).cuda()

# Process image
print("🖼️  Processing image...")
pixel_values = load_image(image)
print(f"✅ Image processed: {pixel_values.shape}")

# Question with official format
question = '<image>\nHow much did Jessica pay?'  # Key: <image>\n prefix!
print(f"❓ Question: {question}")

# Generation config
generation_config = dict(max_new_tokens=2000, do_sample=True)

# Generate response using simple official API
print("🤖 Generating response...")
try:
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    print("✅ Response generated successfully!")
    print("\n" + "="*50)
    print("RESPONSE:")
    print(response)
    print("="*50)
    
except Exception as e:
    print(f"❌ Error during inference: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()

In [None]:
# Save response to file  
output_path = Path("/home/jovyan/nfs_share/tod/output/internvl3_output.txt")

try:
    # Ensure output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write response to file
    with output_path.open("w", encoding="utf-8") as text_file:
        text_file.write(response)
    
    print(f"✅ Response saved to: {output_path}")
    print(f"📄 File size: {output_path.stat().st_size} bytes")
    
except NameError:
    print("❌ Error: 'response' variable not defined.")
    print("💡 Please run Cell [3] first to generate the response.")
    
except Exception as e:
    print(f"❌ Error saving file: {e}")
    print(f"💡 Check if directory exists: {output_path.parent}")