In [1]:
from pathlib import Path

import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

In [2]:
model_id = "/home/jovyan/nfs_share/models/InternVL3-8B"
# here, specify the name of the image
imageName = "/home/jovyan/nfs_share/tod/datasets/synthetic_invoice_014.png"

model = AutoModel.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)

# open the image
image = Image.open(imageName)

FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# InternVL3 uses a simpler prompt format
question = "How much did Jessica pay?"

In [None]:
# Process image for InternVL3 (following vision_processor implementation)
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

# InternVL3 image processing (same as production)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
image_size = 448

# Build transform (same as _build_transform in vision_processor)
transform = T.Compose([
    T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
    T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

# Process image to pixel_values
pixel_values = transform(image).unsqueeze(0)  # Shape: [1, 3, 448, 448]

# Move to appropriate device and dtype (critical step!)
if torch.cuda.is_available():
    pixel_values = pixel_values.cuda().to(torch.float16)
else:
    pixel_values = pixel_values.to(torch.float32)

print(f"Pixel values shape: {pixel_values.shape}")
print(f"Pixel values device: {pixel_values.device}")

# Generation config (following production settings)
generation_config = {
    "max_new_tokens": 2000,
    "do_sample": False,  # Deterministic output like production
    "pad_token_id": tokenizer.eos_token_id,
}

# Generate response using InternVL3 (production method)
try:
    response = model.chat(
        tokenizer=tokenizer,
        pixel_values=pixel_values,
        question=question,
        generation_config=generation_config,
    )
    print("✅ InternVL3 response generated successfully")
    print("\n" + "="*50)
    print(response)
    print("="*50)
    
except Exception as e:
    print(f"❌ Error during InternVL3 inference: {e}")
    print("This suggests a model loading or compatibility issue.")

In [None]:
# save the answer in a file
output_path = Path("/home/jovyan/nfs_share/tod/output/internvl3_output.txt")

# Ensure the output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)

# Write the response to file
with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(response)

print(f"✅ Response saved to {output_path}")