In [1]:
import math
import numpy as np
import torch
import torchvision.transforms as T
# from decord import VideoReader, cpu  # Commented out since not needed for basic usage
from PIL import Image
from torchvision.transforms.functional import InterpolationMode

import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel

print("🔧 Loading InternVL3-2B model...")
model_path = "/home/jovyan/nfs_share/models/InternVL3-2B"
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=False,
    trust_remote_code=True).eval().cuda()

print("✅ Model loaded successfully")

🔧 Loading InternVL3-2B model...
FlashAttention2 is not installed.
✅ Model loaded successfully


In [2]:
# from transformers_modules.OpenGVLab.InternVL3.tokenization_internvl import InternVLTokenizer
# tokenizer = InternVLTokenizer.from_pretrained(model_path)

# # Look for the tokenizer class in the model directory
# import os
# import sys

# # Add the model directory to Python path
# model_path = "/home/jovyan/nfs_share/models/huggingface/hub/InternVL3-1B"
# sys.path.append(model_path)

# # Verify what's in the directory
# print(os.listdir(model_path))

# # Then try importing the tokenizer from there
# from tokenization_internvl import InternVLTokenizer
# tokenizer = InternVLTokenizer.from_pretrained(model_path)

In [3]:
# Fixed tokenizer loading for InternVL3-2B
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    trust_remote_code=True, 
    use_fast=False  # Important for InternVL3
)
print("✅ Tokenizer loaded successfully")

✅ Tokenizer loaded successfully


In [4]:
model

InternVLChatModel(
  (vision_model): InternVisionModel(
    (embeddings): InternVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InternVisionEncoder(
      (layers): ModuleList(
        (0): InternVisionEncoderLayer(
          (attn): InternAttention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (mlp): InternMLP(
            (act): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (drop_path1): Identity()


## [Quick Start](https://huggingface.co/OpenGVLab/InternVL3-1B#quick-start)

In [5]:
# Simple image processing (from official InternVL3 docs)
def load_image(image, input_size=448):
    """Simple image preprocessing following official InternVL3 docs"""
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size)),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    return transform(image).unsqueeze(0).to(torch.bfloat16).cuda()

# Load and process image
imageName = "/home/jovyan/nfs_share/tod/datasets/synthetic_invoice_014.png"
image = Image.open(imageName)
print(f"📷 Image loaded: {image.size}")

print("🖼️  Processing image...")
pixel_values = load_image(image)
print(f"✅ Image processed: {pixel_values.shape}")

# Generation config
generation_config = dict(max_new_tokens=1024, do_sample=True)

# Test simple image conversation
question = '<image>\nPlease describe the image shortly.'
print(f"❓ Question: {question}")

print("🤖 Generating response...")
try:
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    print("✅ Response generated successfully!")
    print("\n" + "="*50)
    print("RESPONSE:")
    print(response)
    print("="*50)
    
except Exception as e:
    print(f"❌ Error during inference: {e}")
    print(f"Error type: {type(e).__name__}")
    import traceback
    traceback.print_exc()

📷 Image loaded: (700, 900)
🖼️  Processing image...
✅ Image processed: torch.Size([1, 3, 448, 448])
❓ Question: <image>
Please describe the image shortly.
🤖 Generating response...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


✅ Response generated successfully!

RESPONSE:
The image is an invoice from Hyatt Hotels, formatted in a table layout, detailing the purchase of milk, apples, ground beef, and pasta. The quantities range from 1 to 3 items each. The total amount due is $31.33, including a subtotal of $28.48 and 10% GST added. Payment is specified for a bank draft with the account number partially masked. The invoice includes contact details for Hyatt Hotels and the recipient, along with billing address information and invoice specifics.


In [6]:
# Save response to file (optional)
from pathlib import Path

try:
    output_path = Path("/home/jovyan/nfs_share/tod/output/internvl3_ibm_output.txt")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with output_path.open("w", encoding="utf-8") as text_file:
        text_file.write(response)
    
    print(f"✅ Response saved to: {output_path}")
    print(f"📄 File size: {output_path.stat().st_size} bytes")
    
except NameError:
    print("❌ Error: 'response' variable not defined.")
    print("💡 Please run the previous cell first to generate the response.")
    
except Exception as e:
    print(f"❌ Error saving file: {e}")
    print(f"💡 Check if directory exists: {output_path.parent}")

✅ Response saved to: /home/jovyan/nfs_share/tod/output/internvl3_ibm_output.txt
📄 File size: 477 bytes


In [7]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform



# set the max number of tiles in `max_num`
pixel_values = load_image('../huaifeng_data/image1.png', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

# pure-text conversation (纯文本对话)
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# single-image single-round conversation (单图单轮对话)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')

# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
pixel_values1 = load_image('../huaifeng_data/image1.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('../huaifeng_data/image2.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
pixel_values1 = load_image('../huaifeng_data/image1.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('../huaifeng_data/image2.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# batch inference, single image per sample (单图批处理)
pixel_values1 = load_image('../huaifeng_data/image1.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('../huaifeng_data/image2.png', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
                             num_patches_list=num_patches_list,
                             questions=questions,
                             generation_config=generation_config)
for question, response in zip(questions, responses):
    print(f'User: {question}\nAssistant: {response}')


TypeError: load_image() got an unexpected keyword argument 'max_num'