# InternVL3 Single Flat Extraction - BFloat16 Multi-GPU

**Model Loading**: bfloat16 precision with multi-GPU device mapping (no quantization)  
**Reference**: https://internvl.readthedocs.io/en/latest/internvl3.0/quick_start.html

In [None]:
from pathlib import Path
import random
import math

import numpy as np
import matplotlib.pyplot as plt

import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [None]:
def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("✅ Random seed set to 42 for reproducibility")

In [None]:
def split_model(model_name):
    """Create device map for multi-GPU distribution.
    
    Strategy:
    - GPU 0: Vision encoder, MLP, embeddings, first/last LLM layers
    - Remaining GPUs: Distributed transformer layers
    
    Reference: https://internvl.readthedocs.io/en/latest/internvl3.0/quick_start.html
    """
    device_map = {}
    world_size = torch.cuda.device_count()
    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    num_layers = config.llm_config.num_hidden_layers
    
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    
    # Assign vision and embedding components to GPU 0
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.model.rotary_emb'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
    
    return device_map

print("✅ Multi-GPU device mapping function defined")

In [None]:
model_path = "/home/jovyan/shared_PTM/InternVL3-8B"

print(f"🔧 Loading InternVL3-8B model in bfloat16...")
print(f"📊 Available GPUs: {torch.cuda.device_count()}")

# Create device map for multi-GPU distribution
device_map = split_model(model_path)

print("\n📍 Device Map Summary:")
# Count components per GPU
gpu_counts = {}
for component, gpu_id in device_map.items():
    gpu_counts[gpu_id] = gpu_counts.get(gpu_id, 0) + 1

for gpu_id in sorted(gpu_counts.keys()):
    print(f"   GPU {gpu_id}: {gpu_counts[gpu_id]} components")

# Load model with bfloat16 and device_map
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map=device_map
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)

print("\n✅ Model loaded successfully!")

In [None]:
# Inspect model parameter datatypes
print("🔍 Model Parameter Datatypes:")

dtype_counts = {}
param_count = 0

for name, param in model.named_parameters():
    dtype = str(param.dtype)
    dtype_counts[dtype] = dtype_counts.get(dtype, 0) + 1
    param_count += 1
    
    # Show first few parameters as examples
    if param_count <= 5:
        print(f"   {name}: {param.dtype} on {param.device}")

print(f"\n📊 Dtype Distribution (Total {param_count} parameters):")
for dtype, count in sorted(dtype_counts.items()):
    print(f"   {dtype}: {count} parameters")

print("\n💾 GPU Memory Usage:")
for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    reserved = torch.cuda.memory_reserved(i) / 1e9
    print(f"   GPU [{i}]: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

In [None]:
def build_transform(input_size):
    """Build image transformation pipeline for InternVL3"""
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])
    return transform


def load_image(image_file, input_size=448, max_num=12):
    """Load and preprocess image for InternVL3"""
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
    return pixel_values

print("✅ Image preprocessing functions defined")

In [None]:
imageName = "/home/jovyan/_LMM_POC/evaluation_data/image_008.png"

print("📂 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

# Preprocess for InternVL3
pixel_values = load_image(imageName, input_size=448)

print(f"\n🔍 Pixel Values Tensor Info:")
print(f"   Shape: {pixel_values.shape}")
print(f"   Dtype: {pixel_values.dtype}")
print(f"   Device: {pixel_values.device}")
print(f"   Memory: {pixel_values.element_size() * pixel_values.nelement() / 1e6:.2f}MB")

In [None]:
# basic flat 5 column ["Date", "Description", "Withdrawal", "Credit", "Balance"] transaction table prompt
prompt_text = """
You are an expert document analyzer specializing in bank statement extraction.
Extract structured data from this flat table bank statement for taxpayer expense claims.

CONVERSATION PROTOCOL:
- Start your response immediately with "DOCUMENT_TYPE: BANK_STATEMENT"
- Do NOT include conversational text like "I'll extract..." or "Based on the document..."
- Do NOT use bullet points, numbered lists, asterisks, or markdown formatting (no **, no ##, no 1., no -)
- Output ONLY the structured extraction data below
- End immediately after "TRANSACTION_AMOUNTS_PAID:" with no additional text
- NO explanations, NO comments, NO additional text

CRITICAL:
- The transaction table in the image has a "Date", a "Description", a "Withdrawal", a "Deposit" and a "Balance" column
- Specifically, it has a "Date" column, a "Description" column, a "Withdrawal" column, a "Deposit" column and a "Balance" column

ANTI-HALLUCINATION RULES:
- YOU MUST NOT GUESS values you are unsure of
- Rows may have missing values
- Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF
- If a value is unclear or missing, use "NOT_FOUND" instead of guessing

STEP 1:
- Extract the Transaction Table formatted as markdown.

STEP 2:
- Extract the earliest and latest date in the "Date" column from the extracted Transaction Table in STEP 1
- Format as STATEMENT_DATE_RANGE: [ First date in "Date" column - Last date in "Date" column ]

STEP 3:
- Extract the "Date" column from the extracted Transaction Table in STEP 1
- Format as TRANSACTION_DATES: [ All "Date" column dates, each separated by " | " ] on a single line

STEP 4:
- Extract the "Description" column from the extracted Transaction Table in STEP 1
- Format as LINE_ITEM_DESCRIPTIONS: [ All "Description" column descriptions, each separated by " | " ] on a single line

STEP 5:
- Extract the "Withdrawal" column from the extracted Transaction Table in STEP 1, replacing missing values with "NOT_FOUND".
- Format as TRANSACTION_AMOUNTS_PAID: [ All "Withdrawal" column amounts each separated by " | " ] on a single line
"""

print(f"📝 Prompt length: {len(prompt_text)} characters")

In [None]:
generation_config = dict(max_new_tokens=4000, do_sample=False)

print("🤖 Generating response with InternVL3-8B...")
print("\n💾 GPU Memory Before Generation:")
for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    print(f"   GPU [{i}]: {allocated:.2f}GB")

# Generate response
response = model.chat(
    tokenizer=tokenizer,
    pixel_values=pixel_values,
    question=prompt_text,
    generation_config=generation_config
)

print("\n💾 GPU Memory After Generation:")
for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    print(f"   GPU [{i}]: {allocated:.2f}GB")

print("\n✅ Response generated successfully!")
print("\n" + "=" * 60)
print("EXTRACTION RESULT:")
print("=" * 60)
print(response)
print("=" * 60)

In [None]:
# Save the response to a file
output_path = Path("internvl3_grouped_bank_statement_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(response)

print(f"✅ Response saved to: {output_path}")
print(f"📁 File size: {output_path.stat().st_size} bytes")

In [None]:
# Final dtype and memory summary
print("📊 FINAL SUMMARY")
print("=" * 60)
print(f"Model Dtype: torch.bfloat16")
print(f"Input Tensor Dtype: {pixel_values.dtype}")
print(f"GPUs Used: {torch.cuda.device_count()}")
print("\n💾 Final GPU Memory:")
total_allocated = 0
for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    reserved = torch.cuda.memory_reserved(i) / 1e9
    total_allocated += allocated
    print(f"   GPU [{i}]: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
print(f"\n   Total Allocated: {total_allocated:.2f}GB")
print("=" * 60)