In [None]:
# Cell 1
from pathlib import Path
import random

import numpy as np
import matplotlib.pyplot as plt

import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

In [None]:
# Cell 2
def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("✅ Random seed set to 42 for reproducibility")

In [None]:
# Cell 3
model_path = "/home/jovyan/shared_PTM/InternVL3-8B"

print("🔧 Loading InternVL3-8B model...")
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map="auto"
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)

print("✅ Model loaded successfully!")

for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    reserved = torch.cuda.memory_reserved(i) / 1e9
    print(f"    GPU [{i}]: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

In [None]:
# Cell 4
def build_transform(input_size):
    """Build image transformation pipeline for InternVL3"""
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])
    return transform


def load_image(image_file, input_size=448, max_num=12):
    """Load and preprocess image for InternVL3"""
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
    return pixel_values

In [None]:
# Cell 5
imageName = "/home/jovyan/_LMM_POC/evaluation_data/image_008.png"

print("📂 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

# Preprocess for InternVL3
pixel_values = load_image(imageName, input_size=448)

In [None]:
# Cell 6
# basic flat 5 column ["Date", "Description", "Withdrawal", "Credit", "Balance"] transaction table prompt
prompt_text = """
You are an expert document analyzer specializing in bank statement extraction.
Extract structured data from this flat table bank statement for taxpayer expense claims.

CONVERSATION PROTOCOL:
- Start your response immediately with "DOCUMENT_TYPE: BANK_STATEMENT"
- Do NOT include conversational text like "I'll extract..." or "Based on the document..."
- Do NOT use bullet points, numbered lists, asterisks, or markdown formatting (no **, no ##, no 1., no -)
- Output ONLY the structured extraction data below
- End immediately after "TRANSACTION_AMOUNTS_PAID:" with no additional text
- NO explanations, NO comments, NO additional text

CRITICAL:
- The transaction table in the image has a "Date", a "Description", a "Withdrawal", a "Deposit" and a "Balance" column
- Specifically, it has a "Date" column, a "Description" column, a "Withdrawal" column, a "Deposit" column and a "Balance" column

ANTI-HALLUCINATION RULES:
- YOU MUST NOT GUESS values you are unsure of
- Rows may have missing values
- Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF
- If a value is unclear or missing, use "NOT_FOUND" instead of guessing

STEP 1:
- Extract the Transaction Table formatted as markdown.

STEP 2:
- Extract the earliest and latest date in the "Date" column from the extracted Transaction Table in STEP 1
- Format as STATEMENT_DATE_RANGE: [ First date in "Date" column - Last date in "Date" column ]

STEP 3:
- Extract the "Date" column from the extracted Transaction Table in STEP 1
- Format as TRANSACTION_DATES: [ All "Date" column dates, each separated by " | " ] on a single line

STEP 4:
- Extract the "Description" column from the extracted Transaction Table in STEP 1
- Format as LINE_ITEM_DESCRIPTIONS: [ All "Description" column descriptions, each separated by " | " ] on a single line

STEP 5:
- Extract the "Withdrawal" column from the extracted Transaction Table in STEP 1, replacing missing values with "NOT_FOUND".
- Format as TRANSACTION_AMOUNTS_PAID: [ All "Withdrawal" column amounts each separated by " | " ] on a single line
"""

print(f"📝 Prompt length: {len(prompt_text)} characters")

In [None]:
# Cell 7
generation_config = dict(max_new_tokens=4000, do_sample=False)

print("🤖 Generating response with InternVL3-8B...")

# Generate response
response = model.chat(
    tokenizer=tokenizer,
    pixel_values=pixel_values,
    question=prompt_text,
    generation_config=generation_config
)

print("✅ Response generated successfully!")
print("\n" + "=" * 60)
print("EXTRACTION RESULT:")
print("=" * 60)
print(response)
print("=" * 60)

In [None]:
# Cell 8
# Save the response to a file
output_path = Path("internvl3_grouped_bank_statement_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(response)

print(f"✅ Response saved to: {output_path}")
print(f"📁 File size: {output_path.stat().st_size} bytes")