In [None]:
# Cell 1
from pathlib import Path
import random

import numpy as np
import matplotlib.pyplot as plt

import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

In [None]:
# Cell 2
def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("✅ Random seed set to 42 for reproducibility")

In [None]:
# Cell 3
model_id = "/home/jovyan/shared_PTM/Llama-3.2-11B-Vision-Instruct"

print("🔧 Loading Llama-3.2-Vision model...")
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# processor
processor = AutoProcessor.from_pretrained(model_id)

for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    reserved = torch.cuda.memory_reserved(i) / 1e9
    print(f"    GPU [{i}]: {{allocated:.2f}}GB allocated, {{reserved:.2f}}GB reserved")

In [None]:
# Cell 4
def clean_llama_response(response: str) -> str:
    """Remove chat template artifacts and extract only the assistant's response."""
    start_marker = "<|start_header_id|>assistant<|end_header_id|>"
    end_marker = "<|eot_id|>"
    
    start_idx = response.find(start_marker)
    if start_idx != -1:
        start_idx += len(start_marker)
        end_idx = response.find(end_marker, start_idx)
        if end_idx != -1:
            return response[start_idx:end_idx].strip()
    
    return response.replace("***","").strip()

In [None]:
# Cell 5
imageName = "/home/jovyan/_LMM_POC/evaluation_data/image_008.png"

print("📂 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

In [None]:
# Cell 6
# basic flat 5 column ["Date", "Description", "Withdrawal", "Credit", "Balance"] transaction table prompt
prompt_text = """
You are an expert document analyzer specializing in bank statement extraction.
Extract structured data from this flat table bank statement for taxpayer expense claims.

CONVERSATION PROTOCOL:
- Start your response immediately with "DOCUMENT_TYPE: BANK_STATEMENT"
- Do NOT include conversational text like "I'll extract..." or "Based on the document..."
- Do NOT use bullet points, numbered lists, asterisks, or markdown formatting (no **, no ##, no 1., no -)
- Output ONLY the structured extraction data below
- End immediately after "TRANSACTION_AMOUNTS_PAID:" with no additional text
- NO explanations, NO comments, NO additional text

CRITICAL:
- The transaction table in the image has a "Date", a "Description", a "Withdrawal", a "Deposit" and a "Balance" column
- Specifically, it has a "Date" column, a "Description" column, a "Withdrawal" column, a "Deposit" column and a "Balance" column

ANTI-HALLUCINATION RULES:
- YOU MUST NOT GUESS values you are unsure of
- Rows may have missing values
- Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF
- If a value is unclear or missing, use "NOT_FOUND" instead of guessing

STEP 1:
- Extract the Transaction Table formatted as markdown.

STEP 2:
- Extract the earliest and latest date in the "Date" column from the extracted Transaction Table in STEP 1
- Format as STATEMENT_DATE_RANGE: [ First date in "Date" column - Last date in "Date" column ]

STEP 3:
- Extract the "Date" column from the extracted Transaction Table in STEP 1
- Format as TRANSACTION_DATES: [ All "Date" column dates, each separated by " | " ] on a single line

STEP 4:
- Extract the "Description" column from the extracted Transaction Table in STEP 1
- Format as LINE_ITEM_DESCRIPTIONS: [ All "Description" column descriptions, each separated by " | " ] on a single line

STEP 5:
- Extract the "Withdrawal" column from the extracted Transaction Table in STEP 1, replacing missing values with "NOT_FOUND".
- Format as TRANSACTION_AMOUNTS_PAID: [ All "Withdrawal" column amounts each separated by " | " ] on a single line
"""

In [None]:
# # Load YAML and generate dynamic prompt
# import yaml
# import sys
# sys.path.insert(0, '/home/jovyan/nfs_share/tod/LMM_POC')
# from common.header_mapping import map_headers_smart, generate_flat_table_prompt

# # Assume headers were extracted earlier
# headers = "Date | Description | Withdrawal | Deposit | Balance"
# mapping = map_headers_smart(headers)

# with open('/home/jovyan/nfs_share/tod/LMM_POC/prompts/flat_table_extraction.yaml') as f:
#     config = yaml.safe_load(f)

# prompt_text = generate_flat_table_prompt(mapping, headers, config)

In [None]:
# Cell 7
# Create message structure for Llama
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": prompt_text,
            },
        ],
    }
]

print(f"📝 Prompt: {prompt_text}")
print("🤖 Generating response with Llama-3.2-Vision...")

In [None]:
# Cell 8
# Process the input
textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)
inputs = processor(image, textInput, return_tensors="pt").to(model.device)

# Generate response with deterministic parameters
output = model.generate(
    **inputs,
    max_new_tokens=4000,
    do_sample=False,
    temperature=None,
    top_p=None,
)
generatedOutput = processor.decode(output[0])

# Clean the response to remove chat and markdown artifacts
cleanedOutput = clean_llama_response(generatedOutput)
cleanedOutput = cleanedOutput.replace("\"**", "") # remove "**" markdown formatting

print("✅ Response generated successfully!")
print("\n" + "=" * 60)
print("CLEANED EXTRACTION:")
print("=" * 60)
print(cleanedOutput)
print("=" * 60)

# Save the cleaned response to a file
output_path = Path("llama_grouped_bank_statement_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(cleanedOutput)

print(f"✅ Response saved to: {output_path}")
print(f"📁 File size: {output_path.stat().st_size} bytes")