In [1]:
from pathlib import Path

import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

In [12]:
# Update this path to your local Llama model
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"

print("🔧 Loading Llama-3.2-Vision model...")
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

print("📷 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

🔧 Loading Llama-3.2-Vision model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

📷 Loading image...
✅ Image loaded: (900, 1320)


In [22]:
# Update this path to your test image
imageName = "/home/jovyan/nfs_share/tod/LMM_POC/evaluation_data/image_009.png"

In [39]:
question = """
1. Extract the "Transaction Table" only. 
2. YOU MUST NOT GUESS values you unsure of. 
3. Rows may have missing values
4. Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF.
5. Format the answer as a markdown table.
"""
print(question)


1. Extract the "Transaction Table" only. 
2. YOU MUST NOT GUESS values you unsure of. 
3. Rows may have missing values
4. Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF.
5. Format the answer as a markdown table.



In [40]:
# Visual Question Answering - ask a simple question about the image
# question = "Explain the structure of the transaction table"

# Create message structure for Llama
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": question,
            },
        ],
    }
]

print(f"❓ Question: {question}")
print("🤖 Generating response with Llama-3.2-Vision...")

❓ Question: 
1. Extract the "Transaction Table" only. 
2. YOU MUST NOT GUESS values you unsure of. 
3. Rows may have missing values
4. Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF.
5. Format the answer as a markdown table.

🤖 Generating response with Llama-3.2-Vision...


In [41]:
# Process the input
textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)
inputs = processor(image, textInput, return_tensors="pt").to(model.device)

# Generate response
output = model.generate(**inputs, max_new_tokens=2000)
generatedOutput = processor.decode(output[0])

print("✅ Response generated successfully!")
print("\n" + "=" * 60)
print("LLAMA-3.2-VISION RESPONSE:")
print("=" * 60)
print(generatedOutput)
print("=" * 60)

✅ Response generated successfully!

LLAMA-3.2-VISION RESPONSE:
<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>
1. Extract the "Transaction Table" only. 
2. YOU MUST NOT GUESS values you unsure of. 
3. Rows may have missing values
4. Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF.
5. Format the answer as a markdown table.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

| Date | Description | Withdrawal | Deposit | Balance |
| --- | --- | --- | --- | --- |
| 07/09/2025 | EFTPOS Cash Out PRICELINE PHARMACY MACKAY QLD | $322.18 |  | $48890.58 |
| 08/09/2025 | EFTPOS Purchase OFFICEWORKS BUSINESS ROCKHAMPTO... | $64.33 |  | $49212.76 |
| 04/09/2025 | Mortgage Repayment MORT 6103P16754533 NAB | $649.79 |  | $49277.09 |
| 03/09/2025 | OSKO Payment to MIKE CHEN 809845773133 | $1385.43 |  | $49226.68 |
| 03/09/2025 | Direct Debit 15395P41608293 MWF 42730 | $199.17 |  | $51312.57 |
| 02/09/2025 | Salary Paymen

In [5]:
# Optional: Save the response to a file
output_path = Path("llama_vqa_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(generatedOutput)

print(f"✅ Response saved to: {output_path}")
print(f"📄 File size: {output_path.stat().st_size} bytes")

✅ Response saved to: llama_vqa_output.txt
📄 File size: 1804 bytes
