In [1]:
from pathlib import Path

import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

In [2]:
# Update this path to your local Llama model
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"
# Update this path to your test image
imageName = "/home/jovyan/nfs_share/tod/LMM_POC/evaluation_data/image_003.png"

print("🔧 Loading Llama-3.2-Vision model...")
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

print("📷 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

🔧 Loading Llama-3.2-Vision model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

📷 Loading image...
✅ Image loaded: (800, 1200)


In [3]:
# Visual Question Answering - ask a simple question about the image
question = "Explain the structure of the transaction table"

# Create message structure for Llama
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": question,
            },
        ],
    }
]

print(f"❓ Question: {question}")
print("🤖 Generating response with Llama-3.2-Vision...")

❓ Question: Explain the structure of the transaction table
🤖 Generating response with Llama-3.2-Vision...


In [4]:
# Process the input
textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)
inputs = processor(image, textInput, return_tensors="pt").to(model.device)

# Generate response
output = model.generate(**inputs, max_new_tokens=2000)
generatedOutput = processor.decode(output[0])

print("✅ Response generated successfully!")
print("\n" + "=" * 60)
print("LLAMA-3.2-VISION RESPONSE:")
print("=" * 60)
print(generatedOutput)
print("=" * 60)

✅ Response generated successfully!

LLAMA-3.2-VISION RESPONSE:
<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Explain the structure of the transaction table<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The image presents a transaction table for the Commonwealth Bank of Australia, detailing financial transactions from August 7, 2025, to September 6, 2025. The table is structured into the following columns:

*   **Date**: Displays the date of each transaction.
*   **Transaction**: Lists the type of transaction, including debit or credit.
*   **Debit**: Shows the amount deducted from the account balance.
*   **Credit**: Shows the amount added to the account balance.
*   **Balance**: Displays the updated account balance after each transaction.

The table provides a clear and organized view of the account's activity during the specified period, allowing users to track their financial transactions and monitor their account balance.<|eot_id|>


In [5]:
# Optional: Save the response to a file
output_path = Path("llama_vqa_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(generatedOutput)

print(f"✅ Response saved to: {output_path}")
print(f"📄 File size: {output_path.stat().st_size} bytes")

✅ Response saved to: llama_vqa_output.txt
📄 File size: 935 bytes
