# Llama Two Independent Prompts Test

This tests what llama_batch.ipynb does: running TWO SEPARATE prompts on the SAME image.
This is NOT multi-turn - each prompt is independent.

In [1]:
from pathlib import Path
import random

import numpy as np
import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("✅ Random seed set")

✅ Random seed set


In [2]:
# Load model
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"
print("🔧 Loading model...")
model = MllamaForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
print("✅ Model loaded")

🔧 Loading model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Model loaded


In [3]:
# Load image
imageName = "/home/jovyan/nfs_share/tod/LMM_POC/evaluation_data/image_008.png"
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

✅ Image loaded: (900, 1320)


In [4]:
def clean_llama_response(response: str) -> str:
    start_marker = "<|start_header_id|>assistant<|end_header_id|>"
    end_marker = "<|eot_id|>"
    start_idx = response.find(start_marker)
    if start_idx != -1:
        start_idx += len(start_marker)
        end_idx = response.find(end_marker, start_idx)
        if end_idx != -1:
            return response[start_idx:end_idx].strip()
    return response.strip()

## Prompt 1: Document Type (Independent Query)

In [5]:
# First independent prompt
prompt1 = "What type of document is this?"

messages1 = [{
    "role": "user",
    "content": [
        {"type": "image"},
        {"type": "text", "text": prompt1}
    ]
}]

print(f"💬 Prompt 1: {prompt1}")
textInput1 = processor.apply_chat_template(messages1, add_generation_prompt=True)
inputs1 = processor(image, textInput1, return_tensors="pt").to(model.device)

output1 = model.generate(**inputs1, max_new_tokens=500, do_sample=False)
response1 = clean_llama_response(processor.decode(output1[0]))

print("\n" + "="*60)
print("RESPONSE 1:")
print("="*60)
print(response1)
print("="*60)

💬 Prompt 1: What type of document is this?





RESPONSE 1:
This document is a bank statement, specifically a Commonwealth Bank statement, detailing the account holder's transactions from August 8, 2025, to September 7, 2025.


## Prompt 2: Transaction Count (Independent Query - Same Image)

In [6]:
# Second independent prompt (NOT continuing conversation from prompt 1)
prompt2 = "How many transaction entries are visible on this bank statement?"

messages2 = [{
    "role": "user",
    "content": [
        {"type": "image"},
        {"type": "text", "text": prompt2}
    ]
}]

print(f"💬 Prompt 2: {prompt2}")
textInput2 = processor.apply_chat_template(messages2, add_generation_prompt=True)
inputs2 = processor(image, textInput2, return_tensors="pt").to(model.device)

output2 = model.generate(**inputs2, max_new_tokens=500, do_sample=False)
response2 = clean_llama_response(processor.decode(output2[0]))

print("\n" + "="*60)
print("RESPONSE 2:")
print("="*60)
print(response2)
print("="*60)

💬 Prompt 2: How many transaction entries are visible on this bank statement?

RESPONSE 2:
There are 40 transaction entries visible on this bank statement.


## Analysis

If BOTH prompts get different, relevant responses:
- ✅ The model CAN process the same image multiple times
- ✅ Different prompts produce different outputs
- ❌ But multi-turn conversation (with history) doesn't work

This would confirm multi-turn is broken, not the basic vision capability.