# Llama 3.2 Vision: Multi-Turn Bank Statement Markdown Extraction

**Protocol**: Extract bank statement tables in markdown format, then filter/analyze via multi-turn conversation

**No LangChain Dependencies** - Pure transformers + Llama multi-turn pattern

In [None]:
from pathlib import Path
import random

import numpy as np
import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

# Set Random Seed for Reproducibility

In [None]:
from common.reproducibility import set_seed
set_seed(42)
print("‚úÖ Random seed set to 42 for reproducibility")

# Load the model

In [None]:
# Update this path to your local Llama model
# model_id = "/home/jovyan/shared_PTM/Llama-3.2-11B-Vision-Instruct"
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"

print("üîß Loading Llama-3.2-Vision model...")
# model = MllamaForConditionalGeneration.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )
# processor = AutoProcessor.from_pretrained(model_id)

from common.llama_model_loader_robust import load_llama_model_robust

model, processor = load_llama_model_robust(
    model_path=model_id,
    use_quantization=False,
    device_map='auto',
    max_new_tokens=2000,
    torch_dtype='bfloat16',
    low_cpu_mem_usage=True,
    verbose=True
)

# Add tie_weights() call
try:
    model.tie_weights()
    print("‚úÖ Model weights tied successfully")
except Exception as e:
    print(f"‚ö†Ô∏è tie_weights() warning: {e}")

# processor

# Load the image

In [None]:
# Update this path to your test image
# imageName = "/home/jovyan/shared_PoC_data/evaluation_data/image_009.png"
imageName = "/home/jovyan/nfs_share/tod/LMM_POC/evaluation_data/image_008.png"
print("üìÅ Loading image...")
image = Image.open(imageName)

# CRITICAL: Store as list for multi-turn compatibility
images = [image]

print(f"‚úÖ Image loaded: {image.size}")
print(f"‚úÖ Images list created with {len(images)} image(s)")

# Multi-Turn Bank Statement Protocol
- Turn 0: Identify actual table headers
- Turn 1: Extract full table using those headers
- Turn 2: Filter using the actual column names found

In [None]:
# TURN 0: Identify Table Headers
# First, identify the actual column headers used in this specific bank statement

prompt = """
Look at the transaction table in this bank statement image.

IMPORTANT STRUCTURAL NOTE:
Some bank statements show dates as section headings with multiple transactions underneath.
If you see this structure, remember that each transaction needs its explicit date in the final output.

What are the exact column header names used in the transaction table?

List each column header exactly as it appears, in order from left to right.
Do not interpret or rename them - use the EXACT text from the image.
"""

# Create message structure for Llama
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": prompt,
            },
        ],
    }
]

print("üí¨ TURN 0: Identifying actual table headers")
print("ü§ñ Generating response with Llama-3.2-Vision...")

# Clean Response Function

In [None]:
def clean_llama_response(response: str) -> str:
    """Remove chat template artifacts and extract only the assistant's response.
    
    Note: This function is kept for backwards compatibility, but when using
    the proper multi-turn pattern (trimming generate_ids), it's not needed.
    """
    start_marker = "<|start_header_id|>assistant<|end_header_id|>"
    end_marker = "<|eot_id|>"
    
    start_idx = response.find(start_marker)
    if start_idx != -1:
        start_idx += len(start_marker)
        end_idx = response.find(end_marker, start_idx)
        if end_idx != -1:
            return response[start_idx:end_idx].strip()
    
    return response.strip()

# Process the prompt

In [None]:
# Process the input using the CORRECT multi-turn pattern
# Based on: https://medium.com/data-science/chat-with-your-images-using-multimodal-llms-60af003e8bfa

textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)

# CRITICAL: Use named parameter 'images=' with list
inputs = processor(images=images, text=textInput, return_tensors="pt").to(model.device)

# Generate response with deterministic parameters
output = model.generate(
    **inputs,
    max_new_tokens=2000,
    do_sample=False,
    temperature=None,
    top_p=None,
)

# CRITICAL: Trim input tokens from output (this is the key to clean responses!)
generate_ids = output[:, inputs['input_ids'].shape[1]:-1]
cleanedOutput = processor.decode(generate_ids[0], clean_up_tokenization_spaces=False)

print("‚úÖ Response generated successfully!")
print("\n" + "=" * 60)
print("TURN 0 - IDENTIFIED TABLE HEADERS:")
print("=" * 60)
print(cleanedOutput)
print("=" * 60)

# CRITICAL: Parse the identified headers for use in subsequent turns
# Extract column names from the response
header_lines = [line.strip() for line in cleanedOutput.split('\n') if line.strip()]
identified_headers = []

# Look for numbered list or bullet points
for line in header_lines:
    # Remove common list markers
    cleaned = line.lstrip('0123456789.-‚Ä¢* ').strip()
    if cleaned and len(cleaned) > 2:  # Ignore very short strings
        identified_headers.append(cleaned)

print(f"\nüìã Parsed {len(identified_headers)} column headers:")
for i, header in enumerate(identified_headers, 1):
    print(f"  {i}. '{header}'")

# Store headers for use in subsequent turns
table_headers = identified_headers

# Save the table headers
output_path = Path("llama_table_headers.txt")
with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(cleanedOutput)

print(f"\n‚úÖ Table headers saved to: {output_path}")
print("üí° These LITERAL header names will be used in Turn 1 & 2 prompts")

## Multi-Turn Conversation Support

Llama supports multi-turn conversations by maintaining a conversation history list:

## Pattern Matching: Map Generic Concepts to Actual Headers

Different bank statements use different column names. Use pattern matching to identify:
- Which header represents **Date**
- Which header represents **Description/Details**  
- Which header represents **Debit/Withdrawal**

In [None]:
# Pattern Matching: Map extracted headers to generic concepts
# This handles variety in bank statement column naming conventions

# Pattern keywords for each concept (in priority order)
DATE_PATTERNS = ['date', 'day', 'transaction date', 'trans date']
DESCRIPTION_PATTERNS = [
    'description', 'details', 'transaction details', 'trans details',
    'particulars', 'narrative', 'transaction', 'trans'
]
DEBIT_PATTERNS = ['debit', 'withdrawal', 'withdrawals', 'paid', 'paid out', 'spent', 'dr']
CREDIT_PATTERNS = ['credit', 'deposit', 'deposits', 'received', 'cr']
BALANCE_PATTERNS = ['balance', 'bal', 'running balance']

def match_header(headers, patterns, fallback=None):
    """Match a header using pattern keywords."""
    headers_lower = [h.lower() for h in headers]
    
    # Try exact match first
    for pattern in patterns:
        for i, header_lower in enumerate(headers_lower):
            if pattern == header_lower:
                return headers[i]
    
    # Try substring match
    for pattern in patterns:
        for i, header_lower in enumerate(headers_lower):
            if pattern in header_lower:
                return headers[i]
    
    return fallback

# Perform pattern matching on extracted headers
date_col = match_header(table_headers, DATE_PATTERNS, fallback=table_headers[0] if table_headers else 'Date')
desc_col = match_header(table_headers, DESCRIPTION_PATTERNS, fallback=table_headers[1] if len(table_headers) > 1 else 'Description')
debit_col = match_header(table_headers, DEBIT_PATTERNS, fallback='Debit')
credit_col = match_header(table_headers, CREDIT_PATTERNS, fallback='Credit')
balance_col = match_header(table_headers, BALANCE_PATTERNS, fallback='Balance')

print("=" * 60)
print("PATTERN MATCHING RESULTS:")
print("=" * 60)
print(f"üìã Extracted Headers: {table_headers}")
print(f"\nüîç Mapped Columns:")
print(f"  Date       ‚Üí '{date_col}'")
print(f"  Description ‚Üí '{desc_col}'")
print(f"  Debit      ‚Üí '{debit_col}'")
print(f"  Credit     ‚Üí '{credit_col}'")
print(f"  Balance    ‚Üí '{balance_col}'")
print("=" * 60)
print("\n‚úÖ These literal column names will be used in Turn 1 and Turn 2")
print("üí° Adjust patterns above if matching fails for your bank statement format")

## üîë Key Multi-Turn Pattern for Llama 3.2 Vision

This notebook uses the **correct multi-turn conversation pattern** discovered from the Medium article:
[Chat with Your Images Using Llama 3.2-Vision Multimodal LLMs](https://medium.com/data-science/chat-with-your-images-using-multimodal-llms-60af003e8bfa)

### Critical Requirements:

1. **Images as List**: `images = [image]` (not just `image`)
2. **Named Parameter**: `processor(images=images, text=text, ...)` (not positional args)
3. **Trim Generated Tokens**: `generate_ids[:, inputs['input_ids'].shape[1]:-1]`
4. **Same Images Every Turn**: Pass the same `images` list for all turns

### Message Structure:

- **Turn 1**: `{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}`
- **Turn 2+**: `{"role": "user", "content": [{"type": "text", "text": "..."}]}` (no image in content)
- **Assistant**: `{"role": "assistant", "content": [{"type": "text", "text": "..."}]}`

The model attends to the image only in the first turn, but the processor needs the images list for all turns because the chat template contains the `<|image|>` token.

In [None]:
# Store conversation history for multi-turn support
# Initialize with first exchange
conversation_history = messageDataStructure.copy()

# Add assistant's response to history
conversation_history.append({
    "role": "assistant",
    "content": [{"type": "text", "text": cleanedOutput}]
})

print("‚úÖ Conversation history initialized")
print(f"üìä Current conversation has {len(conversation_history)} messages (1 user + 1 assistant)")
print(f"üí° Pattern: Using working multi-turn approach from Medium article")

### Debug: View Conversation Context

This cell helps you see what's being sent to the model:

In [None]:
# Optional: Debug conversation structure
print("üîç Current conversation structure:")
print("=" * 60)
for i, msg in enumerate(conversation_history, 1):
    print(f"\nMessage {i} ({msg['role']}):")
    for content in msg['content']:
        if content['type'] == 'text':
            preview = content['text'][:100] + "..." if len(content['text']) > 100 else content['text']
            print(f"  [text]: {preview}")
        else:
            print(f"  [{content['type']}]")
print("=" * 60)

### TURN 1: Extract Full Table in Markdown

Now that we know the actual column headers, extract the complete table:

In [None]:
# TURN 1: Extract Full Table with LITERAL Header Names
# Using the WORKING pattern from: https://medium.com/data-science/chat-with-your-images-using-multimodal-llms-60af003e8bfa

# Build the header string using LITERAL names from Turn 0
header_string = " | ".join(table_headers)

follow_up_prompt = f"""
Now extract the entire transaction table from the bank statement in markdown format.

Use these EXACT column headers in this order:
{header_string}

Format requirements:
- Standard markdown table syntax with | delimiters
- Header row: | {header_string} |
- Separator row: | {" | ".join(["---"] * len(table_headers))} |

CRITICAL EXTRACTION RULES:
1. Extract EVERY transaction as a separate row
2. Each transaction MUST have its explicit date in the date column
3. If multiple transactions share a date heading, repeat that date for each transaction row
4. Do NOT skip or combine any rows
5. Keep all amounts with decimal values intact
6. Do NOT add explanatory text - only output the markdown table

Example: If you see:
  01/06/2024
    Transaction A    $100
    Transaction B    $50
    
Output as TWO rows:
  | 01/06/2024 | Transaction A | $100 | ... |
  | 01/06/2024 | Transaction B | $50  | ... |
"""

# Append user's follow-up to conversation history (text only - NO image in content)
conversation_history.append({
    "role": "user",
    "content": [{"type": "text", "text": follow_up_prompt}]
})

print(f"üí¨ TURN 1: Extract full markdown table")
print(f"üìã Using literal headers: {table_headers}")
print("ü§ñ Generating follow-up response with Llama-3.2-Vision...")

# Process with updated conversation history
textInput = processor.apply_chat_template(
    conversation_history, add_generation_prompt=True
)

# CRITICAL: Use named parameter 'images=' and pass the SAME images list
inputs = processor(images=images, text=textInput, return_tensors="pt").to(model.device)

# Generate response
output = model.generate(
    **inputs,
    max_new_tokens=2000,
    do_sample=False,
    temperature=None,
    top_p=None,
)

# CRITICAL: Trim input tokens from output
generate_ids = output[:, inputs['input_ids'].shape[1]:-1]
cleanedOutput2 = processor.decode(generate_ids[0], clean_up_tokenization_spaces=False)

print("\n‚úÖ Follow-up response generated successfully!")
print("\n" + "=" * 60)
print("TURN 1 - FULL MARKDOWN TABLE:")
print("=" * 60)
print(cleanedOutput2)
print("=" * 60)

# Save the markdown table
output_path = Path("llama_markdown_table_extraction.txt")
with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(cleanedOutput2)

print(f"\n‚úÖ Markdown table saved to: {output_path}")

# Update conversation history with assistant's response
conversation_history.append({
    "role": "assistant",
    "content": [{"type": "text", "text": cleanedOutput2}]
})

print(f"\nüìä Conversation now has {len(conversation_history)} messages")
print("üí° Each transaction has explicit date, even if grouped by date heading")

### TURN 2: Filter Using Actual Column Names

Filter the extracted table using the specific column names identified in Turn 0:

In [None]:
# TURN 2: Filter using LITERAL column names from pattern matching

follow_up_prompt_3 = f"""
From the markdown table you just extracted, create a filtered version showing ONLY withdrawal/debit transactions.

Use these EXACT column names:
- {date_col}
- {desc_col}  
- {debit_col}

Filter rules:
- Only include rows where '{debit_col}' has a value (not empty)
- Exclude credit/deposit transactions
- Keep the markdown table format with header: | {date_col} | {desc_col} | {debit_col} |

Output only the filtered markdown table.
"""

# Append user's follow-up to conversation history
conversation_history.append({
    "role": "user",
    "content": [{"type": "text", "text": follow_up_prompt_3}]
})

print(f"üí¨ TURN 2: Filter using literal column names")
print(f"üìã Filter columns: '{date_col}' | '{desc_col}' | '{debit_col}'")
print("ü§ñ Generating follow-up response with Llama-3.2-Vision...")

# Process with updated conversation history
textInput = processor.apply_chat_template(
    conversation_history, add_generation_prompt=True
)

# Use named parameter 'images=' and pass the SAME images list
inputs = processor(images=images, text=textInput, return_tensors="pt").to(model.device)

# Generate response
output = model.generate(
    **inputs,
    max_new_tokens=2000,
    do_sample=False,
    temperature=None,
    top_p=None,
)

# Trim input tokens from output
generate_ids = output[:, inputs['input_ids'].shape[1]:-1]
cleanedOutput3 = processor.decode(generate_ids[0], clean_up_tokenization_spaces=False)

print("\n‚úÖ Follow-up response generated successfully!")
print("\n" + "=" * 60)
print("TURN 2 - FILTERED WITHDRAWALS:")
print("=" * 60)
print(cleanedOutput3)
print("=" * 60)

# Save filtered results
output_path = Path("llama_filtered_withdrawals.txt")
with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(cleanedOutput3)

print(f"\n‚úÖ Filtered table saved to: {output_path}")

# Update conversation history with assistant's response
conversation_history.append({
    "role": "assistant",
    "content": [{"type": "text", "text": cleanedOutput3}]
})

print(f"\nüìä Conversation now has {len(conversation_history)} messages")
print(f"\n‚úÖ Complete Protocol:")
print(f"   Turn 0: Identify headers ‚Üí {table_headers}")
print(f"   Pattern Match: Date='{date_col}', Desc='{desc_col}', Debit='{debit_col}'")
print(f"   Turn 1: Extract with all headers")
print(f"   Turn 2: Filter using '{debit_col}' column")
print("\n‚úÖ No langchain dependencies - production ready!")

### Save Multi-Turn Conversation

In [None]:
# Save the entire conversation to a file
output_path = Path("llama_multiturn_conversation.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write("=" * 60 + "\n")
    text_file.write("MULTI-TURN CONVERSATION WITH LLAMA-3.2-VISION\n")
    text_file.write("=" * 60 + "\n\n")
    
    for i, msg in enumerate(conversation_history, 1):
        role = msg["role"].upper()
        text_file.write(f"\n{'-' * 60}\n")
        text_file.write(f"MESSAGE {i} - {role}\n")
        text_file.write(f"{'-' * 60}\n\n")
        
        for content in msg["content"]:
            if content["type"] == "text":
                text_file.write(content["text"] + "\n")
            elif content["type"] == "image":
                text_file.write("[IMAGE]\n")
    
    text_file.write("\n" + "=" * 60 + "\n")
    text_file.write(f"Total messages: {len(conversation_history)}\n")
    text_file.write("=" * 60 + "\n")

print(f"‚úÖ Full conversation saved to: {output_path}")
print(f"üìä File size: {output_path.stat().st_size} bytes")
print(f"üí¨ Total messages in conversation: {len(conversation_history)}")