In [None]:
# Cell 1
from pathlib import Path
import random

import numpy as np
import matplotlib.pyplot as plt

import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

In [None]:
# Cell 2
def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("✅ Random seed set to 42 for reproducibility")

In [None]:
# Cell 3
# model_id = "/home/jovyan/shared_PTM/Llama-3.2-11B-Vision-Instruct"
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"

print("🔧 Loading Llama-3.2-Vision model...")
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# processor
processor = AutoProcessor.from_pretrained(model_id)

for i in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(i) / 1e9
    reserved = torch.cuda.memory_reserved(i) / 1e9
    print(f"    GPU [{i}]: {{allocated:.2f}}GB allocated, {{reserved:.2f}}GB reserved")

In [None]:
# Cell 4
import re


def clean_llama_response(response: str) -> str:
    """Remove chat template artifacts and extract only the assistant's response."""
    start_marker = "<|start_header_id|>assistant<|end_header_id|>"
    end_marker = "<|eot_id|>"
    
    start_idx = response.find(start_marker)
    if start_idx != -1:
        start_idx += len(start_marker)
        end_idx = response.find(end_marker, start_idx)
        if end_idx != -1:
            return response[start_idx:end_idx].strip()
    
    return response.replace("***","").strip()


def clean_markdown_table(markdown_text: str) -> str:
    """
    Replace empty cells in markdown table with NOT_FOUND.
    
    Handles patterns like:
    - "|  |" → "| NOT_FOUND |"
    - "| |"  → "| NOT_FOUND |"
    - "|   |" → "| NOT_FOUND |"
    
    Only processes table rows, skips header separator lines (| --- | --- |).
    """
    lines = markdown_text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip header separator lines (like "| --- | --- | --- |")
        if re.match(r'^\|\s*-+\s*\|', line):
            cleaned_lines.append(line)
            continue
        
        # Replace empty cells in data rows
        if '|' in line:
            # Pattern: pipe followed by only whitespace followed by pipe
            cleaned_line = re.sub(r'\|\s+\|', '| NOT_FOUND |', line)
            cleaned_lines.append(cleaned_line)
        else:
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)


In [None]:
# Cell 5
# imageName = "/home/jovyan/_LMM_POC/evaluation_data/image_008.png"
imageName = "/home/jovyan/nfs_share/tod/LMM_POC/evaluation_data/image_008.png"

print("📂 Loading image...")
image = Image.open(imageName)
print(f"✅ Image loaded: {image.size}")

In [None]:
# Cell 6
# basic flat 5 column ["Date", "Description", "Withdrawal", "Credit", "Balance"] transaction table prompt
# prompt_text = """
# You are an expert document analyzer specializing in bank statement extraction.
# Extract structured data from this flat table bank statement for taxpayer expense claims.

# CONVERSATION PROTOCOL:
# - Start your response immediately with "DOCUMENT_TYPE: BANK_STATEMENT"
# - Do NOT include conversational text like "I'll extract..." or "Based on the document..."
# - Do NOT use bullet points, numbered lists, asterisks, or markdown formatting (no **, no ##, no 1., no -)
# - Output ONLY the structured extraction data below
# - End immediately after "TRANSACTION_AMOUNTS_PAID:" with no additional text
# - NO explanations, NO comments, NO additional text

# CRITICAL:
# - The transaction table in the image has a "Date", a "Description", a "Withdrawal", a "Deposit" and a "Balance" column
# - Specifically, it has a "Date" column, a "Description" column, a "Withdrawal" column, a "Deposit" column and a "Balance" column

# ANTI-HALLUCINATION RULES:
# - YOU MUST NOT GUESS values you are unsure of
# - Rows may have missing values
# - Rows NEVER HAVE REPEATED AMOUNTS, SO YOU MUST NOT REPEAT VALUES THAT YOU ARE UNSURE OF
# - If a value is unclear or missing, use "NOT_FOUND" instead of guessing

# STEP 1:
# - Extract the Transaction Table formatted as markdown.

# """

prompt_text = """
You are an expert document analyzer specializing in bank statement extraction.

Step 1
  - Extract the Transaction Table formatted as markdown.
"""

In [None]:
# Cell 7
# Create message structure for Llama chat template
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": prompt_text},
        ],
    }
]

# Process the input
textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)
inputs = processor(image, textInput, return_tensors="pt").to(model.device)

# Generate response with deterministic parameters
output = model.generate(
    **inputs,
    max_new_tokens=4000,
    do_sample=False,
    temperature=None,
    top_p=None,
)
generatedOutput = processor.decode(output[0])

# Clean the response to remove chat and markdown artifacts
cleanedOutput = clean_llama_response(generatedOutput)
cleanedOutput = cleanedOutput.replace("\"**", "") # remove "**" markdown formatting

# Clean markdown table: replace empty cells with NOT_FOUND
if '|' in cleanedOutput:  # Check if it contains a table
    cleanedOutput = clean_markdown_table(cleanedOutput)
    print("✅ Empty cells replaced with NOT_FOUND")

print("✅ Response generated successfully!")
print("\n" + "=" * 60)
print("CLEANED EXTRACTION:")
print("=" * 60)
print(cleanedOutput)
print("=" * 60)

# Save the cleaned response to a file
output_path = Path("llama_grouped_bank_statement_output.txt")

with output_path.open("w", encoding="utf-8") as text_file:
    text_file.write(cleanedOutput)

print(f"✅ Response saved to: {output_path}")
print(f"📁 File size: {output_path.stat().st_size} bytes")

In [None]:
# Cell 8
# Transform markdown table into structured extraction format
import re

# Parse markdown table from cleanedOutput
lines = cleanedOutput.split('\n')

# Find table rows (skip header and separator lines)
table_rows = []
found_separator = False

for line in lines:
    line = line.strip()
    if not line or '|' not in line:
        continue
    
    # Skip separator line (| --- | --- |)
    if re.match(r'^\|\s*-+\s*\|', line):
        found_separator = True
        continue
    
    # Only process rows after separator
    if found_separator:
        # Parse data row: split by | and clean
        cells = [cell.strip() for cell in line.split('|')]
        # Remove empty cells from leading/trailing |
        cells = [c for c in cells if c]
        
        if len(cells) >= 5:  # Date, Description, Withdrawal, Deposit, Balance
            table_rows.append(cells)

print(f"📊 Parsed {len(table_rows)} transaction rows from markdown table\n")

# Extract columns (0=Date, 1=Description, 2=Withdrawal)
dates = [row[0] for row in table_rows]
descriptions = [row[1] for row in table_rows]
withdrawals = [row[2] for row in table_rows]

# STEP 2: Extract date range
if dates:
    first_date = dates[0]
    last_date = dates[-1]
    date_range = f"STATEMENT_DATE_RANGE: [ {first_date} - {last_date} ]"
else:
    date_range = "STATEMENT_DATE_RANGE: [ NOT_FOUND ]"

# STEP 3: Extract all dates
dates_str = " | ".join(dates)
transaction_dates = f"TRANSACTION_DATES: [ {dates_str} ]"

# STEP 4: Extract all descriptions  
descriptions_str = " | ".join(descriptions)
line_item_descriptions = f"LINE_ITEM_DESCRIPTIONS: [ {descriptions_str} ]"

# STEP 5: Extract all withdrawals
withdrawals_str = " | ".join(withdrawals)
transaction_amounts = f"TRANSACTION_AMOUNTS_PAID: [ {withdrawals_str} ]"

# Display structured output
print("=" * 60)
print("STRUCTURED EXTRACTION OUTPUT:")
print("=" * 60)
print("DOCUMENT_TYPE: BANK_STATEMENT\n")
print(date_range)
print()
print(transaction_dates)
print()
print(line_item_descriptions)
print()
print(transaction_amounts)
print("=" * 60)

# Save structured output
structured_output = f"""DOCUMENT_TYPE: BANK_STATEMENT

{date_range}

{transaction_dates}

{line_item_descriptions}

{transaction_amounts}
"""

structured_path = Path("llama_structured_extraction_output.txt")
with structured_path.open("w", encoding="utf-8") as f:
    f.write(structured_output)

print(f"\n✅ Structured output saved to: {structured_path}")
print(f"📁 File size: {structured_path.stat().st_size} bytes")