In [1]:
import os
from PIL import Image
import torch
from transformers import LayoutLMv3Processor
import pandas as pd  # For nice display of results

# Prevent tokenizer warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Setup paths
DEMO_PATH = "demo_documents"  # Directory containing sample documents

In [2]:
def load_and_process_document(image_path, processor):
    """Load and process a document with LayoutLMv3 processor"""
    # Load image
    image = Image.open(image_path).convert("RGB")
    
    # Process with LayoutLMv3
    encoding = processor(
        image,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    
    return encoding, image

def get_words_and_boxes(encoding, processor):
    """Extract words and their bounding boxes from processor output"""
    # Get tokens and boxes
    tokens = processor.tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
    boxes = encoding['bbox'][0].tolist()
    
    # Pair tokens with their boxes
    words_and_boxes = []
    current_word = []
    current_box = None
    
    for token, box in zip(tokens, boxes):
        # Skip special tokens
        if token in ['<s>', '</s>', '<pad>']:
            continue
            
        # Clean up token
        if token.startswith('Ġ'):
            # New word starts
            if current_word:
                words_and_boxes.append({
                    'word': ''.join(current_word),
                    'box': current_box
                })
            current_word = [token.replace('Ġ', '')]
            current_box = box
        else:
            # Continue current word
            current_word.append(token)
    
    # Add last word if exists
    if current_word:
        words_and_boxes.append({
            'word': ''.join(current_word),
            'box': current_box
        })
    
    return words_and_boxes

In [3]:
def find_amount_candidates(words_and_boxes):
    """Find potential amount values in the document"""
    amount_patterns = [
        lambda x: x.replace('.', '').replace(',', '').isdigit(),  # Numbers
        lambda x: x.startswith('CHF'),  # Swiss Francs
        lambda x: x.startswith('EUR'),  # Euros
        lambda x: x.startswith('$')     # Dollars
    ]
    
    candidates = []
    for item in words_and_boxes:
        word = item['word']
        box = item['box']
        
        # Check if word matches any pattern
        for pattern in amount_patterns:
            if pattern(word):
                candidates.append({
                    'value': word,
                    'box': box,
                    'type': 'amount'
                })
                break
    
    return candidates

def find_vendor_candidates(words_and_boxes):
    """Find potential vendor information"""
    vendor_keywords = ["Rechnung", "Invoice", "Von:", "From:", "Vendor:", "Company:"]
    
    candidates = []
    for idx, item in enumerate(words_and_boxes):
        word = item['word']
        box = item['box']
        
        # Check for keywords
        if word in vendor_keywords:
            # Look at next few words as potential vendor name
            next_words = words_and_boxes[idx+1:idx+4]
            candidates.append({
                'keyword': word,
                'box': box,
                'following_words': next_words,
                'type': 'vendor'
            })
    
    return candidates

In [5]:
def analyze_document(image_path, processor):
    """Analyze a document and display results"""
    # Process document
    encoding, image = load_and_process_document(image_path, processor)
    words_and_boxes = get_words_and_boxes(encoding, processor)
    
    # Find information
    amounts = find_amount_candidates(words_and_boxes)
    vendors = find_vendor_candidates(words_and_boxes)
    
    # Display results
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    print("\nPotential Amounts:")
    for amount in amounts:
        print(f"- {amount['value']} (position: {amount['box']})")
    
    print("\nPotential Vendor Information:")
    for vendor in vendors:
        following = ' '.join([w['word'] for w in vendor['following_words']])
        print(f"- Found keyword '{vendor['keyword']}' followed by: {following}")
    
    return {
        'words_and_boxes': words_and_boxes,
        'amounts': amounts,
        'vendors': vendors
    }

# Initialize processor
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")



In [6]:
# First, let's list available documents
def list_demo_documents():
    """List all PNG files in demo directory"""
    if not os.path.exists(DEMO_PATH):
        print(f"Demo directory {DEMO_PATH} does not exist!")
        return []
        
    documents = [f for f in os.listdir(DEMO_PATH) if f.endswith('.png')]
    print("\nAvailable documents:")
    for idx, doc in enumerate(documents):
        print(f"{idx}: {doc}")
    return documents

# Demo usage
documents = list_demo_documents()
if documents:
    # Use first document as example
    image_path = os.path.join(DEMO_PATH, documents[0])
    results = analyze_document(image_path, processor)
else:
    print("No PNG documents found in demo directory!")


Available documents:
0: 15014330_Shiva_Siegen_320000220000492023_1.png
1: 15031152_Topmech_320000220010442023_1.png
2: 50001213_KSU_A-Technik_320000220006912023_1.png

Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Potential Amounts:
- 134.506 (position: [454, 72, 583, 95])
- 1 (position: [273, 224, 281, 234])
- 8408 (position: [105, 246, 148, 256])
- 3. (position: [169, 353, 182, 363])
- 2023 (position: [249, 353, 292, 363])
- 33..731.906 (position: [142, 416, 242, 426])
- 03.01.2023 (position: [254, 491, 349, 501])
- 2 (position: [220, 566, 228, 576])
- 490.00 (position: [343, 565, 404, 576])
- 0076 (position: [386, 716, 429, 727])
- 1016 (position: [435, 716, 478, 727])
- 1184 (position: [484, 716, 528, 726])
- 4349 (position: [533, 715, 576, 726])
- 7 (position: [583, 716, 592, 726])
- 25.02.2023 (position: [207, 817, 302, 828])
- 6, (position: [835, 73, 845, 82])
- 5400 (position: [850, 73, 881, 81])
- 056 (position: [814, 86, 838, 94])
- 222 (position: [844,

In [7]:
def find_amount_candidates(words_and_boxes):
    """Find potential monetary amounts using context and patterns"""
    
    # Helper function to check if string matches money pattern
    def is_money_pattern(s):
        # Matches patterns like: 490.00, 1'234.56, 1,234.56
        import re
        money_pattern = r'^\d{1,3}([\',]\d{3})*(\.\d{2})?$'
        return bool(re.match(money_pattern, s))
    
    # Keywords that indicate monetary amounts
    amount_contexts = {
        'preceding': ['CHF', 'Fr.', 'EUR', 'USD', 'Total', 'Summe', 'Betrag'],
        'following': ['CHF', 'Fr.', 'EUR', 'USD']
    }
    
    candidates = []
    for idx, current in enumerate(words_and_boxes):
        word = current['word']
        box = current['box']
        
        # Skip if it's a postal code (typically 4 digits in Switzerland)
        if len(word) == 4 and word.isdigit():
            continue
            
        # Skip if it's a date (contains common date separators)
        if '.' in word and any(c.isdigit() for c in word) and len(word) > 4:
            # Check if it looks like a date (e.g., 25.02.2023)
            if sum(c == '.' for c in word) >= 1:
                continue
        
        # Check preceding word for context
        preceding_word = words_and_boxes[idx-1]['word'] if idx > 0 else ''
        following_word = words_and_boxes[idx+1]['word'] if idx < len(words_and_boxes)-1 else ''
        
        is_monetary = False
        context = None
        
        # Check if current word is a number and has context
        if is_money_pattern(word):
            # Check preceding context
            if preceding_word in amount_contexts['preceding']:
                is_monetary = True
                context = f"Preceded by {preceding_word}"
            
            # Check following context
            elif following_word in amount_contexts['following']:
                is_monetary = True
                context = f"Followed by {following_word}"
            
            # Check if it's a "round" amount ending in .00
            elif word.endswith('.00'):
                is_monetary = True
                context = "Ends with .00"
        
        # Add if it looks like a monetary amount
        if is_monetary:
            candidates.append({
                'value': word,
                'box': box,
                'context': context,
                'preceding_word': preceding_word,
                'following_word': following_word
            })
    
    return candidates

# Update the display in analyze_document function
def analyze_document(image_path, processor):
    """Analyze a document and display results"""
    # Process document
    encoding, image = load_and_process_document(image_path, processor)
    words_and_boxes = get_words_and_boxes(encoding, processor)
    
    # Find information
    amounts = find_amount_candidates(words_and_boxes)
    vendors = find_vendor_candidates(words_and_boxes)
    
    # Display results
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    print("\nPotential Monetary Amounts:")
    for amount in amounts:
        print(f"- {amount['value']} ({amount['context']})")
        print(f"  Context: {amount['preceding_word']} {amount['value']} {amount['following_word']}")
        print(f"  Position: {amount['box']}")
    
    print("\nPotential Vendor Information:")
    for vendor in vendors:
        following = ' '.join([w['word'] for w in vendor['following_words']])
        print(f"- Found keyword '{vendor['keyword']}' followed by: {following}")
    
    return {
        'words_and_boxes': words_and_boxes,
        'amounts': amounts,
        'vendors': vendors
    }

In [8]:
# First, let's list available documents
def list_demo_documents():
    """List all PNG files in demo directory"""
    if not os.path.exists(DEMO_PATH):
        print(f"Demo directory {DEMO_PATH} does not exist!")
        return []
        
    documents = [f for f in os.listdir(DEMO_PATH) if f.endswith('.png')]
    print("\nAvailable documents:")
    for idx, doc in enumerate(documents):
        print(f"{idx}: {doc}")
    return documents

# Demo usage
documents = list_demo_documents()
if documents:
    # Use first document as example
    image_path = os.path.join(DEMO_PATH, documents[0])
    results = analyze_document(image_path, processor)
else:
    print("No PNG documents found in demo directory!")


Available documents:
0: 15014330_Shiva_Siegen_320000220000492023_1.png
1: 15031152_Topmech_320000220010442023_1.png
2: 50001213_KSU_A-Technik_320000220006912023_1.png

Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Potential Monetary Amounts:

Potential Vendor Information:
- Found keyword 'Rechnung' followed by: PIN 33..731.906 Land:
