In [1]:
import os
import torch
from PIL import Image
from transformers import LayoutLMv3Processor, LayoutLMv3Model
import pandas as pd

# Prevent tokenizer warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Setup paths
DEMO_PATH = "demo_documents"

# Initialize processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")

# Helper function to list documents
def list_demo_documents():
    """List all PNG files in demo directory"""
    if not os.path.exists(DEMO_PATH):
        print(f"Demo directory {DEMO_PATH} does not exist!")
        return []
        
    documents = [f for f in os.listdir(DEMO_PATH) if f.endswith('.png')]
    print("\nAvailable documents:")
    for idx, doc in enumerate(documents):
        print(f"{idx}: {doc}")
    return documents

In [2]:
def process_document_tokens(image_path, processor, model, device='cpu'):
    """
    Process document and get token embeddings and their positions
    """
    # Load and process image
    image = Image.open(image_path).convert("RGB")
    encoding = processor(
        image,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    
    # Move to device and get model outputs
    encoding = {k: v.to(device) for k, v in encoding.items()}
    outputs = model(**encoding)
    
    # Get tokens and their embeddings
    tokens = processor.tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
    embeddings = outputs.last_hidden_state[0]  # Remove batch dimension
    boxes = encoding['bbox'][0]
    
    # Create a list of token information
    token_info = []
    for token, embedding, box in zip(tokens, embeddings, boxes):
        # Skip special tokens
        if token in ['<s>', '</s>', '<pad>']:
            continue
            
        # Clean up token (remove tokenizer artifacts)
        clean_token = token.replace('Ġ', '')
        
        token_info.append({
            'token': clean_token,
            'embedding': embedding.detach().cpu(),  # Move back to CPU for inspection
            'box': box.cpu().tolist(),
            'embedding_norm': torch.norm(embedding).item()  # Magnitude of embedding
        })
    
    return token_info

# Example usage
def analyze_token_embeddings(image_path):
    """
    Analyze token embeddings of a document
    """
    # Process document
    token_info = process_document_tokens(image_path, processor, model)
    
    # Create DataFrame for easy viewing
    df = pd.DataFrame([
        {
            'token': info['token'],
            'embedding_norm': info['embedding_norm'],
            'position': f"({info['box'][0]}, {info['box'][1]})",
        }
        for info in token_info
    ])
    
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    print("\nToken Information:")
    print(df.head(10))  # Show first 10 tokens
    
    return token_info, df

# Try it on a document
documents = list_demo_documents()
if documents:
    image_path = os.path.join(DEMO_PATH, documents[0])
    token_info, df = analyze_token_embeddings(image_path)


Available documents:
0: 15014330_Shiva_Siegen_320000220000492023_1.png
1: 15031152_Topmech_320000220010442023_1.png
2: 50001213_KSU_A-Technik_320000220006912023_1.png





Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Token Information:
  token  embedding_norm   position
0     W       19.064436  (334, 59)
1   urd       19.429543  (334, 59)
2     e       19.717766  (334, 59)
3   ver       18.909647  (425, 56)
4    re       19.791399  (425, 56)
5   chn       19.387173  (425, 56)
6    et       19.013855  (425, 56)
7    an       19.598726  (571, 58)
8   Pin       20.020063  (337, 85)
9     :       20.155340  (337, 85)


In [3]:
def analyze_token_embeddings(image_path):
    """
    Enhanced analysis of token embeddings
    """
    token_info = process_document_tokens(image_path, processor, model)
    
    # Reconstruct words and their positions
    current_word = []
    current_position = None
    words = []
    
    for info in token_info:
        token = info['token']
        position = info['box']
        
        # New word starts with capital letter or after punctuation
        if (token[0].isupper() and current_word) or \
           (current_word and current_word[-1]['token'] in '.,;:!?'):
            # Save current word
            words.append({
                'word': ''.join(t['token'] for t in current_word),
                'position': current_position,
                'avg_embedding_norm': sum(t['embedding_norm'] for t in current_word) / len(current_word)
            })
            current_word = []
            current_position = None
        
        current_word.append(info)
        if not current_position:
            current_position = position
    
    # Add last word
    if current_word:
        words.append({
            'word': ''.join(t['token'] for t in current_word),
            'position': current_position,
            'avg_embedding_norm': sum(t['embedding_norm'] for t in current_word) / len(current_word)
        })
    
    # Create DataFrame
    df = pd.DataFrame(words)
    
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    print("\nReconstructed Words:")
    print(df.head(15))
    
    # Find interesting patterns
    print("\nTokens with highest embedding norms (might be important):")
    high_norm = sorted(token_info, key=lambda x: x['embedding_norm'], reverse=True)[:5]
    for t in high_norm:
        print(f"Token: {t['token']}, Norm: {t['embedding_norm']:.2f}, Position: {t['box']}")
        
    # Look for number patterns
    print("\nPotential numeric values:")
    numbers = [t for t in token_info if any(c.isdigit() for c in t['token'])]
    for n in numbers[:5]:  # Show first 5
        print(f"Value: {n['token']}, Position: {n['box']}")
    
    return words, df

# Try the enhanced analysis
words, df = analyze_token_embeddings(image_path)




Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Reconstructed Words:
                   word              position  avg_embedding_norm
0     Wurdeverrechnetan    [334, 59, 416, 74]           19.364068
1                  Pin:   [337, 85, 383, 100]           20.087702
2               22-134.    [396, 79, 444, 95]           19.596047
3             506patum:    [454, 72, 583, 95]           19.137784
4                C4-01.  [337, 109, 441, 130]           19.552668
5                   CS/  [448, 109, 472, 130]           19.571903
6                Visum:  [475, 114, 527, 129]           19.725094
7                 burwi  [531, 105, 570, 127]           17.644155
8                Kanton  [105, 142, 166, 151]           19.870708
9               Ztirich  [172, 141, 226, 151]           20.517123
10  Strassenverkehrsamt  [105, 162, 287, 172]           20.248379
11                 Frau  [105, 183, 143, 193]           20.654235
12               Sandra  [149, 182, 209, 193]       

In [4]:
def analyze_token_embeddings(image_path):
    """
    Further enhanced analysis with better word separation and pattern recognition
    """
    token_info = process_document_tokens(image_path, processor, model)
    
    # Improved word reconstruction
    words = []
    current_word = []
    current_position = None
    
    for info in token_info:
        token = info['token']
        position = info['box']
        
        # Better word separation conditions
        new_word = (
            (token[0].isupper() and current_word) or  # Capital letter
            (current_word and current_word[-1]['token'] in '.,;:!?') or  # Punctuation
            (position != current_position) or  # Position change
            (token in [':', '@', '/', '-'])  # Special characters
        )
        
        if new_word and current_word:
            words.append({
                'word': ' '.join(t['token'] for t in current_word),
                'position': current_position,
                'avg_embedding_norm': sum(t['embedding_norm'] for t in current_word) / len(current_word),
                'x': current_position[0],  # Add x coordinate for alignment analysis
                'y': current_position[1]   # Add y coordinate for vertical position
            })
            current_word = []
        
        current_word.append(info)
        current_position = position
    
    # Add last word
    if current_word:
        words.append({
            'word': ' '.join(t['token'] for t in current_word),
            'position': current_position,
            'avg_embedding_norm': sum(t['embedding_norm'] for t in current_word) / len(current_word),
            'x': current_position[0],
            'y': current_position[1]
        })
    
    # Create DataFrame
    df = pd.DataFrame(words)
    
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    
    # Find aligned text blocks (same x coordinate)
    print("\nAligned Text Blocks:")
    x_positions = df.groupby('x')['word'].apply(list)
    for x, words_at_x in x_positions.items():
        if len(words_at_x) > 1:  # Show only if multiple words aligned
            print(f"\nAt x={x}:")
            print('\n'.join(words_at_x))
    
    # Look for amount patterns
    print("\nPotential Amounts:")
    amount_pattern = r'\d+[\.,]\d{2}'
    amounts = df[df['word'].str.contains(amount_pattern, na=False)]
    print(amounts[['word', 'position']].to_string())
    
    # Find high-importance tokens
    print("\nHigh Importance Words:")
    important = df.nlargest(5, 'avg_embedding_norm')
    print(important[['word', 'avg_embedding_norm', 'position']].to_string())
    
    return df

# Try the enhanced analysis
df = analyze_token_embeddings(image_path)




Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Aligned Text Blocks:

At x=105:
K anton
Str ass en ver ke h rs am t
Fra u
Z ul ass ung
8 408
Bad en ,
Le ist ung s dat um
:
An sp re ch sp art ner
:
Total

At x=106:
PIN
Land
:
U bers etz ung
Sp es en
Bank verb ind ung
:

At x=142:
33 .. 7 31 .
9 06

At x=207:
25 .
02 .
20 23

At x=254:
03 .
01 .
20 23

At x=285:
IB
AN

At x=334:
W urd e
CH 26

At x=337:
Pin
:
pat um
:
C 4
- 01 .

At x=343:
490 .
00

At x=396:
22
-

At x=454:
134 .
506

At x=475:
/
Vis um
:

At x=758:
Shiva
F
Ã© h ren we g
Tele f on
:
Mobile
:

At x=759:
E
-
Mail
:

At x=760:
Fr .
Fr .

At x=788:
180 .
00
00 .
00
180 .
00

At x=807:
sh iva
@ sie gen .
ch

Potential Amounts:
Empty DataFrame
Columns: [word, position]
Index: []

High Importance Words:
       word  avg_embedding_norm              position
99   sh iva           21.960420  [807, 111, 912, 120]
77    Shiva           21.719246    [758, 47, 816, 59]
82    5 400           21.597796    [850, 73,

In [5]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import DBSCAN
from collections import defaultdict

def calculate_semantic_groups(token_info):
    """
    Group tokens based on their embedding similarity and spatial proximity
    """
    # Stack all embeddings
    embeddings = torch.stack([info['embedding'] for info in token_info])
    
    # Calculate cosine similarity matrix
    similarity_matrix = F.cosine_similarity(
        embeddings.unsqueeze(1), 
        embeddings.unsqueeze(0), 
        dim=2
    )
    
    # Create feature matrix combining embeddings and spatial info
    features = []
    for info in token_info:
        # Normalize spatial coordinates to be on similar scale as embeddings
        x_center = (info['box'][0] + info['box'][2]) / 2 / 1000  # Normalize by page width
        y_center = (info['box'][1] + info['box'][3]) / 2 / 1000  # Normalize by page height
        features.append([x_center, y_center])
    
    # Use DBSCAN for clustering
    clustering = DBSCAN(eps=0.03, min_samples=2).fit(features)
    
    # Group tokens by cluster
    groups = defaultdict(list)
    for idx, label in enumerate(clustering.labels_):
        if label != -1:  # -1 represents noise in DBSCAN
            groups[label].append(token_info[idx])
    
    return groups, similarity_matrix

def analyze_document_semantics(image_path, processor, model, device='cpu'):
    """
    Analyze document using semantic grouping
    """
    # Get token information
    token_info = process_document_tokens(image_path, processor, model, device)
    
    # Get semantic groups
    groups, similarity_matrix = calculate_semantic_groups(token_info)
    
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    
    # Analyze each semantic group
    print("\nSemantic Groups Found:")
    for group_id, tokens in groups.items():
        # Get average position for the group
        avg_x = np.mean([t['box'][0] for t in tokens])
        avg_y = np.mean([t['box'][1] for t in tokens])
        
        # Reconstruct text from tokens
        text = ' '.join(t['token'] for t in tokens)
        
        # Calculate average embedding norm for the group
        avg_norm = np.mean([t['embedding_norm'] for t in tokens])
        
        print(f"\nGroup {group_id}:")
        print(f"Text: {text}")
        print(f"Position: ({avg_x:.0f}, {avg_y:.0f})")
        print(f"Importance (avg norm): {avg_norm:.2f}")
        
        # Look for specific patterns in the group
        has_numbers = any(any(c.isdigit() for c in t['token']) for t in tokens)
        has_keywords = any(t['token'].lower() in ['total', 'summe', 'betrag', 'chf', 'fr'] for t in tokens)
        
        if has_numbers and has_keywords:
            print("→ Potential amount field")
        elif avg_y < 100:  # Near top of page
            print("→ Potential header information")
        elif avg_x < 200 and len(text) > 20:  # Left side, long text
            print("→ Potential address block")
    
    # Find highly similar token pairs
    print("\nStrongly Related Terms:")
    n_tokens = len(token_info)
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            if similarity_matrix[i,j] > 0.8:  # High similarity threshold
                print(f"{token_info[i]['token']} ↔ {token_info[j]['token']}" 
                      f" (similarity: {similarity_matrix[i,j]:.2f})")
    
    return groups, similarity_matrix, token_info

# Try the semantic analysis
documents = list_demo_documents()
if documents:
    image_path = os.path.join(DEMO_PATH, documents[0])
    groups, similarity_matrix, token_info = analyze_document_semantics(
        image_path, processor, model
    )


Available documents:
0: 15014330_Shiva_Siegen_320000220000492023_1.png
1: 15031152_Topmech_320000220010442023_1.png
2: 50001213_KSU_A-Technik_320000220006912023_1.png





Analyzing document: 15014330_Shiva_Siegen_320000220000492023_1.png

Semantic Groups Found:

Group 0:
Text: W urd e
Position: (334, 59)
Importance (avg norm): 19.40
→ Potential header information

Group 1:
Text: ver re chn et
Position: (425, 56)
Importance (avg norm): 19.28
→ Potential header information

Group 2:
Text: Pin :
Position: (337, 85)
Importance (avg norm): 20.09
→ Potential header information

Group 3:
Text: 22 -
Position: (396, 79)
Importance (avg norm): 19.57
→ Potential header information

Group 4:
Text: 134 . 506
Position: (454, 72)
Importance (avg norm): 19.60
→ Potential header information

Group 5:
Text: pat um : C 4 - 01 .
Position: (337, 109)
Importance (avg norm): 19.34

Group 6:
Text: / Vis um :
Position: (475, 114)
Importance (avg norm): 19.82

Group 7:
Text: K anton
Position: (105, 142)
Importance (avg norm): 19.87

Group 8:
Text: Z t ir ich Str ass en ver ke h rs am t Sandra
Position: (127, 157)
Importance (avg norm): 20.28
→ Potential address block

Group 9:


In [6]:
def analyze_semantic_relationships(token_info):
    """Analyze semantic relationships between tokens using embeddings"""
    # Stack all embeddings
    embeddings = torch.stack([info['embedding'] for info in token_info])
    
    # Calculate similarity matrix
    similarity_matrix = F.cosine_similarity(
        embeddings.unsqueeze(1), 
        embeddings.unsqueeze(0), 
        dim=2
    )
    
    # Define semantic anchor points (important concepts in invoices)
    anchors = {
        'amount': ['total', 'summe', 'chf', 'fr.', 'betrag'],
        'date': ['datum', 'date', 'vom', 'per'],
        'vendor': ['firma', 'company', 'von:', 'from:', 'absender'],
        'address': ['strasse', 'street', 'weg', 'plz', 'ort']
    }
    
    # Find tokens most similar to each concept
    semantic_groups = {}
    for concept, keywords in anchors.items():
        # Find tokens containing keywords
        keyword_indices = []
        for idx, info in enumerate(token_info):
            if any(keyword in info['token'].lower() for keyword in keywords):
                keyword_indices.append(idx)
        
        if keyword_indices:
            # Calculate average similarity to keyword tokens
            concept_similarities = similarity_matrix[:, keyword_indices].mean(dim=1)
            
            # Find related tokens (high similarity)
            related_indices = torch.where(concept_similarities > 0.7)[0]
            
            # Group tokens with their similarity scores
            semantic_groups[concept] = [
                {
                    'token': token_info[idx]['token'],
                    'position': token_info[idx]['box'],
                    'similarity': concept_similarities[idx].item(),
                    'embedding_norm': token_info[idx]['embedding_norm']
                }
                for idx in related_indices
            ]
    
    return semantic_groups

def analyze_spatial_relationships(semantic_groups):
    """Analyze spatial relationships between semantic groups"""
    spatial_patterns = {}
    
    for concept, tokens in semantic_groups.items():
        # Calculate average position for the group
        positions = np.array([token['position'] for token in tokens])
        avg_position = positions.mean(axis=0)
        
        # Calculate spatial spread
        position_std = positions.std(axis=0)
        
        # Analyze alignment
        x_coords = positions[:, 0]
        alignments = {
            'left': np.abs(x_coords - x_coords.min()).mean(),
            'right': np.abs(x_coords - x_coords.max()).mean()
        }
        
        spatial_patterns[concept] = {
            'average_position': avg_position,
            'spread': position_std,
            'alignment': 'left' if alignments['left'] < alignments['right'] else 'right'
        }
    
    return spatial_patterns

def extract_document_structure(image_path, processor, model, device='cpu'):
    """Extract structured information from document using semantic understanding"""
    # Get token information
    token_info = process_document_tokens(image_path, processor, model, device)
    
    # Analyze semantic relationships
    semantic_groups = analyze_semantic_relationships(token_info)
    
    # Analyze spatial patterns
    spatial_patterns = analyze_spatial_relationships(semantic_groups)
    
    # Combine semantic and spatial information
    document_structure = {}
    
    for concept, tokens in semantic_groups.items():
        spatial_info = spatial_patterns[concept]
        
        # Sort tokens by similarity and position
        sorted_tokens = sorted(tokens, key=lambda x: (-x['similarity'], x['position'][1]))
        
        # Group nearby tokens
        grouped_tokens = []
        current_group = []
        last_y = None
        
        for token in sorted_tokens:
            current_y = token['position'][1]
            
            if last_y is None or abs(current_y - last_y) < 20:  # Threshold for vertical proximity
                current_group.append(token)
            else:
                if current_group:
                    grouped_tokens.append(current_group)
                current_group = [token]
            last_y = current_y
        
        if current_group:
            grouped_tokens.append(current_group)
        
        # Store structured information
        document_structure[concept] = {
            'groups': grouped_tokens,
            'spatial_pattern': spatial_info,
            'confidence': np.mean([t['similarity'] for t in tokens])
        }
    
    return document_structure

def print_document_analysis(document_structure):
    """Print structured analysis of document"""
    print("\nDocument Structure Analysis:")
    print("=" * 50)
    
    for concept, info in document_structure.items():
        print(f"\n{concept.upper()}")
        print("-" * 30)
        print(f"Confidence: {info['confidence']:.2f}")
        print(f"Alignment: {info['spatial_pattern']['alignment']}")
        
        print("\nContent Groups:")
        for idx, group in enumerate(info['groups']):
            tokens = [t['token'] for t in group]
            similarities = [t['similarity'] for t in group]
            print(f"\nGroup {idx+1} (avg similarity: {np.mean(similarities):.2f}):")
            print(f"Tokens: {' '.join(tokens)}")

# Try the enhanced analysis
documents = list_demo_documents()
if documents:
    image_path = os.path.join(DEMO_PATH, documents[0])
    document_structure = extract_document_structure(image_path, processor, model)
    print_document_analysis(document_structure)


Available documents:
0: 15014330_Shiva_Siegen_320000220000492023_1.png
1: 15031152_Topmech_320000220010442023_1.png
2: 50001213_KSU_A-Technik_320000220006912023_1.png





Document Structure Analysis:

AMOUNT
------------------------------
Confidence: 0.75
Alignment: left

Content Groups:

Group 1 (avg similarity: 1.00):
Tokens: Total

Group 2 (avg similarity: 0.81):
Tokens: Sp

Group 3 (avg similarity: 0.81):
Tokens: Bank

Group 4 (avg similarity: 0.78):
Tokens: U

Group 5 (avg similarity: 0.75):
Tokens: en

Group 6 (avg similarity: 0.75):
Tokens: Land

Group 7 (avg similarity: 0.73):
Tokens: es

Group 8 (avg similarity: 0.73):
Tokens: :

Group 9 (avg similarity: 0.73):
Tokens: Le

Group 10 (avg similarity: 0.73):
Tokens: bers Se etz ung

Group 11 (avg similarity: 0.72):
Tokens: ung

Group 12 (avg similarity: 0.72):
Tokens: An

Group 13 (avg similarity: 0.71):
Tokens: 00

Group 14 (avg similarity: 0.71):
Tokens: :

Group 15 (avg similarity: 0.71):
Tokens: 2

Group 16 (avg similarity: 0.70):
Tokens: au

Group 17 (avg similarity: 0.70):
Tokens: Iran
