In [1]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import DBSCAN
from collections import defaultdict

def calculate_semantic_groups(token_info):
    """
    Group tokens based on their embedding similarity and spatial proximity
    """
    # Stack all embeddings
    embeddings = torch.stack([info['embedding'] for info in token_info])
    
    # Calculate cosine similarity matrix
    similarity_matrix = F.cosine_similarity(
        embeddings.unsqueeze(1), 
        embeddings.unsqueeze(0), 
        dim=2
    )
    
    # Create feature matrix combining embeddings and spatial info
    features = []
    for info in token_info:
        # Normalize spatial coordinates to be on similar scale as embeddings
        x_center = (info['box'][0] + info['box'][2]) / 2 / 1000  # Normalize by page width
        y_center = (info['box'][1] + info['box'][3]) / 2 / 1000  # Normalize by page height
        features.append([x_center, y_center])
    
    # Use DBSCAN for clustering
    clustering = DBSCAN(eps=0.03, min_samples=2).fit(features)
    
    # Group tokens by cluster
    groups = defaultdict(list)
    for idx, label in enumerate(clustering.labels_):
        if label != -1:  # -1 represents noise in DBSCAN
            groups[label].append(token_info[idx])
    
    return groups, similarity_matrix

def analyze_document_semantics(image_path, processor, model, device='cpu'):
    """
    Analyze document using semantic grouping
    """
    # Get token information
    token_info = process_document_tokens(image_path, processor, model, device)
    
    # Get semantic groups
    groups, similarity_matrix = calculate_semantic_groups(token_info)
    
    print(f"\nAnalyzing document: {os.path.basename(image_path)}")
    
    # Analyze each semantic group
    print("\nSemantic Groups Found:")
    for group_id, tokens in groups.items():
        # Get average position for the group
        avg_x = np.mean([t['box'][0] for t in tokens])
        avg_y = np.mean([t['box'][1] for t in tokens])
        
        # Reconstruct text from tokens
        text = ' '.join(t['token'] for t in tokens)
        
        # Calculate average embedding norm for the group
        avg_norm = np.mean([t['embedding_norm'] for t in tokens])
        
        print(f"\nGroup {group_id}:")
        print(f"Text: {text}")
        print(f"Position: ({avg_x:.0f}, {avg_y:.0f})")
        print(f"Importance (avg norm): {avg_norm:.2f}")
        
        # Look for specific patterns in the group
        has_numbers = any(any(c.isdigit() for c in t['token']) for t in tokens)
        has_keywords = any(t['token'].lower() in ['total', 'summe', 'betrag', 'chf', 'fr'] for t in tokens)
        
        if has_numbers and has_keywords:
            print("→ Potential amount field")
        elif avg_y < 100:  # Near top of page
            print("→ Potential header information")
        elif avg_x < 200 and len(text) > 20:  # Left side, long text
            print("→ Potential address block")
    
    # Find highly similar token pairs
    print("\nStrongly Related Terms:")
    n_tokens = len(token_info)
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            if similarity_matrix[i,j] > 0.8:  # High similarity threshold
                print(f"{token_info[i]['token']} ↔ {token_info[j]['token']}" 
                      f" (similarity: {similarity_matrix[i,j]:.2f})")
    
    return groups, similarity_matrix, token_info

# Try the semantic analysis
documents = list_demo_documents()
if documents:
    image_path = os.path.join(DEMO_PATH, documents[0])
    groups, similarity_matrix, token_info = analyze_document_semantics(
        image_path, processor, model
    )

NameError: name 'list_demo_documents' is not defined