# Debug Form Detection Issues

This notebook investigates why example forms are not being detected as forms when compared against themselves.

In [None]:
# Setup
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import fitz  # PyMuPDF
import pandas as pd

In [None]:
# Load CLIP model
model_name = "openai/clip-vit-large-patch14-336"
clip_model = CLIPModel.from_pretrained(model_name)
clip_processor = CLIPProcessor.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
clip_model = clip_model.to(device)
clip_model.eval()

print(f"Model loaded on {device}")

In [None]:
# Test 1: Extract features from a single example form
example_forms_dir = "../../data/raw/_exampleforms/"
example_files = [f for f in os.listdir(example_forms_dir) if f.endswith('.pdf')]
test_file = example_files[0]

print(f"Testing with: {test_file}")

# Load the test image
pdf_path = os.path.join(example_forms_dir, test_file)
pdf = fitz.open(pdf_path)
page = pdf[0]
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
pdf.close()

# Extract features
inputs = clip_processor(images=img, return_tensors="pt").to(device)
with torch.no_grad():
    features1 = clip_model.get_image_features(**inputs)
    features1 = features1.cpu().numpy()
    
print(f"Features shape: {features1.shape}")
print(f"Features norm before normalization: {np.linalg.norm(features1)}")

# Normalize
features1_norm = features1 / np.linalg.norm(features1, axis=1, keepdims=True)
print(f"Features norm after normalization: {np.linalg.norm(features1_norm)}")

In [None]:
# Test 2: Compare the same image with itself
# Load the same image again
pdf = fitz.open(pdf_path)
page = pdf[0]
pix = page.get_pixmap()
img2 = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
pdf.close()

# Extract features again
inputs2 = clip_processor(images=img2, return_tensors="pt").to(device)
with torch.no_grad():
    features2 = clip_model.get_image_features(**inputs2)
    features2 = features2.cpu().numpy()
    features2_norm = features2 / np.linalg.norm(features2, axis=1, keepdims=True)

# Compare
similarity = cosine_similarity(features1_norm, features2_norm)[0][0]
print(f"\nSimilarity of the same image with itself: {similarity:.6f}")
print(f"This should be 1.0 or very close to 1.0")

In [None]:
# Test 3: Load all example features as done in the main notebook
example_features = []

for example_file in example_files[:5]:  # Test with first 5
    example_path = os.path.join(example_forms_dir, example_file)
    
    try:
        pdf = fitz.open(example_path)
        page = pdf[0]
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        pdf.close()
        
        # Extract features
        inputs = clip_processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            features = features.cpu().numpy()
            features = features / np.linalg.norm(features, axis=1, keepdims=True)
            example_features.append(features)
        
        print(f"Loaded {example_file}: shape {features.shape}")
        
    except Exception as e:
        print(f"Error loading {example_file}: {e}")

print(f"\nTotal example features loaded: {len(example_features)}")

In [None]:
# Test 4: Check similarity between examples
print("Similarity matrix between first 5 examples:")
similarity_matrix = np.zeros((min(5, len(example_features)), min(5, len(example_features))))

for i in range(min(5, len(example_features))):
    for j in range(min(5, len(example_features))):
        sim = cosine_similarity(example_features[i], example_features[j])[0][0]
        similarity_matrix[i, j] = sim

# Display as DataFrame for better readability
sim_df = pd.DataFrame(similarity_matrix, 
                      index=[f"Ex{i}" for i in range(similarity_matrix.shape[0])],
                      columns=[f"Ex{i}" for i in range(similarity_matrix.shape[1])])
print(sim_df.round(3))
print("\nDiagonal should be 1.0 (same image with itself)")

In [None]:
# Test 5: Replicate the exact detection function
def detect_form_visual_clip_debug(image, clip_model, clip_processor, device,
                                 positive_features=None, negative_features=None,
                                 similarity_threshold=0.7, negative_threshold=0.7):
    """
    Debug version with extra logging
    """
    result = {
        'is_form': False,
        'confidence': 0,
        'max_positive_similarity': 0,
        'max_negative_similarity': 0,
        'positive_similarities': [],
        'negative_similarities': []
    }
    
    # Extract visual features from current image
    try:
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            features = features.cpu().numpy()
            print(f"  Raw features shape: {features.shape}")
            print(f"  Raw features norm: {np.linalg.norm(features)}")
            
            # Normalize features
            features = features / np.linalg.norm(features, axis=1, keepdims=True)
            print(f"  Normalized features norm: {np.linalg.norm(features)}")
    except Exception as e:
        print(f"Error extracting features: {e}")
        return result
    
    # Check similarity to positive examples
    if positive_features:
        print(f"  Checking against {len(positive_features)} positive examples")
        for i, pos_feat in enumerate(positive_features):
            sim = cosine_similarity(features, pos_feat)[0][0]
            result['positive_similarities'].append(sim)
            print(f"    Example {i}: similarity = {sim:.4f}")
        
        result['max_positive_similarity'] = max(result['positive_similarities'])
        is_like_positive = result['max_positive_similarity'] > similarity_threshold
        print(f"  Max positive similarity: {result['max_positive_similarity']:.4f}")
        print(f"  Is like positive (>{similarity_threshold})? {is_like_positive}")
    else:
        is_like_positive = False
        print("  No positive features provided")
    
    # For now, ignore negative examples to isolate the issue
    is_not_like_negative = True
    
    # Decision: must be like positive AND not like negative
    result['is_form'] = is_like_positive and is_not_like_negative
    print(f"  Final decision: {result['is_form']}")
    
    return result

In [None]:
# Test 6: Test detection on the first example form
print("Testing detection on the first example form:")
print(f"File: {example_files[0]}\n")

# Load the first example
pdf = fitz.open(os.path.join(example_forms_dir, example_files[0]))
page = pdf[0]
pix = page.get_pixmap()
test_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
pdf.close()

# Run detection
result = detect_form_visual_clip_debug(
    test_img, clip_model, clip_processor, device,
    example_features, None, 0.7, 0.7
)

print(f"\nResult: {result['is_form']}")
print(f"This should be True since we're testing an example against itself!")

In [None]:
# Test 7: Check if the issue is with feature dimensions
print("Checking feature dimensions:")
for i, feat in enumerate(example_features[:3]):
    print(f"Example {i}: shape = {feat.shape}, dtype = {feat.dtype}")
    print(f"  Min value: {feat.min():.4f}, Max value: {feat.max():.4f}")
    print(f"  Mean: {feat.mean():.4f}, Std: {feat.std():.4f}")
    print()

In [None]:
# Test 8: Try different similarity thresholds
print("Testing different similarity thresholds:\n")

thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
for threshold in thresholds:
    detected = 0
    for i, example_file in enumerate(example_files[:5]):
        # Load example
        pdf = fitz.open(os.path.join(example_forms_dir, example_file))
        page = pdf[0]
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        pdf.close()
        
        # Extract features
        inputs = clip_processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            features = features.cpu().numpy()
            features = features / np.linalg.norm(features, axis=1, keepdims=True)
        
        # Check similarity
        max_sim = 0
        for pos_feat in example_features:
            sim = cosine_similarity(features, pos_feat)[0][0]
            max_sim = max(max_sim, sim)
        
        if max_sim > threshold:
            detected += 1
    
    print(f"Threshold {threshold}: {detected}/5 examples detected")