# Specific Form Type Detection using Donut

This notebook uses Donut to detect a specific type of form (as represented by examples in `_exampleforms`),
not just any form. We'll use Donut's ability to extract features and compare documents.

In [None]:
# Set environment variables
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

from transformers import DonutProcessor, VisionEncoderDecoderModel
from transformers import ViTModel, ViTImageProcessor
from PIL import Image
import pandas as pd
import fitz  # PyMuPDF
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import transformers
transformers.logging.set_verbosity_error()

# Paths
pdf_dir = "../../data/raw/_contracts/"
formpage_dir = "../../data/raw/_formpage/"
example_forms_dir = "../../data/raw/_exampleforms/"
nonexample_forms_dir = "../../data/raw/_nonexamples/"

os.makedirs(formpage_dir, exist_ok=True)

pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files to process")

In [None]:
# Load Donut model
print("Loading Donut model for feature extraction...")

# Load the base Donut model (not fine-tuned for any specific task)
model_name = "naver-clova-ix/donut-base"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded on {device}")

In [None]:
# Function to extract visual features using Donut's encoder
def extract_donut_features(image, processor, model, device):
    """
    Extract visual features from document using Donut's vision encoder
    """
    # Process image
    pixel_values = processor(image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    
    # Extract features from the encoder (Swin Transformer)
    with torch.no_grad():
        # Get encoder outputs
        encoder_outputs = model.encoder(pixel_values)
        # Use the last hidden states and pool them
        features = encoder_outputs.last_hidden_state.mean(dim=1)
        features = features.cpu().numpy()
        # Normalize
        features = features / np.linalg.norm(features, axis=1, keepdims=True)
    
    return features

In [None]:
# Load example forms and extract their features
print("Loading example forms (specific type we're looking for)...")

example_features = []
example_names = []

if os.path.exists(example_forms_dir):
    example_files = [f for f in os.listdir(example_forms_dir) if f.endswith('.pdf')]
    print(f"Found {len(example_files)} example forms of the specific type")
    
    for example_file in tqdm(example_files, desc="Processing examples"):
        try:
            pdf_path = os.path.join(example_forms_dir, example_file)
            pdf = fitz.open(pdf_path)
            page = pdf[0]  # First page only
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pdf.close()
            
            # Extract features
            features = extract_donut_features(img, processor, model, device)
            example_features.append(features)
            example_names.append(example_file)
            
        except Exception as e:
            print(f"Error loading {example_file}: {e}")
    
    print(f"Successfully loaded {len(example_features)} example features")
else:
    print(f"No example forms found at {example_forms_dir}")

In [None]:
# Load non-examples (different types of documents)
print("\nLoading non-examples (other document types)...")

nonexample_features = []

if os.path.exists(nonexample_forms_dir):
    nonexample_files = [f for f in os.listdir(nonexample_forms_dir) if f.endswith('.pdf')]
    print(f"Found {len(nonexample_files)} non-example documents")
    
    # Process first page of each non-example
    for nonexample_file in tqdm(nonexample_files[:10], desc="Processing non-examples"):  # Limit to 10
        try:
            pdf_path = os.path.join(nonexample_forms_dir, nonexample_file)
            pdf = fitz.open(pdf_path)
            
            # Just use first page of non-examples
            page = pdf[0]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            features = extract_donut_features(img, processor, model, device)
            nonexample_features.append(features)
            
            pdf.close()
            
        except Exception as e:
            print(f"Error loading {nonexample_file}: {e}")
    
    print(f"Loaded {len(nonexample_features)} non-example features")

In [None]:
# Analyze similarity between examples
if len(example_features) > 1:
    print("\nAnalyzing similarity between example forms...")
    
    similarities = []
    for i in range(len(example_features)):
        for j in range(i+1, len(example_features)):
            sim = cosine_similarity(example_features[i], example_features[j])[0][0]
            similarities.append(sim)
    
    avg_similarity = np.mean(similarities)
    min_similarity = np.min(similarities)
    max_similarity = np.max(similarities)
    
    print(f"Example-to-example similarity:")
    print(f"  Average: {avg_similarity:.3f}")
    print(f"  Min: {min_similarity:.3f}")
    print(f"  Max: {max_similarity:.3f}")
    
    # Suggest threshold
    suggested_threshold = min_similarity * 0.9  # 90% of minimum similarity
    print(f"\nSuggested threshold: {suggested_threshold:.3f}")

In [None]:
# Detection function for specific form type
def detect_specific_form_type(image, processor, model, device, 
                             example_features, nonexample_features=None,
                             similarity_threshold=0.7, negative_threshold=0.8):
    """
    Detect if a page is the specific form type represented by examples
    """
    # Extract features from the page
    page_features = extract_donut_features(image, processor, model, device)
    
    # Compare to positive examples
    positive_similarities = []
    for ex_feat in example_features:
        sim = cosine_similarity(page_features, ex_feat)[0][0]
        positive_similarities.append(sim)
    
    max_positive_sim = max(positive_similarities) if positive_similarities else 0
    avg_positive_sim = np.mean(positive_similarities) if positive_similarities else 0
    
    # Compare to negative examples if provided
    if nonexample_features:
        negative_similarities = []
        for neg_feat in nonexample_features:
            sim = cosine_similarity(page_features, neg_feat)[0][0]
            negative_similarities.append(sim)
        max_negative_sim = max(negative_similarities) if negative_similarities else 0
    else:
        max_negative_sim = 0
    
    # Decision logic: must be similar to examples AND not too similar to non-examples
    is_specific_form = max_positive_sim > similarity_threshold
    if nonexample_features and max_negative_sim > negative_threshold:
        is_specific_form = False
    
    # Confidence based on how much more similar to positives than negatives
    if nonexample_features:
        confidence = max_positive_sim - max_negative_sim
        confidence = (confidence + 1) / 2  # Normalize to 0-1
    else:
        confidence = max_positive_sim
    
    return {
        'is_specific_form': is_specific_form,
        'confidence': confidence,
        'max_similarity_to_examples': max_positive_sim,
        'avg_similarity_to_examples': avg_positive_sim,
        'max_similarity_to_nonexamples': max_negative_sim
    }

In [None]:
# Test on example forms themselves
print("\nTesting detection on example forms (should all be detected)...")

threshold = suggested_threshold if 'suggested_threshold' in locals() else 0.7

for i, (example_file, example_feat) in enumerate(zip(example_names[:5], example_features[:5])):
    # Load the image
    pdf_path = os.path.join(example_forms_dir, example_file)
    pdf = fitz.open(pdf_path)
    page = pdf[0]
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    pdf.close()
    
    # Test detection
    result = detect_specific_form_type(
        img, processor, model, device,
        example_features, nonexample_features,
        similarity_threshold=threshold
    )
    
    print(f"\n{example_file}:")
    print(f"  Detected: {result['is_specific_form']}")
    print(f"  Max similarity: {result['max_similarity_to_examples']:.3f}")
    print(f"  Confidence: {result['confidence']:.3f}")

In [None]:
# Process PDFs to find specific form type
def process_pdfs_for_specific_form(pdf_files, pdf_dir, formpage_dir, 
                                  processor, model, device,
                                  example_features, nonexample_features=None,
                                  similarity_threshold=0.7, max_files=None):
    """
    Process PDFs to find pages matching the specific form type
    """
    if max_files:
        pdf_files = pdf_files[:max_files]
    
    results = []
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            pdf_document = fitz.open(pdf_path)
            
            best_match_page = None
            best_similarity = 0
            best_confidence = 0
            page_results = []
            
            # Check each page
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                
                # Detect specific form type
                detection = detect_specific_form_type(
                    img, processor, model, device,
                    example_features, nonexample_features,
                    similarity_threshold
                )
                
                page_results.append({
                    'page': page_num + 1,
                    'is_specific_form': detection['is_specific_form'],
                    'similarity': detection['max_similarity_to_examples'],
                    'confidence': detection['confidence']
                })
                
                # Track best matching page
                if detection['is_specific_form'] and detection['max_similarity_to_examples'] > best_similarity:
                    best_similarity = detection['max_similarity_to_examples']
                    best_confidence = detection['confidence']
                    best_match_page = page_num
                    
                    # Stop early if very high similarity
                    if best_similarity > 0.95:
                        break
            
            # Store results
            result = {
                'file': pdf_file,
                'total_pages': len(pdf_document),
                'has_specific_form': best_match_page is not None,
                'best_match_page': best_match_page + 1 if best_match_page is not None else None,
                'best_similarity': best_similarity,
                'confidence': best_confidence
            }
            results.append(result)
            
            # Extract best matching page if found
            if best_match_page is not None:
                output_pdf = fitz.open()
                output_pdf.insert_pdf(pdf_document, from_page=best_match_page, to_page=best_match_page)
                
                output_path = os.path.join(formpage_dir, pdf_file)
                output_pdf.save(output_path)
                output_pdf.close()
            
            pdf_document.close()
            
        except Exception as e:
            print(f"\nError processing {pdf_file}: {str(e)}")
            results.append({
                'file': pdf_file,
                'error': str(e)
            })
    
    return results

In [None]:
# Process a test batch
print("\nProcessing first 10 PDFs to find specific form type...")

results = process_pdfs_for_specific_form(
    pdf_files[:10],
    pdf_dir,
    formpage_dir,
    processor,
    model,
    device,
    example_features,
    nonexample_features,
    similarity_threshold=threshold
)

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(f"\nProcessed {len(results_df)} PDFs")
print(f"Found {results_df['has_specific_form'].sum()} PDFs with the specific form type")

# Show results
print("\nResults:")
for _, row in results_df.iterrows():
    if 'error' not in row:
        if row['has_specific_form']:
            print(f"{row['file']}: Found on page {row['best_match_page']} (similarity: {row['best_similarity']:.3f})")
        else:
            print(f"{row['file']}: Not found")

In [None]:
# Save results
output_path = '../../data/intermediate_products/donut_specific_form_detection.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to: {output_path}")

In [None]:
# Full processing
"""
print("\n=== PROCESSING ALL PDFs FOR SPECIFIC FORM TYPE ===")
print(f"Looking for forms similar to the {len(example_features)} examples")
print(f"Using similarity threshold: {threshold:.3f}")

all_results = process_pdfs_for_specific_form(
    pdf_files,
    pdf_dir,
    formpage_dir,
    processor,
    model,
    device,
    example_features,
    nonexample_features,
    similarity_threshold=threshold
)

# Save results
all_results_df = pd.DataFrame(all_results)
all_results_df.to_csv('../../data/intermediate_products/donut_specific_form_all.csv', index=False)

print(f"\nProcessing complete!")
print(f"Total processed: {len(all_results_df)}")
print(f"Found specific form type in: {all_results_df['has_specific_form'].sum()} documents")
"""

## Approach Summary

This notebook uses Donut's vision encoder to:
1. Extract visual features from your specific example forms
2. Compare new documents against these examples
3. Find pages that match your specific form type (not just any form)

### Key Differences:
- **Not asking "is this a form?"** - Instead asking "is this like my example forms?"
- **Uses similarity matching** - Compares visual features to examples
- **Handles non-examples** - Can reject documents that are forms but not your type

### Advantages:
- No need to describe the form in words
- Learns from your examples
- Can distinguish between different form types
- Uses Donut's document understanding capabilities