In [3]:
# Set environment variable to avoid tokenizer warnings
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import pipeline
from PIL import Image
import pandas as pd
import fitz  # PyMuPDF for PDF handling
from pdf2image import convert_from_path
import shutil

# Initialize the zero-shot classifier (for initial exploration)
checkpoint = "openai/clip-vit-large-patch14"
detector = pipeline(model=checkpoint, task="zero-shot-image-classification")

# Paths
pdf_dir = "../../data/raw/_contracts/"
formpage_dir = "../../data/raw/_formpage/"

# Create formpage directory if it doesn't exist
os.makedirs(formpage_dir, exist_ok=True)

# Get list of PDF files
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

print(f"Found {len(pdf_files)} PDF files to process")

Device set to use mps:0


Found 193450 PDF files to process


In [5]:
# Use latest CLIP for pure visual form detection
import torch
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load latest CLIP model for visual feature extraction
print("Loading CLIP for visual form detection...")
# Using the larger, more recent CLIP model
model_name = "openai/clip-vit-large-patch14-336"  # Higher resolution variant
clip_model = CLIPModel.from_pretrained(model_name)
clip_processor = CLIPProcessor.from_pretrained(model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
clip_model = clip_model.to(device)
clip_model.eval()

print(f"Model loaded successfully on {device}!")

# Path to example forms
example_forms_dir = "../../data/raw/_exampleforms/"

# Load example forms (positive examples)
example_features = []
example_form_paths = []

if os.path.exists(example_forms_dir):
    example_files = [f for f in os.listdir(example_forms_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.pdf'))]
    
    print(f"\nLoading {len(example_files)} example forms from {example_forms_dir}...")
    
    for example_file in example_files:
        example_path = os.path.join(example_forms_dir, example_file)
        
        try:
            if example_file.endswith('.pdf'):
                pdf = fitz.open(example_path)
                page = pdf[0]  # First page only
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                pdf.close()
            else:
                img = Image.open(example_path)
            
            # Extract visual features using CLIP
            inputs = clip_processor(images=img, return_tensors="pt").to(device)
            
            with torch.no_grad():
                features = clip_model.get_image_features(**inputs)
                features = features.cpu().numpy()
                # Normalize features for better similarity comparison
                features = features / np.linalg.norm(features, axis=1, keepdims=True)
                example_features.append(features)
            
            example_form_paths.append(example_file)
            print(f"  Loaded: {example_file}")
            
        except Exception as e:
            print(f"  Error loading {example_file}: {str(e)}")
    
    print(f"Successfully loaded {len(example_features)} example forms")
else:
    print(f"No example forms found at {example_forms_dir}")

Loading CLIP for visual form detection...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Model loaded successfully on mps!

Loading 32 example forms from ../../data/raw/_exampleforms/...
  Loaded: 25581-000.pdf
  Loaded: 99171-000.pdf
  Loaded: 13924-002.pdf
  Loaded: 1197-000.pdf
  Loaded: 67419-000.pdf
  Loaded: H286-20-001-000.pdf
  Loaded: F1-10-FSSA-DDRS-495-000.pdf
  Loaded: SCM63182-001.pdf
  Loaded: 87458-000.pdf
  Loaded: 87552-000.pdf
  Loaded: 83501-000.pdf
  Loaded: SCM63182-002.pdf
  Loaded: 3341-000.pdf
  Loaded: 83844-000.pdf
  Loaded: 88087-000.pdf
  Loaded: 83502-000.pdf
  Loaded: 83987-000.pdf
  Loaded: 9697-000.pdf
  Loaded: PCI-12-3045-001.pdf
  Loaded: F1-9-FSSA-DMHA-569-000.pdf
  Loaded: 3328-000.pdf
  Loaded: PCI-19-2041-001.pdf
  Loaded: 9693-000.pdf
  Loaded: 3370-000.pdf
  Loaded: F1-9-FSSADFR-402-000.pdf
  Loaded: 83796-000.pdf
  Loaded: 85211-000.pdf
  Loaded: 9699-000.pdf
  Loaded: H28-4-12-001.pdf
  Loaded: 9691-001.pdf
  Loaded: 85212-000.pdf
  Loaded: 87462-001.pdf
Successfully loaded 32 example forms


In [7]:
# Load negative examples (non-forms) - ALL PAGES
nonexample_forms_dir = "../../data/raw/_nonexamples/"

nonexample_features = []
nonexample_paths = []

if os.path.exists(nonexample_forms_dir):
    nonexample_files = [f for f in os.listdir(nonexample_forms_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.pdf'))]
    
    print(f"\nLoading negative examples from {nonexample_forms_dir}...")
    
    for nonexample_file in nonexample_files:
        nonexample_path = os.path.join(nonexample_forms_dir, nonexample_file)
        
        try:
            if nonexample_file.endswith('.pdf'):
                pdf = fitz.open(nonexample_path)
                pages_loaded = 0
                
                # Load ALL pages from non-examples
                for page_num in range(len(pdf)):
                    page = pdf[page_num]
                    pix = page.get_pixmap()
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    
                    # Extract visual features using CLIP
                    inputs = clip_processor(images=img, return_tensors="pt").to(device)
                    
                    with torch.no_grad():
                        features = clip_model.get_image_features(**inputs)
                        features = features.cpu().numpy()
                        # Normalize features
                        features = features / np.linalg.norm(features, axis=1, keepdims=True)
                        nonexample_features.append(features)
                    
                    pages_loaded += 1
                
                pdf.close()
                print(f"  Loaded: {nonexample_file} ({pages_loaded} pages)")
                
            else:
                img = Image.open(nonexample_path)
                
                inputs = clip_processor(images=img, return_tensors="pt").to(device)
                
                with torch.no_grad():
                    features = clip_model.get_image_features(**inputs)
                    features = features.cpu().numpy()
                    # Normalize features
                    features = features / np.linalg.norm(features, axis=1, keepdims=True)
                    nonexample_features.append(features)
                
                print(f"  Loaded: {nonexample_file}")
                
        except Exception as e:
            print(f"  Error loading {nonexample_file}: {str(e)}")
    
    print(f"\nTotal features loaded:")
    print(f"  Positive examples (forms): {len(example_features)}")
    print(f"  Negative examples (non-forms): {len(nonexample_features)}")
else:
    print(f"No negative examples found at {nonexample_forms_dir}")


Loading negative examples from ../../data/raw/_nonexamples/...
  Loaded: 25581-000.pdf (25 pages)
  Loaded: 0000000000000000000062223-001.pdf (5 pages)
  Loaded: 1197-000.pdf (15 pages)
  Loaded: 67419-000.pdf (26 pages)
  Loaded: 104473-000.pdf (42 pages)
  Loaded: 104303-001.pdf (3 pages)
  Loaded: 104324-000.pdf (15 pages)
  Loaded: 104324-002.pdf (6 pages)
  Loaded: 104468-000.pdf (45 pages)
  Loaded: 104428-000.pdf (15 pages)
  Loaded: 104306-000.pdf (40 pages)

Total features loaded:
  Positive examples (forms): 32
  Negative examples (non-forms): 237


In [9]:
# Using all negative examples without balancing
# More negative examples = better filtering of false positives
print(f"\nUsing all {len(nonexample_features)} negative examples for comprehensive filtering")


Using all 237 negative examples for comprehensive filtering


In [11]:
# Simple visual form detection function using CLIP
def detect_form_visual_clip(image, clip_model, clip_processor, device,
                           positive_features=None, negative_features=None,
                           similarity_threshold=0.7, negative_threshold=0.7):
    """
    Pure visual form detection using CLIP features
    No text detection or OCR - just visual similarity
    """
    result = {
        'is_form': False,
        'confidence': 0,
        'max_positive_similarity': 0,
        'max_negative_similarity': 0,
        'positive_similarities': [],
        'negative_similarities': []
    }
    
    # Extract visual features from current image
    try:
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            features = features.cpu().numpy()
            # Normalize features
            features = features / np.linalg.norm(features, axis=1, keepdims=True)
    except Exception as e:
        print(f"Error extracting features: {e}")
        return result
    
    # Check similarity to positive examples
    if positive_features:
        for pos_feat in positive_features:
            sim = cosine_similarity(features, pos_feat)[0][0]
            result['positive_similarities'].append(sim)
        
        result['max_positive_similarity'] = max(result['positive_similarities'])
        is_like_positive = result['max_positive_similarity'] > similarity_threshold
    else:
        is_like_positive = False
    
    # Check similarity to negative examples
    if negative_features:
        for neg_feat in negative_features:
            sim = cosine_similarity(features, neg_feat)[0][0]
            result['negative_similarities'].append(sim)
        
        result['max_negative_similarity'] = max(result['negative_similarities'])
        is_not_like_negative = result['max_negative_similarity'] < negative_threshold
    else:
        is_not_like_negative = True
    
    # Decision: must be like positive AND not like negative
    result['is_form'] = is_like_positive and is_not_like_negative
    
    # Confidence score
    if result['is_form']:
        # High positive similarity, low negative similarity
        result['confidence'] = result['max_positive_similarity'] * (1 - result['max_negative_similarity'] * 0.5)
    else:
        result['confidence'] = 0
    
    return result

In [13]:
def detect_form_with_negatives(image, detector, model, processor, 
                              positive_features=None, negative_features=None,
                              similarity_threshold=0.7, negative_threshold=0.7,
                              logic_mode='AND'):
    """
    Advanced form detection using positive and negative examples
    
    Args:
        image: PIL Image to analyze
        detector: Zero-shot classifier pipeline
        model: CLIP model for feature extraction
        processor: CLIP processor
        positive_features: List of feature vectors from example forms
        negative_features: List of feature vectors from non-form examples
        similarity_threshold: Min similarity to positive examples
        negative_threshold: Max allowed similarity to negative examples
        logic_mode: 'AND' or 'OR' for combining conditions
    
    Returns:
        dict with detection results
    """
    # Zero-shot classification
    predictions = detector(image, candidate_labels=[
        "Standardized contract document",
        "Other document"
    ])
    
    form_score = next(p['score'] for p in predictions if 'form' in p['label'])
    prose_score = next(p['score'] for p in predictions if 'prose' in p['label'])
    
    result = {
        'form_score': form_score,
        'prose_score': prose_score,
        'is_form_zeroshot': form_score > prose_score,
        'positive_similarities': [],
        'negative_similarities': [],
        'max_positive_similarity': 0,
        'max_negative_similarity': 0,
        'is_form_positive': False,
        'is_not_like_negative': True,
        'logic_mode': logic_mode
    }
    
    # Extract features from current image if we have examples
    if positive_features or negative_features:
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
            image_features = image_features.cpu().numpy()
    
    # Check similarity to positive examples
    if positive_features:
        for pos_feat in positive_features:
            similarity = cosine_similarity(image_features, pos_feat)[0][0]
            result['positive_similarities'].append(similarity)
        
        result['max_positive_similarity'] = max(result['positive_similarities'])
        result['is_form_positive'] = result['max_positive_similarity'] > similarity_threshold
    
    # Check similarity to negative examples
    if negative_features:
        for neg_feat in negative_features:
            similarity = cosine_similarity(image_features, neg_feat)[0][0]
            result['negative_similarities'].append(similarity)
        
        result['max_negative_similarity'] = max(result['negative_similarities'])
        # If too similar to a non-form, it's probably not a form
        result['is_not_like_negative'] = result['max_negative_similarity'] < negative_threshold
    
    # Combined decision logic
    if positive_features and negative_features:
        # Full contrastive learning: must be like positive AND not like negative
        if logic_mode == 'AND':
            result['is_form'] = (result['is_form_zeroshot'] and 
                               result['is_form_positive'] and 
                               result['is_not_like_negative'])
        else:  # OR logic
            result['is_form'] = ((result['is_form_zeroshot'] or result['is_form_positive']) 
                               and result['is_not_like_negative'])
        
        # Confidence considers all factors
        # Higher when similar to positive, lower when similar to negative
        positive_contrib = result['max_positive_similarity'] if positive_features else form_score
        negative_penalty = result['max_negative_similarity'] if negative_features else 0
        result['confidence'] = positive_contrib * (1 - negative_penalty * 0.5)
        
    elif positive_features:
        # Only positive examples
        if logic_mode == 'AND':
            result['is_form'] = result['is_form_zeroshot'] and result['is_form_positive']
        else:
            result['is_form'] = result['is_form_zeroshot'] or result['is_form_positive']
        result['confidence'] = min(form_score, result['max_positive_similarity'])
        
    elif negative_features:
        # Only negative examples
        result['is_form'] = result['is_form_zeroshot'] and result['is_not_like_negative']
        result['confidence'] = form_score * (1 - result['max_negative_similarity'] * 0.5)
        
    else:
        # No examples at all
        result['is_form'] = result['is_form_zeroshot']
        result['confidence'] = form_score
    
    return result

In [15]:
# Process PDFs with CLIP visual detection
results_contrastive = []

# Set thresholds
SIMILARITY_THRESHOLD = 0.7  # Min similarity to positive examples
NEGATIVE_THRESHOLD = 0.7    # Max similarity to negative examples
CONFIDENCE_THRESHOLD = 0.90 # Stop early threshold

print(f"\nProcessing with CLIP visual detection:")
print(f"  Positive examples: {len(example_features)}")
print(f"  Negative examples: {len(nonexample_features)}")
print(f"  Similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"  Negative threshold: {NEGATIVE_THRESHOLD}")
print("-" * 50)

for pdf_file in pdf_files[:10]:  # Test with first 10
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_document = fitz.open(pdf_path)
        
        best_form_page = None
        best_confidence = 0
        best_details = None
        
        # Check each page
        for page_num in range(len(pdf_document)):
            # Convert page to image
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # CLIP visual detection
            detection = detect_form_visual_clip(
                img, clip_model, clip_processor, device,
                example_features if example_features else None,
                nonexample_features if nonexample_features else None,
                SIMILARITY_THRESHOLD, NEGATIVE_THRESHOLD
            )
            
            # If this is a form and has higher confidence than current best
            if detection['is_form'] and detection['confidence'] > best_confidence:
                best_confidence = detection['confidence']
                best_form_page = page_num
                best_details = detection
                
                # Stop early if confidence is very high
                if best_confidence > CONFIDENCE_THRESHOLD:
                    break
        
        # Store results
        result = {
            'file': pdf_file,
            'total_pages': len(pdf_document),
            'has_form': best_form_page is not None,
            'best_form_page': best_form_page + 1 if best_form_page is not None else None,
            'best_form_confidence': best_confidence
        }
        
        if best_details:
            result['max_positive_sim'] = best_details.get('max_positive_similarity', 0)
            result['max_negative_sim'] = best_details.get('max_negative_similarity', 0)
        
        results_contrastive.append(result)
        
        # Extract best form page if found
        if best_form_page is not None:
            # Create a new PDF with just the best form page
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=best_form_page, to_page=best_form_page)
            
            # Save to formpage directory
            output_path = os.path.join(formpage_dir, pdf_file)
            output_pdf.save(output_path)
            output_pdf.close()
            
            print(f"✓ {pdf_file}: Form on page {best_form_page + 1}")
            print(f"    Confidence: {best_confidence:.3f}")
            if best_details:
                print(f"    Positive similarity: {best_details.get('max_positive_similarity', 0):.3f}")
                print(f"    Negative similarity: {best_details.get('max_negative_similarity', 0):.3f}")
        else:
            print(f"✗ {pdf_file}: No form pages found")
        
        pdf_document.close()
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        results_contrastive.append({
            'file': pdf_file,
            'error': str(e)
        })

# Summary
results_contrastive_df = pd.DataFrame(results_contrastive)
print("\n" + "-" * 50)
print(f"CLIP Visual Detection Summary:")
print(f"Total PDFs: {len(results_contrastive_df)}")
print(f"PDFs with forms: {results_contrastive_df['has_form'].sum()}")

# Save results
results_contrastive_df.to_csv('../../data/intermediate_products/zeroshot_form_contract_clip.csv', index=False)
print(f"\nResults saved to ../../data/intermediate_products/zeroshot_form_contract_clip.csv")


Processing with CLIP visual detection:
  Positive examples: 32
  Negative examples: 237
  Similarity threshold: 0.7
  Negative threshold: 0.7
--------------------------------------------------
✗ 25581-000.pdf: No form pages found
✗ 99171-000.pdf: No form pages found
✗ 13924-002.pdf: No form pages found
✗ 0000000000000000000062223-001.pdf: No form pages found
✗ 0000000000000000000057475-003.pdf: No form pages found
✗ 0000000000000000000061824-000.pdf: No form pages found
✗ 1197-000.pdf: No form pages found
✗ 67419-000.pdf: No form pages found
✗ 0000000000000000000058079-000.pdf: No form pages found
✗ 104473-000.pdf: No form pages found

--------------------------------------------------
CLIP Visual Detection Summary:
Total PDFs: 10
PDFs with forms: 0

Results saved to ../../data/intermediate_products/zeroshot_form_contract_clip.csv


In [58]:
# Diagnostic Analysis - Identify what examples would help most
print("=== EXAMPLE SELECTION HELPER ===\n")

# First, let's analyze the detection patterns
diagnostic_results = []

print("Analyzing detection patterns to identify gaps...")
print("-" * 50)

# Sample more files for better diagnosis
sample_size = min(30, len(pdf_files))  # Analyze up to 30 files
sample_files = pdf_files[:sample_size]

for i, pdf_file in enumerate(sample_files):
    if i % 10 == 0:
        print(f"Processing {i}/{sample_size}...")
    
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_document = fitz.open(pdf_path)
        
        # Analyze each page
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Get detection results
            detection = detect_form_with_negatives(
                img, detector, model, processor,
                example_features if example_features else None,
                nonexample_features if nonexample_features else None,
                SIMILARITY_THRESHOLD, NEGATIVE_THRESHOLD,
                'AND'
            )
            
            # Store diagnostic info
            diag = {
                'file': pdf_file,
                'page': page_num + 1,
                'form_score': detection['form_score'],
                'prose_score': detection['prose_score'],
                'max_positive_sim': detection.get('max_positive_similarity', 0),
                'max_negative_sim': detection.get('max_negative_similarity', 0),
                'is_form_zeroshot': detection['is_form_zeroshot'],
                'is_form_positive': detection.get('is_form_positive', False),
                'is_not_like_negative': detection.get('is_not_like_negative', True),
                'final_decision': detection['is_form'],
                'confidence': detection['confidence']
            }
            diagnostic_results.append(diag)
        
        pdf_document.close()
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")

# Convert to DataFrame for analysis
diag_df = pd.DataFrame(diagnostic_results)

print(f"\nAnalyzed {len(diag_df)} pages from {sample_size} PDFs")
print("=" * 50)

=== EXAMPLE SELECTION HELPER ===

Analyzing detection patterns to identify gaps...
--------------------------------------------------
Processing 0/30...
Processing 10/30...
Processing 20/30...

Analyzed 540 pages from 30 PDFs


In [61]:
diag_df

Unnamed: 0,file,page,form_score,prose_score,max_positive_sim,max_negative_sim,is_form_zeroshot,is_form_positive,is_not_like_negative,final_decision,confidence
0,25581-000.pdf,1,0.985007,0.014993,0.935717,0.597216,True,True,True,True,0.656304
1,25581-000.pdf,2,0.208603,0.791397,0.604241,0.574646,False,False,True,False,0.430628
2,25581-000.pdf,3,0.171370,0.828630,0.760405,0.730141,False,True,False,False,0.482803
3,25581-000.pdf,4,0.065456,0.934544,0.759050,0.736878,False,True,False,False,0.479387
4,25581-000.pdf,5,0.071361,0.928639,0.703448,0.702655,False,True,False,False,0.456307
...,...,...,...,...,...,...,...,...,...,...,...
535,23954-000.pdf,13,0.187815,0.812185,0.690877,0.774992,False,False,False,False,0.423165
536,23954-000.pdf,14,0.001926,0.998074,0.677222,0.771712,False,False,False,False,0.415912
537,23954-000.pdf,15,0.137368,0.862632,0.593657,0.712278,False,False,False,False,0.382233
538,23954-000.pdf,16,0.103761,0.896239,0.720409,0.832327,False,True,False,False,0.420601


In [63]:
# Identify patterns in missed forms and false positives
print("\n=== DETECTION PATTERN ANALYSIS ===\n")

# 1. Find pages that zero-shot thinks are forms but don't match examples
zeroshot_only = diag_df[
    (diag_df['is_form_zeroshot'] == True) & 
    (diag_df['is_form_positive'] == False) &
    (diag_df['max_positive_sim'] > 0)  # Has examples but no match
]

print(f"1. POTENTIAL MISSED FORMS (Zero-shot detected but no example match):")
print(f"   Found {len(zeroshot_only)} pages")
if len(zeroshot_only) > 0:
    print(f"   Average similarity to examples: {zeroshot_only['max_positive_sim'].mean():.3f}")
    print(f"   These pages have form-like structure but don't match your examples")
    print(f"\n   Top candidates to add as examples:")
    top_missed = zeroshot_only.nlargest(5, 'form_score')[['file', 'page', 'form_score', 'max_positive_sim']]
    for _, row in top_missed.iterrows():
        print(f"   - {row['file']}, page {row['page']} (form_score: {row['form_score']:.3f}, similarity: {row['max_positive_sim']:.3f})")

# 2. Find borderline cases
borderline = diag_df[
    (diag_df['max_positive_sim'] > 0.6) & 
    (diag_df['max_positive_sim'] < 0.7)
]

print(f"\n2. BORDERLINE CASES (Almost matching examples):")
print(f"   Found {len(borderline)} pages")
if len(borderline) > 0:
    print(f"   These are close to the threshold - adding similar examples would help")
    print(f"\n   Top borderline cases:")
    top_borderline = borderline.nlargest(5, 'max_positive_sim')[['file', 'page', 'max_positive_sim', 'form_score']]
    for _, row in top_borderline.iterrows():
        print(f"   - {row['file']}, page {row['page']} (similarity: {row['max_positive_sim']:.3f})")

# 3. Find false negatives blocked by negative examples
blocked_by_negatives = diag_df[
    (diag_df['is_form_zeroshot'] == True) & 
    (diag_df['is_form_positive'] == True) &
    (diag_df['is_not_like_negative'] == False)
]

print(f"\n3. BLOCKED BY NEGATIVE EXAMPLES:")
print(f"   Found {len(blocked_by_negatives)} pages")
if len(blocked_by_negatives) > 0:
    print(f"   These look like forms but are too similar to non-examples")
    print(f"   Consider if your negative examples are too broad")

# 4. Summary recommendations
print(f"\n=== RECOMMENDATIONS ===")
print(f"\n1. DIVERSITY GAPS:")
if len(example_features) > 0:
    avg_similarity = diag_df[diag_df['max_positive_sim'] > 0]['max_positive_sim'].mean()
    print(f"   - Average similarity to best example: {avg_similarity:.3f}")
    if avg_similarity < 0.5:
        print(f"   - Your examples may not represent the forms in this dataset well")
        print(f"   - Consider adding more diverse examples")

print(f"\n2. OPTIMAL EXAMPLES TO ADD:")
if len(zeroshot_only) > 0:
    print(f"   - Add {min(5, len(zeroshot_only))} examples from the 'missed forms' list above")
    print(f"   - These would expand your example coverage")

if len(borderline) > 0:
    print(f"   - Add 2-3 examples from the 'borderline cases'")
    print(f"   - These would strengthen detection of edge cases")

print(f"\n3. THRESHOLD ADJUSTMENTS:")
if len(borderline) > len(zeroshot_only):
    print(f"   - Consider lowering similarity threshold from {SIMILARITY_THRESHOLD} to {SIMILARITY_THRESHOLD - 0.05}")
if len(blocked_by_negatives) > 0:
    print(f"   - Consider raising negative threshold from {NEGATIVE_THRESHOLD} to {NEGATIVE_THRESHOLD + 0.05}")

# Save detailed diagnostics
diag_df.to_csv('../../data/intermediate_products/form_detection_diagnostics.csv', index=False)
print(f"\n4. DETAILED DIAGNOSTICS saved to: ../../data/intermediate_products/form_detection_diagnostics.csv")
print(f"   Review this file to manually inspect specific cases")


=== DETECTION PATTERN ANALYSIS ===

1. POTENTIAL MISSED FORMS (Zero-shot detected but no example match):
   Found 79 pages
   Average similarity to examples: 0.614
   These pages have form-like structure but don't match your examples

   Top candidates to add as examples:
   - 104473-000.pdf, page 34 (form_score: 1.000, similarity: 0.621)
   - 104473-000.pdf, page 38 (form_score: 1.000, similarity: 0.625)
   - 104473-000.pdf, page 20 (form_score: 0.999, similarity: 0.654)
   - 104473-000.pdf, page 40 (form_score: 0.999, similarity: 0.574)
   - 104473-000.pdf, page 19 (form_score: 0.998, similarity: 0.649)

2. BORDERLINE CASES (Almost matching examples):
   Found 203 pages
   These are close to the threshold - adding similar examples would help

   Top borderline cases:
   - 0000000000000000000068702-000.pdf, page 15 (similarity: 0.700)
   - 23954-000.pdf, page 7 (similarity: 0.699)
   - 39719-000.pdf, page 16 (similarity: 0.699)
   - 45703-000.pdf, page 14 (similarity: 0.698)
   - 561

In [None]:
# Extract specific pages for manual review
def extract_diagnostic_pages(results_list, output_dir="../../data/intermediate_products/diagnostic_pages/"):
    """Extract specific pages identified in diagnostics for manual review"""
    os.makedirs(output_dir, exist_ok=True)
    
    extracted = []
    for item in results_list:
        try:
            pdf_path = os.path.join(pdf_dir, item['file'])
            pdf_document = fitz.open(pdf_path)
            
            # Extract the specific page
            page_num = item['page'] - 1  # Convert to 0-based
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
            
            # Save with descriptive name
            output_name = f"{item['file'].replace('.pdf', '')}_page{item['page']}_score{item.get('form_score', 0):.2f}.pdf"
            output_path = os.path.join(output_dir, output_name)
            output_pdf.save(output_path)
            output_pdf.close()
            pdf_document.close()
            
            extracted.append(output_name)
        except Exception as e:
            print(f"Error extracting {item['file']} page {item['page']}: {e}")
    
    return extracted

# Extract the most useful examples
print(f"\n=== EXTRACTING DIAGNOSTIC PAGES ===")

if len(zeroshot_only) > 0:
    print(f"\nExtracting top missed forms for review...")
    missed_forms = zeroshot_only.nlargest(min(10, len(zeroshot_only)), 'form_score').to_dict('records')
    extracted_missed = extract_diagnostic_pages(missed_forms, 
                                               "../../data/intermediate_products/diagnostic_pages/missed_forms/")
    print(f"Extracted {len(extracted_missed)} missed form candidates")

if len(borderline) > 0:
    print(f"\nExtracting borderline cases for review...")
    borderline_cases = borderline.nlargest(min(10, len(borderline)), 'max_positive_sim').to_dict('records')
    extracted_borderline = extract_diagnostic_pages(borderline_cases,
                                                   "../../data/intermediate_products/diagnostic_pages/borderline/")
    print(f"Extracted {len(extracted_borderline)} borderline cases")

print(f"\n✓ Review these extracted pages to identify which ones are actually forms")
print(f"✓ Add the true forms to your examples directory")
print(f"✓ This will improve detection accuracy for similar forms")

In [None]:
# Main processing loop with simplified visual detection
print("\n=== SIMPLIFIED VISUAL FORM DETECTION ===")
print(f"Using {len(example_features)} positive examples")
print(f"Using {len(nonexample_features)} negative examples")
print(f"Similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"Negative threshold: {NEGATIVE_THRESHOLD}")
print("-" * 50)

In [None]:
# Process PDFs with CLIP visual detection
results_visual = []

# Process a subset of files
for pdf_file in pdf_files[:20]:  # Test first 20
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_document = fitz.open(pdf_path)
        
        best_form_page = None
        best_confidence = 0
        page_results = []
        
        # Check each page
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            
            # Convert to image
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Visual detection with CLIP
            detection = detect_form_visual_clip(
                img, clip_model, clip_processor, device,
                example_features, nonexample_features,
                SIMILARITY_THRESHOLD, NEGATIVE_THRESHOLD
            )
            
            page_results.append({
                'page': page_num + 1,
                'is_form': detection['is_form'],
                'confidence': detection['confidence'],
                'pos_sim': detection['max_positive_similarity'],
                'neg_sim': detection['max_negative_similarity']
            })
            
            # If this is a form with higher confidence than current best
            if detection['is_form'] and detection['confidence'] > best_confidence:
                best_confidence = detection['confidence']
                best_form_page = page_num
                
                # Stop early if confidence is very high
                if best_confidence > CONFIDENCE_THRESHOLD:
                    break
        
        # Store results
        result = {
            'file': pdf_file,
            'total_pages': len(pdf_document),
            'has_form': best_form_page is not None,
            'form_page': best_form_page + 1 if best_form_page is not None else None,
            'confidence': best_confidence,
            'page_details': page_results
        }
        results_visual.append(result)
        
        # Extract best form page if found
        if best_form_page is not None:
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=best_form_page, to_page=best_form_page)
            
            output_path = os.path.join(formpage_dir, pdf_file)
            output_pdf.save(output_path)
            output_pdf.close()
            
            print(f"✓ {pdf_file}: Form on page {best_form_page + 1} (conf: {best_confidence:.3f})")
        else:
            print(f"✗ {pdf_file}: No form found")
        
        pdf_document.close()
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        results_visual.append({
            'file': pdf_file,
            'error': str(e)
        })

# Summary
results_visual_df = pd.DataFrame(results_visual)
print(f"\n{'='*50}")
print(f"CLIP Visual Detection Summary:")
print(f"Total PDFs: {len(results_visual_df)}")
print(f"Forms found: {results_visual_df['has_form'].sum()}")

# Save results
results_visual_df.to_csv('../../data/intermediate_products/clip_visual_detection_results.csv', index=False)
print(f"\nResults saved to: ../../data/intermediate_products/clip_visual_detection_results.csv")

In [None]:
# Diagnostic: Test detection on the example forms themselves
print("=== DIAGNOSTIC: Testing detection on example forms ===")
print("This should have very high similarity scores...\n")

if example_features and os.path.exists(example_forms_dir):
    # Test first few example forms
    test_files = [f for f in os.listdir(example_forms_dir) if f.endswith('.pdf')][:3]
    
    for test_file in test_files:
        print(f"\nTesting on example form: {test_file}")
        test_path = os.path.join(example_forms_dir, test_file)
        
        try:
            pdf = fitz.open(test_path)
            page = pdf[0]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Test visual detection with CLIP
            result = detect_form_visual_clip(
                img, clip_model, clip_processor, device,
                example_features, nonexample_features,
                SIMILARITY_THRESHOLD, NEGATIVE_THRESHOLD
            )
            
            print(f"  Visual detection: {result['is_form']}")
            print(f"  Confidence: {result['confidence']:.3f}")
            print(f"  Max positive similarity: {result['max_positive_similarity']:.3f}")
            print(f"  Max negative similarity: {result['max_negative_similarity']:.3f}")
            print(f"  Is above threshold ({SIMILARITY_THRESHOLD})? {result['max_positive_similarity'] > SIMILARITY_THRESHOLD}")
            print(f"  Is below negative threshold ({NEGATIVE_THRESHOLD})? {result['max_negative_similarity'] < NEGATIVE_THRESHOLD}")
            
            pdf.close()
            
        except Exception as e:
            print(f"  Error: {e}")

print("\n" + "="*50)

In [None]:
# Full processing with CLIP visual detection
# Uncomment below to process all PDFs (this will take a long time)

"""
print("\\n=== PROCESSING ALL PDFs WITH CLIP VISUAL DETECTION ===")
print(f"Total PDFs to process: {len(pdf_files)}")
print("This may take a while...")

results_all_visual = []
processed = 0

for i, pdf_file in enumerate(pdf_files):
    if i % 100 == 0 and i > 0:
        print(f"Processed {i}/{len(pdf_files)} PDFs...")
    
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_document = fitz.open(pdf_path)
        
        best_form_page = None
        best_confidence = 0
        
        # Check each page
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Visual detection with CLIP
            detection = detect_form_visual_clip(
                img, clip_model, clip_processor, device,
                example_features, nonexample_features,
                SIMILARITY_THRESHOLD, NEGATIVE_THRESHOLD
            )
            
            # If this is a form with higher confidence
            if detection['is_form'] and detection['confidence'] > best_confidence:
                best_confidence = detection['confidence']
                best_form_page = page_num
                
                # Stop early if very confident
                if best_confidence > CONFIDENCE_THRESHOLD:
                    break
        
        # Store results
        result = {
            'file': pdf_file,
            'total_pages': len(pdf_document),
            'has_form': best_form_page is not None,
            'form_page': best_form_page + 1 if best_form_page is not None else None,
            'confidence': best_confidence
        }
        results_all_visual.append(result)
        
        # Extract best form page if found
        if best_form_page is not None:
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=best_form_page, to_page=best_form_page)
            output_path = os.path.join(formpage_dir, pdf_file)
            output_pdf.save(output_path)
            output_pdf.close()
        
        pdf_document.close()
        processed += 1
        
    except Exception as e:
        results_all_visual.append({
            'file': pdf_file,
            'error': str(e)
        })

# Save full results
results_all_visual_df = pd.DataFrame(results_all_visual)
results_all_visual_df.to_csv('../../data/intermediate_products/clip_visual_detection_all_pdfs.csv', index=False)

print(f"\\nProcessing complete!")
print(f"Total processed: {processed}")
print(f"Forms found: {results_all_visual_df['has_form'].sum()}")
print(f"Results saved to: ../../data/intermediate_products/clip_visual_detection_all_pdfs.csv")
"""

In [None]:
def detect_form_hybrid(image, detector, model, processor, example_features=None, 
                      form_threshold=0.5, similarity_threshold=0.7):
    """
    Hybrid form detection requiring BOTH zero-shot classification AND example similarity
    
    Args:
        image: PIL Image to analyze
        detector: Zero-shot classifier pipeline
        model: CLIP model for feature extraction
        processor: CLIP processor
        example_features: List of feature vectors from example forms
        form_threshold: Threshold for zero-shot form classification
        similarity_threshold: Threshold for example similarity
    
    Returns:
        dict with detection results
    """
    # Zero-shot classification
    predictions = detector(image, candidate_labels=[
        "multi-column data form with entry boxes",
        "prose in single column layout"
    ])
    
    form_score = next(p['score'] for p in predictions if 'form' in p['label'])
    prose_score = next(p['score'] for p in predictions if 'prose' in p['label'])
    
    result = {
        'form_score': form_score,
        'prose_score': prose_score,
        'is_form_zeroshot': form_score > prose_score,
        'similarity_scores': [],
        'max_similarity': 0,
        'is_form_example': False
    }
    
    # Example-based detection if examples are available
    if example_features:
        # Extract features from current image
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
            image_features = image_features.cpu().numpy()
        
        # Calculate similarity with each example
        for example_feat in example_features:
            similarity = cosine_similarity(image_features, example_feat)[0][0]
            result['similarity_scores'].append(similarity)
        
        result['max_similarity'] = max(result['similarity_scores']) if result['similarity_scores'] else 0
        result['is_form_example'] = result['max_similarity'] > similarity_threshold
        
        # STRICTER: Require BOTH conditions to be true
        result['is_form'] = result['is_form_zeroshot'] AND result['is_form_example']
        
        # Confidence is the minimum of both scores (since both must be high)
        # Normalize similarity to 0-1 range for fair comparison
        normalized_similarity = result['max_similarity']
        result['confidence'] = min(form_score, normalized_similarity)
        
    else:
        # If no examples, only use zero-shot
        result['is_form'] = result['is_form_zeroshot']
        result['confidence'] = form_score
    
    return result

In [None]:
# Process PDFs with hybrid detection - Find highest probability form
results_hybrid = []

# Set thresholds
SIMILARITY_THRESHOLD = 0.7  # Adjust based on your examples
CONFIDENCE_THRESHOLD = 0.90  # Stop early if we find a very confident match

print(f"Processing with hybrid detection (examples available: {len(example_features) > 0})")

for pdf_file in pdf_files[:10]:  # Test with first 10
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_document = fitz.open(pdf_path)
        
        best_form_page = None
        best_confidence = 0
        
        # Check each page
        for page_num in range(len(pdf_document)):
            # Convert page to image
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Hybrid detection
            detection = detect_form_hybrid(
                img, detector, model, processor, 
                example_features if example_features else None,
                0.5, SIMILARITY_THRESHOLD
            )
            
            # If this is a form and has higher confidence than current best
            if detection['is_form'] and detection['confidence'] > best_confidence:
                best_confidence = detection['confidence']
                best_form_page = page_num
                
                # Stop early if confidence is very high
                if best_confidence > CONFIDENCE_THRESHOLD:
                    break
        
        # Store results
        result = {
            'file': pdf_file,
            'total_pages': len(pdf_document),
            'has_form': best_form_page is not None,
            'best_form_page': best_form_page + 1 if best_form_page is not None else None,
            'best_form_confidence': best_confidence
        }
        results_hybrid.append(result)
        
        # Extract best form page if found
        if best_form_page is not None:
            # Create a new PDF with just the best form page
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=best_form_page, to_page=best_form_page)
            
            # Save to formpage directory
            output_path = os.path.join(formpage_dir, pdf_file)
            output_pdf.save(output_path)
            output_pdf.close()
            
            print(f"✓ {pdf_file}: Best form on page {best_form_page + 1} (confidence: {best_confidence:.3f})")
        else:
            print(f"✗ {pdf_file}: No form pages found")
        
        pdf_document.close()
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        results_hybrid.append({
            'file': pdf_file,
            'error': str(e)
        })

# Summary
results_hybrid_df = pd.DataFrame(results_hybrid)
print(f"\nHybrid Detection Summary:")
print(f"Total PDFs: {len(results_hybrid_df)}")
print(f"PDFs with forms: {results_hybrid_df['has_form'].sum()}")
if example_features:
    print(f"Using {len(example_features)} example forms for similarity matching")

In [None]:
# Compare detection modes: OR vs AND logic
print("Detection Logic Comparison:")
print("-" * 50)
print("OR Logic (Original): Page is a form if:")
print("  - Zero-shot says it's a form (form_score > prose_score)")
print("  - OR it looks like an example (similarity > 0.7)")
print("  → More permissive, catches more forms but may have false positives")
print()
print("AND Logic (Stricter): Page is a form if:")
print("  - Zero-shot says it's a form (form_score > prose_score)")
print("  - AND it looks like an example (similarity > 0.7)")
print("  → More restrictive, higher precision but may miss some forms")
print()
print("Note: If no examples are provided, both modes use only zero-shot classification")
print("-" * 50)

In [None]:
def detect_form_hybrid_configurable(image, detector, model, processor, example_features=None, 
                                   form_threshold=0.5, similarity_threshold=0.7, 
                                   logic_mode='AND'):
    """
    Configurable hybrid form detection with choice of AND/OR logic
    
    Args:
        logic_mode: 'AND' requires both conditions, 'OR' requires either condition
    """
    # Zero-shot classification
    predictions = detector(image, candidate_labels=[
        "multi-column data form with entry boxes",
        "prose in single column layout"
    ])
    
    form_score = next(p['score'] for p in predictions if 'form' in p['label'])
    prose_score = next(p['score'] for p in predictions if 'prose' in p['label'])
    
    result = {
        'form_score': form_score,
        'prose_score': prose_score,
        'is_form_zeroshot': form_score > prose_score,
        'similarity_scores': [],
        'max_similarity': 0,
        'is_form_example': False,
        'logic_mode': logic_mode
    }
    
    # Example-based detection if examples are available
    if example_features:
        # Extract features from current image
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
            image_features = image_features.cpu().numpy()
        
        # Calculate similarity with each example
        for example_feat in example_features:
            similarity = cosine_similarity(image_features, example_feat)[0][0]
            result['similarity_scores'].append(similarity)
        
        result['max_similarity'] = max(result['similarity_scores']) if result['similarity_scores'] else 0
        result['is_form_example'] = result['max_similarity'] > similarity_threshold
        
        # Apply chosen logic
        if logic_mode == 'AND':
            result['is_form'] = result['is_form_zeroshot'] and result['is_form_example']
            # Confidence is minimum of both (both must be high)
            result['confidence'] = min(form_score, result['max_similarity'])
        else:  # OR logic
            result['is_form'] = result['is_form_zeroshot'] or result['is_form_example']
            # Confidence is weighted average
            result['confidence'] = 0.5 * form_score + 0.5 * result['max_similarity']
        
    else:
        # If no examples, only use zero-shot
        result['is_form'] = result['is_form_zeroshot']
        result['confidence'] = form_score
    
    return result

In [33]:
results_hybrid

[{'file': '25581-000.pdf',
  'total_pages': 26,
  'form_pages': [0,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   15,
   17,
   18,
   20,
   21,
   22,
   23,
   25],
  'has_form': True,
  'first_form_page': 1,
  'page_details': [{'page': 1,
    'form_score': 0.9850072264671326,
    'prose_score': 0.014992760494351387,
    'max_similarity': 0.9357165,
    'is_form': True,
    'confidence': 0.9603618681430817,
    'detection_method': 'hybrid'},
   {'page': 2,
    'form_score': 0.20860306918621063,
    'prose_score': 0.7913969159126282,
    'max_similarity': 0.6042408,
    'is_form': False,
    'confidence': 0.406421922147274,
    'detection_method': 'hybrid'},
   {'page': 3,
    'form_score': 0.1713697463274002,
    'prose_score': 0.828630268573761,
    'max_similarity': 0.7604046,
    'is_form': True,
    'confidence': 0.4658871665596962,
    'detection_method': 'hybrid'},
   {'page': 4,
    'form_score': 0.06545615196228027,
    'prose_score': 0.93454

In [None]:
# Alternative: Enhanced zero-shot with detailed descriptions
# This approach uses more specific descriptions without needing example images

def detect_form_enhanced_zeroshot(image, detector):
    """
    Enhanced zero-shot detection with more detailed form descriptions
    """
    # More detailed labels that better describe government forms
    detailed_labels = [
        "government form with fields, checkboxes, and blank spaces to fill in",
        "data collection form with labeled entry boxes and structured layout", 
        "official form with sections for entering information",
        "standardized form with fields for names, dates, and other data",
        "legal contract or agreement with continuous prose text",
        "document with paragraphs of text and legal language",
        "narrative document without form fields"
    ]
    
    predictions = detector(image, candidate_labels=detailed_labels)
    
    # Aggregate scores for form-like vs prose-like
    form_score = 0
    prose_score = 0
    
    for pred in predictions:
        if any(word in pred['label'].lower() for word in ['form', 'fields', 'boxes', 'fill', 'data collection']):
            form_score += pred['score']
        else:
            prose_score += pred['score']
    
    # Normalize scores
    total = form_score + prose_score
    if total > 0:
        form_score /= total
        prose_score /= total
    
    return {
        'form_score': form_score,
        'prose_score': prose_score,
        'is_form': form_score > prose_score,
        'predictions': predictions[:3]  # Top 3 predictions for debugging
    }

# Test enhanced zero-shot on a few files
print("\nTesting enhanced zero-shot detection:")
for pdf_file in pdf_files[:3]:  # Just test 3 files
    pdf_path = os.path.join(pdf_dir, pdf_file)
    pdf_document = fitz.open(pdf_path)
    
    # Just check first page
    page = pdf_document[0]
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    result = detect_form_enhanced_zeroshot(img, detector)
    print(f"\n{pdf_file}:")
    print(f"  Form score: {result['form_score']:.3f}")
    print(f"  Prose score: {result['prose_score']:.3f}")
    print(f"  Is form: {result['is_form']}")
    print(f"  Top prediction: {result['predictions'][0]['label']} ({result['predictions'][0]['score']:.3f})")
    
    pdf_document.close()

In [None]:
# Process each PDF file - Extract highest probability form page
results = []
CONFIDENCE_THRESHOLD = 0.90  # Stop if we find a form with this confidence

for pdf_file in pdf_files[:10]:  # Process first 10 files as a test
    try:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        
        # Open the PDF
        pdf_document = fitz.open(pdf_path)
        
        best_form_page = None
        best_confidence = 0
        
        # Check each page
        for page_num in range(len(pdf_document)):
            # Convert page to image
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Run zero-shot classification
            predictions = detector(img, candidate_labels=[
                "multi-column data form with entry boxes",
                "prose in single column layout"
            ])
            
            # Get scores
            form_score = next(p['score'] for p in predictions if 'form' in p['label'])
            prose_score = next(p['score'] for p in predictions if 'prose' in p['label'])
            
            # If this is a form page and has higher confidence than current best
            if form_score > prose_score and form_score > best_confidence:
                best_confidence = form_score
                best_form_page = page_num
                
                # Stop early if confidence is very high
                if best_confidence > CONFIDENCE_THRESHOLD:
                    break
        
        # Store results
        result = {
            'file': pdf_file,
            'total_pages': len(pdf_document),
            'has_form': best_form_page is not None,
            'best_form_page': best_form_page + 1 if best_form_page is not None else None,
            'best_form_confidence': best_confidence
        }
        results.append(result)
        
        # If we found a form page, extract it
        if best_form_page is not None:
            # Create a new PDF with just the best form page
            output_pdf = fitz.open()
            output_pdf.insert_pdf(pdf_document, from_page=best_form_page, to_page=best_form_page)
            
            # Save to formpage directory with same basename
            output_path = os.path.join(formpage_dir, pdf_file)
            output_pdf.save(output_path)
            output_pdf.close()
            
            print(f"✓ {pdf_file}: Best form on page {best_form_page + 1} (confidence: {best_confidence:.3f})")
        else:
            print(f"✗ {pdf_file}: No form pages found")
        
        # Close the PDF
        pdf_document.close()
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        results.append({
            'file': pdf_file,
            'error': str(e)
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(f"\nProcessed {len(results_df)} PDFs")
print(f"Found {results_df['has_form'].sum()} PDFs with forms")

In [None]:
# Save results
os.makedirs('../../data/intermediate_products', exist_ok=True)
results_df.to_csv('../../data/intermediate_products/zeroshot_form_contract_fullpdf.csv', index=False)
print(f"Results saved to ../../data/intermediate_products/zeroshot_form_contract_fullpdf.csv")

# Display summary
print("\nSummary:")
print(f"Total PDFs processed: {len(results_df)}")
print(f"PDFs with forms: {results_df['has_form'].sum()}")
print(f"PDFs without forms: {(~results_df['has_form']).sum()}")
if 'error' in results_df.columns:
    print(f"PDFs with errors: {results_df['error'].notna().sum()}")

In [None]:
# Full processing - uncomment to process all PDFs
# WARNING: This will take a long time for many PDFs

# # Process ALL PDF files
# results_all = []
# 
# for i, pdf_file in enumerate(pdf_files):
#     if i % 100 == 0:
#         print(f"Processing PDF {i+1}/{len(pdf_files)}...")
#     
#     try:
#         pdf_path = os.path.join(pdf_dir, pdf_file)
#         
#         # Open the PDF
#         pdf_document = fitz.open(pdf_path)
#         
#         form_pages = []
#         
#         # Check each page
#         for page_num in range(len(pdf_document)):
#             # Convert page to image
#             page = pdf_document[page_num]
#             pix = page.get_pixmap()
#             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
#             
#             # Run zero-shot classification
#             predictions = detector(img, candidate_labels=[
#                 "multi-column data form with entry boxes",
#                 "prose in single column layout"
#             ])
#             
#             # Get scores
#             form_score = next(p['score'] for p in predictions if 'form' in p['label'])
#             prose_score = next(p['score'] for p in predictions if 'prose' in p['label'])
#             
#             # If this page is likely a form
#             if form_score > prose_score:
#                 form_pages.append(page_num)
#         
#         # Store results
#         result = {
#             'file': pdf_file,
#             'total_pages': len(pdf_document),
#             'form_pages': form_pages,
#             'has_form': len(form_pages) > 0,
#             'first_form_page': form_pages[0] + 1 if form_pages else None
#         }
#         results_all.append(result)
#         
#         # If the PDF has at least one form page, extract the first form page
#         if form_pages:
#             first_form_page_num = form_pages[0]
#             
#             # Create a new PDF with just the first form page
#             output_pdf = fitz.open()
#             output_pdf.insert_pdf(pdf_document, from_page=first_form_page_num, to_page=first_form_page_num)
#             
#             # Save to formpage directory
#             output_path = os.path.join(formpage_dir, pdf_file)
#             output_pdf.save(output_path)
#             output_pdf.close()
#         
#         # Close the PDF
#         pdf_document.close()
#         
#     except Exception as e:
#         results_all.append({
#             'file': pdf_file,
#             'error': str(e)
#         })
# 
# # Save full results
# results_all_df = pd.DataFrame(results_all)
# results_all_df.to_csv('../../data/intermediate_products/zeroshot_form_contract_fullpdf_all.csv', index=False)