# Find Most Similar Pages Using CLIP

This notebook uses CLIP embeddings to find pages in the non-examples folder that are most similar to the administrative forms in the examples folder.

## Why CLIP?
- Better visual discrimination than document-specific models
- Trained on diverse image-text pairs
- Should show clearer separation between forms and non-forms

## 1. Setup and Imports

In [None]:
import os
# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_path
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict
import json

from transformers import CLIPProcessor, CLIPModel

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Define Paths and Load Model

In [None]:
# Define paths
BASE_PATH = Path('/Users/admin-tascott/Documents/GitHub/chehalis')
EXAMPLE_FORMS_PATH = BASE_PATH / 'data' / 'raw' / '_exampleforms'
NON_EXAMPLES_PATH = BASE_PATH / 'data' / 'raw' / '_nonexamples'

# Check if paths exist
print(f"Example forms path exists: {EXAMPLE_FORMS_PATH.exists()}")
print(f"Non-examples path exists: {NON_EXAMPLES_PATH.exists()}")

# Load CLIP model
print("\nLoading CLIP model...")
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# Use only the vision model for embeddings
vision_model = model.vision_model.to(device)
vision_model.eval()

print(f"CLIP model loaded successfully")

## 3. Helper Functions

In [None]:
def pdf_to_images(pdf_path: Path, dpi: int = 150) -> List[Image.Image]:
    """
    Convert PDF to list of PIL Images
    """
    try:
        images = convert_from_path(pdf_path, dpi=dpi)
        return images
    except Exception as e:
        print(f"Error converting {pdf_path}: {e}")
        return []

def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess image for CLIP (224x224 RGB)
    """
    if image.mode != 'RGB':
        image = image.convert('RGB')
    # CLIP expects 224x224 images
    return image

@torch.no_grad()
def extract_embedding(image: Image.Image) -> np.ndarray:
    """
    Extract CLIP embedding for a single image
    """
    # Preprocess image
    image = preprocess_image(image)
    
    # Process with CLIP processor
    inputs = processor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values.to(device)
    
    # Extract features
    outputs = vision_model(pixel_values=pixel_values)
    
    # Get pooled output (CLS token)
    if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
        embedding = outputs.pooler_output
    else:
        # Use last hidden state and pool
        embedding = outputs.last_hidden_state.mean(dim=1)
    
    return embedding.cpu().numpy()[0]

## 4. Process Example Forms

In [None]:
# Extract embeddings for all example form pages
example_embeddings = []
example_metadata = []

print("Processing example forms...")
example_files = list(EXAMPLE_FORMS_PATH.glob('*.pdf'))
print(f"Found {len(example_files)} example PDF files")

for pdf_path in tqdm(example_files, desc="Example PDFs"):
    images = pdf_to_images(pdf_path)
    
    for page_num, image in enumerate(images):
        # Extract embedding
        embedding = extract_embedding(image)
        example_embeddings.append(embedding)
        
        # Store metadata
        example_metadata.append({
            'file_path': str(pdf_path),
            'filename': pdf_path.name,
            'page_num': page_num + 1,
            'total_pages': len(images)
        })

# Convert to numpy array
example_embeddings = np.array(example_embeddings)
print(f"\nExtracted embeddings for {len(example_embeddings)} example form pages")
print(f"Embedding shape: {example_embeddings.shape}")

## 5. Process Non-Example Documents

In [None]:
# Extract embeddings for non-example pages
non_example_embeddings = []
non_example_metadata = []
non_example_images = []  # Store for visualization

print("\nProcessing non-example documents...")
non_example_files = list(NON_EXAMPLES_PATH.glob('*.pdf'))

# Limit number of PDFs for memory management
max_pdfs = 50  # Adjust as needed
non_example_files = non_example_files[:max_pdfs]
print(f"Processing {len(non_example_files)} non-example PDF files")

for pdf_path in tqdm(non_example_files, desc="Non-example PDFs"):
    images = pdf_to_images(pdf_path)
    
    for page_num, image in enumerate(images):
        # Extract embedding
        embedding = extract_embedding(image)
        non_example_embeddings.append(embedding)
        
        # Store metadata
        non_example_metadata.append({
            'file_path': str(pdf_path),
            'filename': pdf_path.name,
            'page_num': page_num + 1,
            'total_pages': len(images)
        })
        
        # Store resized image for visualization
        non_example_images.append(preprocess_image(image))

# Convert to numpy array
non_example_embeddings = np.array(non_example_embeddings)
print(f"\nExtracted embeddings for {len(non_example_embeddings)} non-example pages")
print(f"Embedding shape: {non_example_embeddings.shape}")

## 6. Compute Similarity Scores

In [None]:
# Compute cosine similarity between all example and non-example pages
print("\nComputing similarity scores...")
similarity_matrix = cosine_similarity(non_example_embeddings, example_embeddings)

# Find the maximum similarity for each non-example page
max_similarities = similarity_matrix.max(axis=1)
most_similar_example_idx = similarity_matrix.argmax(axis=1)

# Calculate statistics
print(f"\nSimilarity Statistics:")
print(f"Mean similarity: {max_similarities.mean():.4f}")
print(f"Std deviation: {max_similarities.std():.4f}")
print(f"Min similarity: {max_similarities.min():.4f}")
print(f"Max similarity: {max_similarities.max():.4f}")

# Find the overall most similar non-example page
most_similar_idx = max_similarities.argmax()
most_similar_score = max_similarities[most_similar_idx]
most_similar_example = most_similar_example_idx[most_similar_idx]

print(f"\nMost similar non-example page:")
print(f"  File: {non_example_metadata[most_similar_idx]['filename']}")
print(f"  Page: {non_example_metadata[most_similar_idx]['page_num']}")
print(f"  Similarity score: {most_similar_score:.4f}")
print(f"  Most similar to example: {example_metadata[most_similar_example]['filename']}, page {example_metadata[most_similar_example]['page_num']}")

## 7. Analyze High Similarity Pages

In [None]:
# Find ALL non-example pages with similarity > 0.9
high_similarity_threshold = 0.9
high_similarity_indices = np.where(max_similarities > high_similarity_threshold)[0]

print(f"\nFound {len(high_similarity_indices)} non-example pages with similarity > {high_similarity_threshold}")
print(f"That's {len(high_similarity_indices) / len(max_similarities) * 100:.1f}% of all non-example pages!")

if len(high_similarity_indices) > 0:
    print("\nHigh similarity non-example pages:")
    # Sort by similarity score (highest first)
    sorted_indices = high_similarity_indices[np.argsort(max_similarities[high_similarity_indices])[::-1]]
    
    # Show details for top matches (limit to 20 for readability)
    for idx in sorted_indices[:20]:
        similar_example_idx = most_similar_example_idx[idx]
        print(f"\n  File: {non_example_metadata[idx]['filename']}, Page: {non_example_metadata[idx]['page_num']}")
        print(f"  Similarity: {max_similarities[idx]:.4f}")
        print(f"  Similar to: {example_metadata[similar_example_idx]['filename']}, page {example_metadata[similar_example_idx]['page_num']}")
    
    if len(high_similarity_indices) > 20:
        print(f"\n  ... and {len(high_similarity_indices) - 20} more pages with similarity > {high_similarity_threshold}")

## 8. Find Top K Most Similar Pages

In [None]:
# Get top K most similar non-example pages
K = 10
top_k_indices = np.argsort(max_similarities)[-K:][::-1]

print(f"\nTop {K} most similar non-example pages:")
results = []

for rank, idx in enumerate(top_k_indices):
    metadata = non_example_metadata[idx]
    similar_example_idx = most_similar_example_idx[idx]
    similar_example = example_metadata[similar_example_idx]
    
    result = {
        'rank': rank + 1,
        'filename': metadata['filename'],
        'page': metadata['page_num'],
        'similarity_score': max_similarities[idx],
        'similar_to_example': similar_example['filename'],
        'similar_to_page': similar_example['page_num']
    }
    results.append(result)
    
    print(f"\n{rank + 1}. File: {metadata['filename']}, Page: {metadata['page_num']}")
    print(f"   Similarity: {max_similarities[idx]:.4f}")
    print(f"   Most similar to: {similar_example['filename']}, page {similar_example['page_num']}")

# Create DataFrame for easy viewing
results_df = pd.DataFrame(results)
display(results_df)

## 9. Visualize Most Similar Pages

In [None]:
# Visualize top 5 most similar non-example pages
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
fig.suptitle('Top 5 Most Similar Non-Example Pages (Using CLIP)', fontsize=16)

for i, idx in enumerate(top_k_indices[:5]):
    # Non-example page
    ax_non = axes[0, i]
    # Resize for display
    display_img = non_example_images[idx]
    if display_img.size[0] > 400 or display_img.size[1] > 400:
        display_img.thumbnail((400, 400), Image.Resampling.LANCZOS)
    ax_non.imshow(display_img)
    ax_non.set_title(f"Non-Example\n{non_example_metadata[idx]['filename']}\nPage {non_example_metadata[idx]['page_num']}\nScore: {max_similarities[idx]:.3f}")
    ax_non.axis('off')
    
    # Most similar example page
    similar_example_idx = most_similar_example_idx[idx]
    example_meta = example_metadata[similar_example_idx]
    
    # Load the example image for visualization
    example_pdf_path = Path(example_meta['file_path'])
    example_images = pdf_to_images(example_pdf_path)
    if example_meta['page_num'] <= len(example_images):
        example_image = preprocess_image(example_images[example_meta['page_num'] - 1])
        if example_image.size[0] > 400 or example_image.size[1] > 400:
            example_image.thumbnail((400, 400), Image.Resampling.LANCZOS)
        
        ax_ex = axes[1, i]
        ax_ex.imshow(example_image)
        ax_ex.set_title(f"Similar Example\n{example_meta['filename']}\nPage {example_meta['page_num']}")
        ax_ex.axis('off')

plt.tight_layout()
plt.show()

## 10. Analyze Similarity Distribution

In [None]:
# Plot distribution of similarity scores
plt.figure(figsize=(10, 6))
plt.hist(max_similarities, bins=50, alpha=0.7, edgecolor='black')
plt.axvline(x=most_similar_score, color='red', linestyle='--', label=f'Most similar: {most_similar_score:.3f}')
plt.axvline(x=0.9, color='orange', linestyle='--', label='Threshold: 0.9')
plt.xlabel('Maximum Cosine Similarity Score')
plt.ylabel('Number of Non-Example Pages')
plt.title('Distribution of Similarity Scores (CLIP)\n(Non-Example Pages vs Most Similar Example Form)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print percentile statistics
print("\nPercentile Statistics:")
for p in [50, 75, 90, 95, 99]:
    print(f"{p}th percentile: {np.percentile(max_similarities, p):.4f}")

print(f"\nPages with similarity > 0.9: {(max_similarities > 0.9).sum()}")
print(f"Pages with similarity > 0.8: {(max_similarities > 0.8).sum()}")
print(f"Pages with similarity > 0.7: {(max_similarities > 0.7).sum()}")
print(f"Pages with similarity < 0.5: {(max_similarities < 0.5).sum()}")

## 11. Save Results

In [None]:
# Save detailed results to CSV
all_results = []

for idx, (score, example_idx) in enumerate(zip(max_similarities, most_similar_example_idx)):
    metadata = non_example_metadata[idx]
    similar_example = example_metadata[example_idx]
    
    all_results.append({
        'non_example_file': metadata['filename'],
        'non_example_page': metadata['page_num'],
        'non_example_total_pages': metadata['total_pages'],
        'similarity_score': score,
        'most_similar_example_file': similar_example['filename'],
        'most_similar_example_page': similar_example['page_num']
    })

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('similarity_score', ascending=False)
results_df.to_csv('clip_similarity_results.csv', index=False)

print(f"\nResults saved to clip_similarity_results.csv")
print(f"\nTop 10 most similar pages:")
display(results_df.head(10))

## 12. Identify Potential Misclassified Pages

In [None]:
# Pages with very high similarity might be misclassified
threshold = 0.85  # Adjust based on your results
potential_misclassified = results_df[results_df['similarity_score'] > threshold]

print(f"\nPotential misclassified pages (similarity > {threshold}):")
print(f"Found {len(potential_misclassified)} pages")

if len(potential_misclassified) > 0:
    print("\nThese non-example pages are very similar to the administrative forms:")
    for _, row in potential_misclassified.iterrows():
        print(f"\n- File: {row['non_example_file']}, Page: {row['non_example_page']}")
        print(f"  Similarity: {row['similarity_score']:.4f}")
        print(f"  Similar to: {row['most_similar_example_file']}, page {row['most_similar_example_page']}")
    
    # Save potential misclassified pages
    potential_misclassified.to_csv('potential_misclassified_pages_clip.csv', index=False)
    print("\nSaved to potential_misclassified_pages_clip.csv")

## Summary

This notebook uses CLIP to find similar pages. If you're still seeing high similarity scores (>0.9) for many non-example pages, consider:

1. **Visual Inspection**: Look at the high-scoring pages - they might actually be forms
2. **Different Models**: Try other vision models like DINOv2 or newer CLIP variants
3. **Feature Engineering**: Extract specific features (text density, layout structure)
4. **Fine-tuning**: Fine-tune CLIP on your specific document types