# OCR Processing for Contract PDFs

This notebook performs OCR on PDF files in `data/raw/_contracts` using HuggingFace's TrOCR model.

TrOCR is chosen because:
- Optimized for printed text (perfect for contracts)
- Good balance of speed and accuracy
- Works well on MPS (Mac) devices
- Handles document layouts well

In [None]:
# Install required packages if needed
# !pip install transformers torch torchvision
# !pip install pymupdf pdf2image pillow
# !pip install tqdm

In [None]:
import os
import json
import fitz  # PyMuPDF
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from transformers import AutoProcessor, AutoModelForVision2Seq
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set up paths
contracts_dir = "../../data/raw/_contracts/"
output_dir = "../../data/intermediate_products/ocr_results/"
os.makedirs(output_dir, exist_ok=True)

print(f"Contracts directory: {contracts_dir}")
print(f"Output directory: {output_dir}")

In [None]:
# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# For MPS, we might need to set some environment variables
if device.type == "mps":
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

In [None]:
# Load TrOCR model and processor
# Using the base model for better performance on Mac
print("Loading TrOCR model...")

# Option 1: TrOCR base model (recommended for efficiency)
model_name = "microsoft/trocr-base-printed"
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Option 2: Alternative - Donut model (better for complex layouts)
# model_name = "naver-clova-ix/donut-base"
# processor = AutoProcessor.from_pretrained(model_name)
# model = AutoModelForVision2Seq.from_pretrained(model_name)

model = model.to(device)
model.eval()

print(f"Model loaded: {model_name}")

In [None]:
def pdf_page_to_image(pdf_path, page_num, dpi=300):
    """
    Convert a PDF page to PIL Image
    Higher DPI = better quality but slower processing
    """
    pdf_document = fitz.open(pdf_path)
    page = pdf_document[page_num]
    
    # Increase resolution for better OCR
    mat = fitz.Matrix(dpi/72, dpi/72)
    pix = page.get_pixmap(matrix=mat)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    pdf_document.close()
    return img

def ocr_image(image, processor, model, device, max_length=512):
    """
    Perform OCR on a PIL image using TrOCR
    """
    # Prepare image
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    # Generate text
    with torch.no_grad():
        generated_ids = model.generate(pixel_values, max_length=max_length)
    
    # Decode text
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

In [None]:
def split_image_into_chunks(image, chunk_height=500, overlap=50):
    """
    Split image into overlapping chunks for better OCR on full pages
    TrOCR works better on smaller text regions
    """
    width, height = image.size
    chunks = []
    
    y = 0
    while y < height:
        y_end = min(y + chunk_height, height)
        chunk = image.crop((0, y, width, y_end))
        chunks.append((chunk, y, y_end))
        y += chunk_height - overlap
        
    return chunks

def ocr_full_page(image, processor, model, device):
    """
    OCR a full page by processing it in chunks
    """
    chunks = split_image_into_chunks(image)
    page_text = []
    
    for chunk_img, y_start, y_end in chunks:
        text = ocr_image(chunk_img, processor, model, device)
        if text.strip():
            page_text.append(text)
    
    return "\n".join(page_text)

In [None]:
# Test OCR on a single page
pdf_files = [f for f in os.listdir(contracts_dir) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files")

if pdf_files:
    test_file = pdf_files[0]
    print(f"\nTesting OCR on: {test_file}")
    
    # Convert first page to image
    test_img = pdf_page_to_image(os.path.join(contracts_dir, test_file), 0, dpi=200)
    print(f"Page size: {test_img.size}")
    
    # Try OCR on a small region first
    test_region = test_img.crop((0, 0, test_img.width, min(500, test_img.height)))
    print("\nOCR on top region:")
    test_text = ocr_image(test_region, processor, model, device)
    print(test_text[:200] + "..." if len(test_text) > 200 else test_text)

In [None]:
def process_pdf_with_ocr(pdf_path, processor, model, device, max_pages=None):
    """
    Process entire PDF with OCR
    Returns dict with text for each page
    """
    pdf_document = fitz.open(pdf_path)
    num_pages = len(pdf_document)
    pdf_document.close()
    
    if max_pages:
        num_pages = min(num_pages, max_pages)
    
    results = {
        "filename": os.path.basename(pdf_path),
        "num_pages": num_pages,
        "pages": {}
    }
    
    for page_num in range(num_pages):
        try:
            # Convert page to image
            img = pdf_page_to_image(pdf_path, page_num, dpi=200)
            
            # Perform OCR
            text = ocr_full_page(img, processor, model, device)
            
            results["pages"][page_num + 1] = {
                "text": text,
                "status": "success"
            }
            
        except Exception as e:
            results["pages"][page_num + 1] = {
                "text": "",
                "status": "error",
                "error": str(e)
            }
    
    return results

In [None]:
# Process a single PDF completely
if pdf_files:
    test_pdf = os.path.join(contracts_dir, pdf_files[0])
    print(f"Processing full PDF: {pdf_files[0]}")
    
    results = process_pdf_with_ocr(test_pdf, processor, model, device, max_pages=3)
    
    print(f"\nProcessed {len(results['pages'])} pages")
    print("\nSample from page 1:")
    page1_text = results['pages'][1]['text']
    print(page1_text[:300] + "..." if len(page1_text) > 300 else page1_text)

In [None]:
# Batch processing function with CLOBBER option
def batch_process_pdfs(pdf_dir, output_dir, processor, model, device, 
                      max_files=None, max_pages_per_pdf=None, 
                      CLOBBER=False):
    """
    Process multiple PDFs and save OCR results
    
    Args:
        pdf_dir: Directory containing PDF files
        output_dir: Directory to save OCR results
        processor: TrOCR processor
        model: TrOCR model
        device: torch device
        max_files: Limit number of files to process (None for all)
        max_pages_per_pdf: Limit pages per PDF (None for all)
        CLOBBER: If True, process all files. If False, only process new files.
    """
    # Get all PDF files
    all_pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    
    if CLOBBER:
        # Process all files
        pdf_files = all_pdf_files
        print(f"CLOBBER=True: Processing ALL {len(pdf_files)} PDFs...")
    else:
        # Process only files not already in output directory
        existing_ocr_files = {f.replace('_ocr.json', '.pdf') 
                             for f in os.listdir(output_dir) 
                             if f.endswith('_ocr.json')}
        pdf_files = [f for f in all_pdf_files if f not in existing_ocr_files]
        
        print(f"CLOBBER=False: Found {len(all_pdf_files)} total PDFs")
        print(f"  Already processed: {len(existing_ocr_files)}")
        print(f"  New files to process: {len(pdf_files)}")
    
    if max_files:
        pdf_files = pdf_files[:max_files]
        print(f"  Limited to {max_files} files")
    
    if not pdf_files:
        print("No files to process!")
        return
    
    print(f"\nProcessing {len(pdf_files)} PDFs...")
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        output_path = os.path.join(output_dir, pdf_file.replace('.pdf', '_ocr.json'))
        
        try:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            results = process_pdf_with_ocr(pdf_path, processor, model, device, max_pages_per_pdf)
            
            # Save results
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"\nError processing {pdf_file}: {e}")
            
    print("\nBatch processing complete!")

In [None]:
# Process a small batch as a test
print("Processing a test batch of 5 PDFs (first 2 pages each)...")
print("Using CLOBBER=False to only process new files\n")

batch_process_pdfs(
    contracts_dir, 
    output_dir, 
    processor, 
    model, 
    device,
    max_files=5,
    max_pages_per_pdf=2,
    CLOBBER=False  # Only process new files
)

In [None]:
# Example: Using CLOBBER to control processing behavior

# Check what's already processed
existing_files = [f for f in os.listdir(output_dir) if f.endswith('_ocr.json')]
print(f"Currently {len(existing_files)} files in OCR output directory")

# Test CLOBBER=False (incremental)
print("\n--- Testing CLOBBER=False (incremental mode) ---")
batch_process_pdfs(
    contracts_dir, 
    output_dir, 
    processor, 
    model, 
    device,
    max_files=3,  # Just process 3 files for testing
    max_pages_per_pdf=1,  # Just first page
    CLOBBER=False
)

# Test CLOBBER=True (reprocess all)
print("\n--- Testing CLOBBER=True (reprocess mode) ---")
batch_process_pdfs(
    contracts_dir, 
    output_dir, 
    processor, 
    model, 
    device,
    max_files=3,  # Just process 3 files for testing
    max_pages_per_pdf=1,  # Just first page
    CLOBBER=True
)

In [None]:
# Alternative: Faster batch processing using PyMuPDF's built-in text extraction first
def hybrid_ocr_extraction(pdf_path, processor, model, device, ocr_threshold=100):
    """
    Try PyMuPDF text extraction first, fall back to OCR if needed
    This is much faster for PDFs with embedded text
    """
    pdf_document = fitz.open(pdf_path)
    results = {
        "filename": os.path.basename(pdf_path),
        "num_pages": len(pdf_document),
        "pages": {}
    }
    
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        
        # Try direct text extraction first
        text = page.get_text()
        
        # If text is too short, use OCR
        if len(text.strip()) < ocr_threshold:
            try:
                img = pdf_page_to_image(pdf_path, page_num, dpi=200)
                text = ocr_full_page(img, processor, model, device)
                method = "ocr"
            except Exception as e:
                method = "error"
                text = f"OCR failed: {str(e)}"
        else:
            method = "extracted"
        
        results["pages"][page_num + 1] = {
            "text": text,
            "method": method
        }
    
    pdf_document.close()
    return results

def batch_process_pdfs_hybrid(pdf_dir, output_dir, processor, model, device,
                             max_files=None, CLOBBER=False, ocr_threshold=100):
    """
    Batch process PDFs using hybrid extraction (text extraction + OCR fallback)
    
    Args:
        pdf_dir: Directory containing PDF files
        output_dir: Directory to save OCR results
        processor: TrOCR processor
        model: TrOCR model
        device: torch device
        max_files: Limit number of files to process (None for all)
        CLOBBER: If True, process all files. If False, only process new files.
        ocr_threshold: Min text length to consider extraction successful
    """
    # Get all PDF files
    all_pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    
    if CLOBBER:
        pdf_files = all_pdf_files
        print(f"CLOBBER=True: Processing ALL {len(pdf_files)} PDFs with hybrid extraction...")
    else:
        existing_ocr_files = {f.replace('_ocr.json', '.pdf') 
                             for f in os.listdir(output_dir) 
                             if f.endswith('_ocr.json')}
        pdf_files = [f for f in all_pdf_files if f not in existing_ocr_files]
        
        print(f"CLOBBER=False: Found {len(all_pdf_files)} total PDFs")
        print(f"  Already processed: {len(existing_ocr_files)}")
        print(f"  New files to process: {len(pdf_files)}")
    
    if max_files:
        pdf_files = pdf_files[:max_files]
        print(f"  Limited to {max_files} files")
    
    if not pdf_files:
        print("No files to process!")
        return
    
    print(f"\nProcessing {len(pdf_files)} PDFs with hybrid extraction...")
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        output_path = os.path.join(output_dir, pdf_file.replace('.pdf', '_ocr.json'))
        
        try:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            results = hybrid_ocr_extraction(pdf_path, processor, model, device, ocr_threshold)
            
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"\nError with {pdf_file}: {e}")
    
    print("\nHybrid batch processing complete!")

# Test hybrid approach
if pdf_files:
    print("Testing hybrid extraction...")
    test_results = hybrid_ocr_extraction(
        os.path.join(contracts_dir, pdf_files[0]),
        processor, model, device
    )
    print(f"Page 1 extracted using: {test_results['pages'][1]['method']}")

In [None]:
# Full processing with CLOBBER option
# Set CLOBBER based on your needs:
#   - CLOBBER=True: Process ALL files (overwrites existing OCR results)
#   - CLOBBER=False: Only process new files not in output directory

CLOBBER = False  # Change this to True to reprocess all files

# Option 1: Standard OCR processing (slower but works on all PDFs)
"""
print(f"Starting full OCR processing with CLOBBER={CLOBBER}")
batch_process_pdfs(
    contracts_dir,
    output_dir,
    processor,
    model,
    device,
    max_files=None,  # Process all files
    max_pages_per_pdf=None,  # Process all pages
    CLOBBER=CLOBBER
)
"""

# Option 2: Hybrid processing (RECOMMENDED - much faster)
"""
print(f"Starting hybrid extraction with CLOBBER={CLOBBER}")
batch_process_pdfs_hybrid(
    contracts_dir,
    output_dir,
    processor,
    model,
    device,
    max_files=None,  # Process all files
    CLOBBER=CLOBBER,
    ocr_threshold=100  # Min text length to consider extraction successful
)
"""

# Example: Process only new files (incremental update)
"""
print("Incremental update - processing only new files...")
batch_process_pdfs_hybrid(
    contracts_dir,
    output_dir,
    processor,
    model,
    device,
    CLOBBER=False  # Only new files
)
"""

# Example: Reprocess everything
"""
print("Full reprocessing - updating all OCR results...")
batch_process_pdfs_hybrid(
    contracts_dir,
    output_dir,
    processor,
    model,
    device,
    CLOBBER=True  # Reprocess all files
)
"""

In [None]:
# Utility: Check OCR results
def check_ocr_results(output_dir):
    """
    Summary statistics of OCR results
    """
    json_files = [f for f in os.listdir(output_dir) if f.endswith('_ocr.json')]
    
    total_pages = 0
    ocr_pages = 0
    extracted_pages = 0
    error_pages = 0
    
    for json_file in json_files:
        with open(os.path.join(output_dir, json_file), 'r') as f:
            data = json.load(f)
            
        for page_num, page_data in data['pages'].items():
            total_pages += 1
            method = page_data.get('method', 'unknown')
            
            if method == 'ocr':
                ocr_pages += 1
            elif method == 'extracted':
                extracted_pages += 1
            elif method == 'error':
                error_pages += 1
    
    print(f"OCR Results Summary:")
    print(f"  Total files: {len(json_files)}")
    print(f"  Total pages: {total_pages}")
    print(f"  Pages with extracted text: {extracted_pages} ({extracted_pages/total_pages*100:.1f}%)")
    print(f"  Pages requiring OCR: {ocr_pages} ({ocr_pages/total_pages*100:.1f}%)")
    print(f"  Error pages: {error_pages} ({error_pages/total_pages*100:.1f}%)")

# Check results
if os.path.exists(output_dir) and os.listdir(output_dir):
    check_ocr_results(output_dir)

## Notes on Performance

1. **TrOCR** is optimized for printed text and works well on contracts
2. **Hybrid approach** is much faster - uses built-in text extraction when possible
3. **Chunk processing** improves accuracy on full pages
4. **Lower DPI** (200) provides good balance of speed and quality

### Alternative Models to Consider:

- **Donut**: Better for complex layouts but slower
- **LayoutLMv3**: Can extract text AND understand structure
- **PaddleOCR**: Very fast but not from HuggingFace

### Tips for Large-Scale Processing:

1. Use the hybrid approach (try extraction first)
2. Process in batches to avoid memory issues
3. Save progress frequently
4. Consider parallel processing for pure extraction
