# 02 – Full OCR-NLP Pipeline Demo

This notebook demonstrates the complete pipeline: PDF → OCR → NLP → Entity Recognition → JSON Output.

In [None]:
import sys
import json
from pathlib import Path
from pprint import pprint

# Add the parent directory to the path so we can import our modules
sys.path.append('..')

from src.pipeline import PipelineBuilder

# Define paths
RAW_DIR = Path("../data/raw")
OUTPUT_DIR = Path("../outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

## 1. Build the Pipeline

First, we'll create a pipeline with our desired configuration.

In [None]:
# Create a pipeline with custom configuration
pipeline = (
    PipelineBuilder()
    .with_ocr_engine('tesseract', lang='eng')
    .with_ocr_dpi(300)
    .with_language('en')
    .with_entity_extractor('spacy', model_name='en_core_web_sm')
    # Add custom patterns for domain-specific entities
    .with_entity_extractor('custom', patterns={
        'EMAIL': [r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'],
        'PHONE': [r'\+?[\d\-\(\)\s]{10,20}'],
        'URL': [r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'],
        'INVOICE_NUMBER': [r'(?:Invoice|INV)\s*#?\s*(\d+)', r'#\s*(\d+)'],
        'AMOUNT': [r'\$\s*[\d,]+\.?\d*']
    })
    .build()
)

print("✅ Pipeline initialized")

## 2. Process a Sample Document

Let's process a sample document through the pipeline.

In [None]:
# Check if we have any sample PDFs
sample_pdfs = list(RAW_DIR.glob("*.pdf"))

if not sample_pdfs:
    # If no PDFs found, use the sample image from the first notebook
    sample_files = list(Path("../data/samples").glob("*.png"))
    if sample_files:
        sample_path = sample_files[0]
        print(f"Using sample image: {sample_path}")
    else:
        print("No sample files found. Please add a PDF or image to the data/raw or data/samples directory.")
        sample_path = None
else:
    sample_path = sample_pdfs[0]
    print(f"Using sample PDF: {sample_path}")

In [None]:
if sample_path:
    # Process the document
    result = pipeline.process_document(
        sample_path,
        output_dir=OUTPUT_DIR,
        preprocess_ocr=True,
        clean_text=True
    )
    
    print(f"✅ Processed {sample_path.name}")
    print(f"Found {result['entity_extraction']['count']} entities")
    print(f"Total processing time: {result['total_processing_time']:.2f} seconds")

## 3. Examine the Extracted Entities

Let's look at the entities that were extracted from the document.

In [None]:
if 'entities' in result and result['entities']:
    # Group entities by type
    entities_by_type = {}
    for entity in result['entities']:
        entity_type = entity['label']
        if entity_type not in entities_by_type:
            entities_by_type[entity_type] = []
        entities_by_type[entity_type].append(entity['text'])
    
    # Print entities by type
    print("Extracted Entities:")
    for entity_type, entities in entities_by_type.items():
        print(f"\n{entity_type}:")
        for entity in entities:
            print(f"  - {entity}")
else:
    print("No entities found in the document.")

## 4. Visualize the JSON Output

Let's look at the structured JSON output.

In [None]:
# Create a simplified JSON output with just the key information
if sample_path:
    simplified_output = {
        "document": result["document"]["name"],
        "entities": result["entities"],
        "word_count": result["text_analysis"]["word_count"],
        "processing_time": result["total_processing_time"]
    }
    
    # Save the simplified output
    simplified_path = OUTPUT_DIR / f"{sample_path.stem}_simplified.json"
    with open(simplified_path, 'w', encoding='utf-8') as f:
        json.dump(simplified_output, f, indent=2, ensure_ascii=False)
    
    print(f"\nSaved simplified output to: {simplified_path}")
    
    # Display the first few entities
    print("\nSample of extracted entities:")
    for entity in result["entities"][:5]:  # Show first 5 entities
        print(f"- {entity['text']} ({entity['label']})")
    
    if len(result["entities"]) > 5:
        print(f"... and {len(result['entities']) - 5} more entities")

## 5. Batch Processing

The pipeline can also process multiple documents in batch mode.

In [None]:
# Find all documents in the raw directory
all_docs = list(RAW_DIR.glob("*.pdf")) + list(RAW_DIR.glob("*.png")) + list(RAW_DIR.glob("*.jpg"))

if len(all_docs) > 1:
    print(f"Found {len(all_docs)} documents for batch processing")
    
    # Process all documents
    batch_results = pipeline.process_batch(
        all_docs,
        output_dir=OUTPUT_DIR,
        preprocess_ocr=True,
        clean_text=True
    )
    
    print("\nBatch Processing Summary:")
    for i, result in enumerate(batch_results):
        doc_name = result.get('document', {}).get('name', f"Document {i+1}")
        entity_count = result.get('entity_extraction', {}).get('count', 0)
        processing_time = result.get('total_processing_time', 0)
        print(f"- {doc_name}: {entity_count} entities in {processing_time:.2f} seconds")
else:
    print("Not enough documents for batch processing demonstration.")

## 6. Summary

In this notebook, we've demonstrated:

1. Building a configurable OCR-NLP pipeline
2. Processing documents to extract text and entities
3. Generating structured JSON output
4. Batch processing capabilities

The pipeline can be extended with additional entity types, custom preprocessing steps, and domain-specific extractors for different document types.