# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [None]:
import sys
sys.path.append('../src')

from classifiers.pdf import PdfTypeDetector, DigitalElementClassifier
from routers.pdf import route_elements
from pathlib import Path

pdf_path = Path('../data/digital.pdf')

detector = PdfTypeDetector()
pdf_type = detector.detect(str(pdf_path))
pdf_type

In [None]:
if pdf_type == 'digital':
    classifier = DigitalElementClassifier()
    pages = classifier.classify(str(pdf_path))
else:
    pages = []

pages

In [None]:
# Patch processor modules with simple stand-ins so the demo can run end-to-end
from processors.pdf import text_processor, table_processor, image_processor

def show_text(blocks):
    print(f'Text blocks: {len(blocks)}')

def show_table(table):
    print('Table bbox:', table)

def show_image(image):
    print('Image:', image)

text_processor.process_text = show_text
table_processor.process_table = show_table
image_processor.process_image = show_image

# Route the elements collected in the previous step
route_elements(pages)