# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [1]:
import sys
sys.path.append('../src')

from extraction.classifiers.pdf import PDFTypeDetector, DigitalElementClassifier
from extraction.routers.pdf import ElementRouter
from pathlib import Path

pdf_path = Path('../data/digital.pdf')

detector = PDFTypeDetector()
pdf_type = detector.detect(str(pdf_path))
pdf_type

'digital'

In [2]:
if pdf_type == 'digital':
    classifier = DigitalElementClassifier()
    pages = classifier.classify(str(pdf_path))
else:
    pages = []

pages

[{'text': [{'text': 'Hello',
    'x0': 72.0,
    'x1': 126.672,
    'top': 60.96799999999999,
    'doctop': 60.96799999999999,
    'bottom': 84.968,
    'upright': True,
    'height': 24.000000000000014,
    'width': 54.672,
    'direction': 'ltr'},
   {'text': 'World',
    'x0': 133.344,
    'x1': 196.00800000000004,
    'top': 60.96799999999999,
    'doctop': 60.96799999999999,
    'bottom': 84.968,
    'upright': True,
    'height': 24.000000000000014,
    'width': 62.664000000000044,
    'direction': 'ltr'}],
  'tables': [],
  'images': []}]

In [3]:
# Patch processor modules with simple stand-ins so the demo can run end-to-end
from extraction.processors.pdf import text_processor, table_processor, image_processor

def show_text(blocks):
    print(f'Text blocks: {len(blocks)}')

def show_table(table):
    print('Table bbox:', table)

def show_image(image):
    print('Image:', image)

text_processor.process_text = show_text
table_processor.process_table = show_table
image_processor.process_image = show_image

# Route the elements collected in the previous step
router = ElementRouter()
router.route_elements(pages)

Text blocks: 2
