# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

from extraction.classifiers.pdf import PDFTypeDetector, DigitalElementClassifier
from extraction.routers.pdf import ElementRouter
from pathlib import Path

# pdf_path = Path('../data/digital.pdf')
pdf_path = Path('../data/Ripples of Consciousenss.pdf')
# pdf_path = Path('../data/0d45add2d94d80a0eb85e41e22aa43a0.pdf')

detector = PDFTypeDetector()
pdf_type = detector.detect(str(pdf_path))
pdf_type

'digital'

In [2]:
if pdf_type == 'digital':
    classifier = DigitalElementClassifier()
    results = classifier.classify(str(pdf_path))
else:
    results = []

In [4]:
print(results.keys())

for key, value in results.items():
    print(f'key {key}, length {len(value)}')

text = results['text']

print(text[3])

for text_block in text:
    print(text_block)

for image in results['images']:
    print(image)

dict_keys(['text', 'tables', 'images'])
key text, length 12
key tables, length 0
key images, length 3
{'id': 'text_p1_0', 'kind': 'text', 'page_range': (1, 1), 'bboxes_per_page': [{'page': 1, 'bbox': (45.5811, 30.696643999999992, 562.5753007999999, 41.71437380000009)}], 'metadata': {'font': '', 'font_size': 0.0, 'is_heading': False, 'word_count': 11, 'column_span': None}, 'text': 'Spotlights Trends in Cognitive Sciences November 2013, Vol. 17, No. 11'}
{'id': 'text_p0_0', 'kind': 'text', 'page_range': (0, 0), 'bboxes_per_page': [{'page': 0, 'bbox': (52.6677, 32.000643999999966, 106.38871800000001, 42.959644000000026)}], 'metadata': {'font': '', 'font_size': 0.0, 'is_heading': False, 'word_count': 1, 'column_span': None}, 'text': 'Spotlights'}
{'id': 'text_p0_1', 'kind': 'text', 'page_range': (0, 0), 'bboxes_per_page': [{'page': 0, 'bbox': (49.6063, 78.37723470000003, 569.6674065999999, 233.05773999999997)}], 'metadata': {'font': '', 'font_size': 0.0, 'is_heading': False, 'word_count': 

In [None]:
# Patch processor modules with simple stand-ins so the demo can run end-to-end
from extraction.processors.pdf import text_processor, table_processor, image_processor

def show_text(blocks):
    print(f'Text blocks: {len(blocks)}')

def show_table(table):
    print('Table bbox:', table)

def show_image(image):
    print('Image:', image)

text_processor.process_text = show_text
table_processor.process_table = show_table
image_processor.process_image = show_image

# Route the elements collected in the previous step
router = ElementRouter()
router.route_elements(pages)