# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

from extraction.classifiers.pdf import PDFTypeDetector, DigitalElementClassifier
from extraction.routers.pdf import ElementRouter
from pathlib import Path

# pdf_path = Path('../data/digital.pdf')
pdf_path = Path('../data/Ripples of Consciousenss.pdf')

detector = PDFTypeDetector()
pdf_type = detector.detect(str(pdf_path))
pdf_type

'digital'

In [None]:
if pdf_type == 'digital':
    classifier = DigitalElementClassifier()
    results = classifier.classify(str(pdf_path))
else:
    results = []



TEST


{'text': [{'text': 'Spotlights\nSpotlights\nSpotlights',
   'bbox': (45.5811,
    30.696643999999992,
    106.38871800000001,
    42.959644000000026),
   'font': 'CFEHAB+AdvP6EC5',
   'size': 10.95900000000006,
   'x0': 45.5811,
   'y0': 30.696643999999992,
   'x1': 106.38871800000001,
   'y1': 42.959644000000026,
   'char_count': 30},
  {'text': 'Trendsin Cognitive Sciences',
   'bbox': (337.833, 33.74417380000011, 441.74910800000004, 41.71437380000009),
   'font': 'CFFEPL+AdvP6ECA',
   'size': 7.970199999999977,
   'x0': 337.833,
   'y0': 33.74417380000011,
   'x1': 441.74910800000004,
   'y1': 41.71437380000009,
   'char_count': 25},
  {'text': 'November 2013, Vol.17, No.11',
   'bbox': (448.4409, 33.74417380000011, 562.5753007999999, 41.71437380000009),
   'font': 'CFEHBC+AdvP6EC0',
   'size': 7.970199999999977,
   'x0': 448.4409,
   'y0': 33.74417380000011,
   'x1': 562.5753007999999,
   'y1': 41.71437380000009,
   'char_count': 25},
  {'text': 'Trendsin Cognitive Sciences',
   'b

In [3]:
print(results.keys())

for key, value in results.items():
    print(f'key {key}, length {len(value)}')

text = results['text']

print(text[3])

for text_block in text:
    print(len(text_block))
    print(text_block)



dict_keys(['text', 'tables', 'images'])
key text, length 147
key tables, length 0
key images, length 3
{'text': 'Trendsin Cognitive Sciences', 'bbox': (344.9196, 34.36777380000001, 448.835708, 42.337973799999986), 'font': 'CFFEPL+AdvP6ECA', 'size': 7.970199999999977, 'x0': 344.9196, 'y0': 34.36777380000001, 'x1': 448.835708, 'y1': 42.337973799999986, 'char_count': 25}
9
{'text': 'Spotlights\nSpotlights\nSpotlights', 'bbox': (45.5811, 30.696643999999992, 106.38871800000001, 42.959644000000026), 'font': 'CFEHAB+AdvP6EC5', 'size': 10.95900000000006, 'x0': 45.5811, 'y0': 30.696643999999992, 'x1': 106.38871800000001, 'y1': 42.959644000000026, 'char_count': 30}
9
{'text': 'Trendsin Cognitive Sciences', 'bbox': (337.833, 33.74417380000011, 441.74910800000004, 41.71437380000009), 'font': 'CFFEPL+AdvP6ECA', 'size': 7.970199999999977, 'x0': 337.833, 'y0': 33.74417380000011, 'x1': 441.74910800000004, 'y1': 41.71437380000009, 'char_count': 25}
9
{'text': 'November 2013, Vol.17, No.11', 'bbox': (44

In [None]:
# Patch processor modules with simple stand-ins so the demo can run end-to-end
from extraction.processors.pdf import text_processor, table_processor, image_processor

def show_text(blocks):
    print(f'Text blocks: {len(blocks)}')

def show_table(table):
    print('Table bbox:', table)

def show_image(image):
    print('Image:', image)

text_processor.process_text = show_text
table_processor.process_table = show_table
image_processor.process_image = show_image

# Route the elements collected in the previous step
router = ElementRouter()
router.route_elements(pages)