# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [7]:
%load_ext autoreload
%autoreload 2

from pydoc import Doc
import sys
import os
sys.path.append('../src')
sys.path.append(os.path.abspath('..'))  # This gets the absolute path to root

from extraction.classifiers.pdf import PDFTypeDetector, PDFDigitalElementClassifier, PDFRouter, route_pdf_format
from extraction.routers.pdf import ElementRouter
from pathlib import Path

from extraction.utils.document_manager import DocumentManager


# doc_name = "Ripples of Consciousenss.pdf"
# doc_name = "0d45add2d94d80a0eb85e41e22aa43a0.pdf"
doc_name = "2016 Atasoy Connectome Harmonics.pdf"


uprocessed_doc_path = "../data/unprocessed_documents/"


pdf_path = Path(uprocessed_doc_path) / doc_name


# Detect if PDF is digital or scanned
detector = PDFTypeDetector()
digital_or_scanned = detector.detect(str(pdf_path))
print("digital_or_scanned:", digital_or_scanned)

# Detect type of PDF formatting
if digital_or_scanned == 'digital':
    pdf_format_detector = PDFRouter()
    pdf_format_stats = pdf_format_detector.analyze(str(pdf_path))
    print("pdf_format", pdf_format_stats)


route_decision = route_pdf_format(pdf_format_stats)
print(route_decision.policy)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
digital_or_scanned: digital
pdf_format {'sandwich_ratio': 0.0, 'vector_to_raster': 638.2941176470588, 'narrow_block_ratio': 0.8151260504201681, 'figure_token_pages': 0.7}
journal


In [8]:
# if pdf_type == 'digital':
#     classifier = DigitalElementClassifier()
#     results = classifier.classify(str(pdf_path))
# else:
#     results = []


if digital_or_scanned == 'digital':

    # Create a document ID from the PDF filename
    pdf_name = pdf_path.stem
    document_id = f"{pdf_name}_{int(__import__('time').time())}"
    
    print(f"📋 Processing document: {document_id}")
    print(f"📄 PDF: {pdf_path.name}")
    
    # Create document structure and copy PDF
    
    doc_manager = DocumentManager()
    doc_path = doc_manager.create_document(document_id, str(pdf_path))

    if route_decision.policy == 'journal':
    
        # Now classify with document_id so images are saved to the right place
        classifier = PDFDigitalElementClassifier()
        results = classifier.classify(str(pdf_path), document_id=document_id)

    if route_decision.policy == 'sandwich':
        print("sandwich detected. PDF extraction not implemented")
        exit()
    # Save extracted elements to organized folders
    doc_manager.save_elements(document_id, results)
    
    print(f"✅ Document processed and saved to: {doc_path}")
else:
    print("Document has been detected as scanned")
    results = []

📋 Processing document: 2016 Atasoy Connectome Harmonics_1756240455
📄 PDF: 2016 Atasoy Connectome Harmonics.pdf
📄 Copied 2016 Atasoy Connectome Harmonics.pdf to /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756240455/raw/2016 Atasoy Connectome Harmonics.pdf
NO CAMELOT 
✅ Document processed and saved to: /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756240455


In [6]:
# print(results.keys())

# for key, value in results.items():
#     print(f'key {key}, length {len(value)}')

# for text_block in results['text']:
#     # print(text_block)

#     try:
#         print(text_block['text'])
#     except:
#         # print("~~~~~~~~~~~ ERROR ~~~~~~~~ Could not print text from text bloc")
#         continue


# for image in results['images']:
#     print(image)


# Text Concatenation Function for PDF Data Extraction


import json


# print(doc_name)

# processed_doc_path = Path('../data/documents') / doc_name.strip('.pdf')
processed_doc_path = Path('../data/documents') / "0d45add2d94d80a0eb85e41e22aa43a0_1756235959"
print(processed_doc_path)
json_text_path = Path(processed_doc_path) / Path('processed/elements/text_blocks.json')

print(json_text_path)

with open(json_text_path, 'r', encoding='utf-8') as f:
    text_blocks = json.load(f)


def concatenate_text_blocks(text_blocks):
    """
    Concatenate all text entries from text blocks into one big string.
    
    Args:
        text_blocks: List of text block dictionaries
        
    Returns:
        str: Concatenated text from all blocks
    """
    concatenated_text = ""
    
    for text_block in text_blocks:
        try:
            if 'text' in text_block and text_block['text']:
                concatenated_text += text_block['text'] + "\n"
        except Exception as e:
            print(f"Error processing text block: {e}")
            continue
    
    return concatenated_text.strip()


# Usage example:
# Add this to your notebook after the existing text processing loop

print("="*50)
print("CONCATENATED TEXT:")
print("="*50)
full_text = concatenate_text_blocks(text_blocks)
print(full_text)
print(f"\nTotal text length: {len(full_text)} characters")
print(f"Number of text blocks processed: {len(text_blocks)}")

# Optional: Save to file
# with open('extracted_text.txt', 'w', encoding='utf-8') as f:
#     f.write(full_text)


../data/documents/0d45add2d94d80a0eb85e41e22aa43a0_1756235959
../data/documents/0d45add2d94d80a0eb85e41e22aa43a0_1756235959/processed/elements/text_blocks.json
CONCATENATED TEXT:
02736320 Registered number: 5013557 number: Charity
PRE-ECLAMPSIA LIMITED ACTION ON guarantee) company limited by (A
UNAUDITED
TRUSTEES' STATEMENTS REPORT FINANCIAL AND
2016 31 DECEMBER FOR YEAR ENDED THE
ACTION PRE-ECLAMPSIA ON LIMITED company limited guarantee) (A by
CONTENTS
Page
Reference and administrative of details the trustees its charity, advisers and
Trustees' 2-9 report
examiner's Independent 10-11 report
of Statement financial activities 12
sheet Balance 13
to Notes the statements financial 14-25
PRE-ECLAMPSIA LIMITED ACTION ON guarantee) limited company by (A
ADVISERS TRUSTEES ITS OF REFERENCE THE DETAILS CHARITY, ADMINISTRATIVE AND AND 2016 31 DECEMBER FOR ENDED THE YEAR
Trustees Janet Trustee Ms Bray, Trustee Professor Nelson-Piercy, Catherine Professor Sherman, Chairman Andrew Trustee Turner, D