# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [10]:
%load_ext autoreload
%autoreload 2

from pydoc import Doc
import sys
import os
sys.path.append('../src')
sys.path.append(os.path.abspath('..'))  # This gets the absolute path to root

from extraction.classifiers.pdf import PDFScannedOrDigitalDetector, PDFDigitalJournalElementClassifier, PDFRouter, route_pdf_format
from extraction.routers.pdf import ElementRouter
from pathlib import Path

from extraction.utils.document_manager import DocumentManager


# doc_name = "Ripples of Consciousenss.pdf"
# doc_name = "0d45add2d94d80a0eb85e41e22aa43a0.pdf"
doc_name = "2016 Atasoy Connectome Harmonics.pdf"


uprocessed_doc_path = "../data/unprocessed_documents/"


pdf_path = Path(uprocessed_doc_path) / doc_name


# Detect if PDF is digital or scanned
detector = PDFScannedOrDigitalDetector()
digital_or_scanned = detector.detect(str(pdf_path))
print("digital_or_scanned:", digital_or_scanned)

# Detect type of PDF formatting
if digital_or_scanned == 'digital':
    pdf_format_detector = PDFRouter()
    pdf_format_stats = pdf_format_detector.analyze(str(pdf_path))
    print("pdf_format", pdf_format_stats)


route_decision = route_pdf_format(pdf_format_stats)
print(route_decision.policy)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
digital_or_scanned: digital
pdf_format {'sandwich_ratio': 0.0, 'vector_to_raster': 638.2941176470588, 'narrow_block_ratio': 0.8151260504201681, 'figure_token_pages': 0.7}
journal


In [11]:
# if pdf_type == 'digital':
#     classifier = DigitalElementClassifier()
#     results = classifier.classify(str(pdf_path))
# else:
#     results = []

from extraction.classifiers.pdf import PDFSandwichElementClassifier


if digital_or_scanned == 'digital':

    # Create a document ID from the PDF filename
    pdf_name = pdf_path.stem
    document_id = f"{pdf_name}_{int(__import__('time').time())}"
    
    print(f"📋 Processing document: {document_id}")
    print(f"📄 PDF: {pdf_path.name}")
    
    # Create document structure and copy PDF
    
    doc_manager = DocumentManager()
    doc_path = doc_manager.create_document(document_id, str(pdf_path))

    if route_decision.policy == 'journal':

        print("Extracting elements from digital journal pdf")
    
        # Now classify with document_id so images are saved to the right place
        classifier = PDFDigitalJournalElementClassifier()
        results = classifier.classify(str(pdf_path), document_id=document_id)

    if route_decision.policy == 'sandwich':
        print("Extracting elements from digital sandwich pdf")

        classifier = PDFSandwichElementClassifier()
        results = classifier.classify(str(pdf_path))

    # Save extracted elements to organized folders
    doc_manager.save_elements(document_id, results)
    
    print(f"✅ Document processed and saved to: {doc_path}")
else:
    print("Document has been detected as scanned")
    results = []

📋 Processing document: 2016 Atasoy Connectome Harmonics_1756242506
📄 PDF: 2016 Atasoy Connectome Harmonics.pdf
📄 Copied 2016 Atasoy Connectome Harmonics.pdf to /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756242506/raw/2016 Atasoy Connectome Harmonics.pdf
Extracting elements from digital journal pdf
_HAVE_CAMELOT 
✅ Document processed and saved to: /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756242506


In [12]:
# print(results.keys())

# for key, value in results.items():
#     print(f'key {key}, length {len(value)}')

# for text_block in results['text']:
#     # print(text_block)

#     try:
#         print(text_block['text'])
#     except:
#         # print("~~~~~~~~~~~ ERROR ~~~~~~~~ Could not print text from text bloc")
#         continue


# for image in results['images']:
#     print(image)


# Text Concatenation Function for PDF Data Extraction

import json


# processed_doc_path = Path('../data/documents') / doc_name.strip('.pdf')
processed_doc_path = Path('../data/documents') / "2016 Atasoy Connectome Harmonics_1756242506"
print(processed_doc_path)
json_text_path = Path(processed_doc_path) / Path('processed/elements/text_blocks.json')

print(json_text_path)

with open(json_text_path, 'r', encoding='utf-8') as f:
    text_blocks = json.load(f)


def concatenate_text_blocks(text_blocks):
    """
    Concatenate all text entries from text blocks into one big string.
    
    Args:
        text_blocks: List of text block dictionaries
        
    Returns:
        str: Concatenated text from all blocks (no line breaks)
    """
    concatenated_text = ""
    
    for text_block in text_blocks:
        try:
            if 'text' in text_block and text_block['text']:
                concatenated_text += text_block['text'] + " "
        except Exception as e:
            print(f"Error processing text block: {e}")
            continue
    
    return concatenated_text.strip()


# Usage example:
# Add this to your notebook after the existing text processing loop

print("="*50)
print("CONCATENATED TEXT:")
print("="*50)
full_text = concatenate_text_blocks(text_blocks)
print(full_text.replace('\n', ' '))
print(f"\nTotal text length: {len(full_text)} characters")
print(f"Number of text blocks processed: {len(text_blocks)}")

# Optional: Save to file
# with open('extracted_text.txt', 'w', encoding='utf-8') as f:
#     f.write(full_text)


../data/documents/2016 Atasoy Connectome Harmonics_1756242506
../data/documents/2016 Atasoy Connectome Harmonics_1756242506/processed/elements/text_blocks.json
CONCATENATED TEXT:
ARTICLE OPEN Received 29 Oct 2015 | Accepted 2 Dec 2015 | Published 21 Jan 2016 DOI: 10.1038/ncomms10340 Human brain networks function in connectome-specific harmonic waves 1 2 1 Selen Atasoy , Isaac Donnelly & Joel Pearson A key characteristic of human brain activity is coherent, spatially distributed oscillations forming behaviour-dependent brain networks. However, a fundamental principle underlying these networks remains unknown. Here we report that functional networks of the human brain are predicted by harmonic patterns, ubiquitous throughout nature, steered by the anatomy of the human cerebral cortex, the human connectome. We introduce a new technique extending the Fourier basis to the human connectome. In this new frequency-specific representation of cortical activity, that we call ‘connectome harmonics