# PDF Processing Pipeline Demo

This notebook demonstrates how to detect the type of a PDF, classify elements from digital PDFs, and route those elements to placeholder processors.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append('../src')
sys.path.append(os.path.abspath('..'))  # This gets the absolute path to root

from extraction.classifiers.pdf import PDFTypeDetector, DigitalElementClassifier
from extraction.routers.pdf import ElementRouter
from pathlib import Path

from extraction.utils.document_manager import DocumentManager

# pdf_path = Path('../data/digital.pdf')
pdf_path = Path('../data/unprocessed_documents/Ripples of Consciousenss.pdf')
pdf_path = Path('../data/unprocessed_documents/2016 Atasoy Connectome Harmonics.pdf')
# pdf_path = Path('../data/0d45add2d94d80a0eb85e41e22aa43a0.pdf')

detector = PDFTypeDetector()
pdf_type = detector.detect(str(pdf_path))
pdf_type

'digital'

In [2]:
# if pdf_type == 'digital':
#     classifier = DigitalElementClassifier()
#     results = classifier.classify(str(pdf_path))
# else:
#     results = []


if pdf_type == 'digital':
    
    # Create a document ID from the PDF filename
    pdf_name = pdf_path.stem
    document_id = f"{pdf_name}_{int(__import__('time').time())}"
    
    print(f"📋 Processing document: {document_id}")
    print(f"📄 PDF: {pdf_path.name}")
    
    # Create document structure and copy PDF
    
    doc_manager = DocumentManager()
    doc_path = doc_manager.create_document(document_id, str(pdf_path))
    
    # Now classify with document_id so images are saved to the right place
    classifier = DigitalElementClassifier()
    results = classifier.classify(str(pdf_path), document_id=document_id)
    
    # Save extracted elements to organized folders
    doc_manager.save_elements(document_id, results)
    
    print(f"✅ Document processed and saved to: {doc_path}")
else:
    results = []

📋 Processing document: 2016 Atasoy Connectome Harmonics_1756174013
📄 PDF: 2016 Atasoy Connectome Harmonics.pdf
📄 Copied 2016 Atasoy Connectome Harmonics.pdf to /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756174013/raw/2016 Atasoy Connectome Harmonics.pdf
✅ Document processed and saved to: /Users/daylight/Desktop/pdf_data_extraction/pdf-data-extraction/data/documents/2016 Atasoy Connectome Harmonics_1756174013


In [None]:
print(results.keys())

for key, value in results.items():
    print(f'key {key}, length {len(value)}')



# for text_block in results['text']:
#     # print(text_block)

#     try:
#         print(text_block['text'])
#     except:
#         # print("~~~~~~~~~~~ ERROR ~~~~~~~~ Could not print text from text bloc")
#         continue


# for image in results['images']:
#     print(image)


# Text Concatenation Function for PDF Data Extraction


def concatenate_text_blocks(text_blocks):
    """
    Concatenate all text entries from text blocks into one big string.
    
    Args:
        text_blocks: List of text block dictionaries
        
    Returns:
        str: Concatenated text from all blocks
    """
    concatenated_text = ""
    
    for text_block in text_blocks:
        try:
            if 'text' in text_block and text_block['text']:
                concatenated_text += text_block['text'] + "\n"
        except Exception as e:
            print(f"Error processing text block: {e}")
            continue
    
    return concatenated_text.strip()


# Usage example:
# Add this to your notebook after the existing text processing loop

print("="*50)
print("CONCATENATED TEXT:")
print("="*50)
full_text = concatenate_text_blocks(results['text'])
print(full_text)
print(f"\nTotal text length: {len(full_text)} characters")
print(f"Number of text blocks processed: {len(results['text'])}")

# Optional: Save to file
# with open('extracted_text.txt', 'w', encoding='utf-8') as f:
#     f.write(full_text)
