<a href="https://colab.research.google.com/github/sunnygupta3535/Notebook/blob/main/pdf_processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Processor (RAG Pipeline)

This notebook handles the PDF processing part of the RAG pipeline using Docling.
It scans a directory for PDF files, converts them to Markdown, and splits them by page.

In [1]:
!pip install docling

Collecting docling
  Downloading docling-2.64.1-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.50.1 (from docling-core[chunking]<3.0.0,>=2.50.1->docling)
  Downloading docling_core-2.54.1-py3-none-any.whl.metadata (7.6 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.3-py3-none-any.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading

In [2]:
import os
import logging
from pathlib import Path
from typing import Optional

from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import PdfFormatOption

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Configuration
DATA_DIR = Path("data")
OUTPUT_DIR = Path("output")

print(f"Data Directory: {DATA_DIR.resolve()}")
print(f"Output Directory: {OUTPUT_DIR.resolve()}")

Data Directory: /content/data
Output Directory: /content/output


In [4]:
class PDFProcessor:
    """Process PDFs using Docling and export to markdown."""

    def __init__(self, output_dir: Optional[str] = None):
        self.output_dir = Path(output_dir or OUTPUT_DIR)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Configure PDF pipeline options
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = True  # Enable OCR for scanned PDFs

        self.converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )

    def process_pdf(self, pdf_path: str) -> list[dict]:
        """
        Process a single PDF and generate page-wise markdown files.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            List of dicts with page info and markdown content
        """
        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Convert PDF
        logger.info(f"Converting {pdf_path.name}...")
        result = self.converter.convert(pdf_path)

        # Get the document
        doc = result.document

        # Export full document to markdown
        full_md = doc.export_to_markdown()

        # Create output directory for this PDF
        pdf_output_dir = self.output_dir / pdf_path.stem
        pdf_output_dir.mkdir(parents=True, exist_ok=True)

        # Save full markdown
        full_md_path = pdf_output_dir / f"{pdf_path.stem}_full.md"
        with open(full_md_path, 'w', encoding='utf-8') as f:
            f.write(full_md)

        # Split by pages and save individual page files
        pages = self._split_by_pages(doc, full_md)

        page_files = []
        for i, page_content in enumerate(pages, 1):
            page_path = pdf_output_dir / f"{pdf_path.stem}_page_{i:03d}.md"
            with open(page_path, 'w', encoding='utf-8') as f:
                f.write(page_content)

            page_files.append({
                'page_num': i,
                'file_path': str(page_path),
                'content': page_content,
                'source_pdf': str(pdf_path),
            })

        return page_files

    def _split_by_pages(self, doc, full_md: str) -> list[str]:
        """
        Split document content by pages.
        """
        pages = []

        try:
            if hasattr(doc, 'pages') and doc.pages:
                for page in doc.pages:
                    if hasattr(page, 'export_to_markdown'):
                        # This works for PageItem objects in recent Docling versions
                        pages.append(page.export_to_markdown())
                    elif hasattr(page, 'text'):
                         # Fallback to text content if markdown export not available on page
                        pages.append(page.text)
                    else:
                        # Fallback: extract text content representation
                        content = str(page) if page else ""
                        pages.append(content)
        except Exception as e:
            logger.warning(f"Error extracting pages: {e}")
            pass

        if not pages:
            if '\n---\n' in full_md:
                pages = full_md.split('\n---\n')
            else:
                pages = [full_md]

        return pages

    def process_all_pdfs(self, data_dir: Optional[str] = None) -> list[dict]:
        """
        Process all PDFs in the data directory.
        """
        data_dir = Path(data_dir or DATA_DIR)

        all_pages = []
        pdf_files = list(data_dir.glob("**/*.pdf"))

        logger.info(f"Found {len(pdf_files)} PDF files to process in {data_dir}")

        for pdf_path in pdf_files:
            logger.info(f"Processing: {pdf_path.name}")
            try:
                pages = self.process_pdf(pdf_path)
                all_pages.extend(pages)
                logger.info(f"  → Generated {len(pages)} page(s)")
            except Exception as e:
                logger.error(f"  ✗ Error processing {pdf_path.name}: {e}")

        return all_pages

In [6]:
from google.colab import userdata
userdata.get('HF_TOKEN')

# Run Processing
if __name__ == "__main__":
    processor = PDFProcessor(output_dir=OUTPUT_DIR)

    # Ensure data directory exists
    if not DATA_DIR.exists():
        print(f"Warning: Data directory {DATA_DIR} does not exist!")
        print("Please create it and place PDF files there.")
    else:
        results = processor.process_all_pdfs(data_dir=DATA_DIR)
        print(f"\nTotal pages generated: {len(results)}")

[32m[INFO] 2025-12-10 10:36:21,670 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-10 10:36:21,671 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2025-12-10 10:36:21,714 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-10 10:36:21,715 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-10 10:36:21,947 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-10 10:36:21,949 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2025-12-10 10:36:21,952 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-12-10 10:36:21,953 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages


Total pages generated: 307


In [7]:
# Verify Output
output_files = list(OUTPUT_DIR.glob("**/*.md"))
print(f"Found {len(output_files)} markdown files in output directory:")
for f in output_files[:5]:
    print(f" - {f.name}")
if len(output_files) > 5:
    print("...")

Found 312 markdown files in output directory:
 - Cyber_Crime_Law_and_Practice_page_061.md
 - Cyber_Crime_Law_and_Practice_page_079.md
 - Cyber_Crime_Law_and_Practice_page_138.md
 - Cyber_Crime_Law_and_Practice_page_062.md
 - Cyber_Crime_Law_and_Practice_page_056.md
...
