## Data Extraction with Docling

In this notebook, extract content from PDFs into structured formats:

- **Markdown**: Full document text with page breaks for chunking
- **Images**: Save pages containing large charts/diagrams (>500x500 pixels)
- **Tables**: Extract with 2 paragraphs of context + page number metadata

**Output Structure:**
```
data/rag-data/markdown/{company}/{document}.md
data/rag-data/images/{company}/{document}/page_5.png
data/rag-data/tables/{company}/{document}/table_1_page_5.md
```

### 1. Setup and Configuration

In [1]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [2]:
# Directory paths
DATA_DIR = "data/rag-data/pdfs"
OUTPUT_MD_DIR = "data/rag-data/markdown"
OUTPUT_IMAGES_DIR = "data/rag-data/images"
OUTPUT_TABLES_DIR = "data/rag-data/tables"

### Metadata Extraction

In [3]:
def extract_metadata_from_filename(filename: str):
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """

    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

extract_metadata_from_filename('apple 10-k 2023.pdf')

{'company_name': 'apple',
 'doc_type': '10-k',
 'fiscal_quarter': None,
 'fiscal_year': '2023'}

### Extract Markdown

In [4]:
def convert_pdf_to_docling(pdf_file: Path):

    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = 2
    pipeline_options.generate_picture_images = True
    pipeline_options.generate_page_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    return doc_converter.convert(pdf_file)

In [8]:
pdf_file = Path("data\\rag-data\\pdfs\\apple\\apple 8-k q4 2023.pdf")
pdf_file.stem

meta_data = extract_metadata_from_filename(pdf_file.stem)
company_name = meta_data['company_name']

md_dir = Path(OUTPUT_MD_DIR) / company_name
images_dir = Path(OUTPUT_IMAGES_DIR) / company_name / pdf_file.stem
tables_dir = Path(OUTPUT_TABLES_DIR) / company_name / pdf_file.stem

for dir_path in [md_dir, images_dir, tables_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

doc_converter = convert_pdf_to_docling(pdf_file)
markdown_text = doc_converter.document.export_to_markdown(page_break_placeholder="<!-- page break --!>")

[32m[INFO] 2026-02-11 13:54:07,807 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m


[32m[INFO] 2026-02-11 13:54:07,817 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\ASUSZ\anaconda3\envs\venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-11 13:54:07,818 [RapidOCR] main.py:53: Using C:\Users\ASUSZ\anaconda3\envs\venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-11 13:54:07,914 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-11 13:54:07,916 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\ASUSZ\anaconda3\envs\venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-11 13:54:07,917 [RapidOCR] main.py:53: Using C:\Users\ASUSZ\anaconda3\envs\venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-11 13:54:07,974 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-11 13:54:07,986 [RapidOCR] download_file.py:60: File exi

In [9]:
(md_dir / f"{pdf_file.stem}.md").write_text(markdown_text, encoding="utf-8")

28575