In [2]:
from pathlib import Path
import matplotlib.pyplot as plt

from docling.document_converter import DocumentConverter
from docling.document_converter import PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import TableFormerMode

from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import PictureItem
from docling_core.types.doc.document import TableItem
from docling_core.types.doc.document import TextItem
from docling_core.types.doc.document import DoclingDocument
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.do_code_enrichment = True
pipeline_options.do_formula_enrichment = True
pipeline_options.do_picture_classification = True

output_dir = Path('Output')
output_dir.mkdir(exist_ok = True)

In [3]:
docling_paper = 'https://arxiv.org/pdf/2501.17887'

converter = DocumentConverter(
    allowed_formats = [InputFormat.PDF],
    format_options = {
        InputFormat.PDF: PdfFormatOption(pipeline_options = pipeline_options, backend = PyPdfiumDocumentBackend),
        }
)

result = converter.convert(docling_paper)
doc = result.document

2025-11-06 17:31:08,527 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-06 17:31:08,529 - INFO - Going to convert document batch...
2025-11-06 17:31:08,529 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 0c59f2efc0f49e77e021b96930f51155
2025-11-06 17:31:08,534 - INFO - Loading plugin 'docling_defaults'
2025-11-06 17:31:08,535 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-06 17:31:08,555 - INFO - Accelerator device: 'cuda:0'
2025-11-06 17:31:09,119 - INFO - Loading plugin 'docling_defaults'
2025-11-06 17:31:09,121 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-06 17:31:09,125 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-11-06 17:31:09,126 - INFO - easyocr cannot be used because it is not installed.
2025-11-06 17:31:09,211 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-11-06 17:31:09,220 [RapidOCR] base.py:22: Using engine_name: to

In [53]:
md_out = doc.export_to_markdown()
print(f'{md_out[ : 6000]}')

## Docling: An Efficient Open-Source Toolkit for AI-driven Document Conversion

Nikolaos Livathinos * , Christoph Auer * , Maksym Lysak, Ahmed Nassar, Michele Dolfi, Panagiotis Vagenas, Cesar Berrospi, Matteo Omenetti, Kasper Dinkla, Yusik Kim, Shubham Gupta, Rafael Teixeira de Lima, Valery Weber, Lucas Morin, Ingmar Meijer, Viktor Kuropiatnyk, Peter W. J. Staar

IBM Research, Ruschlikon, Switzerland ¨ ¨

Please send correspondence to: deepsearch-core@zurich.ibm.com

## Abstract

We introduce Docling, an easy-to-use, self-contained, MITlicensed, open-source toolkit for document conversion, that can parse several types of popular document formats into a unified, richly structured representation. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. Docling is released as a Python package and can be used as a Python API or as a CLI tool. 

In [4]:
print(f"Document title: {doc.name}")
print(f"Number of pages: {len(doc.pages)}")
print(f"Number of tables: {len(doc.tables)}")
print(f"Number of pictures: {len(doc.pictures)}")

print('Document structure')
for i, (item, level) in enumerate(doc.iterate_items()):
    if i < 20:
        item_type = type(item).__name__
        text_preview = item.text[:200] if hasattr(item, 'text') else 'N/A'
        print(f'{"  " * level}- {item_type}: {text_preview}')

    

Document title: 2501.17887v1
Number of pages: 8
Number of tables: 1
Number of pictures: 6
Document structure
  - SectionHeaderItem: Docling: An Efficient Open-Source Toolkit for AI-driven Document Conversion
  - TextItem: Nikolaos Livathinos * , Christoph Auer * , Maksym Lysak, Ahmed Nassar, Michele Dolfi, Panagiotis Vagenas, Cesar Berrospi, Matteo Omenetti, Kasper Dinkla, Yusik Kim, Shubham Gupta, Rafael Teixeira de L
  - TextItem: IBM Research, Ruschlikon, Switzerland ¨ ¨
  - TextItem: Please send correspondence to: deepsearch-core@zurich.ibm.com
  - SectionHeaderItem: Abstract
  - TextItem: We introduce Docling, an easy-to-use, self-contained, MITlicensed, open-source toolkit for document conversion, that can parse several types of popular document formats into a unified, richly structur
  - TextItem: Repository — https://github.com/DS4SD/docling
  - SectionHeaderItem: 1 Introduction
  - TextItem: Converting documents back into a unified machineprocessable format has been a major ch

In [55]:
doc.save_as_json(output_dir / 'docling_paper.json')