In [1]:
DOC_SOURCE = "https://arxiv.org/pdf/2311.18481"

# we set some start-stop cues for defining an excerpt to print
start_cue = "Copyright © 2024"
stop_cue = "Application of NLP to ESG"

In [2]:
from rich.console import Console
from rich.panel import Panel

console = Console(width=210)  # for preventing Markdown table wrapped rendering


def print_in_console(text):
    console.print(Panel(text))

In [3]:
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
doc = converter.convert(source=DOC_SOURCE).document

2025-09-27 22:40:41,128 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-27 22:40:46,055 - INFO - Going to convert document batch...
2025-09-27 22:40:46,058 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-27 22:40:53,526 - INFO - Loading plugin 'docling_defaults'
2025-09-27 22:40:53,530 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-27 22:40:53,541 - INFO - Loading plugin 'docling_defaults'
2025-09-27 22:40:53,550 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-27 22:40:54,378 - INFO - Accelerator device: 'cpu'
2025-09-27 22:40:55,936 - INFO - Accelerator device: 'cpu'
2025-09-27 22:40:57,236 - INFO - Accelerator device: 'cpu'
2025-09-27 22:40:57,733 - INFO - Processing document 2311.18481v1.pdf
2025-09-27 22:41:07,683 - INFO - Finished converting document 2311.18481v1.pdf in 27.30 sec.


In [4]:
from docling_core.transforms.serializer.html import HTMLDocSerializer

serializer = HTMLDocSerializer(doc=doc)
ser_result = serializer.serialize()
ser_text = ser_result.text

# we here only print an excerpt to keep the output brief:
print_in_console(ser_text[ser_text.find(start_cue): ser_text.find(stop_cue)])

In [5]:
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer

serializer = MarkdownDocSerializer(doc=doc)
ser_result = serializer.serialize()
ser_text = ser_result.text

print_in_console(ser_text[ser_text.find(start_cue): ser_text.find(stop_cue)])

In [6]:
from docling_core.transforms.chunker.hierarchical_chunker import TripletTableSerializer
from docling_core.transforms.serializer.markdown import MarkdownParams

serializer = MarkdownDocSerializer(
    doc=doc,
    table_serializer=TripletTableSerializer(),
    params=MarkdownParams(
        image_placeholder="",
        # ...
    ),
)
ser_result = serializer.serialize()
ser_text = ser_result.text

print_in_console(ser_text[ser_text.find(start_cue): ser_text.find(stop_cue)])

In [8]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionVlmOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

pipeline_options = PdfPipelineOptions(
    do_picture_description=True,
    picture_description_options=PictureDescriptionVlmOptions(
        repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
        prompt="Describe this picture in three to five sentences. Be precise and concise.",
    ),
    generate_picture_images=True,
    images_scale=2,
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(
        pipeline_options=pipeline_options)}
)
doc = converter.convert(source=DOC_SOURCE).document

2025-09-27 22:47:05,838 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-27 22:47:05,840 - INFO - Going to convert document batch...
2025-09-27 22:47:05,840 - INFO - Initializing pipeline for StandardPdfPipeline with options hash aa9f55aff62f306a7a681d0256fe95ca
2025-09-27 22:47:06,125 - INFO - Accelerator device: 'cpu'
2025-09-27 22:47:06,383 - INFO - Accelerator device: 'cpu'
2025-09-27 22:47:07,500 - INFO - Accelerator device: 'cpu'
2025-09-27 22:47:08,560 - INFO - Accelerator device: 'cpu'
2025-09-27 22:47:08,795 - INFO - Processing document 2311.18481v1.pdf
2025-09-27 22:48:59,543 - INFO - Finished converting document 2311.18481v1.pdf in 114.08 sec.


In [9]:
from typing import Any, Optional

from docling_core.transforms.serializer.base import (
    BaseDocSerializer,
    SerializationResult,
)
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.transforms.serializer.markdown import (
    MarkdownParams,
    MarkdownPictureSerializer,
)
from docling_core.types.doc.document import (
    DoclingDocument,
    PictureDescriptionData,
    PictureItem,
)
from docling_core.types.doc.base import ImageRefMode
from typing_extensions import override


class AnnotationPictureSerializer(MarkdownPictureSerializer):
    @override
    def serialize(
        self,
        *,
        item: PictureItem,
        doc_serializer: BaseDocSerializer,
        doc: DoclingDocument,
        separator: Optional[str] = None,
        **kwargs: Any,
    ) -> SerializationResult:
        text_parts: list[str] = []

        # reusing the existing result:
        parent_res = super().serialize(
            item=item,
            doc_serializer=doc_serializer,
            doc=doc,
            **kwargs,
        )
        text_parts.append(parent_res.text)

        # appending annotations:
        for annotation in item.annotations:
            if isinstance(annotation, PictureDescriptionData):
                text_parts.append(f"")

        text_res = (separator or "\n").join(text_parts)
        return create_ser_result(text=text_res, span_source=item)

In [10]:
serializer = MarkdownDocSerializer(
    doc=doc,
    picture_serializer=AnnotationPictureSerializer(),
    params=MarkdownParams(
        image_mode=ImageRefMode.PLACEHOLDER,
        image_placeholder="",
    ),
)
ser_result = serializer.serialize()
ser_text = ser_result.text

print_in_console(ser_text[ser_text.find(start_cue): ser_text.find(stop_cue)])