In [1]:
!docling --version

Docling version: 2.23.0
Docling Core version: 2.19.1
Docling IBM Models version: 3.3.2
Docling Parse version: 3.3.1
Python: cpython-312 (3.12.9)
Platform: Windows-11-10.0.26100-SP0


In [1]:
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling_core.types.doc import PictureItem
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document_converter import DocumentConverter
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
import pandas as pd
import time
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
    AcceleratorDevice,
    AcceleratorOptions,
    granite_picture_description,
    smolvlm_picture_description,
)
from docling.datamodel.settings import settings


In [2]:
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 3.0

In [3]:
def main():
    logging.basicConfig(level=logging.INFO)

    # Define input paths and output root directory
    input_paths = [
        # Path("inputs/Improving_machine-learning_models.pdf"),
        # Path("inputs/tutorial_open-source_large_language_models.pdf"),
        # Path("inputs/Finite_Element_Modeling_Shape_Changes_Plant_Cells.pdf"),
        Path("inputs/wang_.pdf"),
    ]
    output_root_dir = Path("outputs")

    accelerator_options = AcceleratorOptions(
        num_threads=8, device=AcceleratorDevice.AUTO
    )

    # Configure PDF pipeline options
    pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_picture_description = True
    # pipeline_options.picture_description_options = smolvlm_picture_description

    # pipeline_options.picture_description_options.prompt = (
    #     "Describe the image in three sentences. Be consise and accurate."
    # )

    pipeline_options.accelerator_options = accelerator_options
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options

    # Configure DocumentConverter
    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
            InputFormat.ASCIIDOC,
            InputFormat.MD,
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend,
                pipeline_options=pipeline_options,
            ),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
        },
    )

    start_time = time.time()

    # result = doc_converter.convert(input_paths)
    # result = doc_converter.convert([str(path) for path in input_paths])
    # for input_path in input_paths:
    #     result = doc_converter.convert(str(input_path))

    for input_path in input_paths:
        if not input_path.exists():
            _log.warning(f"Input file {input_path} does not exist. Skipping.")
            continue

        # Convert document
        conv_res = doc_converter.convert(input_path)

        # Create output directory for this input
        doc_filename = conv_res.input.file.stem
        output_dir = output_root_dir / doc_filename
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save page images
        for page_no, page in conv_res.document.pages.items():
            page_image_filename = output_dir / f"{doc_filename}-page-{page_no}.png"
            with page_image_filename.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")

        # Save images of figures and tables
        table_counter = 0
        picture_counter = 0
        for element, _level in conv_res.document.iterate_items():
            if isinstance(element, TableItem):
                table_counter += 1
                table_image_filename = (
                    output_dir / f"{doc_filename}-table-{table_counter}.png"
                )
                with table_image_filename.open("wb") as fp:
                    element.get_image(conv_res.document).save(fp, "PNG")

            if isinstance(element, PictureItem):
                picture_counter += 1
                picture_image_filename = (
                    output_dir / f"{doc_filename}-picture-{picture_counter}.png"
                )

                # print(
                # f"Picture {element.self_ref}\n"
                # f"Caption: {element.caption_text(doc=result.document)}\n"
                # f"Annotations: {element.annotations}"
                # )

                with picture_image_filename.open("wb") as fp:
                    element.get_image(conv_res.document).save(fp, "PNG")

        # Save markdown with embedded pictures
        md_embedded_filename = output_dir / f"{doc_filename}-with-images.md"
        conv_res.document.save_as_markdown(md_embedded_filename, image_mode=ImageRefMode.EMBEDDED)

        # Save markdown with externally referenced pictures
        md_referenced_filename = output_dir / f"{doc_filename}-with-image-refs.md"
        conv_res.document.save_as_markdown(md_referenced_filename, image_mode=ImageRefMode.REFERENCED)

        # Save HTML with externally referenced pictures
        html_referenced_filename = output_dir / f"{doc_filename}-with-image-refs.html"
        conv_res.document.save_as_html(html_referenced_filename, image_mode=ImageRefMode.REFERENCED)

        # Save document outputs in various formats
        with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
            fp.write(conv_res.document.export_to_markdown())

        with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
            fp.write(conv_res.document.export_to_text())

        with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(conv_res.document.export_to_dict(), indent=4))

        with (output_dir / f"{doc_filename}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

        # Export tables if present
        for table_ix, table in enumerate(conv_res.document.tables):
            table_df: pd.DataFrame = table.export_to_dataframe()
            print(f"## Table {table_ix}")
            print(table_df.to_markdown())

            # Save the table as CSV
            table_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
            _log.info(f"Saving CSV table to {table_csv_filename}")
            table_df.to_csv(table_csv_filename)

            # Save the table as HTML
            table_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
            _log.info(f"Saving HTML table to {table_html_filename}")
            with table_html_filename.open("w", encoding="utf-8") as fp:
                fp.write(table.export_to_html())

    end_time = time.time() - start_time
    _log.info(f"All documents converted, images, and tables exported in {end_time:.2f} seconds.")


In [4]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract --list-langs
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document Advanced_edited.pdf
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract -l fra+deu+spa+eng C:\Users\livMatS\AppData\Local\Temp\tmpv_jcbchj.png stdout tsv
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract -l fra+deu+spa+eng C:\Users\livMatS\AppData\Local\Temp\tmptxvg3_xi.png stdout tsv
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract -l fra+deu+spa+eng C:\Users\livMatS\AppData\Local\Temp\tmpscc62q1a.png stdout tsv
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract -l fra+deu+spa+eng C:\Users\livMatS\AppData\Local\Temp\tmpk56evju9.png stdout tsv
INFO:docling.models.tesseract_ocr_cli_model:command: tesseract -l fra+deu+s