# Multi-format conversion

### Check and install libs

In [None]:
!pip install ipywidgets
!docling --version
!pip install pyarrow
!pip install -qU docling transformers
!pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv
!pip install milvus

### Converting pdf to md, json, and yaml formats

In [11]:
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document_converter import DocumentConverter
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
import pandas as pd
import time


In [12]:
_log = logging.getLogger(__name__)

In [3]:
def main():
    input_paths = [
        Path("inputs/Improving_machine-learning_models.pdf"),
        Path("inputs/tutorial_open-source_large_language_models.pdf"),
    ]

    ## for defaults use:
    # doc_converter = DocumentConverter()

    ## to customize use:
    doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
                ),
            },
        )
    )

    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        # Create an output directory for each input file
        out_dir = Path("outputs") / res.input.file.stem
        out_dir.mkdir(parents=True, exist_ok=True)

        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {str(out_dir)}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))

        # Export Docling document format to markdown, JSON, and YAML in the respective folder
        with (out_dir / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
            fp.write(res.document.export_to_markdown())

        with (out_dir / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(res.document.export_to_dict(), indent=4))

        with (out_dir / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict()))


In [4]:
if __name__ == "__main__":
    main()

Document Improving_machine-learning_models.pdf converted.
Saved markdown output to: outputs\Improving_machine-learning_models
Document tutorial_open-source_large_language_models.pdf converted.
Saved markdown output to: outputs\tutorial_open-source_large_language_models


### Converting pdf to md, json, and yaml formats + Exporting Tables

In [6]:
def main():
    logging.basicConfig(level=logging.INFO)

    # Define input paths and output directory
    input_paths = [
        Path("inputs/Improving_machine-learning_models.pdf"),
        # Path("inputs/tutorial_open-source_large_language_models.pdf"),
    ]
    output_root_dir = Path("outputs")

    # Configure DocumentConverter
    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
            InputFormat.ASCIIDOC,
            InputFormat.MD,
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
            ),
        },
    )

    start_time = time.time()

    for input_path in input_paths:
        if not input_path.exists():
            _log.warning(f"Input file {input_path} does not exist. Skipping.")
            continue

        # Convert document
        conv_res = doc_converter.convert(input_path)

        # Create output directory for this input
        doc_filename = conv_res.input.file.stem
        output_dir = output_root_dir / doc_filename
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save document outputs in various formats
        with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
            fp.write(conv_res.document.export_to_markdown())

        with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(conv_res.document.export_to_dict(), indent=4))

        with (output_dir / f"{doc_filename}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

        # Export tables if present
        for table_ix, table in enumerate(conv_res.document.tables):
            table_df: pd.DataFrame = table.export_to_dataframe()
            print(f"## Table {table_ix}")
            print(table_df.to_markdown())

            # Save the table as CSV
            table_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
            _log.info(f"Saving CSV table to {table_csv_filename}")
            table_df.to_csv(table_csv_filename)

            # Save the table as HTML
            table_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
            _log.info(f"Saving HTML table to {table_html_filename}")
            with table_html_filename.open("w", encoding="utf-8") as fp:
                fp.write(table.export_to_html())

    end_time = time.time() - start_time
    _log.info(f"All documents converted and tables exported in {end_time:.2f} seconds.")

In [7]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document Improving_machine-learning_models.pdf
INFO:docling.document_converter:Finished converting document Improving_machine-learning_models.pdf in 41.75 sec.
INFO:__main__:Saving CSV table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-1.csv
INFO:__main__:Saving HTML table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-1.html
INFO:__main__:Saving CSV table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-2.csv
INFO:__main__:Saving HTML table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-2.html
INFO:__main__:Saving CSV table to outputs\Imp

## Table 0
|    | Property    | Units                 | mean    |   std |    MAE |   err (%) |
|---:|:------------|:----------------------|:--------|------:|-------:|----------:|
|  0 | Gap         | eV                    | 0.17    |  0.65 | 0.054  |      33   |
|  1 | Eform       | eVatom -  1           | -  0.48 |  0.96 | 0.06   |      13   |
|  2 | Ehull       | eVatom -  1           | 0.30    |  0.48 | 0.058  |      19   |
|  3 | E tot /atom | eVatom -  1           | -  5.2  |  1.9  | 0.063  |       1.2 |
|  4 | Mag/vol     | μBÅ -  3              | 0.0083  |  0.02 | 0.0024 |      29   |
|  5 | Vol/atom    | Å3atom -  1           | 24      |  9.1  | 0.62   |       2.5 |
|  6 | DOS/atom    | states (eV atom) -  1 | 0.79    |  0.63 | 0.12   |      15   |
## Table 1
|    | Property    | Units                 | mean    |   std |    MAE |   err (%) |
|---:|:------------|:----------------------|:--------|------:|-------:|----------:|
|  0 | Gap         | eV                    | 0.13    |

### Converting pdf to md, json, and yaml formats + Exporting Tables + Image export

In [8]:
IMAGE_RESOLUTION_SCALE = 2.0

In [13]:
def main():
    logging.basicConfig(level=logging.INFO)

    # Define input paths and output root directory
    input_paths = [
        Path("inputs/Improving_machine-learning_models.pdf"),
        Path("inputs/tutorial_open-source_large_language_models.pdf"),
    ]
    output_root_dir = Path("outputs")

    # Configure PDF pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    # Configure DocumentConverter
    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
            InputFormat.ASCIIDOC,
            InputFormat.MD,
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend,
                pipeline_options=pipeline_options,
            ),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
        },
    )

    start_time = time.time()

    for input_path in input_paths:
        if not input_path.exists():
            _log.warning(f"Input file {input_path} does not exist. Skipping.")
            continue

        # Convert document
        conv_res = doc_converter.convert(input_path)

        # Create output directory for this input
        doc_filename = conv_res.input.file.stem
        output_dir = output_root_dir / doc_filename
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save page images
        for page_no, page in conv_res.document.pages.items():
            page_image_filename = output_dir / f"{doc_filename}-page-{page_no}.png"
            with page_image_filename.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")

        # Save images of figures and tables
        table_counter = 0
        picture_counter = 0
        for element, _level in conv_res.document.iterate_items():
            if isinstance(element, TableItem):
                table_counter += 1
                table_image_filename = (
                    output_dir / f"{doc_filename}-table-{table_counter}.png"
                )
                with table_image_filename.open("wb") as fp:
                    element.get_image(conv_res.document).save(fp, "PNG")

            if isinstance(element, PictureItem):
                picture_counter += 1
                picture_image_filename = (
                    output_dir / f"{doc_filename}-picture-{picture_counter}.png"
                )
                with picture_image_filename.open("wb") as fp:
                    element.get_image(conv_res.document).save(fp, "PNG")

        # Save markdown with embedded pictures
        md_embedded_filename = output_dir / f"{doc_filename}-with-images.md"
        conv_res.document.save_as_markdown(md_embedded_filename, image_mode=ImageRefMode.EMBEDDED)

        # Save markdown with externally referenced pictures
        md_referenced_filename = output_dir / f"{doc_filename}-with-image-refs.md"
        conv_res.document.save_as_markdown(md_referenced_filename, image_mode=ImageRefMode.REFERENCED)

        # Save HTML with externally referenced pictures
        html_referenced_filename = output_dir / f"{doc_filename}-with-image-refs.html"
        conv_res.document.save_as_html(html_referenced_filename, image_mode=ImageRefMode.REFERENCED)

        # Save document outputs in various formats
        with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
            fp.write(conv_res.document.export_to_markdown())

        with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(conv_res.document.export_to_dict(), indent=4))

        with (output_dir / f"{doc_filename}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

        # Export tables if present
        for table_ix, table in enumerate(conv_res.document.tables):
            table_df: pd.DataFrame = table.export_to_dataframe()
            print(f"## Table {table_ix}")
            print(table_df.to_markdown())

            # Save the table as CSV
            table_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
            _log.info(f"Saving CSV table to {table_csv_filename}")
            table_df.to_csv(table_csv_filename)

            # Save the table as HTML
            table_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
            _log.info(f"Saving HTML table to {table_html_filename}")
            with table_html_filename.open("w", encoding="utf-8") as fp:
                fp.write(table.export_to_html())

    end_time = time.time() - start_time
    _log.info(f"All documents converted, images, and tables exported in {end_time:.2f} seconds.")


In [14]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document Improving_machine-learning_models.pdf
INFO:docling.document_converter:Finished converting document Improving_machine-learning_models.pdf in 44.97 sec.
INFO:__main__:Saving CSV table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-1.csv
INFO:__main__:Saving HTML table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-1.html
INFO:__main__:Saving CSV table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-2.csv
INFO:__main__:Saving HTML table to outputs\Improving_machine-learning_models\Improving_machine-learning_models-table-2.html
INFO:__main__:Saving CSV table to outputs\Imp

## Table 0
|    | Property    | Units                 | mean    |   std |    MAE |   err (%) |
|---:|:------------|:----------------------|:--------|------:|-------:|----------:|
|  0 | Gap         | eV                    | 0.17    |  0.65 | 0.054  |      33   |
|  1 | Eform       | eVatom -  1           | -  0.48 |  0.96 | 0.06   |      13   |
|  2 | Ehull       | eVatom -  1           | 0.30    |  0.48 | 0.058  |      19   |
|  3 | E tot /atom | eVatom -  1           | -  5.2  |  1.9  | 0.063  |       1.2 |
|  4 | Mag/vol     | μBÅ -  3              | 0.0083  |  0.02 | 0.0024 |      29   |
|  5 | Vol/atom    | Å3atom -  1           | 24      |  9.1  | 0.62   |       2.5 |
|  6 | DOS/atom    | states (eV atom) -  1 | 0.79    |  0.63 | 0.12   |      15   |
## Table 1
|    | Property    | Units                 | mean    |   std |    MAE |   err (%) |
|---:|:------------|:----------------------|:--------|------:|-------:|----------:|
|  0 | Gap         | eV                    | 0.13    |

INFO:docling.document_converter:Finished converting document tutorial_open-source_large_language_models.pdf in 72.67 sec.
INFO:__main__:Saving CSV table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-1.csv
INFO:__main__:Saving HTML table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-1.html
INFO:__main__:Saving CSV table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-2.csv
INFO:__main__:Saving HTML table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-2.html
INFO:__main__:Saving CSV table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-3.csv
INFO:__main__:Saving HTML table to outputs\tutorial_open-source_large_language_models\tutorial_open-source_large_language_models-table-3.html
INFO:__main__:All documents conv

## Table 0
|    | Text Construct Factor                                                       |
|---:|:----------------------------------------------------------------------------|
|  0 | Go straight for the goal. Achievement-Striving Conscientiousness            |
|  1 | Plunge into tasks with all my heart. Achievement-Striving Conscientiousness |
|  2 | Remain calm under pressure. Vulnerability Neuroticism                       |
## Table 1
|    | Text Label                                       |
|---:|:-------------------------------------------------|
|  0 | Broken leg. A broken leg (leg fracture)... 49.33 |
|  1 | Bulimia. Bulimia is an eating disorder... 34.18  |
|  2 | Hyperacusis. Hyperacusis is when... 53.82        |
## Table 2
|    | Excerpt BT_easiness                              |
|---:|:-------------------------------------------------|
|  0 | An honest and poor old woman was... -0.05        |
|  1 | Our plate illustrates the residence of... -2.98  |
|  2 | Just as wilde