In [None]:
!docling --version


In [7]:
from pathlib import Path
import json
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


In [8]:
def main():
    input_doc = Path("s13.pdf")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    ocr_options = TesseractCliOcrOptions(
        force_full_page_ocr=True,
        tesseract_cmd="C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
    )
    pipeline_options.ocr_options = ocr_options

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    doc = converter.convert(input_doc).document
    md = doc.export_to_markdown()

    # Save Markdown
    output_path_md = Path("output.md")
    with output_path_md.open("w", encoding="utf-8") as f:
        f.write(md)

    # Save Plain Text
    text_content = doc.get_text()
    output_path_txt = Path("output.txt")
    with output_path_txt.open("w", encoding="utf-8") as f:
        f.write(text_content)

    # Save Tables
    table_data = doc.get_tables()
    output_path_tables = Path("tables.json")
    with output_path_tables.open("w", encoding="utf-8") as f:
        json.dump(table_data, f)

    print("Markdown, text, and tables have been saved!")


In [None]:
# Run the function
if __name__ == "__main__":
    main()

In [4]:
from pathlib import Path
import json
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def main():
    # Path to the input PDF document
    input_doc = Path("test_light.pdf")

    # Initialize pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    # OCR configuration (Using Tesseract as OCR engine)
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True, tesseract_cmd="C:\\Program Files\\Tesseract-OCR\\tesseract.exe")
    pipeline_options.ocr_options = ocr_options

    # Initialize the document converter
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Convert the input PDF document
    doc = converter.convert(input_doc).document

    # 1. Extracted Text (OCR Text)
    ocr_text = doc.ocr_text  # OCR extracted text from the document
    output_path_ocr = Path("ocr_text.txt")
    with output_path_ocr.open("w", encoding="utf-8") as f:
        f.write(ocr_text)

    # 2. Markdown Output
    md = doc.export_to_markdown()  # Convert document to markdown
    output_path_md = Path("output.md")
    with output_path_md.open("w", encoding="utf-8") as f:
        f.write(md)

    # 3. Plain Text Output (if available)
    if hasattr(doc, 'text'):  # Check if text is available
        text_content = doc.text  # This may be available if the document has a text attribute
        output_path_txt = Path("output.txt")
        with output_path_txt.open("w", encoding="utf-8") as f:
            f.write(text_content)

    # 4. Table Data (Extract tables from document)
    if hasattr(doc, 'tables'):  # Check if tables are available
        table_data = doc.tables  # Get table data from the document
        output_path_tables = Path("tables.json")
        with output_path_tables.open("w", encoding="utf-8") as f:
            json.dump(table_data, f, indent=4)

    # 5. Metadata (Extract metadata like title, author, etc.)
    if hasattr(doc, 'metadata'):  # Check if metadata is available
        metadata = doc.metadata  # Extract document metadata
        output_path_metadata = Path("metadata.json")
        with output_path_metadata.open("w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=4)

    # 6. Images (Extract images if any from the document)
    if hasattr(doc, 'images'):  # Check if images are available
        images = doc.images  # Extract images from the document
        output_path_images = Path("images.json")
        with output_path_images.open("w", encoding="utf-8") as f:
            json.dump(images, f, indent=4)

    # 7. Document Structure (Get structured content such as headings, paragraphs, etc.)
    if hasattr(doc, 'sections'):  # Check if sections are available
        document_structure = doc.sections  # Extract document structure (headings, paragraphs)
        output_path_structure = Path("structure.json")
        with output_path_structure.open("w", encoding="utf-8") as f:
            json.dump(document_structure, f, indent=4)

    # Print confirmation
    print("All document features have been captured and saved!")


# Run the function
if __name__ == "__main__":
    main()


AttributeError: 'DoclingDocument' object has no attribute 'ocr_text'

In [2]:
import datetime
import logging
import time
from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
_log = logging.getLogger(__name__)

In [4]:
IMAGE_RESOLUTION_SCALE = 2.0

In [7]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("s13.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)

    rows = []
    for (
        content_text,
        content_md,
        content_dt,
        page_cells,
        page_segments,
        page,
    ) in generate_multimodal_pages(conv_res):

        dpi = page._default_image_scale * 72

        rows.append(
            {
                "document": conv_res.input.file.name,
                "hash": conv_res.input.document_hash,
                "page_hash": create_hash(
                    conv_res.input.document_hash + ":" + str(page.page_no - 1)
                ),
                "image": {
                    "width": page.image.width,
                    "height": page.image.height,
                    "bytes": page.image.tobytes(),
                },
                "cells": page_cells,
                "contents": content_text,
                "contents_md": content_md,
                "contents_dt": content_dt,
                "segments": page_segments,
                "extra": {
                    "page_num": page.page_no + 1,
                    "width_in_points": page.size.width,
                    "height_in_points": page.size.height,
                    "dpi": dpi,
                },
            }
        )

    # Generate one parquet from all documents
    df = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
    df.to_parquet(output_filename)

    end_time = time.time() - start_time

    _log.info(
        f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
    )

    # This block demonstrates how the file can be opened with the HF datasets library
    # from datasets import Dataset
    # from PIL import Image
    # multimodal_df = pd.read_parquet(output_filename)

    # # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
    # dataset = Dataset.from_pandas(multimodal_df)
    # def transforms(examples):
    #     examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
    #     return examples
    # dataset = dataset.map(transforms)

In [10]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document s13.pdf
INFO:docling.document_converter:Finished converting document s13.pdf in 66.22 sec.
INFO:__main__:Document converted and multimodal pages generated in 67.36 seconds.


In [9]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-19.0.0-cp310-cp310-win_amd64.whl (25.3 MB)
   ---------------------------------------- 0.0/25.3 MB ? eta -:--:--
   ----------------- ---------------------- 11.3/25.3 MB 54.2 MB/s eta 0:00:01
   -------------------------------------- - 24.4/25.3 MB 57.1 MB/s eta 0:00:01
   ---------------------------------------- 25.3/25.3 MB 55.2 MB/s eta 0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.0


In [2]:
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

In [6]:
_log = logging.getLogger(__name__)

In [11]:
def main():
    input_paths = [
        Path("s13.pdf"),
        # Path("tests/data/html/wiki_duck.html"),
        # Path("tests/data/docx/word_sample.docx"),
        # Path("tests/data/docx/lorem_ipsum.docx"),
        # Path("tests/data/pptx/powerpoint_sample.pptx"),
        # Path("tests/data/2305.03393v1-pg9-img.png"),
        # Path("tests/data/2206.01062.pdf"),
        # Path("tests/data/test_01.asciidoc"),
        # Path("tests/data/test_01.asciidoc"),
    ]

    ## for defaults use:
    # doc_converter = DocumentConverter()

    ## to customize use:

    doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
                ),
            },
        )
    )

    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        out_path = Path("scratch")
        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {str(out_path)}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        # Export Docling document format to markdowndoc:
        with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
            fp.write(res.document.export_to_markdown())

        with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(res.document.export_to_dict()))

        with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict()))

In [12]:
if __name__ == "__main__":
    main()

Document s13.pdf converted.
Saved markdown output to: scratch


In [13]:
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0

In [14]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("s13.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    # Save HTML with externally referenced pictures
    html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
    conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

In [15]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document s13.pdf
INFO:docling.document_converter:Finished converting document s13.pdf in 70.00 sec.
INFO:__main__:Document converted and figures exported in 74.34 seconds.


In [16]:
import json
import logging
import time
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)

In [17]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("s13.pdf")

    ###########################################################################

    # The following sections contain a combination of PipelineOptions
    # and PDF Backends for various configurations.
    # Uncomment one section at the time to see the differences in the output.

    # PyPdfium without EasyOCR
    # --------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = False
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = False

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(
    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
    #         )
    #     }
    # )

    # PyPdfium with EasyOCR
    # -----------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(
    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
    #         )
    #     }
    # )

    # Docling Parse without EasyOCR
    # -------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = False
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with EasyOCR
    # ----------------------
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["es"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=4, device=AcceleratorDevice.AUTO
    )

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Docling Parse with EasyOCR (CPU only)
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.ocr_options.use_gpu = False  # <-- set this.
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with Tesseract
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractOcrOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with Tesseract CLI
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractCliOcrOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with ocrmac(Mac only)
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = OcrMacOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    ###########################################################################

    start_time = time.time()
    conv_result = doc_converter.convert(input_doc_path)
    end_time = time.time() - start_time

    _log.info(f"Document converted in {end_time:.2f} seconds.")

    ## Export results
    output_dir = Path("scratch")
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_result.input.file.stem

    # Export Deep Search document JSON format:
    with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
        fp.write(json.dumps(conv_result.document.export_to_dict()))

    # Export Text format:
    with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_text())

    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_markdown())

    # Export Document Tags format:
    with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_document_tokens())

In [18]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document s13.pdf
INFO:docling.document_converter:Finished converting document s13.pdf in 77.66 sec.
INFO:__main__:Document converted in 77.65 seconds.


In [19]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("s13.pdf")
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)

    doc_filename = conv_res.input.file.stem

    # Export tables
    for table_ix, table in enumerate(conv_res.document.tables):
        table_df: pd.DataFrame = table.export_to_dataframe()
        print(f"## Table {table_ix}")
        print(table_df.to_markdown())

        # Save the table as csv
        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
        _log.info(f"Saving CSV table to {element_csv_filename}")
        table_df.to_csv(element_csv_filename)

        # Save the table as html
        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
        _log.info(f"Saving HTML table to {element_html_filename}")
        with element_html_filename.open("w") as fp:
            fp.write(table.export_to_html())

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")

In [20]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document s13.pdf
INFO:docling.document_converter:Finished converting document s13.pdf in 69.95 sec.
INFO:__main__:Saving CSV table to scratch\s13-table-1.csv
INFO:__main__:Saving HTML table to scratch\s13-table-1.html
INFO:__main__:Saving CSV table to scratch\s13-table-2.csv
INFO:__main__:Saving HTML table to scratch\s13-table-2.html
INFO:__main__:Saving CSV table to scratch\s13-table-3.csv
INFO:__main__:Saving HTML table to scratch\s13-table-3.html
INFO:__main__:Document converted and tables exported in 69.98 seconds.


## Table 0
|    | Text                                 | Construct            | Factor            |
|---:|:-------------------------------------|:---------------------|:------------------|
|  0 | Go straight for the goal.            | Achievement-Striving | Conscientiousness |
|  1 | Plunge into tasks with all my heart. | Achievement-Striving | Conscientiousness |
|  2 | Remain calm under pressure.          | Vulnerability        | Neuroticism       |
## Table 1
|    | Text                                       |   Label |
|---:|:-------------------------------------------|--------:|
|  0 | Broken leg. A broken leg (leg fracture)... |   49.33 |
|  1 | Bulimia. Bulimia is an eating disorder...  |   34.18 |
|  2 | Hyperacusis. Hyperacusis is when...        |   53.82 |
## Table 2
|    | Excerpt                                    |   BT_easiness |
|---:|:-------------------------------------------|--------------:|
|  0 | An honest and poor old woman was...        |         -0.05 |
|  1 | O

In [21]:
!pip install -qU docling transformers

In [22]:
from docling.document_converter import DocumentConverter

DOC_SOURCE = "scratch/s13.md"

doc = DocumentConverter().convert(source=DOC_SOURCE).document

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document s13.md
INFO:docling.document_converter:Finished converting document s13.md in 3.56 sec.


In [27]:
from transformers import AutoTokenizer

from docling.chunking import HybridChunker

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 128  # set to a small number for illustrative purposes

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

chunker = HybridChunker(
    tokenizer=tokenizer,  # instance or model name, defaults to "sentence-transformers/all-MiniLM-L6-v2"
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer`
    merge_peers=True,  # optional, defaults to True
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)

In [28]:
for i, chunk in enumerate(chunks):
    print(f"=== {i} ===")
    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))
    print(f"chunk.text ({txt_tokens} tokens):\n{repr(chunk.text)}")

    ser_txt = chunker.serialize(chunk=chunk)
    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")

    print()

=== 0 ===
chunk.text (42 tokens):
'Zak Hussain 1,2 · Marcel Binz 3,4 · Rui Mata 1 · Dirk U. Wulff 1,2\nAccepted: 27 May 2024\n©The Author(s) 2024'
chunker.serialize(chunk) (55 tokens):
'A tutorial on open-source large language models for behavioral science\nZak Hussain 1,2 · Marcel Binz 3,4 · Rui Mata 1 · Dirk U. Wulff 1,2\nAccepted: 27 May 2024\n©The Author(s) 2024'

=== 1 ===
chunk.text (127 tokens):
'Large language models (LLMs) have the potential to revolutionize behavioral science by accelerating and improving the research cycle, from conceptualization to data analysis. Unlike closed-source solutions, open-source frameworks for LLMs can enable transparency, reproducibility, and adherence to data protection standards, which gives them a crucial advantage for use in behavioral science. To help researchers harness the promise of LLMs, this tutorial offers a primer on the open-source Hugging Face ecosystem and demonstrates several applications that advance conceptual and empirical wor

In [29]:
output_file = "chunk_output.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for i, chunk in enumerate(chunks):
        f.write(f"=== {i} ===\n")
        
        txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))
        f.write(f"chunk.text ({txt_tokens} tokens):\n{repr(chunk.text)}\n\n")
        
        ser_txt = chunker.serialize(chunk=chunk)
        ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))
        f.write(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}\n\n")

        f.write("\n")
        
print(f"Output saved to {output_file}")


Output saved to chunk_output.txt


In [30]:
!pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv

In [1]:
import os
from pathlib import Path
from tempfile import mkdtemp
from warnings import filterwarnings

from dotenv import load_dotenv


def _get_env_from_colab_or_os(key):
    try:
        from google.colab import userdata

        try:
            return userdata.get(key)
        except userdata.SecretNotFoundError:
            pass
    except ImportError:
        pass
    return os.getenv(key)


load_dotenv()

filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")
GEN_MODEL = HuggingFaceInferenceAPI(
    token=_get_env_from_colab_or_os("YOUR_TOKEN_HERE"),
    model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)
SOURCE = "https://arxiv.org/pdf/2408.09869"  # Docling Technical Report
QUERY = "Which are the main AI models in Docling?"

embed_dim = len(EMBED_MODEL.get_text_embedding("hi"))

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
# from llama_index.vector_stores.milvus import MilvusVectorStore
# from llama_index.vector_stores.simple import SimpleVectorStore  # Alternative to Milvus

reader = DoclingReader()
node_parser = MarkdownNodeParser()

# vector_store = MilvusVectorStore(
#     uri=str(Path(mkdtemp()) / "docling.db"),  # or set as needed
#     dim=embed_dim,
#     overwrite=True,
# )
# Use SimpleVectorStore as the vector store
# Use the default in-memory vector store
storage_context = StorageContext.from_defaults()

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=storage_context,
    embed_model=EMBED_MODEL,
)


result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

Q: Which are the main AI models in Docling?
A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.

Sources:


[('## 3.2 AI models\n\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',
  {'header_path': '/Docling Technical Report/'}),
 ("## 5 Applications\n\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowled

In [None]:
from huggingface_hub import login

# Log in to Hugging Face using your access token
login(token="YOUR_TOKEN_HERE")


from llama_index.node_parser.docling import DoclingNodeParser

from llama_index.readers.docling import DoclingReader

reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

# vector_store = MilvusVectorStore(
#     uri=str(Path(mkdtemp()) / "docling.db"),  # or set as needed
#     dim=embed_dim,
#     overwrite=True,
# )
storage_context = StorageContext.from_defaults()
index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=storage_context,
    embed_model=EMBED_MODEL,
)
result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

Q: Which are the main AI models in Docling?
A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.

Sources:


[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',
  {'schema_name': 'docling_core.transforms.chunker.DocMeta',
   'version': '1.0.0',
   'doc_items': [{'self_ref': '#/texts/50',
     'parent': {'$ref': '#/body'},
     'children': [],
     'label': 'text',
     'prov': [{'page_no': 3,
       'bbox': {'l': 108.0,
        't': 404.87298583984375,
        'r': 504.00299072265625,
        'b': 330.8659973144531,
        'coord_origin': '

In [4]:
!pip install milvus

Collecting milvus
  Downloading milvus-2.2.16-py3-none-win_amd64.whl.metadata (7.0 kB)
Downloading milvus-2.2.16-py3-none-win_amd64.whl (38.6 MB)
   ---------------------------------------- 0.0/38.6 MB ? eta -:--:--
   -------------- ------------------------- 13.6/38.6 MB 65.9 MB/s eta 0:00:01
   ------------------------ --------------- 23.6/38.6 MB 57.3 MB/s eta 0:00:01
   ------------------------------------ --- 35.1/38.6 MB 55.8 MB/s eta 0:00:01
   ---------------------------------------- 38.6/38.6 MB 52.1 MB/s eta 0:00:00
Installing collected packages: milvus
Successfully installed milvus-2.2.16


In [10]:
from huggingface_hub import login

# Log in to Hugging Face using your access token
