In [10]:
from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
# from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

In [12]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

In [13]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, # pipeline options go here.
                backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline # default for office formats and HTML
            ),
        },
    )
)

In [16]:
from docling.datamodel.document import ConversionResult
conv_result = doc_converter.convert("https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf")


In [17]:
def create_folder_if_not_exists(folder_path):
    Path(folder_path).mkdir(parents=True, exist_ok=True)

# Example usage:
output_dir = Path("Samples/Docling_" + conv_result.input.file.stem)
create_folder_if_not_exists(output_dir)

In [18]:
# Specify the file name
doc_filename = conv_result.input.file.stem
file_name = output_dir / f"{doc_filename}.md"

conv_result.document.save_as_markdown(file_name, image_mode=ImageRefMode.REFERENCED)
    

In [7]:

# from IPython.display import display, HTML
# import base64
# table_counter = 0
# picture_counter = 0
# for element, _level in conv_result.document.iterate_items():
#         if isinstance(element, TableItem):
#             table_counter += 1
#             element_table_filename = (
#                 output_dir / f"{doc_filename}-table-{table_counter}.PNG"
#             )
#             with element_table_filename.open("wb") as fp:
#                 element.get_image(conv_result.document).save(fp, "PNG")
#             # table_df: pd.DataFrame = element.export_to_dataframe()
#             # with element_table_filename.open("wb") as fp:
#             #     fp.write(table_df.to_markdown().encode("utf-8"))

#         if isinstance(element, PictureItem):
#             picture_counter += 1
#             element_image_filename = (
#                 output_dir / f"{doc_filename}-picture-{picture_counter}.png"
#             )
#             #html_code = f'<img src="data:image/png;base64,{base64.b64encode(image_data).decode('utf-8')}" alt="Base64 Image" />'
#             #display(HTML(html_code))
#             with element_image_filename.open("wb") as fp:
#                 element.get_image(conv_result.document).save(fp, "PNG")
                