In [2]:
from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

In [4]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, # pipeline options go here.
                backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline # default for office formats and HTML
            ),
        },
    )
)

In [5]:
from docling.datamodel.document import ConversionResult
conv_result = doc_converter.convert("https://en.wikipedia.org/wiki/Large_language_model")


detected nested tables: skipping for now
detected nested tables: skipping for now


In [6]:
def create_folder_if_not_exists(folder_path):
    Path(folder_path).mkdir(parents=True, exist_ok=True)

# Example usage:
output_dir = Path("Samples/" + conv_result.input.file.stem)
create_folder_if_not_exists(output_dir)

In [7]:
# Specify the file name
doc_filename = conv_result.input.file.stem
file_name = output_dir / f"{doc_filename}.md"

# Open the file in write mode
with open(file_name, "w") as file:
    # Write the Markdown content to the file
    file.write(conv_result.document.export_to_markdown())

In [13]:
from IPython.display import display, HTML
picture_counter = 0
for element, _level in conv_result.document.iterate_items():
    if isinstance(element, PictureItem):
        picture_counter += 1
        print(f"Processing Picture {picture_counter}")
        print(element)
        image = element.get_image(conv_result.document)
        print(element._image_to_base64)
        html_code = f'<img src="data:image/png;base64,{element._image_to_base64}" alt="Base64 Image" />'
        display(HTML(html_code))
        if image is None:
            print(f"Picture {picture_counter}: get_image returned None")
        else:
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                image.save(fp, "PNG")


Processing Picture 1
self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/0', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[], references=[], footnotes=[], image=None, annotations=[])>


Picture 1: get_image returned None
Processing Picture 2
self_ref='#/pictures/1' parent=RefItem(cref='#/body') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/1', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[], references=[], footnotes=[], image=None, annotations=[])>


Picture 2: get_image returned None
Processing Picture 3
self_ref='#/pictures/2' parent=RefItem(cref='#/body') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/2', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[], references=[], footnotes=[], image=None, annotations=[])>


Picture 3: get_image returned None
Processing Picture 4
self_ref='#/pictures/3' parent=RefItem(cref='#/texts/131') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/132')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/3', parent=RefItem(cref='#/texts/131'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/132')], references=[], footnotes=[], image=None, annotations=[])>


Picture 4: get_image returned None
Processing Picture 5
self_ref='#/pictures/4' parent=RefItem(cref='#/texts/131') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/133')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/4', parent=RefItem(cref='#/texts/131'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/133')], references=[], footnotes=[], image=None, annotations=[])>


Picture 5: get_image returned None
Processing Picture 6
self_ref='#/pictures/5' parent=RefItem(cref='#/texts/131') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/136')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/5', parent=RefItem(cref='#/texts/131'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/136')], references=[], footnotes=[], image=None, annotations=[])>


Picture 6: get_image returned None
Processing Picture 7
self_ref='#/pictures/6' parent=RefItem(cref='#/texts/165') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/167')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/6', parent=RefItem(cref='#/texts/165'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/167')], references=[], footnotes=[], image=None, annotations=[])>


Picture 7: get_image returned None
Processing Picture 8
self_ref='#/pictures/7' parent=RefItem(cref='#/texts/178') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/179')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/7', parent=RefItem(cref='#/texts/178'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/179')], references=[], footnotes=[], image=None, annotations=[])>


Picture 8: get_image returned None
Processing Picture 9
self_ref='#/pictures/8' parent=RefItem(cref='#/texts/218') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[RefItem(cref='#/texts/219')] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/8', parent=RefItem(cref='#/texts/218'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[RefItem(cref='#/texts/219')], references=[], footnotes=[], image=None, annotations=[])>


Picture 9: get_image returned None
Processing Picture 10
self_ref='#/pictures/9' parent=RefItem(cref='#/texts/428') children=[] label=<DocItemLabel.PICTURE: 'picture'> prov=[] captions=[] references=[] footnotes=[] image=None annotations=[]
<bound method PictureItem._image_to_base64 of PictureItem(self_ref='#/pictures/9', parent=RefItem(cref='#/texts/428'), children=[], label=<DocItemLabel.PICTURE: 'picture'>, prov=[], captions=[], references=[], footnotes=[], image=None, annotations=[])>


Picture 10: get_image returned None
