In [19]:
from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
# from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

In [20]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

In [21]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, # pipeline options go here.
                backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline # default for office formats and HTML
            ),
        },
    )
)

In [22]:
from docling.datamodel.document import ConversionResult
conv_result = doc_converter.convert("Samples/wikitest2.pdf")


In [23]:
def create_folder_if_not_exists(folder_path):
    Path(folder_path).mkdir(parents=True, exist_ok=True)

# Example usage:
output_dir = Path("Samples/Docling_" + conv_result.input.file.stem)
create_folder_if_not_exists(output_dir)

In [24]:
# Specify the file name
doc_filename = conv_result.input.file.stem
file_name = output_dir / f"{doc_filename}.md"

# Open the file in write mode
with open(file_name, "w") as file:
    # Write the Markdown content to the file
    file.write(conv_result.document.export_to_markdown())
    

In [None]:
from IPython.display import display, HTML
import base64
table_counter = 0
picture_counter = 0
for element, _level in conv_result.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_table_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.PNG"
            )
            with element_table_filename.open("wb") as fp:
                element.get_image(conv_result.document).save(fp, "PNG")
            # table_df: pd.DataFrame = element.export_to_dataframe()
            # with element_table_filename.open("wb") as fp:
            #     fp.write(table_df.to_markdown().encode("utf-8"))

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            html_code = f'<img src="data:image/png;base64,{base64.b64encode(image_data).decode('utf-8')}" alt="Base64 Image" />'
            display(HTML(html_code))
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_result.document).save(fp, "PNG")

In [8]:
import base64
import os
from docling.document_converter import DocumentConverter

def convert_pdf_to_markdown_with_images(pdf_path, markdown_output):
    # Step 1: Initialize the DocumentConverter
    converter = DocumentConverter()
    
    # Step 2: Convert the PDF and get the ConversionResult
    result = converter.convert(pdf_path)
    document = result.document  # Access the converted document object

    markdown_content = ""

    # Step 3: Process each item in the document
    for item in document.iterate_items():
        item_type = item[0]  # First element of tuple is the type
        item_data = item[1]  # Second element contains data

        if item_type == "TextItem":
            # Append text content to Markdown
            markdown_content += item_data + "\n\n"
        elif item_type == "PictureItem":
            # Extract image data from PictureItem (assuming it's raw bytes)
            image_data = item_data
            
            # Encode image data as Base64
            base64_string = base64.b64encode(image_data).decode('utf-8')

            # Embed Base64 image in Markdown
            markdown_content += f"![Image](data:image/png;base64,{base64_string})\n\n"
        print(markdown_content)
    # Step 4: Save the final Markdown content to a file
    with open(markdown_output, 'w') as md_file:
        md_file.write(markdown_content)

# Example usage
pdf_path = 'Samples/wikitest1.pdf'  # Path to your PDF file
markdown_output = 'output.md'  # Path to save the Markdown file with embedded images
convert_pdf_to_markdown_with_images(pdf_path, markdown_output)













































































































































































































































































































































































