In [None]:
! pip install --quiet docling

In [None]:
import os

import logging
import time
from pathlib import Path

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption


from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend


In [None]:
IMAGE_RESOLUTION_SCALE = 2.0

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True


def convert_doc_to_markdown(source,default_backend = True):
    '''read PDF and convert to MD with embedded images'''

    print(f'Processing: {source}')
    input_doc_path = Path(source)
    # output_dir = Path(save_root)

    if default_backend:
        doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
    else:
         doc_converter = DocumentConverter(
                    format_options={
                        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options
                                                        ,backend=PyPdfiumDocumentBackend ## defaults to dlpparse_v2
                                                        )
                    }
                )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    # output_dir.mkdir(parents=True, exist_ok=True)
    # doc_filename = conv_res.input.file.stem

    end_time = time.time()

    print('DUMPING MD')
    md_dump = conv_res.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)

    print('SAVING MD')
    # Save markdown with embedded pictures
    doc_filename = os.path.basename(source).replace(".pdf","")
    md_filename = os.path.join('/kaggle/working/',f"{doc_filename}.md")
    # conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED) ## doesnt work, gives the 'suffix' issue
    
    with open(md_filename, "w", encoding="utf-8") as markdown_file:
        markdown_file.write(md_dump)

    print(f'TIME TAKEN to convert doc to MD: {end_time - start_time}')

    return md_dump

# convert whole dir of PDF to MD

In [None]:
root_dir = '/kaggle/input/dir'
for source in [os.path.join(root_dir,i) for i in os.listdir(root_dir)]:
    try:
        md_content = convert_doc_to_markdown(source)
    except Exception as e:
        print(f'Error with default for {source}:\n{e} trying the second method')
        try:
            md_content = convert_doc_to_markdown(source,
                                         default_backend=False
                                        )
        except Exception as e:
            print(f'Error with second method as well:\n{e} skipping.')

In [None]:
import sys
sys.exit(0)

# single PDF to MD

In [None]:
source = "/kaggle/input/something.pdf"
md_content = convert_doc_to_markdown(source,
                                     # default_backend=False
                                    )

In [None]:
sanitized_stem = 'dummy'
f"{sanitized_stem}_{len(os.listdir('/kaggle/input/maths-books'))}"

In [None]:
'somethunig.md'[-3:]