In [1]:
!pip install -q -U docling==2.13.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def convert_pdf_to_md(pdf_filepath):
    """Use docling to convert a PDF document to a markdown document."""
    pipeline_options = PdfPipelineOptions(do_table_structure=True)

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    
    conversion_result = doc_converter.convert(pdf_filepath)
    
    elements = [
        'caption',
        'checkbox_selected',
        'checkbox_unselected',
        'code',
        'document_index',
        'footnote',
        'form',
        'formula',
        'key_value_region',
        'list_item',
        'page_footer',
        'page_header',
        'paragraph',
        'picture',
        'reference',
        'section_header',
        'table',
        'text',
        'title'
    ]
    
    md_content = conversion_result.document.export_to_markdown(labels=elements)
    return md_content

In [None]:
def batch_convert_pdf_to_md(pdf_dir, output_md_dir):
    """Batch convert all PDF files in a directory to markdown files."""
    # Create the output directory if it does not exist
    os.makedirs(output_md_dir, exist_ok=True)
        
    # Gather PDF file paths
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    
    # Process each PDF file
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
        pdf_filepath = os.path.join(pdf_dir, pdf_file)
        
        try:
            # convert the PDF to markdown
            md_content = convert_pdf_to_md(pdf_filepath)
            
            # save the markdown content to a file
            md_filename = pdf_file.replace('.pdf', '.md')
            md_filepath = os.path.join(output_md_dir, md_filename)
            with open(md_filepath, 'w', encoding='utf-8') as f:
                f.write(md_content)
        except Exception as e:
            print(f"Failed to process {pdf_filepath}: {e}")

In [4]:
PDF_DIR = 'course_materials/emgt605/modules_1_8_pdf'
OUTPUT_DIR = 'course_materials/emgt605/modules_1_8_md'

batch_convert_pdf_to_md(PDF_DIR, OUTPUT_DIR)