# PDF to Markdown Conversion

This notebook compares different methods for converting complex PDF contracts into Markdown.

**Methods Evaluated:**
1. **MarkItDown**: Microsoft's tool for converting various formats to text.
2. **Dolphin / GOT-OCR2.0**: A VLM-based approach for high-fidelity OCR.
3. **Docling**: A specialized document conversion library (IBM), including custom pipeline tweaks for table structure and heading detection.


#### Markitdown Library

In [None]:
from markitdown import MarkItDown
import os
import glob

md = MarkItDown()

# Input folder (where your PDFs are stored)
input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"

# Output folder (where parsed files will go)
output_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Parsed Contracts"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Find all PDFs recursively
files = glob.glob(os.path.join(input_folder, "**", "*.pdf"), recursive=True)
files += glob.glob(os.path.join(input_folder, "**", "*.PDF"), recursive=True)

parsed_docs = {}

for file in files:
    result = md.convert(file)

    parsed_docs[file] = {
        "text": result.text_content,
        "markdown": result.markdown
    }

    # Get base filename only (no folders, no extension)
    base = os.path.splitext(os.path.basename(file))[0]

    # Save outputs into Parsed Contracts
    with open(os.path.join(output_folder, f"{base}_parsed.txt"), "w", encoding="utf-8") as f:
        f.write(parsed_docs[file]["text"])

    with open(os.path.join(output_folder, f"{base}_parsed.md"), "w", encoding="utf-8") as f:
        f.write(parsed_docs[file]["markdown"])

print(f"Processed {len(files)} files. Results saved in {output_folder}")


In [None]:
result

In [None]:
dir(result)

### Dolphin 

In [None]:
import os
import glob
import subprocess
import sys

# change to dolphin directory
os.chdir("/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Dolphin")
sys.path.append(os.getcwd())

input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"
output_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Dolphin_Parsed"
model_path = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Dolphin/hf_model"  # adjust if different

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Find all PDFs (case insensitive)
files = glob.glob(os.path.join(input_folder, "**", "*.pdf"), recursive=True)
files += glob.glob(os.path.join(input_folder, "**", "*.PDF"), recursive=True)

print(f"Found {len(files)} PDF files to process.")

for file in files:
    print(f"Processing {file} ...")
    # Run Dolphin per file
    subprocess.run([
        "python", "demo_page_hf.py",
        "--model_path", model_path,
        "--input_path", file,
        "--save_dir", output_folder
    ])


### Docling

In [5]:
import os
import glob
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel import vlm_model_specs

In [2]:
dir(vlm_model_specs)

['AcceleratorDevice',
 'AnyUrl',
 'ApiVlmOptions',
 'DOLPHIN_TRANSFORMERS',
 'Enum',
 'GEMMA3_12B_MLX',
 'GEMMA3_27B_MLX',
 'GOT2_TRANSFORMERS',
 'GRANITEDOCLING_MLX',
 'GRANITEDOCLING_TRANSFORMERS',
 'GRANITEDOCLING_VLLM',
 'GRANITE_VISION_OLLAMA',
 'GRANITE_VISION_TRANSFORMERS',
 'GRANITE_VISION_VLLM',
 'InferenceFramework',
 'InlineVlmOptions',
 'NU_EXTRACT_2B_TRANSFORMERS',
 'PHI4_TRANSFORMERS',
 'PIXTRAL_12B_MLX',
 'PIXTRAL_12B_TRANSFORMERS',
 'QWEN25_VL_3B_MLX',
 'ResponseFormat',
 'SMOLDOCLING_MLX',
 'SMOLDOCLING_TRANSFORMERS',
 'SMOLDOCLING_VLLM',
 'SMOLVLM256_MLX',
 'SMOLVLM256_TRANSFORMERS',
 'SMOLVLM256_VLLM',
 'TransformersModelType',
 'TransformersPromptStyle',
 'VlmModelType',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_log',
 'logging']

In [6]:
pipeline_options = VlmPipelineOptions(
    vlm_options = vlm_model_specs.GRANITEDOCLING_MLX  # or a model appropriate to your hardware 
)

converter = DocumentConverter(
    format_options = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls = VlmPipeline,
            pipeline_options = pipeline_options
        )
    }
)
input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"
output_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Docling_Vlm_parsed"
os.makedirs(output_folder, exist_ok=True)

pdf_files = glob.glob(os.path.join(input_folder, "**", "*.pdf"), recursive=True)

for filepath in pdf_files:
    result = converter.convert(filepath)
    base = os.path.splitext(os.path.basename(filepath))[0]

    # Save Markdown
    with open(os.path.join(output_folder, f"{base}.md"), "w", encoding="utf-8") as f:
        f.write(result.document.export_to_markdown())

    # Save JSON
    with open(os.path.join(output_folder, f"{base}.json"), "w", encoding="utf-8") as f:
        import json
        json.dump(result.document.export_to_dict(), f, indent=2)

print(f"Processed {len(pdf_files)} PDFs with Docling.")


2025-11-03 10:54:55,829 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 10:54:55,907 - INFO - Going to convert document batch...
2025-11-03 10:54:55,908 - INFO - Initializing pipeline for VlmPipeline with options hash 0b46d09deed704fae432713af4b78bcd
2025-11-03 10:54:55,941 - INFO - Loading plugin 'docling_defaults'
2025-11-03 10:54:55,943 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-03 10:55:07,033 - INFO - Processing document NMA-Alberta-Province.pdf
2025-11-03 11:07:27,035 - INFO - Finished converting document NMA-Alberta-Province.pdf in 751.23 sec.
2025-11-03 11:07:27,290 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 11:07:27,307 - INFO - Going to convert document batch...
2025-11-03 11:07:27,308 - INFO - Processing document NWR Edmonton Exchanger.pdf
2025-11-03 11:23:49,273 - INFO - Finished converting document NWR Edmonton Exchanger.pdf in 981.94 sec.
2025-11-03 11:23:49,602 - INFO - detected formats: [<InputFormat.PDF: 'pdf

Processed 6 PDFs with Docling.


### Dolphin_New

In [None]:
import os, glob

input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"
files = glob.glob(os.path.join(input_folder, "**", "*.pdf"), recursive=True)
files += glob.glob(os.path.join(input_folder, "**", "*.PDF"), recursive=True)
print("PDFs found:", len(files))
print(files[:5])


In [None]:
import os
import fitz  # PyMuPDF
from PIL import Image
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import torch


# ========== CONFIGURATION ==========
input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"
output_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Dolphin_New"
model_name = "ucaslcl/GOT-OCR2_0"
device = "cpu"  # force CPU mode
dpi = 150       # lower DPI = faster rendering
# ===================================


# ========== LOAD MODEL ==========
print("Loading GOT-OCR2.0 model (CPU mode)...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
model.eval()
print("Model loaded.\n")
# ===================================


def pdf_to_images(pdf_path, dpi=150):
    """Convert PDF pages to PIL Images"""
    doc = fitz.open(pdf_path)
    pages = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        zoom = dpi / 72
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        pages.append((page_num, img))
    doc.close()
    return pages


def process_pdf(pdf_path):
    """Convert PDF pages to markdown using GOT-OCR2.0"""
    print(f"Processing {os.path.basename(pdf_path)}")
    pages = pdf_to_images(pdf_path, dpi)
    all_md = []

    for page_num, img in tqdm(pages, desc="Pages", leave=False):
        temp_path = f"temp_page_{page_num}.png"
        img.save(temp_path)

        try:
            # Force model to use CPU by setting device_map
            with torch.no_grad():
                md_text = model.chat(
                    tokenizer,
                    temp_path,
                    ocr_type="format",  # markdown-like
                    device=device
                )
            all_md.append(f"## Page {page_num+1}\n\n{md_text}\n\n---\n\n")

        except Exception as e:
            print(f"⚠️ Error on page {page_num}: {e}")

        finally:
            os.remove(temp_path)

    return "".join(all_md)


def main():
    os.makedirs(output_folder, exist_ok=True)
    pdf_files = [
        os.path.join(root, f)
        for root, _, files in os.walk(input_folder)
        for f in files if f.lower().endswith(".pdf")
    ]

    if not pdf_files:
        print("⚠️ No PDFs found.")
        return

    for pdf in pdf_files:
        md_text = process_pdf(pdf)
        out_name = os.path.splitext(os.path.basename(pdf))[0] + ".md"
        out_path = os.path.join(output_folder, out_name)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(md_text)
        print(f"✅ Saved: {out_path}\n")


if __name__ == "__main__":
    main()


### Appendix f issue fix with tweak

In [7]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.datamodel.base_models import InputFormat
import os, glob, json

input_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Contracts_Initial"
output_folder = "/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Docling_Tweak"
os.makedirs(output_folder, exist_ok=True)

# Configure pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_classification = False  # disable picture classification
pipeline_options.generate_page_images = False        # optionally disable generating full page images
# Table processing adjustments
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

# Create converter with these options
converter = DocumentConverter(
    format_options = {
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# Process files
pdf_files = glob.glob(os.path.join(input_folder, "**", "*.pdf"), recursive=True)
for filepath in pdf_files:
    result = converter.convert(filepath)
    base = os.path.splitext(os.path.basename(filepath))[0]

    # Post-processing: detect header banner treated as picture and convert
    doc = result.document
    for pic in list(doc.pictures):  # iterate through PictureItem list
        text = getattr(pic, "ocr_text", None)
        if text and text.strip().upper().startswith("APPENDIX"):
            # Convert this picture item into a heading
            # Simplified example: remove from pictures, add to texts as heading
            doc.pictures.remove(pic)
            doc.texts.append(
                type(pic)(  # using class of picture? adjust accordingly
                    text=text,
                    label="HEADING",
                    heading_level=2,
                    bbox=pic.bbox,
                    parent=pic.parent,
                    provenance=pic.provenance
                )
            )

    # Save Markdown
    with open(os.path.join(output_folder, f"{base}.md"), "w", encoding="utf-8") as f:
        f.write(doc.export_to_markdown())
    # Save JSON
    with open(os.path.join(output_folder, f"{base}.json"), "w", encoding="utf-8") as f:
        json.dump(doc.export_to_dict(), f, indent=2)

print(f"Processed {len(pdf_files)} PDFs with adjusted pipeline.")


2025-11-03 12:08:40,020 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:08:40,026 - INFO - Going to convert document batch...
2025-11-03 12:08:40,027 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
2025-11-03 12:08:40,077 - INFO - Loading plugin 'docling_defaults'
2025-11-03 12:08:40,085 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-03 12:08:46,424 - INFO - Auto OCR model selected ocrmac.
2025-11-03 12:08:46,434 - INFO - Accelerator device: 'mps'
2025-11-03 12:08:54,317 - INFO - Accelerator device: 'mps'
2025-11-03 12:08:55,021 - INFO - Processing document NMA-Alberta-Province.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=4, 1-4
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 5-8
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 9-12
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 13-16
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 17-20
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 21-24
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 25-28
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 29-32
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 33-36
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 37-40
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=2, 41-42
len(valid_pages)=2
len(valid_page_images)=2


2025-11-03 12:09:19,005 - INFO - Finished converting document NMA-Alberta-Province.pdf in 38.99 sec.
2025-11-03 12:09:19,450 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:09:19,472 - INFO - Going to convert document batch...
2025-11-03 12:09:19,473 - INFO - Processing document NWR Edmonton Exchanger.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=3, 1-3
len(valid_pages)=3
len(valid_page_images)=3
len(pages)=4, 4-7
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 8-11
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 12-15
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 16-19
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 20-23
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 24-27
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 28-31
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 32-35
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 36-39
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 40-43
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 44-47
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 48-51
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 52-55
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 56-59
len(valid_pages)=4
len(vali

2025-11-03 12:11:09,556 - INFO - Finished converting document NWR Edmonton Exchanger.pdf in 110.11 sec.
2025-11-03 12:11:10,333 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:11:10,362 - INFO - Going to convert document batch...
2025-11-03 12:11:10,362 - INFO - Processing document C1000776 CO#8 Combined - Labour Rates update June 1, 2025.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=1, 1-1
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=4, 2-5
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 6-9
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=1, 10-10
len(valid_pages)=1
len(valid_page_images)=1


2025-11-03 12:12:16,122 - INFO - Finished converting document C1000776 CO#8 Combined - Labour Rates update June 1, 2025.pdf in 65.79 sec.
2025-11-03 12:12:17,245 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:12:17,254 - INFO - Going to convert document batch...
2025-11-03 12:12:17,254 - INFO - Processing document C1000776 CO#6 Combined - Labour and Equipment Rates and Extension to Feb 21 2027.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=4, 1-4
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 5-8
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 9-12
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=1, 13-13
len(valid_pages)=1
len(valid_page_images)=1


2025-11-03 12:13:43,038 - INFO - Finished converting document C1000776 CO#6 Combined - Labour and Equipment Rates and Extension to Feb 21 2027.pdf in 85.80 sec.
2025-11-03 12:13:44,303 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:13:44,312 - INFO - Going to convert document batch...
2025-11-03 12:13:44,312 - INFO - Processing document Boilermakers Collective-Agreement.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=4, 1-4
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 5-8
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 9-12
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 13-16
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 17-20
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 21-24
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 25-28
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 29-32
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 33-36
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 37-40
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 41-44
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 45-48
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 49-52
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 53-56
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 57-60
len(valid_pages)=4
len(vali

2025-11-03 12:14:17,598 - INFO - Finished converting document Boilermakers Collective-Agreement.pdf in 33.30 sec.
2025-11-03 12:14:17,890 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-03 12:14:17,896 - INFO - Going to convert document batch...
2025-11-03 12:14:17,896 - INFO - Processing document Pipefitters-Collective-Agreement_April_2025-CLEAN_updated_May-22-onfile.pdf


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1
len(pages)=3, 1-3
len(valid_pages)=3
len(valid_page_images)=3
len(pages)=4, 4-7
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 8-11
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 12-15
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 16-19
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 20-23
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 24-27
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 28-31
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 32-35
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 36-39
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 40-43
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 44-47
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 48-51
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 52-55
len(valid_pages)=4
len(valid_page_images)=4
len(pages)=4, 56-59
len(valid_pages)=4
len(vali

2025-11-03 12:15:36,114 - INFO - Finished converting document Pipefitters-Collective-Agreement_April_2025-CLEAN_updated_May-22-onfile.pdf in 78.23 sec.


Processed 6 PDFs with adjusted pipeline.
