## 1) Import Libraries

In [None]:
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from pathlib import Path
from IPython.display import display

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions
)

## 2) Example via document

In [None]:
start_time = time.time()
source = "documents/PDF-IAS-2025-10-CompleteIssue.pdf"

converter = DocumentConverter()
result = converter.convert(source)

end_time = time.time() - start_time

print(f'Total time take for parsing {end_time:.2f} seconds.')

In [None]:
print(result.document.export_to_markdown())

## 3) Advanced Features of docling

### 3.1) Parser Pipeline

In [None]:
IMAGE_RESOLUTION_SCALE = 2.0

input_doc_path = Path(source)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'Total time take for parsing {end_time:.2f} seconds.')

### 3.2) Extract metadata

In [None]:
print('Document Name: ', conv_res.document.origin.filename)
print('Document Type: ', conv_res.document.origin.mimetype)
print('Number of Pages: ', len(conv_res.document.pages.keys()))
print('Number of Text Elements: ', len(conv_res.document.dict()['texts']))
print('Number of Text Tables: ', len(conv_res.document.dict()['tables']))
print('Number of Text Images: ', len(conv_res.document.dict()['pictures']))

#### 3.2.1) Iterate through text units

In [None]:
results_body = conv_res.document.dict()

dict_list = []

texts = results_body['texts']
for t in texts:
    ref = t['self_ref']
    text_content = t['text']
    page = t['prov'][0]['page_no']
    dict_list.append({'text_reference':ref, 'page': page, 'text_content(first 500 chars)':text_content[:500]})

In [None]:
df_text = pd.DataFrame(dict_list)

df_text

#### 3.2.2) Iterate through Tables

In [None]:
tables = results_body['tables']
dict_list = []

for t in tables:
    ref = t['self_ref']
    page = t['prov'][0]['page_no']

    table_captions = t['captions']
    if(len(table_captions) and 'cref' in table_captions[0].keys()):
        table_caption = table_captions[0]['cref']
    else:
        table_caption = 'No Caption'
    table_data = str(t['data'])[:100]

    dict_list.append({'table_reference':ref, 'page': page, 'table_captions': table_captions,'table_data(first 100 chars)':table_data})

In [None]:
df_table = pd.DataFrame(dict_list)

df_table

#### 3.2.3) Iterate through Images

In [None]:
images = results_body['pictures']
dict_list = []

for i in images:
    ref = i['self_ref']
    page = i['prov'][0]['page_no']

    image_captions = i['captions']

    if(len(image_captions) and 'cref' in image_captions[0].keys()):
        image_caption = image_captions[0]['cref']
    else:
        image_caption = 'No Caption'
    image_data = str(t['image'])[:100]

    dict_list.append({'image_reference':ref, 'page': page, 'image_captions': image_captions,'image_data(first 100 chars)':image_data})

In [None]:
df_image = pd.DataFrame(dict_list)

df_image

### 3.3) Iterate though the pages of the document

In [None]:
## Display the pages

def display_images(images, images_per_row=5, figsize=(15,8)):

    # handle dictionary or list input
    if isinstance(images, dict):
        images = list(images.values())

    # calculate number of rows needed
    num_rows = math.ceil(len(images)/images_per_row)

    # create subplots
    fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
    axes = axes.flatten() #flatten axes for easier iteration

    # plot images
    for ax, image in zip(axes, images):
        ax.imshow(image)
        ax.axis('off')

    # turn off unused axes
    for ax in axes[len(images):]:
        ax.axis('off')

    # adjust layout and display
    plt.tight_layout()
    plt.show()

In [None]:
page_images = {page_no: page.image.pil_image for page_no, page in conv_res.document.pages.items()}
display_images(page_images, images_per_row=5, figsize=(15,8))

#### Save the page images to directory

In [None]:
dir_path = './pages'
os.makedirs(dir_path, exist_ok=True)

for page_no, page in conv_res.document.pages.items():
    page_no = page.page_no
    page_image_filename = f"{page_no}.png"
    with open(os.path.join(dir_path, page_image_filename), 'wb') as fp: # use the built-in open() function
        page.image.pil_image.save(fp, format='PNG')

### 3.4 Extract all the images from the document

In [None]:
dir_path = './images'
os.makedirs(dir_path, exist_ok=True)

images_list = []
image_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, PictureItem):
        element_image_filename = os.path.join(dir_path, str(image_number)) + '.png'
        with open(element_image_filename, 'wb') as fp:
            image = element.get_image(conv_res.document)
            image.save(fp, 'PNG')
            images_list.append(image)
        image_number += 1

In [None]:
display_images(images_list, images_per_row=5, figsize=(15,8))

### 3.5 Extract all tables from the document

In [None]:
dir_path = './tables'
dir_paths = ['./tables/images', './tables/CSVs', './tables/HTMLs']

[os.makedirs(path, exist_ok=True) for path in dir_paths]

table_list = []
table_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TableItem):
        
        # saving tables as images
        element_table_filename = os.path.join(dir_path, 'images', str(table_number)) + '.png'
        with open(element_table_filename, 'wb') as fp:
            table_image = element.get_image(conv_res.document)
            table_image.save(fp,'PNG')
            table_list.append(table_image)

        # saving tables as CSV files
        table_df: pd.DataFrame = element.export_to_dataframe()
        element_table_filename = os.path.join(dir_path, 'CSVs', str(table_number)) + '.csv'
        table_df.to_csv(element_table_filename)

        # saving tables as HTML files
        table_html = element.export_to_html()
        element_table_filename = os.path.join(dir_path, 'HTMLs', str(table_number)) + '.html'
        with open(element_table_filename, 'w') as fp:
            fp.write(table_html)

        table_number += 1 

In [None]:
display_images(table_list, images_per_row=5, figsize=(15,8))

## OCR pipeline

## Using Visual Language Model / Multi-Modal on images

In [None]:
import base64
import re
import textwrap
from io import BytesIO
from pathlib import Path

import numpy as np

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
    smolvlm_picture_description
)

from IPython.display import HTML, display
from PIL import Image

### Convert PDF to markdown

In [None]:
pipeline_options = PdfPipelineOptions(
    generate_page_images=True,
    images_scale=1.00,
    do_ocr=True,
    do_picture_description=True,
    ocr_options=EasyOcrOptions(force_full_page_ocr=True),
    picture_description_options=smolvlm_picture_description,
)

In [None]:
converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

In [None]:
result = converter.convert(source)

document = result.document


In [None]:
print(document.export_to_markdown())

### Visual Inspection

In [None]:
def image_to_base64(image: Image.Image) -> str:
    buffered = BytesIO()
    image.save(buffered, format = 'PNG')
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def display_page(content: str, image: Image):
    
    html_template = f""" 
    <div style="display: flex; align-items: flex-start; gap: 40px; font-family: monospace;"> 
        <div style="flex: 1; max-width: 45%;">
            <img src="data:image/png;base64, {image_to_base64(image)}" style="width: 100%; height: auto; padding: 5px;">
        <div>
        <div style = "flex: 1; max-width: 45%; white-space: pre-wrap; padding: 10px;">
            <div style="word-wrap: break-word; max-width: 120ch;">
                {content}

            <div>
        <div>
    <div>
    """

    display(HTML(html_template))


In [None]:
n_pages = len(document.pages)
n_pages

In [None]:
pages = []
for page_num in range(1, n_pages + 1):
    pages.append(
        (
            document.export_to_markdown(page_no=page_num),
            document.pages[page_num].image.pil_image,
        )
    )

In [None]:
display_page(*pages[1])

### Image Annotations

In [None]:
document.pictures[0].annotations

In [None]:
len(document.pictures)

In [None]:
annotations = []

annotationscount = 1
picturecount = 1

for picture in document.pictures:
    
    print('processing for picture: ' + str(picturecount))
          
    picturecount = picturecount + 1       
    
    for annotation in picture.annotations:

        print('processing for annotation: ' + str(annotationscount))
        print(annotation.text)
        annotations.append(annotation.text)
        annotationscount = annotationscount + 1

In [None]:
len(annotations)

In [None]:
annotations

In [None]:
assert len(annotations) == len(document.pictures)

In [None]:
def replace_occurrences(text, target, replacements):
    for replacement in replacements:
        if target in text:
            text = text.replace(target, replacement, 1)
        else:
            raise ValueError(
                f"No more occurrences of '{target}' found in the text for replacement ({replacement})"
            )
        
    return text

In [None]:
IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
text = document.export_to_markdown(
    image_placeholder=IMAGE_PLACEHOLDER, page_break_placeholder=PAGE_BREAK_PLACEHOLDER,
) #maybe for IDD business case, we can tag words like image_placeholder and also which section does it belong to... then we can leverage it to the VLM to enrich the prompt (e.g., this image is regarding Firm AUM for the manager) for better annotation once it analyse the image

In [None]:
print(text)

In [None]:
print(replace_occurrences(text, IMAGE_PLACEHOLDER, annotations))

### Document Processing Pipeline

In [None]:
def process_document(document_path: Path, converter: DocumentConverter, n_pages: int = -1) -> str:
    
    result = converter.convert(document_path)
    document = result.document

    annotations = []
    
    for picture in document.pictures:
        for annotation in picture.annotations:
            annotations.append(annotation.text)

    assert len(annotations) == len(document.pictures)

    text = document.export_to_markdown(
        page_break_placeholder=PAGE_BREAK_PLACEHOLDER,
        image_placeholder=IMAGE_PLACEHOLDER
    )
    text = replace_occurrences(text, IMAGE_PLACEHOLDER, annotation)
    if n_pages == -1:
        return text
    
    return PAGE_BREAK_PLACEHOLDER.join(text.split(PAGE_BREAK_PLACEHOLDER)[:n_pages])

In [None]:
document_text = process_document(source, converter, n_pages=4)

In [None]:
print(len(document_text.split("")))

In [None]:
print(document_text)