## 1) Import Libraries

In [1]:
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from pathlib import Path
from IPython.display import display

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions
)

  from .autonotebook import tqdm as notebook_tqdm


## 2) Example via document

In [2]:
start_time = time.time()
source = "documents/PDF-IAS-2025-10-CompleteIssue.pdf"

converter = DocumentConverter()
result = converter.convert(source)

end_time = time.time() - start_time

print(f'Total time take for parsing {end_time:.2f} seconds.')

2025-09-28 14:39:55,833 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-28 14:39:55,868 - INFO - Going to convert document batch...
2025-09-28 14:39:55,868 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-28 14:39:55,880 - INFO - Loading plugin 'docling_defaults'
2025-09-28 14:39:55,880 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-28 14:39:55,894 - INFO - Loading plugin 'docling_defaults'
2025-09-28 14:39:55,903 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-28 14:39:56,034 - INFO - Accelerator device: 'cpu'
2025-09-28 14:39:57,745 - INFO - Accelerator device: 'cpu'
2025-09-28 14:39:58,919 - INFO - Accelerator device: 'cpu'
2025-09-28 14:39:59,336 - INFO - Processing document PDF-IAS-2025-10-CompleteIssue.pdf
2025-09-28 14:42:06,436 - INFO - Finished converting document PDF-IAS-2025-10-CompleteIssue.pdf in 130.61 sec.


Total time take for parsing 130.61 seconds.


In [3]:
print(result.document.export_to_markdown())

<!-- image -->

October 2025

Charles Schwab Corporation (SCHW) Nvidia Corporation (NVDA) ► New to Service

## INVESTMENT COMMENTS

Markets  have  staged  a  strong  recovery from  the  April  lows,  with  the  S&amp;P  500  up more  than  30%  and  hovering  around  alltime  highs.  Though  there  has  been  a  recent broadening out in anticipation of interest rate cuts at the Fed's September meeting, the market has been largely driven by narrow leadership tied to the AI theme. Oracle's  most  recent  quarterly  earnings  announcement  simply  underscored  this,  as the company highlighted key contract wins related to its role as a provider of AI computing  capacity  that  are  projected  to  send revenue in its cloud-computing business up 700%  over  the  next  three  years.  Shares surged  more  than  35%  and  Oracle  added approximately  $250  billion  to  its  market capitalization. AI remains very much at the forefront of investors' minds.

However, the July and August labor rep

## 3) Advanced Features of docling

### 3.1) Parser Pipeline

In [None]:
IMAGE_RESOLUTION_SCALE = 2.0

input_doc_path = Path(source)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'Total time take for parsing {end_time:.2f} seconds.')

### 3.2) Extract metadata

In [None]:
print('Document Name: ', conv_res.document.origin.filename)
print('Document Type: ', conv_res.document.origin.mimetype)
print('Number of Pages: ', len(conv_res.document.pages.keys()))
print('Number of Text Elements: ', len(conv_res.document.dict()['texts']))
print('Number of Text Tables: ', len(conv_res.document.dict()['tables']))
print('Number of Text Images: ', len(conv_res.document.dict()['pictures']))

#### 3.2.1) Iterate through text units

In [None]:
results_body = conv_res.document.dict()

dict_list = []

texts = results_body['texts']
for t in texts:
    ref = t['self_ref']
    text_content = t['text']
    page = t['prov'][0]['page_no']
    dict_list.append({'text_reference':ref, 'page': page, 'text_content(first 500 chars)':text_content[:500]})

In [None]:
df = pd.DataFrame(dict_list)

df

#### 3.2.2) Iterate through Tables

In [None]:
tables = results_body['tables']
dict_list = []

for t in tables:
    ref = t['self_ref']
    page = t['prov'][0]['page_no']

    table_captions = t['captions']
    if(len(table_captions) and 'cref' in table_captions[0].keys()):
        table_caption = table_captions[0]['cref']
    else:
        table_caption = 'No Caption'
    table_data = str(t['data'])[:100]

    dict_list.append({'table_reference':ref, 'page': page, 'table_captions': table_captions,'table_data(first 100 chars)':table_data})

In [None]:
df = pd.DataFrame(dict_list)

df

#### 3.2.3) Iterate through Tables

In [None]:
images = results_body['pictures']
dict_list = []

for i in images:
    ref = i['self_ref']
    page = i['prov'][0]['page_no']

    image_captions = i['captions']

    if(len(image_captions) and 'cref' in image_captions[0].keys()):
        image_caption = image_captions[0]['cref']
    else:
        image_caption = 'No Caption'
    image_data = str(t['image'])[:100]

    dict_list.append({'image_reference':ref, 'page': page, 'image_captions': image_captions,'image_data(first 100 chars)':image_data})

In [None]:
df = pd.DataFrame(dict_list)

df

### 3.3) Iterate though the pages of the document

In [None]:
## Display the pages

def display_images(images, images_per_row=5, figsize=(15,8)):

    # handle dictionary or list input
    if isinstance(images, dict):
        images = list(images.values())

    # calculate number of rows needed
    num_rows = math.ceil(len(images)/images_per_row)

    # create subplots
    fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
    axes = axes.flatten() #flatten axes for easier iteration

    # plot images
    for ax, image in zip(axes, images):
        ax.imshow(image)
        ax.axis('off')

    # turn off unused axes
    for ax in axes[len(images):]:
        ax.axis('off')

    # adjust layout and display
    plt.tight_layout()
    plt.show()

In [None]:
page_images = {page_no: page.image.pil_image for page_no, page in conv_res.document.pages.items()}
display_images(page_images, images_per_row=5, figsize=(15,8))

#### Save the page images to directory

In [None]:
dir_path = './pages'
os.makedirs(dir_path, exist_ok=True)

for page_no, page in conv_res.document.pages.items():
    page_no = page.page_no
    page_image_filename = f"{page_no}.png"
    with open(os.path.join(dir_path, page_image_filename), 'wb') as fp: # use the built-in open() function
        page.image.pil_image.save(fp, format='PNG')

### 3.4 Extract all the images from the document

In [None]:
dir_path = './images'
os.makedirs(dir_path, exist_ok=True)

images_list = []
image_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, PictureItem):
        element_image_filename = os.path.join(dir_path, str(image_number)) + '.png'
        with open(element_image_filename, 'wb') as fp:
            image = element.get_image(conv_res.document)
            image.save(fp, 'PNG')
            images_list.append(image)
        image_number += 1

In [None]:
display_images(images_list, images_per_row=5, figsize=(15,8))

### 3.5 Extract all tables from the document

In [None]:
dir_path = './tables'
dir_paths = ['./tables/images', './tables/CSVs', './tables/HTMLs']

[os.makedirs(path, exist_ok=True) for path in dir_paths]

table_list = []
table_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TableItem):
        
        # saving tables as images
        element_table_filename = os.path.join(dir_path, 'images', str(table_number)) + '.png'
        with open(element_table_filename, 'wb') as fp:
            table_image = element.get_image(conv_res.document)
            table_image.save(fp,'PNG')
            table_list.append(table_image)

        # saving tables as CSV files
        table_df: pd.DataFrame = element.export_to_dataframe()
        element_table_filename = os.path.join(dir_path, 'CSVs', str(table_number)) + '.csv'
        table_df.to_csv(element_table_filename)

        # saving tables as HTML files
        table_html = element.export_to_html()
        element_table_filename = os.path.join(dir_path, 'HTMLs', str(table_number)) + '.html'
        with open(element_table_filename, 'w') as fp:
            fp.write(table_html)

        table_number += 1 

In [None]:
display_images(table_list, images_per_row=5, figsize=(15,8))

## OCR pipeline

In [None]:
def OCR_parsing(doc_path):
    input_doc = Path(doc_path)
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options
    pipeline_options.generate_page_images = True

    start_time = time.time()

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    result = converter.convert(input_doc)
    end_time = time.time() - start_time
    print(f'Total time take for parsing {end_time: .2f} seconds.')
    return result

In [None]:
OCR_result = OCR_parsing(source)

extract = OCR_result.document.export_to_markdown()

print('Docling Extract')
print(extract)

In [None]:
page_images = {page_no: page.image.pil_image for page_no, page in OCR_result.document.pages.items()}
print('Actual Document')
display_images(page_images, images_per_row=2, figsize=(100,50))