# Desired Chunking Strategies

- input file containing the cues for splitting the text
- add descripting context to each of the contents
- add metadata for main category such as Business, Staff etc.
- add finegrain metadata to talk about specific, like firm leadership etc. 
- need to calculate the tokens to show

## 1) Import Libraries

In [1]:
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from pathlib import Path
from IPython.display import display

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions
)

  from .autonotebook import tqdm as notebook_tqdm


## 2) Example via document

In [2]:
start_time = time.time()
source = "documents/CC&L Q Global.pdf"

converter = DocumentConverter()
result = converter.convert(source)

end_time = time.time() - start_time

print(f'Total time take for parsing {end_time:.2f} seconds.')

2025-09-30 18:53:05,382 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 18:53:09,185 - INFO - Going to convert document batch...
2025-09-30 18:53:09,185 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-30 18:53:09,204 - INFO - Loading plugin 'docling_defaults'
2025-09-30 18:53:09,212 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-30 18:53:09,227 - INFO - Loading plugin 'docling_defaults'
2025-09-30 18:53:09,237 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-30 18:53:10,005 - INFO - Accelerator device: 'cpu'
2025-09-30 18:53:12,010 - INFO - Accelerator device: 'cpu'
2025-09-30 18:53:13,867 - INFO - Accelerator device: 'cpu'
2025-09-30 18:53:14,472 - INFO - Processing document CC&L Q Global.pdf
2025-09-30 18:57:54,280 - INFO - Finished converting document CC&L Q Global.pdf in 288.92 sec.


Total time take for parsing 288.93 seconds.


In [None]:
result.document.save_as_markdown('markdown_saved')

In [None]:
print(result.document.export_to_markdown())

## Q Global Equity)

## Aon Investment Manager Research Due Diligence Questionnaire (

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy section one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplemental attachments that are strictly 

In [16]:
mkdown = result.document.export_to_markdown()

In [18]:
# Marker string to split on
marker = "Please provide copies of the following:"

# Split into two chunks
if marker in mkdown:
    part1, part2 = mkdown.split(marker, 1)  # split only once
    # Add back the marker to the second chunk if you want it preserved
    part2 = marker + part2  
else:
    part1, part2 = mkdown, ""  # fallback if marker not found

# Create dataframe with two rows
df_chunk = pd.DataFrame({
    "Chunk": [part1.strip(), part2.strip()]
})

In [22]:
df_chunk

Unnamed: 0,Chunk
0,## Q Global Equity)\n\n## Aon Investment Manag...
1,Please provide copies of the following:\n\n| P...


In [21]:
# export
df_chunk.to_csv("df_chunk.csv", index=False)


## 3) Advanced Features of docling

### 3.1) Parser Pipeline

In [4]:
IMAGE_RESOLUTION_SCALE = 2.0

input_doc_path = Path(source)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [5]:
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'Total time take for parsing {end_time:.2f} seconds.')

2025-09-30 18:57:54,423 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-30 18:57:54,429 - INFO - Going to convert document batch...
2025-09-30 18:57:54,431 - INFO - Initializing pipeline for StandardPdfPipeline with options hash ce2db4bc6b59e8bf84cfaffa1879c953
2025-09-30 18:57:54,433 - INFO - Accelerator device: 'cpu'
2025-09-30 18:57:56,167 - INFO - Accelerator device: 'cpu'
2025-09-30 18:57:57,389 - INFO - Accelerator device: 'cpu'
2025-09-30 18:57:57,775 - INFO - Processing document CC&L Q Global.pdf
2025-09-30 19:02:38,913 - INFO - Finished converting document CC&L Q Global.pdf in 284.49 sec.


Total time take for parsing 284.49 seconds.


### 3.2) Extract metadata

In [6]:
print('Document Name: ', conv_res.document.origin.filename)
print('Document Type: ', conv_res.document.origin.mimetype)
print('Number of Pages: ', len(conv_res.document.pages.keys()))
print('Number of Text Elements: ', len(conv_res.document.dict()['texts']))
print('Number of Text Tables: ', len(conv_res.document.dict()['tables']))
print('Number of Text Images: ', len(conv_res.document.dict()['pictures']))

Document Name:  CC&L Q Global.pdf
Document Type:  application/pdf
Number of Pages:  51
Number of Text Elements:  756
Number of Text Tables:  36
Number of Text Images:  11


C:\Users\xenow\AppData\Local\Temp\ipykernel_23792\1348740120.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print('Number of Text Elements: ', len(conv_res.document.dict()['texts']))
C:\Users\xenow\AppData\Local\Temp\ipykernel_23792\1348740120.py:5: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print('Number of Text Tables: ', len(conv_res.document.dict()['tables']))
C:\Users\xenow\AppData\Local\Temp\ipykernel_23792\1348740120.py:6: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration

#### 3.2.1) Iterate through text units

In [7]:
results_body = conv_res.document.dict()

dict_list = []

texts = results_body['texts']


C:\Users\xenow\AppData\Local\Temp\ipykernel_23792\4010605772.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = conv_res.document.dict()


In [8]:
texts

[{'self_ref': '#/texts/0',
  'parent': {'cref': '#/body'},
  'children': [],
  'content_layer': <ContentLayer.BODY: 'body'>,
  'label': <DocItemLabel.SECTION_HEADER: 'section_header'>,
  'prov': [{'page_no': 1,
    'bbox': {'l': 265.08,
     't': 694.3190649804687,
     'r': 379.288,
     'b': 681.4460649804688,
     'coord_origin': <CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>},
    'charspan': (0, 16)}],
  'orig': 'Q Global Equity)',
  'text': 'Q Global Equity)',
  'formatting': None,
  'hyperlink': None,
  'level': 1},
 {'self_ref': '#/texts/1',
  'parent': {'cref': '#/body'},
  'children': [],
  'content_layer': <ContentLayer.BODY: 'body'>,
  'label': <DocItemLabel.SECTION_HEADER: 'section_header'>,
  'prov': [{'page_no': 1,
    'bbox': {'l': 70.08,
     't': 710.8940649804688,
     'r': 303.818,
     'b': 681.4430649804688,
     'coord_origin': <CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>},
    'charspan': (0, 61)}],
  'orig': 'Aon Investment Manager Research Due Diligence Questionnaire (',
  

In [9]:
for t in texts:
    ref = t['self_ref']
    text_content = t['text']
    page = t['prov'][0]['page_no']
    #dict_list.append({'text_reference':ref, 'page': page, 'text_content(first 500 chars)':text_content[:500]})
    dict_list.append({'text_reference':ref, 'page': page, 'text' :text_content})

In [None]:
df = pd.DataFrame(dict_list)
df

Unnamed: 0,text_reference,page,text
0,#/texts/0,1,Q Global Equity)
1,#/texts/1,1,Aon Investment Manager Research Due Diligence ...
2,#/texts/2,1,Please read these instructions before completi...
3,#/texts/3,1,"Should any questions not be applicable, please..."
4,#/texts/4,1,If you are responding with information on more...
...,...,...,...
751,#/texts/751,51,"6.2. Have any questions been added, removed, o..."
752,#/texts/752,51,"Yes, question 4.29 was altered to reflect '>=1..."
753,#/texts/753,51,Aon
754,#/texts/754,51,51


In [11]:
# Export df to Excel
df.to_excel("text_df.xlsx", index=False)

#### 3.2.2) Iterate through Tables

In [None]:
tables = results_body['tables']
dict_list = []

for t in tables:
    ref = t['self_ref']
    page = t['prov'][0]['page_no']

    table_captions = t['captions']
    if(len(table_captions) and 'cref' in table_captions[0].keys()):
        table_caption = table_captions[0]['cref']
    else:
        table_caption = 'No Caption'
    table_data = str(t['data'])[:100]

    dict_list.append({'table_reference':ref, 'page': page, 'table_captions': table_captions,'table_data(first 100 chars)':table_data})

In [None]:
df = pd.DataFrame(dict_list)

df

#### 3.2.3) Iterate through Images

In [None]:
images = results_body['pictures']
dict_list = []

for i in images:
    ref = i['self_ref']
    page = i['prov'][0]['page_no']

    image_captions = i['captions']

    if(len(image_captions) and 'cref' in image_captions[0].keys()):
        image_caption = image_captions[0]['cref']
    else:
        image_caption = 'No Caption'
    image_data = str(t['image'])[:100]

    dict_list.append({'image_reference':ref, 'page': page, 'image_captions': image_captions,'image_data(first 100 chars)':image_data})

In [None]:
df = pd.DataFrame(dict_list)

df

### 3.3) Iterate though the pages of the document

In [None]:
## Display the pages

def display_images(images, images_per_row=5, figsize=(15,8)):

    # handle dictionary or list input
    if isinstance(images, dict):
        images = list(images.values())

    # calculate number of rows needed
    num_rows = math.ceil(len(images)/images_per_row)

    # create subplots
    fig, axes = plt.subplots(num_rows, images_per_row, figsize=figsize)
    axes = axes.flatten() #flatten axes for easier iteration

    # plot images
    for ax, image in zip(axes, images):
        ax.imshow(image)
        ax.axis('off')

    # turn off unused axes
    for ax in axes[len(images):]:
        ax.axis('off')

    # adjust layout and display
    plt.tight_layout()
    plt.show()

In [None]:
page_images = {page_no: page.image.pil_image for page_no, page in conv_res.document.pages.items()}
display_images(page_images, images_per_row=5, figsize=(15,8))

#### Save the page images to directory

In [None]:
dir_path = './pages'
os.makedirs(dir_path, exist_ok=True)

for page_no, page in conv_res.document.pages.items():
    page_no = page.page_no
    page_image_filename = f"{page_no}.png"
    with open(os.path.join(dir_path, page_image_filename), 'wb') as fp: # use the built-in open() function
        page.image.pil_image.save(fp, format='PNG')

### 3.4 Extract all the images from the document

In [None]:
dir_path = './images'
os.makedirs(dir_path, exist_ok=True)

images_list = []
image_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, PictureItem):
        element_image_filename = os.path.join(dir_path, str(image_number)) + '.png'
        with open(element_image_filename, 'wb') as fp:
            image = element.get_image(conv_res.document)
            image.save(fp, 'PNG')
            images_list.append(image)
        image_number += 1

In [None]:
display_images(images_list, images_per_row=5, figsize=(15,8))

### 3.5 Extract all tables from the document

In [None]:
dir_path = './tables'
dir_paths = ['./tables/images', './tables/CSVs', './tables/HTMLs']

[os.makedirs(path, exist_ok=True) for path in dir_paths]

table_list = []
table_number = 1

for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TableItem):
        
        # saving tables as images
        element_table_filename = os.path.join(dir_path, 'images', str(table_number)) + '.png'
        with open(element_table_filename, 'wb') as fp:
            table_image = element.get_image(conv_res.document)
            table_image.save(fp,'PNG')
            table_list.append(table_image)

        # saving tables as CSV files
        table_df: pd.DataFrame = element.export_to_dataframe()
        element_table_filename = os.path.join(dir_path, 'CSVs', str(table_number)) + '.csv'
        table_df.to_csv(element_table_filename)

        # saving tables as HTML files
        table_html = element.export_to_html()
        element_table_filename = os.path.join(dir_path, 'HTMLs', str(table_number)) + '.html'
        with open(element_table_filename, 'w') as fp:
            fp.write(table_html)

        table_number += 1 

In [None]:
display_images(table_list, images_per_row=5, figsize=(15,8))

## OCR pipeline

In [None]:
def OCR_parsing(doc_path):
    input_doc = Path(doc_path)
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options
    pipeline_options.generate_page_images = True

    start_time = time.time()

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    result = converter.convert(input_doc)
    end_time = time.time() - start_time
    print(f'Total time take for parsing {end_time: .2f} seconds.')
    return result

In [None]:
OCR_result = OCR_parsing(source)

extract = OCR_result.document.export_to_markdown()

print('Docling Extract')
print(extract)

In [None]:
page_images = {page_no: page.image.pil_image for page_no, page in OCR_result.document.pages.items()}
print('Actual Document')
display_images(page_images, images_per_row=2, figsize=(100,50))