## Libraries

In [1]:
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from pathlib import Path
from IPython.display import display

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
source = "documents/CC&L Q Global.pdf"

## Parser Pipeline

In [3]:
IMAGE_RESOLUTION_SCALE = 2.0

input_doc_path = Path(source)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [4]:
start_time = time.time()
result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'Total time take for parsing {end_time:.2f} seconds.')

2025-10-03 18:07:35,631 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-03 18:07:35,658 - INFO - Going to convert document batch...
2025-10-03 18:07:35,658 - INFO - Initializing pipeline for StandardPdfPipeline with options hash ce2db4bc6b59e8bf84cfaffa1879c953
2025-10-03 18:07:35,666 - INFO - Loading plugin 'docling_defaults'
2025-10-03 18:07:35,668 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-03 18:07:35,677 - INFO - Loading plugin 'docling_defaults'
2025-10-03 18:07:35,681 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-03 18:07:35,757 - INFO - Accelerator device: 'cpu'
2025-10-03 18:07:37,095 - INFO - Accelerator device: 'cpu'
2025-10-03 18:07:38,125 - INFO - Accelerator device: 'cpu'
2025-10-03 18:07:38,386 - INFO - Processing document CC&L Q Global.pdf
2025-10-03 18:10:13,326 - INFO - Finished converting document CC&L Q Global.pdf in 157.69 sec.


Total time take for parsing 157.70 seconds.


In [6]:
results_markdown = result.document.export_to_markdown()
print(results_markdown)

## Q Global Equity)

## Aon Investment Manager Research Due Diligence Questionnaire (

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy section one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplemental attachments that are strictly 

In [20]:
image_count = results_markdown.count('<!-- image -->')
for i in range(1, image_count + 1):
    results_markdown = results_markdown.replace('<!-- image -->', f'<!-- image{i} -->', 1)

print(results_markdown)

## Q Global Equity)

## Aon Investment Manager Research Due Diligence Questionnaire (

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy section one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplemental attachments that are strictly 

## Metadata

In [8]:
print('Document Name: ', result.document.origin.filename)
print('Document Type: ', result.document.origin.mimetype)
print('Number of Pages: ', len(result.document.pages.keys()))
print('Number of Text Elements: ', len(result.document.dict()['texts']))
print('Number of Text Tables: ', len(result.document.dict()['tables']))
print('Number of Text Images: ', len(result.document.dict()['pictures']))

Document Name:  CC&L Q Global.pdf
Document Type:  application/pdf
Number of Pages:  51
Number of Text Elements:  756
Number of Text Tables:  36
Number of Text Images:  11


C:\Users\Jasper\AppData\Local\Temp\ipykernel_42036\1621722229.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print('Number of Text Elements: ', len(result.document.dict()['texts']))
C:\Users\Jasper\AppData\Local\Temp\ipykernel_42036\1621722229.py:5: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print('Number of Text Tables: ', len(result.document.dict()['tables']))
C:\Users\Jasper\AppData\Local\Temp\ipykernel_42036\1621722229.py:6: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/

## Save Images

In [21]:
dir_path = './images'
os.makedirs(dir_path, exist_ok=True)

images_list = []
image_number = 1

for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        element_image_filename = os.path.join(dir_path, f'image{image_number}.png')
        with open(element_image_filename, 'wb') as fp:
            image = element.get_image(result.document)
            image.save(fp, 'PNG')
            images_list.append(image)
        image_number += 1

## Chunking

In [27]:
import pandas as pd

# Load the Excel file into a DataFrame
breakpointdf = pd.read_excel("inputfiles/BreakpointTest.xlsx")

In [28]:
breakpointdf

Unnamed: 0,Start,End
0,Aon Investment Manager Research,Thank you and please contact us if you have an...
1,Section One: Firm Information and AUM,"Firm’s ownership structure, including:"
2,"1.2 Firm's ownership structure, including:",1.2.4 Any affiliated companies or joint ventur...
3,1.2.4 Any affiliated companies or joint ventur...,


In [29]:
# Loop through each row in the dataframe
for idx, row in breakpointdf.iterrows():
    start_str = row['Start']
    end_str = row['End']
    
    # Find the start position
    start_pos = results_markdown.find(start_str)
    
    if start_pos == -1:
        # Start string not found
        breakpointdf.at[idx, 'Chunk'] = ''
        continue
    
    # Check if End is null/NaN
    if pd.isna(end_str):
        # Extract from start to the end of the document
        chunk = results_markdown[start_pos:]
    else:
        # Find the end position (search after the start position)
        end_pos = results_markdown.find(end_str, start_pos + len(start_str))
        
        if end_pos == -1:
            # End string not found, extract to the end
            chunk = results_markdown[start_pos:]
        else:
            # Extract from start (inclusive) to end (exclusive)
            chunk = results_markdown[start_pos:end_pos]
    
    breakpointdf.at[idx, 'Chunk'] = chunk

In [30]:
breakpointdf

Unnamed: 0,Start,End,Chunk
0,Aon Investment Manager Research,Thank you and please contact us if you have an...,Aon Investment Manager Research Due Diligence ...
1,Section One: Firm Information and AUM,"Firm’s ownership structure, including:",Section One: Firm Information and AUM\n\n## 1....
2,"1.2 Firm's ownership structure, including:",1.2.4 Any affiliated companies or joint ventur...,"1.2 Firm's ownership structure, including:\n\n..."
3,1.2.4 Any affiliated companies or joint ventur...,,1.2.4 Any affiliated companies or joint ventur...
