## Libraries

In [1]:
import re
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from pathlib import Path
from IPython.display import display

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================
SOURCE_PDF = "documents/Antipodes.pdf"
BREAKPOINT_EXCEL = "inputfiles/BreakpointTest.xlsx"
IMAGE_OUTPUT_DIR = './images'
IMAGE_RESOLUTION_SCALE = 2.0

In [3]:
# ============================================================================
# STEP 1: DOCUMENT CONVERSION
# ============================================================================
print("Step 1: Converting PDF document...")
input_doc_path = Path(SOURCE_PDF)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_table_structure = True

# Configure EasyOCR options
#pipeline_options.do_ocr = True
#ocr_options = EasyOcrOptions()
#ocr_options.force_full_page_ocr = True  # Force OCR on entire page
#pipeline_options.ocr_options = ocr_options

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start_time = time.time()
result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'✓ Document parsed in {end_time:.2f} seconds.\n')

2025-10-04 20:57:33,322 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-04 20:57:33,365 - INFO - Going to convert document batch...
2025-10-04 20:57:33,366 - INFO - Initializing pipeline for StandardPdfPipeline with options hash ce2db4bc6b59e8bf84cfaffa1879c953
2025-10-04 20:57:33,380 - INFO - Loading plugin 'docling_defaults'
2025-10-04 20:57:33,384 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-04 20:57:33,405 - INFO - Loading plugin 'docling_defaults'
2025-10-04 20:57:33,410 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-04 20:57:33,509 - INFO - Accelerator device: 'cpu'


Step 1: Converting PDF document...


2025-10-04 20:57:34,793 - INFO - Accelerator device: 'cpu'
2025-10-04 20:57:35,834 - INFO - Accelerator device: 'cpu'
2025-10-04 20:57:36,085 - INFO - Processing document Antipodes.pdf
2025-10-04 20:59:42,307 - INFO - Finished converting document Antipodes.pdf in 128.99 sec.


✓ Document parsed in 128.99 seconds.



In [4]:
# results markdown
PAGE_BREAK_PLACEHOLDER = "<page_break>"
results_markdown = result.document.export_to_markdown(page_break_placeholder=PAGE_BREAK_PLACEHOLDER)
#results_markdown = result.document.export_to_markdown()

In [5]:
print(results_markdown)

<!-- image -->

## Aon Investment Manager Research Due Diligence Questionnaire ( Antipodes Global Long Strategy

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy sections one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplemental att

## Extract all Questionnaire Headers

In [6]:
def extract_headers_with_numbering(markdown_text):
    """
    Extract headers and questions while preserving their numbering.
    Only includes items that start with a digit or # (headers).
    Excludes items that contain the substring "Section" (case-sensitive).
    """
    headers_and_questions = []
    
    lines = markdown_text.split('\n')
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
        
        # Extract markdown headers (e.g., ## 1. Section One...)
        if line.startswith('#'):
            header_text = re.sub(r'^#+\s*', '', line).strip()
            # Only include if it starts with a digit AND doesn't contain "Section"
            if header_text and header_text[0].isdigit() and "Section" not in header_text:
                headers_and_questions.append(header_text)
        
        # Extract numbered list items, preserving the numbering
        elif re.match(r'^-\s+\d+(\.\d+)*\.?\s+', line):
            # Remove the leading dash and space
            question_text = re.sub(r'^-\s+', '', line).strip()
            # Double-check it starts with a digit AND doesn't contain "Section"
            if question_text and question_text[0].isdigit() and "Section" not in question_text:
                headers_and_questions.append(question_text)
    
    return headers_and_questions

In [7]:
headers_questions_clean = extract_headers_with_numbering(results_markdown)

print(f"Found {len(headers_questions_clean)} headers/questions:\n")
for item in headers_questions_clean:
    print(item)

Found 123 headers/questions:

1.1. Please complete the following information:
1.1.1 Firm name
1.1.2 Firm's web site address.
1.1.3 Locations and number of offices.
1.1.4 Location of investment, trading, administration, and operations functions.
1.1.5 Primary contact individual (include telephone / fax numbers and email address).
1.1.6 Firm inception (and date of assets first managed if different).
1.1.7 Total assets under management for the firm and AUM broken down by asset classes.
1.2 Firm's ownership structure, including:
1.2.1 Legal structure.
1.2.2 Ownership structure including economic and voting interests broken down by each individual and entity (please include each individual's relationship to firm).
1.2.3 Changes over the past three years and planned changes to the ownership structure.
1.2.4 Any affiliated companies or joint ventures including any deals currently in process, and any planned deals.
1.2.5 Detail any corporate structure changes over the past three years, includi

In [8]:
import pandas as pd
import re

def classify_into_main_sections(headers_list):
    """
    Create a dataframe with Main Section and Headers columns.
    Main Section is extracted from the first digit of each header.
    """
    data = []
    
    for header in headers_list:
        # Extract the first number (main section number)
        match = re.match(r'^(\d+)', header)
        if match:
            main_section = match.group(1)
            data.append({
                'Main Section': main_section,
                'Headers': header
            })
    
    # Create dataframe
    df = pd.DataFrame(data)
    
    return df

# Create the dataframe
df_headers = classify_into_main_sections(headers_questions_clean)



In [9]:
df_headers

Unnamed: 0,Main Section,Headers
0,1,1.1. Please complete the following information:
1,1,1.1.1 Firm name
2,1,1.1.2 Firm's web site address.
3,1,1.1.3 Locations and number of offices.
4,1,"1.1.4 Location of investment, trading, adminis..."
...,...,...
118,6,6.23. Has a portfolio manager shifted the port...
119,6,6.24. Discuss how ESG integration adds value t...
120,6,6.25. How do you report on responsible investm...
121,7,7.1 Will your firm notify us within a reasona...


### Classifiy Main Sections

In [10]:
# Manual mapping of Main Section to Main Section Name
section_mapping = {
    '1': 'Firm Information and AUM',
    '2': 'Firm Wide Professional Staff',
    '3': 'Compensation',
    '4': 'Product Information',
    '5': 'Risk & Fees',
    '6': 'Responsible Investment',
    '7': 'Concluding Section'
}

# Create the Main Section Name column by mapping
df_headers['Main Section Name'] = df_headers['Main Section'].map(section_mapping)

# Reorder columns to have Main Section Name as the 2nd column
df_headers = df_headers[['Main Section', 'Main Section Name', 'Headers']]

In [11]:
df_headers

Unnamed: 0,Main Section,Main Section Name,Headers
0,1,Firm Information and AUM,1.1. Please complete the following information:
1,1,Firm Information and AUM,1.1.1 Firm name
2,1,Firm Information and AUM,1.1.2 Firm's web site address.
3,1,Firm Information and AUM,1.1.3 Locations and number of offices.
4,1,Firm Information and AUM,"1.1.4 Location of investment, trading, adminis..."
...,...,...,...
118,6,Responsible Investment,6.23. Has a portfolio manager shifted the port...
119,6,Responsible Investment,6.24. Discuss how ESG integration adds value t...
120,6,Responsible Investment,6.25. How do you report on responsible investm...
121,7,Concluding Section,7.1 Will your firm notify us within a reasona...


## Reannotate Images by their headers

In [14]:
def identify_images_with_headers(markdown_text, df_headers):
    """
    Identify each <!-- image --> and determine which header it belongs to.
    Returns a dataframe with image positions and their associated headers.
    """
    # Get all headers from df_headers
    headers_list = df_headers['Headers'].tolist()
    
    # Find all positions of headers in the markdown
    header_positions = []
    for header in headers_list:
        pos = markdown_text.find(header)
        if pos != -1:
            header_positions.append({
                'header': header,
                'position': pos
            })
    
    # Sort by position
    header_positions.sort(key=lambda x: x['position'])
    
    # Find all <!-- image --> tags and their positions
    image_pattern = r'<!-- image -->'
    image_matches = list(re.finditer(image_pattern, markdown_text))
    
    # For each image, find the nearest preceding header
    image_data = []
    
    for idx, match in enumerate(image_matches, 1):
        image_pos = match.start()
        
        # Find the nearest preceding header
        nearest_header = None
        nearest_header_pos = -1
        
        for header_info in reversed(header_positions):
            if header_info['position'] < image_pos:
                nearest_header = header_info['header']
                nearest_header_pos = header_info['position']
                break
        
        # Default to 'Intro' if no header found
        if nearest_header is None:
            nearest_header = 'Intro'
            nearest_header_pos = 0
        
        image_data.append({
            'Image_Number': idx,
            'Image_Position': image_pos,
            'Belongs_To_Header': nearest_header,
            'Header_Position': nearest_header_pos,
            'Distance_From_Header': image_pos - nearest_header_pos
        })
    
    return pd.DataFrame(image_data)


In [15]:
# Create the dataframe
images_df = identify_images_with_headers(results_markdown, df_headers)
images_df

Unnamed: 0,Image_Number,Image_Position,Belongs_To_Header,Header_Position,Distance_From_Header
0,1,0,Intro,0,0
1,2,12011,1.3 Please complete the following tables:,10649,1362
2,3,14856,1.3 Please complete the following tables:,10649,4207
3,4,39076,4.1 Include the names of all investment profes...,35041,4035
4,5,52566,4.4 How are analysts' responsibilities divided...,51070,1496
5,6,58157,4.9 Describe in detail the investment process ...,57860,297
6,7,99516,5.16 Please provide the following information ...,99381,135
7,8,133918,6.21. Do you actively engage or collaborate wi...,131758,2160


In [16]:
def replace_image_tags_with_named_images(markdown_text, images_df):
    """
    Replace <!-- image --> tags with descriptive names based on the images_df.
    
    Format: '<Image X - Y.Z>' where X is the sequential image number for that header,
    and Y.Z is just the numeric part of the header (e.g., 1.3, 4.1).
    """
    # Group by header and assign sequential numbers within each header
    images_df_sorted = images_df.sort_values('Image_Position').copy()
    
    # Count images per header
    images_df_sorted['Image_Number_In_Header'] = images_df_sorted.groupby('Belongs_To_Header').cumcount() + 1
    
    # Extract only the numeric part from the header
    def extract_header_number(header):
        # Extract the number pattern (e.g., "1.3" from "1.3 Please complete...")
        match = re.match(r'^(\d+(?:\.\d+)*)', header)
        if match:
            return match.group(1)
        else:
            # If no number found (e.g., "Intro"), return as is
            return header
    
    images_df_sorted['Header_Number'] = images_df_sorted['Belongs_To_Header'].apply(extract_header_number)
    
    # Create replacement names with < and > prefix/suffix
    images_df_sorted['Replacement_Text'] = images_df_sorted.apply(
        lambda row: f"Image {row['Image_Number_In_Header']} - {row['Header_Number']}", 
        axis=1
    )
    
    # Replace from end to start to maintain positions
    modified_markdown = markdown_text
    
    for idx in reversed(images_df_sorted.index):
        row = images_df_sorted.loc[idx]
        pos = row['Image_Position']
        replacement = row['Replacement_Text']
        
        # Replace <!-- image --> at this position
        modified_markdown = (
            modified_markdown[:pos] + 
            replacement + 
            modified_markdown[pos + len('<!-- image -->'):]
        )
    
    return modified_markdown, images_df_sorted


In [17]:
# Apply the replacement
results_markdown_with_named_images, images_mapping_df = replace_image_tags_with_named_images(
    results_markdown, 
    images_df
)

In [18]:
images_mapping_df

Unnamed: 0,Image_Number,Image_Position,Belongs_To_Header,Header_Position,Distance_From_Header,Image_Number_In_Header,Header_Number,Replacement_Text
0,1,0,Intro,0,0,1,Intro,Image 1 - Intro
1,2,12011,1.3 Please complete the following tables:,10649,1362,1,1.3,Image 1 - 1.3
2,3,14856,1.3 Please complete the following tables:,10649,4207,2,1.3,Image 2 - 1.3
3,4,39076,4.1 Include the names of all investment profes...,35041,4035,1,4.1,Image 1 - 4.1
4,5,52566,4.4 How are analysts' responsibilities divided...,51070,1496,1,4.4,Image 1 - 4.4
5,6,58157,4.9 Describe in detail the investment process ...,57860,297,1,4.9,Image 1 - 4.9
6,7,99516,5.16 Please provide the following information ...,99381,135,1,5.16,Image 1 - 5.16
7,8,133918,6.21. Do you actively engage or collaborate wi...,131758,2160,1,6.21,Image 1 - 6.21


In [19]:
print(results_markdown_with_named_images)

Image 1 - Intro

## Aon Investment Manager Research Due Diligence Questionnaire ( Antipodes Global Long Strategy

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy sections one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplemental at

#### Extract images and save into the images folder

In [21]:
dir_path = './images'
os.makedirs(dir_path, exist_ok=True)

images_list = []
image_counter = 0

for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        # Get the replacement text from images_mapping_df
        if image_counter < len(images_mapping_df):
            replacement_text = images_mapping_df.iloc[image_counter]['Replacement_Text']
            # Remove < and > for filename, replace spaces and special chars with underscore
            filename = replacement_text
            element_image_filename = os.path.join(dir_path, filename) + '.png'
        else:
            # Fallback if we have more images than expected
            element_image_filename = os.path.join(dir_path, f'image_{image_counter + 1}') + '.png'
        
        with open(element_image_filename, 'wb') as fp:
            image = element.get_image(result.document)
            image.save(fp, 'PNG')
            images_list.append(image)
            print(f"Saved: {element_image_filename}")
        
        image_counter += 1

print(f"\n✓ Total images saved: {image_counter}")

Saved: ./images\Image 1 - Intro.png
Saved: ./images\Image 1 - 1.3.png
Saved: ./images\Image 2 - 1.3.png
Saved: ./images\Image 1 - 4.1.png
Saved: ./images\Image 1 - 4.4.png
Saved: ./images\Image 1 - 4.9.png
Saved: ./images\Image 1 - 5.16.png
Saved: ./images\Image 1 - 6.21.png

✓ Total images saved: 8


## Chunkings

In [22]:
def create_granular_chunks(df_headers, markdown_text):
    """
    Create chunks of text between consecutive headers.
    Each chunk includes text from current header up to (but excluding) the next header.
    """
    chunks = []
    
    # Get list of headers
    headers_list = df_headers['Headers'].tolist()
    
    for i, row in df_headers.iterrows():
        current_header = row['Headers']
        
        # Determine the next header (if exists)
        if i < len(df_headers) - 1:
            next_header = df_headers.iloc[i + 1]['Headers']
        else:
            next_header = None
        
        # Find the position of current header in markdown
        current_pos = markdown_text.find(current_header)
        
        if current_pos == -1:
            # Header not found in markdown
            chunks.append("")
            continue
        
        # Find the position of next header
        if next_header:
            next_pos = markdown_text.find(next_header, current_pos + len(current_header))
            if next_pos != -1:
                # Extract text from current header to next header (excluding next header)
                chunk_text = markdown_text[current_pos:next_pos].strip()
            else:
                # Next header not found, take rest of document
                chunk_text = markdown_text[current_pos:].strip()
        else:
            # Last header, take rest of document
            chunk_text = markdown_text[current_pos:].strip()
        
        chunks.append(chunk_text)
    
    return chunks



In [23]:
# Create chunks
chunks = create_granular_chunks(df_headers, results_markdown)


# Create new dataframe with chunks
granular_chunk_df = df_headers.copy()
granular_chunk_df['chunk'] = chunks

# Simple word-based token approximation (roughly 1 token ≈ 0.75 words)
granular_chunk_df['approximate_token_count'] = granular_chunk_df['chunk'].apply(
    lambda x: int(len(x.split()) * 1.33) if x else 0
)



In [24]:
granular_chunk_df

Unnamed: 0,Main Section,Main Section Name,Headers,chunk,approximate_token_count
0,1,Firm Information and AUM,1.1. Please complete the following information:,1.1. Please complete the following information...,9
1,1,Firm Information and AUM,1.1.1 Firm name,1.1.1 Firm name\n\nAntipodes Partners Limited\...,9
2,1,Firm Information and AUM,1.1.2 Firm's web site address.,1.1.2 Firm's web site address.\n\nwww.antipode...,9
3,1,Firm Information and AUM,1.1.3 Locations and number of offices.,1.1.3 Locations and number of offices.\n\n3 of...,34
4,1,Firm Information and AUM,"1.1.4 Location of investment, trading, adminis...","1.1.4 Location of investment, trading, adminis...",51
...,...,...,...,...,...
118,6,Responsible Investment,6.23. Has a portfolio manager shifted the port...,6.23. Has a portfolio manager shifted the port...,441
119,6,Responsible Investment,6.24. Discuss how ESG integration adds value t...,6.24. Discuss how ESG integration adds value t...,85
120,6,Responsible Investment,6.25. How do you report on responsible investm...,6.25. How do you report on responsible investm...,73
121,7,Concluding Section,7.1 Will your firm notify us within a reasona...,7.1 Will your firm notify us within a reasona...,31


### Alternative, let's reduce the number of chunks by grouping some of the granular chunks together but limited by "Main Section"

In [25]:
def combine_granular_chunks(df, chunk_size=1000):
    """
    Combine rows in granular_chunk_df to meet minimum chunk_size.
    
    Rules:
    1. Only combine rows with the same Main Section
    2. If a row's approximate_token_count >= chunk_size, leave it alone
    3. Combine consecutive rows until reaching chunk_size (without exceeding it significantly)
    
    Parameters:
    - df: granular_chunk_df dataframe
    - chunk_size: minimum token count target for chunks
    
    Returns:
    - New dataframe with combined chunks
    """
    combined_data = []
    
    # Group by Main Section to ensure we only combine within same section
    grouped = df.groupby('Main Section', sort=False)
    
    for main_section, group in grouped:
        group = group.reset_index(drop=True)
        i = 0
        
        while i < len(group):
            current_row = group.iloc[i]
            current_tokens = current_row['approximate_token_count']
            
            # If current row already meets or exceeds chunk_size, keep it as is
            if current_tokens >= chunk_size:
                combined_data.append({
                    'Main Section': current_row['Main Section'],
                    'Main Section Name': current_row['Main Section Name'],
                    'Headers': current_row['Headers'],
                    'chunk': current_row['chunk'],
                    'approximate_token_count': current_tokens
                })
                i += 1
                continue
            
            # Try to combine with next rows
            combined_headers = [current_row['Headers']]
            combined_chunks = [current_row['chunk']]
            combined_tokens = current_tokens
            j = i + 1
            
            # Keep adding rows until we reach chunk_size or run out of rows
            while j < len(group) and combined_tokens < chunk_size:
                next_row = group.iloc[j]
                next_tokens = next_row['approximate_token_count']
                
                # Check if adding next row would be reasonable
                # (don't add if next row alone is >= chunk_size)
                if next_tokens >= chunk_size:
                    break
                
                combined_headers.append(next_row['Headers'])
                combined_chunks.append(next_row['chunk'])
                combined_tokens += next_tokens
                j += 1
            
            # Create combined row
            combined_data.append({
                'Main Section': current_row['Main Section'],
                'Main Section Name': current_row['Main Section Name'],
                'Headers': ' | '.join(combined_headers),  # Combine headers with separator
                'chunk': '\n\n'.join(combined_chunks),  # Combine chunks with double newline
                'approximate_token_count': combined_tokens
            })
            
            # Move to next uncombined row
            i = j
    
    return pd.DataFrame(combined_data)


In [26]:
# Combine chunks with different chunk sizes
combined_granular_df = combine_granular_chunks(granular_chunk_df, chunk_size=1000)


In [27]:
combined_granular_df

Unnamed: 0,Main Section,Main Section Name,Headers,chunk,approximate_token_count
0,1,Firm Information and AUM,1.1. Please complete the following information...,1.1. Please complete the following information...,1733
1,1,Firm Information and AUM,1.4. If there has been a significant drop or i...,1.4. If there has been a significant drop or i...,468
2,2,Firm Wide Professional Staff,2.1. List the total number of employees in you...,2.1. List the total number of employees in you...,1040
3,2,Firm Wide Professional Staff,2.5 If there is a dedicated responsible invest...,2.5 If there is a dedicated responsible invest...,527
4,3,Compensation,3.1 Describe in detail the compensation struct...,3.1 Describe in detail the compensation struct...,833
5,4,Product Information,4.1 Include the names of all investment profes...,4.1 Include the names of all investment profes...,1058
6,4,Product Information,4.2 Describe the structure of the investment m...,4.2 Describe the structure of the investment m...,1051
7,4,Product Information,4.5 Provide a description of succession-planni...,4.5 Provide a description of succession-planni...,1002
8,4,Product Information,4.9 Describe in detail the investment process ...,4.9 Describe in detail the investment process ...,1570
9,4,Product Information,4.10 Please complete the following table. | 4....,4.10 Please complete the following table.\n\n|...,1135


In [28]:
# Display results
print("=== Original Granular Chunks ===")
print(f"Total rows: {len(granular_chunk_df)}")
print(f"Average tokens: {granular_chunk_df['approximate_token_count'].mean():.0f}")
print(f"Min tokens: {granular_chunk_df['approximate_token_count'].min()}")
print(f"Max tokens: {granular_chunk_df['approximate_token_count'].max()}")

print("\n=== Combined Granular Chunks ===")
print(f"Total rows: {len(combined_granular_df)}")
print(f"Average tokens: {combined_granular_df['approximate_token_count'].mean():.0f}")
print(f"Min tokens: {combined_granular_df['approximate_token_count'].min()}")
print(f"Max tokens: {combined_granular_df['approximate_token_count'].max()}")

print("\n=== Combined Chunks by Main Section ===")
for section in combined_granular_df['Main Section'].unique():
    section_df = combined_granular_df[combined_granular_df['Main Section'] == section]
    print(f"\nSection {section}: {len(section_df)} chunks")
    print(section_df[['Headers', 'approximate_token_count']])

# Try different chunk sizes
print("\n=== Comparison of Different Chunk Sizes ===")
for size in [500, 1000, 1500, 2000]:
    temp_df = combine_granular_chunks(granular_chunk_df, chunk_size=size)
    print(f"Chunk size {size}: {len(temp_df)} chunks (avg tokens: {temp_df['approximate_token_count'].mean():.0f})")

=== Original Granular Chunks ===
Total rows: 123
Average tokens: 192
Min tokens: 7
Max tokens: 1570

=== Combined Granular Chunks ===
Total rows: 24
Average tokens: 983
Min tokens: 52
Max tokens: 1733

=== Combined Chunks by Main Section ===

Section 1: 2 chunks
                                             Headers  approximate_token_count
0  1.1. Please complete the following information...                     1733
1  1.4. If there has been a significant drop or i...                      468

Section 2: 2 chunks
                                             Headers  approximate_token_count
2  2.1. List the total number of employees in you...                     1040
3  2.5 If there is a dedicated responsible invest...                      527

Section 3: 1 chunks
                                             Headers  approximate_token_count
4  3.1 Describe in detail the compensation struct...                      833

Section 4: 8 chunks
                                              Head

In [29]:
combined_granular_df.to_excel('combined_granular_chunks.xlsx', index=False)


### Main Section Chunking

In [30]:
def create_main_section_chunks(df_headers, markdown_text):
    """
    Create chunks of text grouped by Main Section.
    All headers belonging to the same Main Section are combined into one chunk.
    """
    chunks_data = []
    
    # Group by Main Section
    grouped = df_headers.groupby('Main Section', sort=False)
    
    for main_section, group in grouped:
        # Get all headers in this main section
        headers_in_section = group['Headers'].tolist()
        
        # Get the Main Section Name (should be same for all rows in group)
        main_section_name = group['Main Section Name'].iloc[0]
        
        # Find the first header position
        first_header = headers_in_section[0]
        start_pos = markdown_text.find(first_header)
        
        if start_pos == -1:
            # First header not found
            chunks_data.append({
                'Main Section': main_section,
                'Main Section Name': main_section_name,
                'chunk': ""
            })
            continue
        
        # Find where this section ends (start of next main section)
        # Get the first header of the next main section
        next_main_section = str(int(main_section) + 1)
        next_section_headers = df_headers[df_headers['Main Section'] == next_main_section]['Headers']
        
        if not next_section_headers.empty:
            # Find the first header of next section
            next_section_first_header = next_section_headers.iloc[0]
            end_pos = markdown_text.find(next_section_first_header, start_pos)
            
            if end_pos != -1:
                chunk_text = markdown_text[start_pos:end_pos].strip()
            else:
                # Next section not found, take rest of document
                chunk_text = markdown_text[start_pos:].strip()
        else:
            # This is the last section, take rest of document
            chunk_text = markdown_text[start_pos:].strip()
        
        chunks_data.append({
            'Main Section': main_section,
            'Main Section Name': main_section_name,
            'chunk': chunk_text
        })
    
    return pd.DataFrame(chunks_data)


In [31]:

# Create main section chunks
mainSection_chunk_df = create_main_section_chunks(df_headers, results_markdown)

# Simple word-based token approximation (roughly 1 token ≈ 0.75 words)
mainSection_chunk_df['approximate_token_count'] = mainSection_chunk_df['chunk'].apply(
    lambda x: int(len(x.split()) * 1.33) if x else 0
)

# Display the dataframe
mainSection_chunk_df


Unnamed: 0,Main Section,Main Section Name,chunk,approximate_token_count
0,1,Firm Information and AUM,1.1. Please complete the following information...,2211
1,2,Firm Wide Professional Staff,2.1. List the total number of employees in you...,1569
2,3,Compensation,3.1 Describe in detail the compensation struct...,835
3,4,Product Information,4.1 Include the names of all investment profes...,9165
4,5,Risk & Fees,5.1 Who is responsible for the product's risk ...,2590
5,6,Responsible Investment,6.1. Are you a signatory to the Principles for...,7223
6,7,Concluding Section,7.1 Will your firm notify us within a reasona...,53


### Hybrid Chunking from Docling

- for more control and using openai models see: https://docling-project.github.io/docling/examples/hybrid_chunking/#configuring-tokenization

In [32]:
from docling.chunking import HybridChunker

In [33]:
chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=result.document)

Token indices sequence length is longer than the specified maximum sequence length for this model (1058 > 512). Running this sequence through the model will result in indexing errors


In [34]:
for i, chunk in enumerate(chunk_iter):
    print(f"=== {i} ===")
    print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")

    enriched_text = chunker.contextualize(chunk=chunk)
    print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}")

    print()

=== 0 ===
chunk.text:
"Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses …"
chunker.contextualize(chunk):
"Please read these instructions before completing this questionnaire.\nShould any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference b…"

=== 1 ===
chunk.text:
'Yes, Item/Document = Most recently updated presentation book for institutional investors and consultants.. Yes, Item/Document = Current Personnel Organizational Chart (providing names and roles of key individuals).. Yes, Item/Document = Provide the names of all investment professionals in the attach…'
chunker.cont