In [9]:
import time
import pandas as pd
import os
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import PictureItem
from docling.datamodel.pipeline_options import PdfPipelineOptions


In [10]:
# ============================================================================
# CONFIGURATION
# ============================================================================
SOURCE_PDF = "documents/Antipodes.pdf"
BREAKPOINT_EXCEL = "inputfiles/BreakpointTest.xlsx"
IMAGE_OUTPUT_DIR = './images'
IMAGE_RESOLUTION_SCALE = 2.0



In [11]:
# ============================================================================
# STEP 1: DOCUMENT CONVERSION
# ============================================================================
print("Step 1: Converting PDF document...")
input_doc_path = Path(SOURCE_PDF)

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start_time = time.time()
result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f'✓ Document parsed in {end_time:.2f} seconds.\n')

# Export to markdown
results_markdown = result.document.export_to_markdown()

2025-10-04 00:11:21,186 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-04 00:11:21,199 - INFO - Going to convert document batch...
2025-10-04 00:11:21,201 - INFO - Initializing pipeline for StandardPdfPipeline with options hash ce2db4bc6b59e8bf84cfaffa1879c953
2025-10-04 00:11:21,202 - INFO - Accelerator device: 'cpu'


Step 1: Converting PDF document...


2025-10-04 00:11:22,773 - INFO - Accelerator device: 'cpu'
2025-10-04 00:11:23,870 - INFO - Accelerator device: 'cpu'
2025-10-04 00:11:24,165 - INFO - Processing document Antipodes.pdf
2025-10-04 00:13:39,390 - INFO - Finished converting document Antipodes.pdf in 138.21 sec.


✓ Document parsed in 138.23 seconds.



In [12]:
# ============================================================================
# STEP 2: LOAD BREAKPOINT DATA
# ============================================================================
print("Step 2: Loading breakpoint data...")
breakpointdf = pd.read_excel(BREAKPOINT_EXCEL)
print(f'✓ Loaded {len(breakpointdf)} breakpoints.\n')



Step 2: Loading breakpoint data...
✓ Loaded 4 breakpoints.



In [13]:
# ============================================================================
# STEP 3: IMAGE EXTRACTION AND RENAMING
# ============================================================================
print("Step 3: Processing and saving images...")

# Create image output directory
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)

# Build mapping: determine which images belong to which section
image_mapping = {}
global_image_counter = 1

for idx, row in breakpointdf.iterrows():
    start_str = row['Start']
    end_str = row['End']
    short_desc = row['Short Description']
    
    # Find chunk boundaries in original markdown
    start_pos = results_markdown.find(start_str)
    if start_pos == -1:
        continue
    
    if pd.isna(end_str):
        end_pos = len(results_markdown)
    else:
        end_pos = results_markdown.find(end_str, start_pos + len(start_str))
        if end_pos == -1:
            end_pos = len(results_markdown)
    
    # Count images in this section
    chunk = results_markdown[start_pos:end_pos]
    image_count = chunk.count('<!-- image -->')
    
    # Map global image numbers to section-specific names
    for i in range(1, image_count + 1):
        image_mapping[global_image_counter] = f"{short_desc}_{i}"
        global_image_counter += 1

print(f"  Found {len(image_mapping)} images across {len(breakpointdf)} sections")

# Extract and save images with new names, update markdown placeholders
results_markdown_updated = results_markdown
image_number = 1

for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        # Get new image name from mapping
        new_image_name = image_mapping.get(image_number, f"image{image_number}")
        image_filepath = os.path.join(IMAGE_OUTPUT_DIR, f'{new_image_name}.png')
        
        # Save image
        with open(image_filepath, 'wb') as fp:
            image = element.get_image(result.document)
            image.save(fp, 'PNG')
        
        # Update markdown placeholder
        results_markdown_updated = results_markdown_updated.replace(
            '<!-- image -->',
            f'<!-- {new_image_name} -->',
            1  # Replace only first occurrence
        )
        
        image_number += 1

print(f'✓ Saved {image_number - 1} images to {IMAGE_OUTPUT_DIR}\n')



Step 3: Processing and saving images...
  Found 14 images across 4 sections
✓ Saved 8 images to ./images



In [14]:
print(results_markdown_updated)

<!-- Ownership_1 -->

## Aon Investment Manager Research Due Diligence Questionnaire ( Antipodes Global Long Strategy

## Please read these instructions before completing this questionnaire.

Should any questions not be applicable, please indicate as such by responding with 'n/a' or 'not applicable'. All market value information should be stated in millions ($USD) unless indicated otherwise. Please note the difference between individual or organizational accounts. Please enter responses to the questions in the spaces provided and/or in an attached document.

If you are responding with information on more than one product, please copy sections one to four. Please provide your Due Diligence Questionnaire answer file in a Word document format (not PDF).

In the case of multiple product submission, please clearly label each product at the top of this page.

Any supporting materials must be clearly referenced to the appropriate question and appropriately labeled.

Information and supplement

In [15]:
# ============================================================================
# STEP 4: EXTRACT CHUNKS WITH UPDATED IMAGE PLACEHOLDERS
# ============================================================================
print("Step 4: Extracting chunks with updated image references...")

breakpointdf['Chunk'] = ''

for idx, row in breakpointdf.iterrows():
    start_str = row['Start']
    end_str = row['End']
    
    # Find start position
    start_pos = results_markdown_updated.find(start_str)
    if start_pos == -1:
        breakpointdf.at[idx, 'Chunk'] = ''
        print(f"  Warning: Start string not found for row {idx}")
        continue
    
    # Find end position
    if pd.isna(end_str):
        end_pos = len(results_markdown_updated)
    else:
        end_pos = results_markdown_updated.find(end_str, start_pos + len(start_str))
        if end_pos == -1:
            end_pos = len(results_markdown_updated)
    
    # Extract chunk (start inclusive, end exclusive)
    chunk = results_markdown_updated[start_pos:end_pos]
    breakpointdf.at[idx, 'Chunk'] = chunk

print(f'✓ Extracted {len(breakpointdf)} chunks.\n')



Step 4: Extracting chunks with updated image references...
✓ Extracted 4 chunks.



In [16]:
# ============================================================================
# STEP 5: DISPLAY RESULTS
# ============================================================================
print("=" * 70)
print("PROCESSING COMPLETE")
print("=" * 70)
print(f"\nChunk Summary:")
for idx, row in breakpointdf.iterrows():
    chunk_length = len(row['Chunk'])
    image_count = row['Chunk'].count('<!--')
    print(f"  {row['Short Description']}: {chunk_length} chars, {image_count} images")

print(f"\n✓ All processing complete!")
print(f"✓ Images saved to: {IMAGE_OUTPUT_DIR}")
print(f"✓ Chunks stored in dataframe")

# Optional: Save results
# breakpointdf.to_excel("output_with_chunks.xlsx", index=False)
# with open("output_markdown.md", "w", encoding="utf-8") as f:
#     f.write(results_markdown_updated)

PROCESSING COMPLETE

Chunk Summary:
  Intro: 5084 chars, 0 images
  Ownership: 139795 chars, 7 images
  Affiliation: 822 chars, 0 images
  Ending: 137656 chars, 7 images

✓ All processing complete!
✓ Images saved to: ./images
✓ Chunks stored in dataframe
