# Docling testing script

Extract structured data from PDF using docling.

In [3]:
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.settings import settings

## File path

In [2]:
file_path = "/mnt/c/Users/User/Downloads/codes/sample pdf/pwc-my-centrestage-budget-2025.pdf"

## Extract data from PDF using docling

In [4]:
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# Enable Profiling
settings.debug.profile_pipeline_timings = True

In [5]:
result = converter.convert(file_path)

In [6]:
doc_conversion_secs = result.timings["pipeline_total"].times
print(f"Conversion secs: {doc_conversion_secs}")

Conversion secs: [20.717543965999994]


## Create folders

In [5]:
output_dir = Path("extracted_images/docling")
output_dir.mkdir(parents=True, exist_ok=True)

## Save pages as images

Manually save each page as image.

In [None]:
for page_no, page in result.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"pwc-my-centrestage-budget-2025-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

## Save figures as images

Manually save the figures in the pages as images.

In [None]:
table_counter = 0
picture_counter = 0
for element, _ in result.document.iterate_items():
    if isinstance(element, TableItem):
        table_counter += 1
        element_image_filename = (
            output_dir / f"pwc-my-centrestage-budget-2025-table-{table_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(result.document).save(fp, "PNG")

    if isinstance(element, PictureItem):
        picture_counter += 1
        element_image_filename = (
            output_dir / f"pwc-my-centrestage-budget-2025-picture-{picture_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(result.document).save(fp, "PNG")

## Save to markdown file

This function includes saving figures as images and reference them.

In [6]:
md_filename = output_dir / "pwc-my-centrestage-budget-2025-picture-with-image-refs.md"
result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

## Print markdown directly

This function does NOT save the figures as images and reference them.

In [8]:
md = result.document.export_to_markdown(page_break_placeholder="<!-- page break -->")
print(md)

<!-- image -->

Centre Stage:

## Budget 2025 Overview

<!-- image -->

<!-- image -->

<!-- page break -->

<!-- image -->

Foreword Page 3

Budget snapshot

Economic snapshot

Page 4

Key Budget measures

Page 8

Page 5

Tax proposal highlights Page 17

Economic outlook

Page 6

<!-- page break -->

<!-- image -->

## Foreword

'Budget 2025 represents the government's day in the sun, having navigated through the economic challenges of the past few years. These measures provide a strong foundation for economic resilience, support for families and streamlined tax administration.'

Malaysia is currently buoyed by the recent strong economic growth, manageable inflation, and a robust Malaysian Ringgit against the US Dollar. Budget 2025 reflects the government's readiness to implement sustainable strategies to further fuel the economy while addressing the cost of living for the B40 and M40 groups.

## Tax reliefs focusing on care

## Expanding the revenue base

Budget 2025 avoids major new

In [12]:
md_remove_img = md.replace("<!-- image -->\n\n", "")
print(md_remove_img)

Centre Stage:

## Budget 2025 Overview

<!-- page break -->

Foreword Page 3

Budget snapshot

Economic snapshot

Page 4

Key Budget measures

Page 8

Page 5

Tax proposal highlights Page 17

Economic outlook

Page 6

<!-- page break -->

## Foreword

'Budget 2025 represents the government's day in the sun, having navigated through the economic challenges of the past few years. These measures provide a strong foundation for economic resilience, support for families and streamlined tax administration.'

Malaysia is currently buoyed by the recent strong economic growth, manageable inflation, and a robust Malaysian Ringgit against the US Dollar. Budget 2025 reflects the government's readiness to implement sustainable strategies to further fuel the economy while addressing the cost of living for the B40 and M40 groups.

## Tax reliefs focusing on care

## Expanding the revenue base

Budget 2025 avoids major new taxes, while introducing measures to expand Malaysia's revenue base. The scope 

In [20]:
md_split = [pg.strip() for pg in md_remove_img.split("<!-- page break -->\n\n")]
for i, pg in enumerate(md_split):
    print(f"PAGE {i + 1}:\n")
    print(pg)

PAGE 1:

Centre Stage:

## Budget 2025 Overview
PAGE 2:

Foreword Page 3

Budget snapshot

Economic snapshot

Page 4

Key Budget measures

Page 8

Page 5

Tax proposal highlights Page 17

Economic outlook

Page 6
PAGE 3:

## Foreword

'Budget 2025 represents the government's day in the sun, having navigated through the economic challenges of the past few years. These measures provide a strong foundation for economic resilience, support for families and streamlined tax administration.'

Malaysia is currently buoyed by the recent strong economic growth, manageable inflation, and a robust Malaysian Ringgit against the US Dollar. Budget 2025 reflects the government's readiness to implement sustainable strategies to further fuel the economy while addressing the cost of living for the B40 and M40 groups.

## Tax reliefs focusing on care

## Expanding the revenue base

Budget 2025 avoids major new taxes, while introducing measures to expand Malaysia's revenue base. The scope of the Sales and 