In [None]:
import sys
import logging
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from typing import Iterator

from docling.document_converter import DocumentConverter
from docling.document_converter import PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import TableFormerMode

from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import PictureItem
from docling_core.types.doc.document import TableItem
from docling_core.types.doc.document import TextItem
from docling_core.types.doc.document import DoclingDocument
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
from docling_core.transforms.chunker.base import BaseChunker
from docling_core.transforms.chunker.base import BaseChunk
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.do_code_enrichment = True
pipeline_options.do_formula_enrichment = True
pipeline_options.do_picture_classification = True
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    allowed_formats = [InputFormat.PDF],
    format_options = {
        InputFormat.PDF: PdfFormatOption(pipeline_options = pipeline_options, backend = PyPdfiumDocumentBackend),
        }
    )

logging.basicConfig(level = logging.INFO)

output_dir = Path('Output')
output_dir.mkdir(exist_ok = True)

In [8]:
#docling_paper = 'https://arxiv.org/pdf/2501.17887'
sample_doc_url = 'https://midwestfoodbank.org/images/AR_2020_WEB2.pdf'

result = converter.convert(sample_doc_url)
doc = result.document

2025-11-06 19:08:52,752 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-06 19:08:52,759 - INFO - Going to convert document batch...
2025-11-06 19:08:52,759 - INFO - Processing document AR_2020_WEB2.pdf
[32m[INFO] 2025-11-06 19:08:53,592 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF[0m
[32m[INFO] 2025-11-06 19:08:55,531 [RapidOCR] download_file.py:82: Download size: 3.09MB[0m
[32m[INFO] 2025-11-06 19:08:56,250 [RapidOCR] download_file.py:95: Successfully saved to: /home/torben/Huginn/.venv/lib/python3.12/site-packages/rapidocr/models/FZYTK.TTF[0m
2025-11-06 19:09:13,178 - INFO - Finished converting document AR_2020_WEB2.pdf in 20.75 sec.


In [10]:
md_out = doc.export_to_markdown()
print(f'{md_out[ : 2000]}')

<!-- image -->

## ANNUAL REPORT 2020

<!-- image -->

bridging the gap between poverty and prosperity

<!-- image -->

## A message from Co-Founder, President, and CEO, David Kieser

No one could have predicted the events of 2020. The global COVID-19 pandemic created a dynamic year. With the help of volunteers, donors, staff, and most importantly, the blessings of God, Midwest Food Bank responded nimbly to the changing landscape.

All MFB locations remained open and responsive to the need of our nonprofit partners. We enacted safety protocols and reduced volunteer numbers to maintain social distancing guidelines. To allow partner agencies to receive food from MFB safely, we altered our distribution model. Community, business, and donor support funded operations and helped with food purchases. More details on our response to the pandemic are on page 14.

Noteworthy in 2020:

- MFB distributed a record amount of food, 37% more than in 2019.
- In 2020, we sent a record number of family f

In [11]:
print(f"Document title: {doc.name}")
print(f"Number of pages: {len(doc.pages)}")
print(f"Number of tables: {len(doc.tables)}")
print(f"Number of pictures: {len(doc.pictures)}")

print('Document structure')
for i, (item, level) in enumerate(doc.iterate_items()):
    if i < 20:
        item_type = type(item).__name__
        text_preview = item.text[:200] if hasattr(item, 'text') else 'N/A'
        print(f'{"  " * level}- {item_type}: {text_preview}')

    

Document title: AR_2020_WEB2
Number of pages: 20
Number of tables: 2
Number of pictures: 53
Document structure
  - PictureItem: N/A
  - SectionHeaderItem: ANNUAL REPORT 2020
  - PictureItem: N/A
  - TextItem: bridging the gap between poverty and prosperity
  - PictureItem: N/A
  - SectionHeaderItem: A message from Co-Founder, President, and CEO, David Kieser
  - TextItem: No one could have predicted the events of 2020. The global COVID-19 pandemic created a dynamic year. With the help of volunteers, donors, staff, and most importantly, the blessings of God, Midwest Foo
  - TextItem: All MFB locations remained open and responsive to the need of our nonprofit partners. We enacted safety protocols and reduced volunteer numbers to maintain social distancing guidelines. To allow partn
  - TextItem: Noteworthy in 2020:
    - ListItem: MFB distributed a record amount of food, 37% more than in 2019.
    - ListItem: In 2020, we sent a record number of family food boxes in Disaster Relief semi l

In [12]:
#doc.save_as_json(output_dir / 'docling_paper.json')
doc.save_as_json(output_dir / 'annual_report.json')
