In [1]:
from dotenv import load_dotenv
load_dotenv()
from document_ai.llm import OpenAILLM
from pydantic import BaseModel
from document_ai import DocumentProcessor
from document_ai.parser import DigitalPDFParser
from document_ai.formatter import PDFFormatter
from document_ai.schemas import Document
from document_ai.extractor import PDFExtractor
from document_ai.schemas import Mode

In [2]:
class EndingBalance(BaseModel):
    ending_balance: float
    page_number_citation: int
    line_number_citations: list[int]
    

In [3]:
llm = OpenAILLM()

In [4]:
processor = DocumentProcessor(
    parser=DigitalPDFParser(),
    formatter=PDFFormatter(),
    extractor=PDFExtractor(llm),
    document=Document(
        document_type="digital_pdf",
        uri="/Users/zeel/Public/ms/open_source/document_ai/data/Bank_eStmt_2025-07-14.pdf"
    ),
    mode=Mode(paginated=False, include_line_numbers=True)
)


In [5]:
processor.parse()
llm_input = processor.formatted_input_for_llm()

In [13]:
processor.document.content.pages[0].lines

[Line(text='P.O. Box 15284 Customer service information', bounding_box=BoundingBox(x0=0.06470588235294118, top=0.10988135363636356, x1=0.8684772147058823, bottom=0.12555682818181813)),
 Line(text='Wilmington, DE 19850', bounding_box=BoundingBox(x0=0.06470588235294118, top=0.12109347484848476, x1=0.18169214196078434, bottom=0.12993180818181824)),
 Line(text='Customer service: 1.800.432.1000', bounding_box=BoundingBox(x0=0.6529411764705881, top=0.15628910090909087, x1=0.8773563370588234, bottom=0.16828410090909088)),
 Line(text='En Espa√±ol: 1.800.688.6086', bounding_box=BoundingBox(x0=0.6529411764705881, top=0.17992546454545455, x1=0.8357859005882352, bottom=0.19192046454545456)),
 Line(text='ZEEL DIPENKUMAR THUMAR bankofamerica.com', bounding_box=BoundingBox(x0=0.07352941176470588, top=0.19982575757575766, x1=0.7804466152941176, bottom=0.21373864636363635)),
 Line(text='230 CAMPBELL AVE', bounding_box=BoundingBox(x0=0.07352941176470588, top=0.21346212121212121, x1=0.20114705882352943, 

In [7]:
system_prompt = """You are a helpful assistant that extracts information from a document."""

user_prompt = f"""Extract the ending balance from bank statement along with the page number and line number its mentioned in.
Document:
{llm_input}
"""

In [8]:
response = processor.extract(
    model="gpt-5-mini",
    reasoning={"effort": "low"},
    response_format=EndingBalance,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
)

In [9]:
response.model_dump()

{'ending_balance': 111.61,
 'page_number_citation': 0,
 'line_number_citations': [18]}