In [37]:
from dotenv import load_dotenv
load_dotenv()
from document_ai.llm import OpenAILLM
from pydantic import BaseModel
from document_ai import DocumentProcessor
from document_ai.parser import DigitalPDFParser
from document_ai.formatter import PDFFormatter
from document_ai.schemas import Document
from document_ai.extractor import PDFExtractor
from document_ai.schemas import Mode
from typing import Any

In [38]:
llm = OpenAILLM()

In [39]:
processor = DocumentProcessor(
    parser=DigitalPDFParser(),
    formatter=PDFFormatter(),
    extractor=PDFExtractor(llm),
    document=Document(
        document_type="digital_pdf",
        uri="/Users/zeel/Public/ms/open_source/document_ai/data/Bank_eStmt_2025-07-14.pdf"
    ),
    # mode=Mode(paginated=False, include_line_numbers=True)
)


In [40]:
processor.parse()
# TODO: Implement generator for this
llm_input = processor.format_document_for_llm()

In [41]:
llm_input = "\n\n".join(llm_input)

In [42]:
# processor.document.content.pages[0].lines

In [43]:
# system_prompt = """You are a helpful assistant that extracts information from a document."""
# llm_input = "\n\n".join(llm_input)
# user_prompt = f"""Extract the ending balance from bank statement along with the page number and line number its mentioned in.
# Document:
# {llm_input}
# """

In [44]:
class EndingBalance(BaseModel):
    ending_balance: float
    ending_balance_citation: Any
    start_balance: float
    start_balance_citation: Any


response = processor.extract(
    model="gpt-5-mini",
    reasoning={"effort": "low"},
    response_format=EndingBalance,
    # system_prompt=system_prompt,
    # user_prompt=user_prompt,
    llm_input=llm_input,
)

{'$defs': {'Citation': {'description': 'Citation dict for page and line number references.', 'properties': {'page': {'title': 'Page', 'type': 'integer'}, 'lines': {'items': {'type': 'integer'}, 'title': 'Lines', 'type': 'array'}}, 'required': ['page', 'lines'], 'title': 'Citation', 'type': 'object'}}, 'properties': {'ending_balance': {'title': 'Ending Balance', 'type': 'number'}, 'ending_balance_citation': {'description': 'This is used to cite the page number and line number where the information is mentioned in the document.\nFor example:\n[{"page": 1, "lines": [10, 11]}, {"page": 2, "lines": [20]}]', 'items': {'$ref': '#/$defs/Citation'}, 'title': 'Ending Balance Citation', 'type': 'array'}, 'start_balance': {'title': 'Start Balance', 'type': 'number'}, 'start_balance_citation': {'description': 'This is used to cite the page number and line number where the information is mentioned in the document.\nFor example:\n[{"page": 1, "lines": [10, 11]}, {"page": 2, "lines": [20]}]', 'items':

In [45]:
response.model_dump()

{'ending_balance': 111.61,
 'ending_balance_citation': [{'page': 0, 'lines': [18]}],
 'start_balance': 610.52,
 'start_balance_citation': [{'page': 0, 'lines': [13]}]}

In [46]:
print(llm_input)

<page number=0>
0: P.O. Box 15284 Customer service information
1: Wilmington, DE 19850
2: Customer service: 1.800.432.1000
3: En Español: 1.800.688.6086
4: ZEEL DIPENKUMAR THUMAR bankofamerica.com
5: 230 CAMPBELL AVE
6: WEST HAVEN, CT 06516-5338 Bank of America, N.A.
7: P.O. Box 25118
8: Tampa, FL 33622-5118
9: Your Adv SafeBalance Banking
10: for June 11, 2025 to July 14, 2025 Account number: 3850 3327 1835
11: ZEEL DIPENKUMAR THUMAR
12: Account summary
13: Beginning balance on June 11, 2025 $610.52
14: Deposits and other additions 0.00
15: ATM and debit card subtractions -418.91
16: Other subtractions -80.00
17: Service fees -0.00
18: Ending balance on July 14, 2025 $111.61
19: Invest in their future - open a 529 plan
20: The future starts now at merrilledge.com/529plan
21: Scan the code to learn more.
22: When you use the QRC feature, certain information is collected from your mobile device for business
23: purposes. Merrill Lynch, Pierce, Fenner & Smith Incorporated (also referred 