In [11]:
from dotenv import load_dotenv

load_dotenv()

True

In [12]:
from langchain_core.output_parsers import JsonOutputParser
from openai import OpenAI
from pydantic import BaseModel, Field

from doc_intelligence.base import Document
from doc_intelligence.formatter import DigitalPDFFormatter
from doc_intelligence.parser import DigitalPDFParser
from doc_intelligence.pydantic_to_json_instance_schema import (
    pydantic_to_json_instance_schema,
    stringify_schema,
)

In [3]:
import os

os.getcwd().split("/")[-1]
pdf_path = (
    "/".join(os.getcwd().split("/")[:-1]) + "/" + "data/Bank_eStmt_2025-07-14.pdf"
)

In [4]:
document = Document(uri=pdf_path)

In [13]:
client = OpenAI()
json_parser = JsonOutputParser()
parser = DigitalPDFParser()
formatter = DigitalPDFFormatter()

In [6]:
pdf = parser.parse(document)

In [7]:
formatted_pdf_content = formatter.format_document_for_llm(
    pdf, include_line_numbers=True
)

In [8]:
print(formatted_pdf_content)

<page number=0>
0: P.O. Box 15284 Customer service information
1: Wilmington, DE 19850
2: Customer service: 1.800.432.1000
3: En Español: 1.800.688.6086
4: ZEEL DIPENKUMAR THUMAR bankofamerica.com
5: 230 CAMPBELL AVE
6: WEST HAVEN, CT 06516-5338 Bank of America, N.A.
7: P.O. Box 25118
8: Tampa, FL 33622-5118
9: Your Adv SafeBalance Banking
10: for June 11, 2025 to July 14, 2025 Account number: 3850 3327 1835
11: ZEEL DIPENKUMAR THUMAR
12: Account summary
13: Beginning balance on June 11, 2025 $610.52
14: Deposits and other additions 0.00
15: ATM and debit card subtractions -418.91
16: Other subtractions -80.00
17: Service fees -0.00
18: Ending balance on July 14, 2025 $111.61
19: Invest in their future - open a 529 plan
20: The future starts now at merrilledge.com/529plan
21: Scan the code to learn more.
22: When you use the QRC feature, certain information is collected from your mobile device for business
23: purposes. Merrill Lynch, Pierce, Fenner & Smith Incorporated (also referred 

In [9]:
class Balance(BaseModel):
    ending_balance: float = Field(..., description="ending balance")

In [10]:
schema = stringify_schema(pydantic_to_json_instance_schema(Balance, citation=True))

In [11]:
print(schema)

{
    "ending_balance": {
        "value": <number>,  # desc: ending balance
        "citations": [{"page": <integer>, "lines": [<integer>]}]
    }
}


In [12]:
response = client.responses.create(
    model="gpt-5.2",
    input=f""""Your job is to extract structured mentioned in schema data from a document given below.

DOCUMENT:
{formatted_pdf_content}

OUTPUT SCHEMA:
{schema}

Generate output in JSON format.
""",
)

In [13]:
response_dict = json_parser.parse(response.output_text)

In [14]:
response_dict

{'ending_balance': {'value': 111.61,
  'citations': [{'page': 0, 'lines': [18]}]}}

In [15]:
from doc_intelligence.utils import enrich_citations_with_bboxes

In [16]:
enrich_citations_with_bboxes(response_dict, pdf)

{'ending_balance': {'value': 111.61,
  'citations': [{'page': 0,
    'bboxes': [{'x0': 0.058823529411764705,
      'top': 0.6095707475757575,
      'x1': 0.5635455037254902,
      'bottom': 0.6221969596969696}]}]}}

In [1]:
from doc_intelligence.utils import strip_citations

In [4]:
import json

response_dict = json.loads("""{"ending_balance": {"value": 111.61,
  "citations": [{"page": 0,
    "bboxes": [{"x0": 0.058823529411764705,
      "top": 0.6095707475757575,
      "x1": 0.5635455037254902,
      "bottom": 0.6221969596969696}]}]}}""")

In [9]:
data = {
    "ids": [
        {"value": 101.09, "citations": [{"page": 1, "lines": [1]}]},
        {"value": 205.09, "citations": [{"page": 1, "lines": [1]}]},
        {"value": 309.09, "citations": [{"page": 1, "lines": [1]}]},
    ],
    "name": {"value": "Zeel", "citations": [{"page": 1, "lines": [1, 2]}]},
    "address": {
        "street": {
            "value": "742 Evergreen Terrace",
            "citations": [{"page": 2, "lines": [1]}],
        },
        "city": {"value": "Springfield", "citations": [{"page": 2, "lines": [1]}]},
        "zipcode": {"value": "62704", "citations": [{"page": 2, "lines": [2]}]},
    },
}

strip_citations(data)

{'ids': [101.09, 205.09, 309.09],
 'name': 'Zeel',
 'address': {'street': '742 Evergreen Terrace',
  'city': 'Springfield',
  'zipcode': '62704'}}

In [5]:
strip_citations(response_dict)

{'ending_balance': 111.61}

In [None]:
from pydantic import BaseModel


class Balance(BaseModel):
    ending_balance: float = Field(..., description="ending balance")


issubclass(Balance, BaseModel)

True