In [1]:
from dotenv import load_dotenv
load_dotenv()
from document_ai.llm import OpenAILLM
from pydantic import BaseModel
from document_ai import DocumentProcessor
from document_ai.parser import DigitalPDFParser
from document_ai.formatter import PDFFormatter
from document_ai.schemas import Document
from document_ai.extractor import PDFExtractor
from document_ai.schemas import Mode
from typing import Any

In [2]:
llm = OpenAILLM()

In [3]:
uri="../data/Bank_eStmt_2025-07-14.pdf"
processor = DocumentProcessor(
    parser=DigitalPDFParser(),
    formatter=PDFFormatter(),
    extractor=PDFExtractor(llm),
    document=Document(
        document_type="digital_pdf",
        uri=uri
    ),
    # mode=Mode(paginated=False, include_line_numbers=True)
)


In [4]:
processor.parse()
# TODO: Implement generator for this
llm_input = processor.format_document_for_llm()

In [5]:
llm_input = "\n\n".join(llm_input)

In [6]:
# processor.document.content.pages[0].lines

In [7]:
# system_prompt = """You are a helpful assistant that extracts information from a document."""
# llm_input = "\n\n".join(llm_input)
# user_prompt = f"""Extract the ending balance from bank statement along with the page number and line number its mentioned in.
# Document:
# {llm_input}
# """

In [8]:
class EndingBalance(BaseModel):
    ending_balance: float
    ending_balance_citation: Any
    start_balance: float
    start_balance_citation: Any


response = processor.extract(
    model="gpt-5-mini",
    reasoning={"effort": "low"},
    response_format=EndingBalance,
    # system_prompt=system_prompt,
    # user_prompt=user_prompt,
    llm_input=llm_input,
)

{'$defs': {'Citation': {'description': 'Citation dict for page and line number references.', 'properties': {'page': {'title': 'Page', 'type': 'integer'}, 'lines': {'items': {'type': 'integer'}, 'title': 'Lines', 'type': 'array'}}, 'required': ['page', 'lines'], 'title': 'Citation', 'type': 'object'}}, 'properties': {'ending_balance': {'title': 'Ending Balance', 'type': 'number'}, 'ending_balance_citation': {'description': 'This is used to cite the page number and line number where the information is mentioned in the document.\nFor example:\n[{"page": 1, "lines": [10, 11]}, {"page": 2, "lines": [20]}]', 'items': {'$ref': '#/$defs/Citation'}, 'title': 'Ending Balance Citation', 'type': 'array'}, 'start_balance': {'title': 'Start Balance', 'type': 'number'}, 'start_balance_citation': {'description': 'This is used to cite the page number and line number where the information is mentioned in the document.\nFor example:\n[{"page": 1, "lines": [10, 11]}, {"page": 2, "lines": [20]}]', 'items':

In [9]:
response.model_dump()

{'ending_balance': 111.61,
 'ending_balance_citation': [{'page': 0,
   'lines': [18],
   'bboxes': [{'x0': 0.058823529411764705,
     'top': 0.6095707475757575,
     'x1': 0.5635455037254902,
     'bottom': 0.6221969596969696}]}],
 'start_balance': 610.52,
 'start_balance_citation': [{'page': 0,
   'lines': [13],
   'bboxes': [{'x0': 0.058823529411764705,
     'top': 0.49401637363636364,
     'x1': 0.5639691831372549,
     'bottom': 0.5060113736363636}]}]}

In [10]:
print(llm_input)

<page number=0>
0: P.O. Box 15284 Customer service information
1: Wilmington, DE 19850
2: Customer service: 1.800.432.1000
3: En Español: 1.800.688.6086
4: ZEEL DIPENKUMAR THUMAR bankofamerica.com
5: 230 CAMPBELL AVE
6: WEST HAVEN, CT 06516-5338 Bank of America, N.A.
7: P.O. Box 25118
8: Tampa, FL 33622-5118
9: Your Adv SafeBalance Banking
10: for June 11, 2025 to July 14, 2025 Account number: 3850 3327 1835
11: ZEEL DIPENKUMAR THUMAR
12: Account summary
13: Beginning balance on June 11, 2025 $610.52
14: Deposits and other additions 0.00
15: ATM and debit card subtractions -418.91
16: Other subtractions -80.00
17: Service fees -0.00
18: Ending balance on July 14, 2025 $111.61
19: Invest in their future - open a 529 plan
20: The future starts now at merrilledge.com/529plan
21: Scan the code to learn more.
22: When you use the QRC feature, certain information is collected from your mobile device for business
23: purposes. Merrill Lynch, Pierce, Fenner & Smith Incorporated (also referred 

In [11]:
from document_ai.utils import enrich_citations_with_bboxes
enriched = enrich_citations_with_bboxes(response, processor.document.content)

In [12]:
enriched

{'ending_balance': 111.61,
 'ending_balance_citation': [{'page': 0,
   'lines': [18],
   'bboxes': [{'x0': 0.058823529411764705,
     'top': 0.6095707475757575,
     'x1': 0.5635455037254902,
     'bottom': 0.6221969596969696}]}],
 'start_balance': 610.52,
 'start_balance_citation': [{'page': 0,
   'lines': [13],
   'bboxes': [{'x0': 0.058823529411764705,
     'top': 0.49401637363636364,
     'x1': 0.5639691831372549,
     'bottom': 0.5060113736363636}]}]}

In [13]:
from document_ai.utils import add_appropriate_citation_type
from typing_extensions import TypedDict
class Citation(TypedDict):
    page: int
    lines: list[int]
    bboxes: list[dict[str, Any]]
CitationType = list[Citation]
final_response_format = add_appropriate_citation_type(EndingBalance, CitationType)

In [14]:
final_response_format(**enriched)

EndingBalance(ending_balance=111.61, ending_balance_citation=[{'page': 0, 'lines': [18], 'bboxes': [{'x0': 0.058823529411764705, 'top': 0.6095707475757575, 'x1': 0.5635455037254902, 'bottom': 0.6221969596969696}]}], start_balance=610.52, start_balance_citation=[{'page': 0, 'lines': [13], 'bboxes': [{'x0': 0.058823529411764705, 'top': 0.49401637363636364, 'x1': 0.5639691831372549, 'bottom': 0.5060113736363636}]}])

In [15]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import io

def display_page_with_bbox(pdf_path, page_number, normalized_bbox):
    """
    Display a PDF page with a red rectangle drawn on the specified bbox.
    
    Args:
        pdf_path (str): Path to the PDF file
        page_number (int): Page number (0-indexed)
        normalized_bbox (tuple): Normalized bounding box as (x0, top, x1, bottom) 
                                 where values are between 0 and 1
    
    Returns:
        PIL.Image: The page image with red bbox annotation
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Check if page number is valid
    if page_number < 0 or page_number >= len(doc):
        raise ValueError(f"Invalid page number. PDF has {len(doc)} pages (0-indexed)")
    
    # Get the specified page
    page = doc[page_number]
    
    # Get page dimensions
    page_rect = page.rect
    page_width = page_rect.width
    page_height = page_rect.height
    
    # Convert normalized bbox to actual coordinates
    x0_norm, top_norm, x1_norm, bottom_norm = normalized_bbox
    x0 = x0_norm * page_width
    top = top_norm * page_height
    x1 = x1_norm * page_width
    bottom = bottom_norm * page_height
    
    # Render the full page as a pixmap
    zoom = 2.0  # 2x zoom for better quality
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    
    # Convert pixmap to PIL Image
    img_data = pix.tobytes("png")
    img = Image.open(io.BytesIO(img_data))
    
    # Draw red rectangle on the image
    draw = ImageDraw.Draw(img)
    
    # Scale coordinates by zoom factor
    rect_coords = [
        x0 * zoom,
        top * zoom,
        x1 * zoom,
        bottom * zoom
    ]
    
    # Draw red rectangle with 3-pixel width
    draw.rectangle(rect_coords, outline="red", width=3)
    
    # Close the document
    doc.close()
    
    return img


# Example usage
if __name__ == "__main__":
    # Example parameters
    pdf_path = uri
    page_number = 0  # First page (0-indexed)
    x0 = enriched["ending_balance_citation"][0]["bboxes"][0]["x0"]
    top = enriched["ending_balance_citation"][0]["bboxes"][0]["top"]
    x1 = enriched["ending_balance_citation"][0]["bboxes"][0]["x1"]
    bottom = enriched["ending_balance_citation"][0]["bboxes"][0]["bottom"]
    normalized_bbox = (x0, top, x1, bottom)  # (x0, top, x1, bottom) normalized
    
    try:
        # Get page with bbox annotation
        image = display_page_with_bbox(pdf_path, page_number, normalized_bbox)
        
        # Display the image
        image.show()
        
        # Optionally save the image
        # image.save("page_with_bbox.png")
        
        print(f"Successfully rendered page {page_number} with bbox annotation")
        print(f"Normalized bbox: {normalized_bbox}")
        print(f"Image size: {image.size}")
        
    except FileNotFoundError:
        print(f"Error: PDF file '{pdf_path}' not found")
    except Exception as e:
        print(f"Error: {e}")

Successfully rendered page 0 with bbox annotation
Normalized bbox: (0.058823529411764705, 0.6095707475757575, 0.5635455037254902, 0.6221969596969696)
Image size: (1224, 1584)
