In [1]:
import os
os.chdir(r'/Users/yashgourav/expense_manager')
print(f'current working directory set to : {os.getcwd()}')

current working directory set to : /Users/yashgourav/expense_manager


In [2]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError




In [3]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]

# The ID and range of a sample spreadsheet.
SAMPLE_SPREADSHEET_ID = "1YLVz3i6R9sYXW3oWfUEtFTMkrXyY6ONB4FohhTf4B8w"
SAMPLE_RANGE_NAME = "taxonomy!A1:C10"

In [12]:
def main():
  """Shows basic usage of the Sheets API.
  Prints values from a sample spreadsheet.
  """
  creds = None
  if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
  if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
      creds.refresh(Request())
    else:
      flow = InstalledAppFlow.from_client_secrets_file(
          "credentials.json", SCOPES
      )
      creds = flow.run_local_server(port=0)
    with open("token.json", "w") as token:
      token.write(creds.to_json())

  try:
    service = build("sheets", "v4", credentials=creds)

    # Call the Sheets API
    print('Calling sheets API')

    spreadsheet = service.spreadsheets().get(spreadsheetId=SAMPLE_SPREADSHEET_ID).execute()

    sheet_name = spreadsheet["properties"]["title"]

    print("Spreadsheet name:", sheet_name)
    sheet = service.spreadsheets()
    result = (
        sheet.values()
        .get(spreadsheetId=SAMPLE_SPREADSHEET_ID, range=SAMPLE_RANGE_NAME)
        .execute()
    )
    print(result)
    values = result.get("values", [])

    if not values:
      print("No data found.")
      return

    print("Name, Major:")
    for row in values:
      # Print columns A and E, which correspond to indices 0 and 4.
      print(f"{row[0]}, {row[1]}")
  except HttpError as err:
    print(err)


main()

Calling sheets API
Spreadsheet name: Taxonomy Updated
{'range': 'taxonomy!A1:C10', 'majorDimension': 'ROWS', 'values': [['Category', 'Sub Category-I', 'Sub Category II'], ['Food Items', 'Fruits and Vegetables', 'Fruits'], ['Food Items', 'Fruits and Vegetables', 'Fruits'], ['Food Items', 'Fruits and Vegetables', 'Fruits'], ['Food Items', 'Fruits and Vegetables', 'Vegetables'], ['Food Items', 'Fruits and Vegetables', 'Vegetables'], ['Food Items', 'Fruits and Vegetables', 'Vegetables'], ['Food Items', 'Fruits and Vegetables', 'Vegetables'], ['Food Items', 'Fruits and Vegetables', 'Vegetables'], ['Food Items', 'Fruits and Vegetables', 'Vegetables']]}
Name, Major:
Category, Sub Category-I
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables
Food Items, Fruits and Vegetables


In [4]:
from src.integration.gsheet_handler import GSheetHandler

SPREADSHEET_ID = "1YLVz3i6R9sYXW3oWfUEtFTMkrXyY6ONB4FohhTf4B8w"
cred = 'credentials.json'
gsheet_handler = GSheetHandler(sheet_id=SPREADSHEET_ID, credential_file=cred)

2025-12-10 05:00:15,447 - src.integration.gsheet_handler - INFO - [60] - Successfully authenticated and loaded Google Sheet.


In [14]:
df = gsheet_handler.load_sheet_as_df(worksheet_name='taxonomy')
df = df.iloc[:, :3]
df.rename(columns={
    'Category':'category',
    'Sub Category-I':'sub_category_i',
    'Sub Category II':'sub_category_ii',
}, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

2025-12-10 05:18:53,690 - src.integration.gsheet_handler - INFO - [83] - Loaded worksheet 'taxonomy' into DataFrame.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [15]:
df

Unnamed: 0,category,sub_category_i,sub_category_ii
0,Food Items,Fruits and Vegetables,Fruits
1,Food Items,Fruits and Vegetables,Vegetables
2,Food Items,"Grains, Flours & Pulses",Rice
3,Food Items,"Grains, Flours & Pulses",Flours
4,Food Items,"Grains, Flours & Pulses",Oats & Cereals
...,...,...,...
168,Utilities and Bills,Household Bills,Utilities
169,Utilities and Bills,Communication,Connectivity
170,Utilities and Bills,Subscriptions,Digital Services
171,Utilities and Bills,Housing,Accommodation


In [1]:
def html_tag(tag):
	
	def wrap_text(msg):
		print(f'<{tag}><{msg}></{tag}>')
		
	return wrap_text

print_h1 = html_tag('h1')
print_h1('Test Headline!')
print_h1('Another Headline')

print_p = html_tag('p')
print_p('Test paragraph')

<h1><Test Headline!></h1>
<h1><Another Headline></h1>
<p><Test paragraph></p>


Exercise 1: The API Rate Limiter (State Management)
Goal: Create a closure that prevents a function from being called too many times.

The Task: Write a function create_limiter(limit) that returns a closure. This closure should wrap an LLM call. If the number of calls exceeds the limit, it should return "Rate limit exceeded" instead of calling the model.

In [None]:

def create_limiter(limit):
    counter = 0
    
    def limiter(func, *args, **kwargs):
        nonlocal counter
        if counter < limit:
            counter += 1
            return func(*args, **kwargs)
        else:
            return 'Rate limit exceeded'
        
    return limiter

mock_gen = lambda p: f"Response for {p}"
my_limiter = create_limiter(2)

print(my_limiter(mock_gen, "Hi")) # Response for Hi
print(my_limiter(mock_gen, "Hi")) # Response for Hi
print(my_limiter(mock_gen, "Hi")) # Rate limit exceeded

Exercise 2: The Prompt Template Factory (Configuration)
Goal: Use closures to "pre-fill" function arguments.

The Task: In LLM work, we often have "System Prompts" (e.g., "You are a helpful lawyer"). Create a closure make_expert(role) that returns a function. This returned function should take a user_input string and return a combined prompt: [{role}]: {user_input}.

In [16]:
def make_expert(role):
    def expert(prompt):
        return f'[{role}]: {prompt}'
    return expert

# Expected Usage:
lawyer_bot = make_expert("Legal Advisor")
coder_bot = make_expert("Senior Python Developer")

print(lawyer_bot("Is this contract valid?")) 
print(coder_bot("How to implement a binary search in Python?"))

[Legal Advisor]: Is this contract valid?
[Senior Python Developer]: How to implement a binary search in Python?


Exercise 3: The Response Timer (Performance Tracking)
Goal: Create a "Decorator-style" closure that measures execution time.

The Task: Write a closure time_tracker(func) that captures how long an LLM takes to respond. It should print the time taken and then return the actual response from the function.

In [17]:
import time

def time_tracker(func):
    def wrapper(*args, **kwargs):
        # 1. Start timer
        start_time = time.time()
        print(f">>> Starting execution of {func.__name__}...")

        # 2. Call the actual function
        result = func(*args, **kwargs)

        # 3. End timer
        end_time = time.time()
        
        # 4. Calculate and print duration
        duration = end_time - start_time
        print(f">>> Function '{func.__name__}' finished.")
        print(f">>> Time Taken: {duration:.4f} seconds")
        
        return result
    return wrapper

@time_tracker
def call_llm(prompt):
    print(f"Querying LLM with prompt: '{prompt}'")
    time.sleep(1.5)  # Simulate the delay of a real API call
    return "This is the LLM response."

# --- HOW TO CHECK IT ---
# Just call the function as you normally would!
response = call_llm("Explain Quantum Physics")

print(f"\nFinal Result returned to main script: {response}")

>>> Starting execution of call_llm...
Querying LLM with prompt: 'Explain Quantum Physics'
>>> Function 'call_llm' finished.
>>> Time Taken: 1.5058 seconds

Final Result returned to main script: This is the LLM response.


In [18]:
from abc import ABC, abstractmethod

class Model(ABC):

    @abstractmethod
    def fit(self, X, y):
        pass

    @abstractmethod
    def predict(self, X):
        pass

In [21]:
class BadModel(Model):
    def fit(self, X, y):
        pass

    def predict(self, X):
        pass

In [22]:
BadModel()

<__main__.BadModel at 0x1050699c0>

In [40]:
from openai import OpenAI

client = OpenAI()

response = client.responses.create(
  model="gpt-4.1",
  input="Tell me a three sentence bedtime story about a unicorn."
)

print(response)


Response(id='resp_0e4bee289b3bfdbe006950001a5b908196b8194bcef87c9e17', created_at=1766850586.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4.1-2025-04-14', object='response', output=[ResponseOutputMessage(id='msg_0e4bee289b3bfdbe006950001c2f30819681f66070ff555bb6', content=[ResponseOutputText(annotations=[], text='Once upon a time, a sparkling unicorn named Luna soared over a sleepy forest each night, leaving trails of twinkling stardust behind her. One evening, she discovered a lost baby owl and guided it home with her shimmering light. The animals in the forest slept peacefully, knowing Luna was watching over them with magic and love.', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, conversation=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=No

In [43]:
response.__dict__
print(response.output_text)

Once upon a time, a sparkling unicorn named Luna soared over a sleepy forest each night, leaving trails of twinkling stardust behind her. One evening, she discovered a lost baby owl and guided it home with her shimmering light. The animals in the forest slept peacefully, knowing Luna was watching over them with magic and love.


In [38]:
from openai import OpenAI

client = OpenAI()

response = client.responses.create(
    model="gpt-4o-mini",
    input=[
        {"role": "user", "content": "knock knock."},
        {"role": "assistant", "content": "Who's there?"},
        {"role": "user", "content": "Orange."},
    ],
)

print(response.output_text)

Orange who?


In [44]:
import openai
conversation = openai.conversations.create()


In [46]:
conversation.id

'conv_695000a6a8208194aa3b96b32b80aa640c771fe8b7eb4429'

In [47]:

response = openai.responses.create(
  model="gpt-4.1",
  input=[{"role": "user", "content": "What are the 5 Ds of dodgeball?"}],
  conversation=conversation.id
)


In [48]:
response.output_text

'The "5 Ds of Dodgeball" are a fun concept popularized by the comedy film **"Dodgeball: A True Underdog Story"** (2004). According to the character Patches O\'Houlihan, the five Ds are:\n\n1. **Dodge**\n2. **Duck**\n3. **Dip**\n4. **Dive**\n5. **Dodge** (yes, "dodge" is included twice for comedic effect)\n\nThese "5 Ds" emphasize the importance of agility and quick movements to avoid being hit by a dodgeball in the game.'

In [48]:
from pathlib import Path

img_path = 'artifacts/images/17000186820251224442777.png'

file_path = Path(img_path)

# Check if it exists (returns True for files OR folders)
if file_path.exists():
    # specifically check if it's a file (not a folder)
    if file_path.is_file():
        print("File exists!")
else:
    print("File does not exist.")

File exists!


In [17]:
import pytesseract
py_result = pytesseract.image_to_string('artifacts/images/17000186820251224442777.png')
py_result

"L$DL\n\nSwords - IE9513674T\n\nEUR\nTomatoes 6 pack 3.87 A\n3 x 1.29\nLidl Plus Offers -1.80\nLemons 0.99 A\nLidl Plus Offers -0.50\n\nMixed Peppers 2.09 A\nPink Lady Kids Apples 3.39 A\nFairtrade Org. Yellow Bananas 2.35 A\nRed Onions 0.99 A\nNatural Yogurt 5.67 A\n3 x 1.89\nIrish Protein Milk 1.55 A\nLight Milk 2L 2.35 A\nHighProtein Pudding Choco 200g 7.92 A\n8 x 0.99\nSwiss Style Muesli no Sugar 2.79 A\nSparkling Water Lemon&Lime 0.59 C\n0.25 Deposit 0.25 A\nSalted Peanuts 1.18 C\n2 x 0.59\nTOTAL 33.68\nDebit Payment 33.68\nDeposits paid\n1 x 0.25 0.25\nTotal Deposits paid 0.25\nTOTAL SAVINGS 2.30\nA 0.0% VAT 31.91 0.00\nC 23.0% VAT 1.77 0.33\nTRN-ID: 1E018608442777358251\n***CUSTOMER COPY***\nDate: 24/12/25 Time: 11:14:34\nMID: ***71253 TID: ****8875\nTRNS NO: IE018608442777358251\nVisa Debit RRKKKKKKKKKKAS 21\nA0000000031010\nContactless SALE\n€33.68\nVerified by Cardholder Device\nAPPROVED AUTH CODE 443129\nPLEASE RETAIN RECEIPT\nPLEASE DEBIT ACCOUNT WITH TOTAL SHOWN\n' Total S

In [5]:
def rapid_ocr_to_text(image_path: str) -> str:
    from rapidocr_onnxruntime import RapidOCR
    import re

    engine = RapidOCR()
    result, _ = engine(image_path)

    if result is None:
        return ""

    lines = [entry[1] for entry in result if isinstance(entry, (list, tuple))]
    text = "\n".join(lines)

    # Clean for LLM
    text = re.sub(r"\[\d\.\d{2}\]\s*", "", text).strip()

    return text

text = rapid_ocr_to_text("artifacts/images/17000186820251224442777.png")

In [6]:
text

'Swords-IE9513674T\nEUR\nTomatoes 6 pack\n3.87 A\n3x 1.29\nLidl Plus Offers\n-1.80\nLemons\n0.99A\nLidl Plus offers\n-0.50\nMixedPeppers\n2.09 A\nPink Lady Kids Apples\n3.39A\nFairtrade Org.Yellow Bananas\n2.35 A\nRed Onions\n0.99A\nNatural Yogurt\n5.67 A\n3 x 1.89\nIrish Protein Milk\n1.55 A\nLight Milk 2L\n2.35 A\nHighProtein Pudding Choco 200g\n7.92 A\n8x0.99\nSwiss Style Muesli no Sugar\n2.79 A\nSparkling Water Lemon&Lime\n0.59C\n0.25 Deposit\n0.25 A\nSalted Peanuts\n1.18 C\n2x 0.59\nTOTAL\n33.68\nDebit Payment\n33.68\nDeposits paid\n1x0.25\n0.25\nTotal Deposits paid\n0.25\nTOTALSAVINGS\n2.30\nA\n0.0%VAT\n31.91\n0.00\nC23.0%VAT\n1.77\n0.33\nTRN-ID:IE018608442777358251\n***CUSTOMER COPY***\nDate:24/12/25\nTime: 11:14:34\nMID:***71253\nTID:****8875\nTRNS NO:\nIE018608442777358251\nVisa Debit\n************4521\nA0000000031010\nContactless\nSALE\n33.68\nVerifiedbyCardholderDevice\nAPPROVED\nAUTHC0DE 443129\nPLEASE RETAIN RECEIPT\nPLEASE DEBIT ACCOUNT WITH TOTAL SHOWN\nTotal Savings wit

In [89]:
from typing import List, Optional
from datetime import date, time
from pydantic import BaseModel, Field, ConfigDict


class Price(BaseModel):
    amount: float = Field(..., description="Price amount before discount")
    discount: str = Field("EUR", description="discount amount")

class ParsedItem(BaseModel):
    item: str = Field(..., description="Item name")
    item_type: str = Field(..., description="generic item type, e.g., apple, milk, etc.")
    item_count: int = Field(..., description="Number of items purchased") 
    price: Price

class ParserResponse(BaseModel):
    date: Optional[date]
    time: Optional[time]
    shop: Optional[str]

    parsed_items: List[ParsedItem]
    model_config = ConfigDict(extra="forbid")


In [99]:
from openai import OpenAI

client = OpenAI()
prompt = f"""
Parse the following receipt text into a structured format. Take care of item type field as there are different names for Apples like Pink Lady Kids Apples,
or Fairtrade Org. Yellow Bananas for Banana. Receipt OCR Extracted text: {text}
"""
response = client.responses.parse(
  model="gpt-4.1",
  input=[
    {"role": "user", "content": prompt}
    ],
  text_format=ParserResponse
)

In [93]:
response.__dict__

{'id': 'resp_0875d78fa02617c90069513a1aa5c08193aca47fe16d8a7905',
 'created_at': 1766930970.0,
 'error': None,
 'incomplete_details': None,
 'instructions': None,
 'metadata': {},
 'model': 'gpt-4.1-2025-04-14',
 'object': 'response',
 'output': [ParsedResponseOutputMessage[ParserResponse](id='msg_0875d78fa02617c90069513a1b0cdc81938659ff552a92f975', content=[ParsedResponseOutputText[ParserResponse](annotations=[], text='{\n  "date": "2025-12-24",\n  "time": "11:14:34Z",\n  "shop": "Lidl Swords-Dublin Road, K67 E6A2 Dublin",\n  "parsed_items": [\n    {\n      "item": "Tomatoes 6 pack",\n      "item_type": "tomato",\n      "item_count": 3,\n      "price": {\n        "amount": 3.87,\n        "discount": "-1.80 Lidl Plus Offers"\n      }\n    },\n    {\n      "item": "Lemons",\n      "item_type": "lemon",\n      "item_count": 1,\n      "price": {\n        "amount": 0.99,\n        "discount": "-0.50 Lidl Plus Offers"\n      }\n    },\n    {\n      "item": "Mixed Peppers",\n      "item_type"

In [98]:
response.__dict__

{'id': 'resp_0cbe39eef05204d7006952770bff4c81969c308e32e1acc0e7',
 'created_at': 1767012108.0,
 'error': None,
 'incomplete_details': None,
 'instructions': None,
 'metadata': {},
 'model': 'gpt-4.1-2025-04-14',
 'object': 'response',
 'output': [ParsedResponseOutputMessage[NoneType](id='msg_0cbe39eef05204d7006952770db65081969be3e1961418e668', content=[ParsedResponseOutputText[NoneType](annotations=[], text='Certainly! Here is the structured version of your receipt, with an "item type" field included. I have done my best to group similar items, normalizing fruit names like "Pink Lady Kids Apples" as "Apple" and "Fairtrade Org. Yellow Bananas" as "Banana".\n\n---\n\n### Receipt Structured Data\n\n**Store**: Lidl, Swords-DublinRoad  \n**Address**: Dublin Road, Miltonsfields, K67 E6A2, Dublin  \n**Date**: 2024-12-25  \n**Time**: 11:14:34  \n**Total**: €33.68  \n**Total Savings**: €2.30  \n**Payment Type**: Debit Card (Visa Debit)  \n**Deposits Paid**: €0.25  \n\n#### Items\n\n| Item Name 

In [96]:
response.usage.total_tokens

1640

In [100]:
response.output_text

'{\n  "date": "2025-12-24",\n  "time": "11:14:34+00:00",\n  "shop": "Lidl, Swords-DublinRoad, Dublin Road Miltonsfields, K67 E6A2, Dublin",\n  "parsed_items": [\n    {\n      "item": "Tomatoes 6 pack",\n      "item_type": "tomato",\n      "item_count": 3,\n      "price": {\n        "amount": 3.87,\n        "discount": "1.80 (Lidl Plus Offers)"\n      }\n    },\n    {\n      "item": "Lemons",\n      "item_type": "lemon",\n      "item_count": 1,\n      "price": {\n        "amount": 0.99,\n        "discount": "0.50 (Lidl Plus Offers)"\n      }\n    },\n    {\n      "item": "MixedPeppers",\n      "item_type": "pepper",\n      "item_count": 1,\n      "price": {\n        "amount": 2.09,\n        "discount": "0"\n      }\n    },\n    {\n      "item": "Pink Lady Kids Apples",\n      "item_type": "apple",\n      "item_count": 1,\n      "price": {\n        "amount": 3.39,\n        "discount": "0"\n      }\n    },\n    {\n      "item": "Fairtrade Org.Yellow Bananas",\n      "item_type": "banana",

In [101]:
import json
parsed_data = json.loads(response.output_text)

parsed_data

{'date': '2025-12-24',
 'time': '11:14:34+00:00',
 'shop': 'Lidl, Swords-DublinRoad, Dublin Road Miltonsfields, K67 E6A2, Dublin',
 'parsed_items': [{'item': 'Tomatoes 6 pack',
   'item_type': 'tomato',
   'item_count': 3,
   'price': {'amount': 3.87, 'discount': '1.80 (Lidl Plus Offers)'}},
  {'item': 'Lemons',
   'item_type': 'lemon',
   'item_count': 1,
   'price': {'amount': 0.99, 'discount': '0.50 (Lidl Plus Offers)'}},
  {'item': 'MixedPeppers',
   'item_type': 'pepper',
   'item_count': 1,
   'price': {'amount': 2.09, 'discount': '0'}},
  {'item': 'Pink Lady Kids Apples',
   'item_type': 'apple',
   'item_count': 1,
   'price': {'amount': 3.39, 'discount': '0'}},
  {'item': 'Fairtrade Org.Yellow Bananas',
   'item_type': 'banana',
   'item_count': 1,
   'price': {'amount': 2.35, 'discount': '0'}},
  {'item': 'Red Onions',
   'item_type': 'onion',
   'item_count': 1,
   'price': {'amount': 0.99, 'discount': '0'}},
  {'item': 'Natural Yogurt',
   'item_type': 'yogurt',
   'item_co

In [103]:
text

'Swords-IE9513674T\nEUR\nTomatoes 6 pack\n3.87 A\n3x 1.29\nLidl Plus Offers\n-1.80\nLemons\n0.99A\nLidl Plus offers\n-0.50\nMixedPeppers\n2.09 A\nPink Lady Kids Apples\n3.39A\nFairtrade Org.Yellow Bananas\n2.35 A\nRed Onions\n0.99A\nNatural Yogurt\n5.67 A\n3 x 1.89\nIrish Protein Milk\n1.55 A\nLight Milk 2L\n2.35 A\nHighProtein Pudding Choco 200g\n7.92 A\n8x0.99\nSwiss Style Muesli no Sugar\n2.79 A\nSparkling Water Lemon&Lime\n0.59C\n0.25 Deposit\n0.25 A\nSalted Peanuts\n1.18 C\n2x 0.59\nTOTAL\n33.68\nDebit Payment\n33.68\nDeposits paid\n1x0.25\n0.25\nTotal Deposits paid\n0.25\nTOTALSAVINGS\n2.30\nA\n0.0%VAT\n31.91\n0.00\nC23.0%VAT\n1.77\n0.33\nTRN-ID:IE018608442777358251\n***CUSTOMER COPY***\nDate:24/12/25\nTime: 11:14:34\nMID:***71253\nTID:****8875\nTRNS NO:\nIE018608442777358251\nVisa Debit\n************4521\nA0000000031010\nContactless\nSALE\n33.68\nVerifiedbyCardholderDevice\nAPPROVED\nAUTHC0DE 443129\nPLEASE RETAIN RECEIPT\nPLEASE DEBIT ACCOUNT WITH TOTAL SHOWN\nTotal Savings wit

In [2]:
from src.llm.openai_client import OpenAIClient
from src.agents.parser import parse_receipt

llm = OpenAIClient(model_name="gpt-5.1")


2025-12-30 09:01:01,527 - src.llm.openai_client - INFO - [32] - Initializing OpenAIClient for model: gpt-5.1


In [3]:
import json
def rapid_ocr_to_text(image_path: str) -> str:
    from rapidocr_onnxruntime import RapidOCR
    import re

    engine = RapidOCR()
    result, _ = engine(image_path)

    if result is None:
        return ""

    lines = [entry[1] for entry in result if isinstance(entry, (list, tuple))]
    text = "\n".join(lines)

    # Clean for LLM
    text = re.sub(r"\[\d\.\d{2}\]\s*", "", text).strip()

    return text

text = rapid_ocr_to_text("artifacts/images/17000186820251224442777.png")

In [4]:
result = parse_receipt(
    text=text,
    llm_client=llm
)

json_dict = result.model_dump()
print(json.dumps(json_dict, indent=2))

2025-12-29 15:34:29,066 - src.agents.parser - INFO - [52] - Initiating receipt parsing logic.
2025-12-29 15:34:29,071 - src.llm.base - INFO - [75] - LLM Dispatch: Provider=OpenAIClient, Model=gpt-4.1
2025-12-29 15:34:29,072 - src.llm.openai_client - INFO - [58] - Starting generation request for model 'gpt-4.1'
2025-12-29 15:34:29,073 - src.llm.openai_client - INFO - [76] - Applying response model: ModelMetaclass
2025-12-29 15:34:36,658 - src.llm.openai_client - INFO - [110] - Successfully received response. Tokens used: 1640
2025-12-29 15:34:36,660 - src.llm.base - INFO - [99] - Structured output parsing successful.
2025-12-29 15:34:36,660 - src.llm.base - INFO - [109] - Generation complete. Latency: 7589.10ms. Total Tokens: 1640
2025-12-29 15:34:36,660 - src.agents.parser - INFO - [70] - Successfully parsed receipt into ParserResponse object.


TypeError: Object of type date is not JSON serializable

In [5]:
json_dict

{'date': datetime.date(2412, 12, 25),
 'time': datetime.time(11, 14, 34, tzinfo=TzInfo(0)),
 'shop': 'Lidl',
 'parsed_items': [{'item': 'Tomatoes 6 pack',
   'item_type': 'Tomato',
   'item_count': 3,
   'price': {'amount': 3.87, 'discount': 1.8}},
  {'item': 'Lemons',
   'item_type': 'Lemon',
   'item_count': 1,
   'price': {'amount': 0.99, 'discount': 0.5}},
  {'item': 'MixedPeppers',
   'item_type': 'Pepper',
   'item_count': 1,
   'price': {'amount': 2.09, 'discount': 0.0}},
  {'item': 'Pink Lady Kids Apples',
   'item_type': 'Apple',
   'item_count': 1,
   'price': {'amount': 3.39, 'discount': 0.0}},
  {'item': 'Fairtrade Org.Yellow Bananas',
   'item_type': 'Banana',
   'item_count': 1,
   'price': {'amount': 2.35, 'discount': 0.0}},
  {'item': 'Red Onions',
   'item_type': 'Onion',
   'item_count': 1,
   'price': {'amount': 0.99, 'discount': 0.0}},
  {'item': 'Natural Yogurt',
   'item_type': 'Yogurt',
   'item_count': 3,
   'price': {'amount': 5.67, 'discount': 0.0}},
  {'item'

In [10]:
import pytesseract
py_result = pytesseract.image_to_string('artifacts/images/17000186820251224442777.png')
py_result = pytesseract.image_to_string('artifacts/images/dunnes.jpg')

py_result

In [5]:
import json
def rapid_ocr_to_text(image_path: str) -> list:
    from rapidocr_onnxruntime import RapidOCR
    import re

    engine = RapidOCR()
    result, _ = engine(image_path)

    if result is None:
        return ""

    lines = [entry[1] for entry in result if isinstance(entry, (list, tuple))]
    # text = "\n".join(lines)

    # Clean for LLM
    # text = re.sub(r"\[\d\.\d{2}\]\s*", "", text).strip()

    return lines

text = rapid_ocr_to_text("artifacts/images/dunnes.jpeg")

In [6]:
str(text)

"['SUPERMARKET', '5ONDMS', 'Swords Shopping Centre', 'Phone:018402884', 'www.jcsupermarket.ie', 'SLICED JALAPENOS', '2.65', '*SAVER DEAL!', '-0.66', 'WHOLEWHEAT PENNE', '1.05', 'DS CHICK BAO BUN', '3.49', 'CHICKEN PIECES', '2.79', 'DS SPAGHETTI', '0.75', 'OEPGUACAMOLE', '3.49', '*SAVERDEAL！', '-0.87', 'WWHEAT NOODLE', '1.39', '*SAVER DEAL！', '-0.70', 'G/F ROLLS', '4.59', 'STIR FRY', '1.15', 'DUNNES BOLOGNESE', '0.80', 'LINDT', '9.00', '*SAVER DEAL！', '2.00', 'VALUEclubVOUCHER', '3.00', '2223793769031', 'VALUEclubVOUCHER', '3.00', '2223793770037', 'BAL', '20.92', '2488060679251228180709', 'VCCard No', '6017890100236375290', 'SALE VOUCHER', 'SWORDS SHOPPING CENTRE', 'OPERATOR 2806', 'TERMINAL806', 'GOODS', '20.92', '18:07:46', '28/12/25', 'MID:***47205', 'TID:****6799', '251228180708065716', 'A0000000041010', 'DEBIT MASTERCARD', '****x**x**xx379700', 'Contactless', 'Sale', 'EUR20.92', 'Verification Not Required', 'APPROVED', 'AUTHC0DE 766907', 'PLEASE RETAIN RECEIPT', 'Your account will 

In [9]:
result_llm = parse_receipt(
    text=str(text),
    llm_client=llm
)

json_dict = result_llm.model_dump()
json_dict

2025-12-30 09:02:42,236 - src.agents.parser - INFO - [53] - Initiating receipt parsing logic.
2025-12-30 09:02:42,242 - src.llm.base - INFO - [75] - LLM Dispatch: Provider=OpenAIClient, Model=gpt-5.1
2025-12-30 09:02:42,243 - src.llm.openai_client - INFO - [58] - Starting generation request for model 'gpt-5.1'
2025-12-30 09:02:42,243 - src.llm.openai_client - INFO - [76] - Applying response model: ModelMetaclass
2025-12-30 09:02:47,488 - src.llm.openai_client - INFO - [110] - Successfully received response. Tokens used: 1448
2025-12-30 09:02:47,490 - src.llm.base - INFO - [99] - Structured output parsing successful.
2025-12-30 09:02:47,491 - src.llm.base - INFO - [109] - Generation complete. Latency: 5247.94ms. Total Tokens: 1448
2025-12-30 09:02:47,492 - src.agents.parser - INFO - [71] - Successfully parsed receipt into ParserResponse object.


{'date': datetime.date(2805, 12, 25),
 'time': datetime.time(18, 7, 46, tzinfo=TzInfo(-74880)),
 'shop': 'SUPERMARKET',
 'parsed_items': [{'item': 'SLICED JALAPENOS',
   'item_type': 'Jalapeno',
   'item_count': 1,
   'price': {'amount': 2.65, 'discount': 0.66}},
  {'item': 'WHOLEWHEAT PENNE',
   'item_type': 'Pasta',
   'item_count': 1,
   'price': {'amount': 1.05, 'discount': 0.0}},
  {'item': 'DS CHICK BAO BUN',
   'item_type': 'Bao bun',
   'item_count': 1,
   'price': {'amount': 3.49, 'discount': 0.0}},
  {'item': 'CHICKEN PIECES',
   'item_type': 'Chicken',
   'item_count': 1,
   'price': {'amount': 2.79, 'discount': 0.0}},
  {'item': 'DS SPAGHETTI',
   'item_type': 'Pasta',
   'item_count': 1,
   'price': {'amount': 0.75, 'discount': 0.0}},
  {'item': 'OEP GUACAMOLE',
   'item_type': 'Guacamole',
   'item_count': 1,
   'price': {'amount': 3.49, 'discount': 0.87}},
  {'item': 'WWHEAT NOODLE',
   'item_type': 'Noodles',
   'item_count': 1,
   'price': {'amount': 1.39, 'discount': 