In [14]:
import boto3
import json
import re
import pandas as pd
from datetime import datetime, timezone
import ast
from io import StringIO
import yaml
from openai import OpenAI
from tabkeeper import *



In [3]:
# Load configuration from the YAML file
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
# Initialize BonEngine with the config
bon_engine = BonEngine(config)

# Print the initialized object to verify
print(bon_engine)

<tabkeeper.BonEngine object at 0x7f9fcdb87a10>


In [5]:
# Import receipt names from S3
bon_engine.o_import_receipt_names()

# Print the list of receipts to verify
print("Receipts:", bon_engine.receipts)

Receipts: ['receipts/IMG_6536.JPEG', 'receipts/uid1234567_aldi002.JPEG', 'receipts/uid1234567_aldi003.JPEG', 'receipts/uid1234567_aldi004.JPEG', 'receipts/uid1234567_aldi005.JPEG', 'receipts/uid1234567_aldi006.JPEG', 'receipts/uid1234567_aldi007.JPEG', 'receipts/uid1234567_aldi008.JPEG', 'receipts/uid1234567_aldi009.JPEG', 'receipts/uid1234567_aldi010.JPEG', 'receipts/uid1234567_aldi011.JPEG', 'receipts/uid1234567_lidl001.JPEG', 'receipts/uid1234567_lidl002.JPEG', 'receipts/uid1234567_lidl003.JPEG', 'receipts/uid1234567_lidl004.JPEG', 'receipts/uid1234567_rewe001.JPEG', 'receipts/uid1234567_rewe002.JPEG', 'receipts/uid1234567_rewe003.JPEG', 'receipts/uid1234567_rewe004.JPEG', 'receipts/uid1234567_rewe005.JPEG', 'receipts/uid1234567_rewe006.JPEG', 'receipts/uid1234567_rewe007.JPEG', 'receipts/uid1234567_rewe008.JPEG', 'receipts/uid1234567_rewe009.JPEG', 'receipts/uid1234567_rewe010.JPEG', 'receipts/uid1234567_rewe011.JPEG', 'receipts/uid1234567_rewe012.JPEG', 'receipts/uid1234567_rewe01

In [6]:
# Check if there is existing data in the S3 bucket
try:
    # Attempt to read the existing CSV file from S3
    bon_engine.existing_df = pd.read_csv(bon_engine.s3_path)
    print("Existing data found. Processing new receipts only.")
    
    # Sync the receipts to process only new ones
    bon_engine.o_sync()
    print("Sync complete.")
    
    # If there are no new receipts, exit early
    if not bon_engine.receipts:
        print("No new receipts to process.")

except Exception as e:
    # If no existing data is found, process all receipts
    print("No existing data found. Processing all receipts.")
    bon_engine.existing_df = None

No existing data found. Processing all receipts.


In [7]:
# Convert images to text using AWS Textract
bon_engine.o_image_to_text()

# # Print the raw text response to verify
# print("Raw Text Response:", bon_engine.receipts_raw_text)

Processing receipt: receipts/IMG_6536.JPEG
Processing receipt: receipts/uid1234567_aldi002.JPEG
Processing receipt: receipts/uid1234567_aldi003.JPEG
Processing receipt: receipts/uid1234567_aldi004.JPEG
Processing receipt: receipts/uid1234567_aldi005.JPEG
Processing receipt: receipts/uid1234567_aldi006.JPEG
Processing receipt: receipts/uid1234567_aldi007.JPEG
Processing receipt: receipts/uid1234567_aldi008.JPEG
Processing receipt: receipts/uid1234567_aldi009.JPEG
Processing receipt: receipts/uid1234567_aldi010.JPEG
Processing receipt: receipts/uid1234567_aldi011.JPEG
Processing receipt: receipts/uid1234567_lidl001.JPEG
Processing receipt: receipts/uid1234567_lidl002.JPEG
Processing receipt: receipts/uid1234567_lidl003.JPEG
Processing receipt: receipts/uid1234567_lidl004.JPEG
Processing receipt: receipts/uid1234567_rewe001.JPEG
Processing receipt: receipts/uid1234567_rewe002.JPEG
Processing receipt: receipts/uid1234567_rewe003.JPEG
Processing receipt: receipts/uid1234567_rewe004.JPEG
Pro

In [8]:
# Extract text from the Textract response
bon_engine.o_extract_text_from_response()

# # Print the extracted text to verify
# print("Extracted Text:", bon_engine.receipt_texts)

Extracted Text: [['r e w e', 'severinstrasse 93-95', '****', '****', '50678 köln', '****', '****', '0221-9320092', '****', 'uid nr.: de812706034', 'eur', 'pastrami', '1,99 b', 'oliven-tomatenk.', '1,29 b', 'ja! butterkaese', '2,99 b', 'blutorangensaft', '2,59 a', 'pfand 0,25 euro', '0,25 a *', 'hp pud.', '1,49 b', 'kind. bueno white', '2,10 b', 'twister mallow', '1,60 b', 'kinder bueno eis', '2.10 b', 'magnum doublehaz', '2,70 b', 'spaghetti', '1,39 b', 'knus. muesli hon', '2,59 b', 'salz mit selen', '1,29 b', '.nat. olivenoel', '8,99 b', 'pesto genovese', '1,99 b', 'summe', 'eur', '35,35', 'geg. mastercard', 'eur', '35,35', '** kundenbeleg *', '*', 'datum:', '15.03.2025', 'uhrzeit:', '19:45:23 uhr', 'beleg-nr.', '0183', 'trace-nr.', '311092', 'bezahlung', 'contactless', 'debit mastercard', 'nr.', '5748 0002', 'vu-nr.', '4556783885', 'terminal id', '56039995', 'pos-info', '00 075 00', 'as-zeit 15.03.', '19:45 uhr', 'as-proc-code = 00 075 00', 'capt. -ref. = 0000', 'approved', 'betrag e

In [9]:
# Derive store brands from the receipt text
bon_engine.o_get_store_brands()

# Print the store brands to verify
print("Store Brands:", bon_engine.store_names)

Store Brands: ['rewe', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'aldi süd', 'lidl', 'lidl', 'lidl', 'lidl', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe', 'rewe']


In [10]:
# Extract features (items, prices, dates, etc.)
bon_engine.o_exctract_features()

# # Print the df_dict to verify
# print("df_dict:", bon_engine.df_dict)

In [11]:
# Create a DataFrame from the df_dict
bon_engine.o_create_df()

# Print the DataFrame to verify
print("DataFrame:", bon_engine.df)

DataFrame:                          receipt_code store_brand purchase_date purchase_hour  \
0              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
1              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
2              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
3              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
4              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
..                                ...         ...           ...           ...   
243  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
244  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
245  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
246  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
248  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   

        purchase

In [15]:
# Prepare for GenAI processing
bon_engine.o_pre_genai_prep()

# Print the row ranges to verify
print("Row Ranges:", bon_engine.row_ranges)

Row Ranges: [('receipts/IMG_6536.JPEG', [0, 14]), ('receipts/IMG_6536.JPEG', [0, 14])]


In [18]:
# Get explicit item names from the AI
bon_engine.o_get_explicit_items()

# Print the DataFrame with explicit item names to verify
print("DataFrame with Explicit Items:", bon_engine.df)

('receipts/IMG_6536.JPEG', [0, 14])
('receipts/IMG_6536.JPEG', [0, 14])
('receipts/IMG_6536.JPEG', [0, 14])
('receipts/uid1234567_aldi002.JPEG', [14, 17])
('receipts/uid1234567_aldi003.JPEG', [17, 26])
('receipts/uid1234567_aldi004.JPEG', [26, 34])
('receipts/uid1234567_aldi005.JPEG', [34, 38])
('receipts/uid1234567_aldi006.JPEG', [38, 52])
('receipts/uid1234567_aldi007.JPEG', [52, 61])
('receipts/uid1234567_aldi008.JPEG', [61, 65])
('receipts/uid1234567_aldi009.JPEG', [65, 77])
('receipts/uid1234567_aldi010.JPEG', [77, 89])
('receipts/uid1234567_aldi011.JPEG', [89, 96])
('receipts/uid1234567_lidl001.JPEG', [96, 103])
('receipts/uid1234567_lidl002.JPEG', [103, 113])
('receipts/uid1234567_lidl003.JPEG', [113, 132])
('receipts/uid1234567_lidl004.JPEG', [132, 154])
('receipts/uid1234567_rewe001.JPEG', [154, 155])
('receipts/uid1234567_rewe002.JPEG', [155, 159])
('receipts/uid1234567_rewe003.JPEG', [159, 163])
('receipts/uid1234567_rewe004.JPEG', [163, 167])
('receipts/uid1234567_rewe005.J

In [19]:
if bon_engine.existing_df is not None:
    bon_engine.o_stack_dfs()

In [20]:
# Get item categories from the AI
bon_engine.o_get_item_categories()

# Print the item categories to verify
print("Item Categories:", bon_engine.item_categories)

Item Categories: ['Dairy Products', 'Meat and Poultry', 'Pasta and Grains', 'Beverages', 'Snacks and Sweets', 'Fruits and Vegetables', 'Baking and Cooking Ingredients', 'Condiments and Sauces', 'Breakfast Items', 'Household Items']


In [21]:
# Match items to their categories
bon_engine.o_match_items_to_categories()

# Print the DataFrame with item categories to verify
print("DataFrame with Item Categories:", bon_engine.df)

DataFrame with Item Categories:                          receipt_code store_brand purchase_date purchase_hour  \
0              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
1              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
2              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
3              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
4              receipts/IMG_6536.JPEG        rewe    2025-03-15      19:45:23   
..                                ...         ...           ...           ...   
244  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
245  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
246  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
247  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00   
248  receipts/uid1234567_rewe018.JPEG        rewe    2023-10-14      19:41:00

In [22]:
# Save the results to S3
bon_engine.o_save_results()

# Print confirmation
print("Results saved to S3.")

DataFrame uploaded to S3 as receipts_decoded/shopping_history.csv in bucket bon-yy.
Results saved to S3.
