In [1]:
%pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from ocr_main import main as ocr_main
import os
from openai import OpenAI
from dotenv import dotenv_values
from azure.storage.blob import BlobServiceClient
import pandas as pd

In [2]:
config = dotenv_values(".env")
open_ai_api_key=config["open_ai_api_key"]
client = OpenAI(api_key=open_ai_api_key)
connect_str=config["connection_str"]

connection_string = config["connection_string"]
container_name = config["container_name"]
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client(container_name)


In [3]:
def get_blob_sas_urls(container_client, blob_names):
    urls = []
    for blob_name in blob_names:
        if not (blob_name.endswith('.jpg') or blob_name.endswith('.png')):
            continue  # Skip files ending with .labels.json

        blob_client = container_client.get_blob_client(blob_name)

        from datetime import datetime, timedelta
        from azure.storage.blob import generate_blob_sas, BlobSasPermissions

        sas_token = generate_blob_sas(
            account_name=blob_service_client.account_name,
            container_name=container_name,
            blob_name=blob_name,
            account_key=blob_service_client.credential.account_key,
            permission=BlobSasPermissions(read=True),
            expiry=datetime.utcnow() + timedelta(hours=1)
        )

        sas_url = f"{blob_client.url}?{sas_token}"
        urls.append(sas_url)

    return urls


In [4]:
blob_names = [blob.name for blob in container_client.list_blobs()]

In [5]:
print(blob_names)

['17386590297053997295044438274399 - Victor Lee.jpg', '17386590297053997295044438274399 - Victor Lee.jpg.labels.json', '17386590297053997295044438274399 - Victor Lee.jpg.ocr.json', '17386590966462230082736051626241 - Victor Lee.jpg', '17386590966462230082736051626241 - Victor Lee.jpg.labels.json', '17386590966462230082736051626241 - Victor Lee.jpg.ocr.json', '17387186556135444632046881269223 - Victor Lee.jpg', '17387186556135444632046881269223 - Victor Lee.jpg.labels.json', '17387186556135444632046881269223 - Victor Lee.jpg.ocr.json', '17387375666977370569976893425821 - Victor Lee.jpg', '17387375666977370569976893425821 - Victor Lee.jpg.labels.json', '17387375666977370569976893425821 - Victor Lee.jpg.ocr.json', '1738828911937654016708644032092 - Victor Lee.jpg', '1738828911937654016708644032092 - Victor Lee.jpg.labels.json', '1738828911937654016708644032092 - Victor Lee.jpg.ocr.json', '17388290984957689282649041447964 - Victor Lee.jpg', '17388290984957689282649041447964 - Victor Lee.jp

In [6]:
urls = get_blob_sas_urls(container_client, blob_names)

  expiry=datetime.utcnow() + timedelta(hours=1)


In [7]:
len(urls)

329

In [8]:
urls[0]

'https://tcctocrpoc.blob.core.windows.net/tcct-ocr-poc/17386590297053997295044438274399%20-%20Victor%20Lee.jpg?se=2025-05-06T08%3A20%3A13Z&sp=r&sv=2025-05-05&sr=b&sig=QZVwWbPBCAR6OfwsWicj9Ph3soLNAa8oKUEJOvmdhks%3D'

In [15]:
all_results = []

for url in urls:
    result = ocr_main(url)
    print(url)
    print(result)
    all_results.append(result)

# Normalize nested items


https://tcctocrpoc.blob.core.windows.net/tcct-ocr-poc/17386590297053997295044438274399%20-%20Victor%20Lee.jpg?se=2025-04-25T03%3A47%3A00Z&sp=r&sv=2025-01-05&sr=b&sig=53L7q/VuabEP%2BH6xgoTpaBfEPfPHArpdd8nvRtAOiC4%3D
{'merchant_name': 'Brewing Happiness Co.,Ltd.', 'transaction_date': '2025-02-04', 'transaction_time': '14:04:32', 'items': [{'description': 'Iced Americano - Floral', 'quantity': 1, 'total_price': 55.0}], 'total': 55.0, 'tax_id': '0105560101035', 'receipt_no': '10344001022500156', 'address': None, 'unit_no': None, 'mall_name': 'One Bangkok', 'invalid_fields': [], 'hashed_receipt': '54241a863f9bdf7760556e4b68c49659bfacf4c7715513125451c45d6297b59d'}
https://tcctocrpoc.blob.core.windows.net/tcct-ocr-poc/17386590966462230082736051626241%20-%20Victor%20Lee.jpg?se=2025-04-25T03%3A47%3A00Z&sp=r&sv=2025-01-05&sr=b&sig=dO70H23sjzXXbkcyiMKgINVPh0xtDFeqkMZygjk1rjM%3D
{'merchant_name': 'Tonkatsu Wako One Bangkok', 'transaction_date': '2025-04-02', 'transaction_time': '12:16:00', 'items'

In [20]:
filtered_data = [
    (name)
    for name in blob_names
    if name.lower().endswith(('.jpg', '.png'))
]

filtered_blob_names = filtered_data

df = pd.DataFrame(all_results)
df.index = filtered_blob_names

output_file = 'receipt_results6.xlsx'
df.to_excel(output_file, index=True)
print(f"Saved to {output_file}")

Saved to receipt_results6.xlsx
