In [76]:
from dotenv import dotenv_values
import logging
from src.core.ocr_intel import create_doc_intel_client, extract_receipts

logger = logging.getLogger(__name__)

config = dotenv_values(".env")

endpoint = config["ENDPOINT_S0_DOC_INT"]
key = config["KEY_S0_DOC_INT"]

doc_intel_client = create_doc_intel_client(endpoint, key)

def extraction(source):
    extracted_receipts, content = extract_receipts(
        bytes_source=source, document_intelligence_client=doc_intel_client
    )
    
    return extracted_receipts, content


In [77]:
import os

def images_in_directory_to_bytes(directory_path, extensions={'.jpg', '.jpeg', '.png', '.bmp', '.gif'}):
    """
    Reads all image files in the specified directory and returns a dictionary
    mapping filenames to their byte content.

    Parameters:
    - directory_path (str): Path to the directory containing images.
    - extensions (set): Set of file extensions to include (default: common image formats).

    Returns:
    - dict: {filename: byte content}
    """
    image_bytes_dict = {}

    if not os.path.isdir(directory_path):
        print(f"Invalid directory: {directory_path}")
        return {}

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)

        if os.path.isfile(file_path) and os.path.splitext(filename)[1].lower() in extensions:
            try:
                with open(file_path, 'rb') as image_file:
                    image_bytes_dict[filename] = image_file.read()
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return image_bytes_dict


In [78]:
import pandas as pd

def receipts_to_csv(receipts):
    """
    Converts a list of receipt dictionaries into a UTF-8 encoded CSV file.
    
    - Each top-level key becomes a column.
    - The 'items' list is flattened into a readable string.
    
    Parameters:
    - receipts (list): List of dictionaries containing receipt data.
    - output_file (str): Path to output CSV file.
    """
    
    df = pd.DataFrame(receipts)
    df.to_excel("receipts.xlsx", index=False)
    return


In [87]:
dir_path = "Receipts-20250704T024310Z-1-001/Receipts/Forum receipt"
images_bytes = images_in_directory_to_bytes(dir_path)

for name, byte_data in images_bytes.items():
    print(f"{name}: {len(byte_data)} bytes")


Bigc Market .png: 198556 bytes
Canvas.png: 183779 bytes


In [88]:
results = []
for image in images_bytes:
    result = extraction(images_bytes[image])
    result[0][0]['file_name'] = image
    results.append(result[0][0])

In [85]:
results[0]

{'merchant_name': 'PIB/KL Consulting (Thailand) Co.,Ltd.',
 'transaction_date': '2025-05-09',
 'transaction_time': '14:20',
 'items': [{'description': 'Red Wine Poached Pear',
   'quantity': 1,
   'total_price': 295.0}],
 'total': 295.0,
 'tax_id': '0105546032501',
 'receipt_no': 'APRIL/00003252',
 'address': 'อาคารพาเหรด โครงการ วันแบงค็อก ห้องเลข\nที่ OP1303 ชั้น3 เลขที่1877 ถนน พระรามที่ 4\nแขวงลุมพินี เขต ปทุมวัน กรุงเทพมหานคร\n10330',
 'unit_no': None,
 'mall_name': 'ONE BANGKOK',
 'hashed_receipt': '8babe024da8186a01d81cc8a75ea1e8e9c872018bf000b443e7c753fa47cd6fd',
 'file_name': 'A Keen.png'}

In [89]:
receipts_to_csv(results)