In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from helper.ocr_intel import extract_receipts, create_doc_intel_client
from dotenv import dotenv_values
import os
import json
from helper.azure_ai import az_content_understanding_analyze
from helper.general_function import combine, validate_receipt
from datetime import datetime
import numpy as np
import pandas as pd

In [6]:
def main(source, file_path):
    ai_trigger = 0
    config = dotenv_values(".env")

    endpoint = config["endpoint_s0_doc_int"]
    key = config["key_s0_doc_int"]

    doc_intel_client = create_doc_intel_client(endpoint, key)
    if type(source) == str:
        extracted_receipts = extract_receipts(
            urls=source, document_intelligence_client=doc_intel_client
        )
    else:
        extracted_receipts = extract_receipts(
            bytes_source=source, document_intelligence_client=doc_intel_client
        )

    keys_to_check = ["total", "transaction_date", "transaction_time"]
    if any(extracted_receipts[0][key] is None for key in keys_to_check):
        ai_trigger += 1
        ai_extracted_receipts = az_content_understanding_analyze(file_path)
        extracted_receipts = combine(extracted_receipts, ai_extracted_receipts)
        extracted_receipts = validate_receipt(extracted_receipts)
    
    print(ai_trigger)
    # return json.dumps(extracted_receipts, indent=4, ensure_ascii=False),ai_trigger
    return extracted_receipts,ai_trigger


In [7]:
INPUT_PATH = 'data/inputs/receipt_images'

total_ai_trigger = 0
total_runtime = []
file_runtime_dct = {}
file_missing_dct = {}
for file_name in os.listdir(INPUT_PATH):
    file_path = os.path.join(INPUT_PATH,file_name)
    with open(file_path, "rb") as file:
        file_content = file.read()
        start_time = datetime.now()
        result, ai_trigger = main(file_content,file_path)
        end_time = datetime.now()
        runtime = (end_time - start_time).total_seconds()
        total_runtime.append(runtime)
        total_ai_trigger += ai_trigger
        file_runtime_dct[file_name] = runtime
        file_missing_dct[file_name] = result[0]['invalid_fields']

0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


KeyboardInterrupt: 

In [5]:
print(f"Send to AzAi: {total_ai_trigger}/{len(total_runtime)} images")
print(f"AVG Runtime: {round(np.mean(total_runtime),4)} secs")
print(f"Median Runtime: {round(np.median(total_runtime),4)} secs")
print(f"Max Runtime: {round(np.max(total_runtime),4)} secs")

Send to AzAi: 16/335 images
AVG Runtime: 6.086 secs
Median Runtime: 4.7925 secs
Max Runtime: 45.7092 secs


In [6]:
all_fields = ['tax_id', 'total', 'date', 'time']

# Create a list to store the transformed data
rows = []

for filename, invalid_fields in file_missing_dct.items():
    # Start with the filename
    row = {'Filename': filename}
    
    # Add columns for each field
    for field in all_fields:
        # Set value to 1 if field is in the invalid_fields list, 0 otherwise
        column_name = f'invalid_{field}' if field != 'tax_id' else 'invalid_tax_id'
        row[column_name] = 1 if field in invalid_fields else 0
    
    rows.append(row)

# Create DataFrame
file_missing_fields_df = pd.DataFrame(rows)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f"{timestamp}_invalid_fields.csv"
file_missing_fields_df.to_csv(csv_filename, index=False)

In [7]:
tax_id_error_rate = file_missing_fields_df['invalid_tax_id'].sum()/len(file_missing_fields_df) * 100
total_error_rate = file_missing_fields_df['invalid_total'].sum()/len(file_missing_fields_df) * 100
date_error_rate = file_missing_fields_df['invalid_date'].sum()/len(file_missing_fields_df) * 100
time_error_rate = file_missing_fields_df['invalid_time'].sum()/len(file_missing_fields_df) * 100

In [8]:
print(f"tax_id error rate: {file_missing_fields_df['invalid_tax_id'].sum()}/{len(file_missing_fields_df)} = {round(tax_id_error_rate,4)}%")
print(f"total error rate: {file_missing_fields_df['invalid_total'].sum()}/{len(file_missing_fields_df)} = {round(total_error_rate,4)}%")
print(f"date error rate: {file_missing_fields_df['invalid_date'].sum()}/{len(file_missing_fields_df)} = {round(date_error_rate,4)}%")
print(f"time error rate: {file_missing_fields_df['invalid_time'].sum()}/{len(file_missing_fields_df)} = {round(time_error_rate,4)}%")

tax_id error rate: 10/335 = 2.9851%
total error rate: 3/335 = 0.8955%
date error rate: 0/335 = 0.0%
time error rate: 0/335 = 0.0%


In [9]:
# Create DataFrame
file_runtime_df = pd.DataFrame(list(file_runtime_dct.items()), columns=['Filename', 'Runtime'])
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f"{timestamp}_runtime.csv"
file_runtime_df.to_csv(csv_filename, index=False)

In [10]:
file_runtime_df.describe()

Unnamed: 0,Runtime
count,335.0
mean,6.085968
std,4.803941
min,2.568122
25%,3.840629
50%,4.792484
75%,5.381986
max,45.709153
