In [16]:
import easyocr
import pandas as pd
import os
from datetime import datetime


In [17]:
# Set up the directory path for receipts
receipt_directory = r'C:\Users\Martin\Desktop\Tax 2024\Receipts'

# Dataframe to store extracted data
columns = ['Store Name', 'Transaction Date', 'Total Amount']
receipt_data = pd.DataFrame(columns=columns)

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [19]:

def preprocess_image(image_path):
    """Preprocess the image to improve OCR results."""
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive thresholding
    processed_image = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    return processed_image

def extract_text_from_image(image_path):
    """Extract text from an image using EasyOCR with preprocessing."""
    processed_image = preprocess_image(image_path)
    results = reader.readtext(processed_image, detail=0)
    text = "\n".join(results)
    # Debugging: Print the extracted text
    print(f"Extracted text from {image_path}:\n{text}\n{'-'*40}\n")
    return text

def parse_receipt_text(text):
    """Extract the store name, transaction date, and total amount from receipt text."""
    store_name = None
    transaction_date = None
    total_amount = None

    # Split the text into lines for easier parsing
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        print(f"Processing line: {line}")  # Debugging: Print each line being processed

        # Extract store name (assuming it's on the first few lines)
        if not store_name and len(line) > 3:
            store_name = line
            print(f"Store name identified as: {store_name}")

        # Extract transaction date using date formats
        try:
            potential_date = datetime.strptime(line, '%Y/%m/%d')
            transaction_date = potential_date.strftime('%Y-%m-%d')
            print(f"Transaction date identified as: {transaction_date}")
        except ValueError:
            try:
                potential_date = datetime.strptime(line, '%d-%b-%Y')
                transaction_date = potential_date.strftime('%Y-%m-%d')
                print(f"Transaction date identified as: {transaction_date}")
            except ValueError:
                try:
                    potential_date = datetime.strptime(line, '%d-%m-%Y')
                    transaction_date = potential_date.strftime('%Y-%m-%d')
                    print(f"Transaction date identified as: {transaction_date}")
                except ValueError:
                    pass

        # Extract total amount (assuming it starts with 'Total' or similar)
        if 'total' in line.lower():
            total_amount = ''.join([char for char in line if char.isdigit() or char == '.' or char == ','])
            if total_amount:
                total_amount = total_amount.replace(',', '')
                try:
                    total_amount = float(total_amount)
                    print(f"Total amount identified as: {total_amount}")
                except ValueError:
                    total_amount = None

    return store_name, transaction_date, total_amount

def process_receipts(directory):
    """Process all receipt images in a directory and store extracted data in a DataFrame."""
    for filename in os.listdir(directory):
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            file_path = os.path.join(directory, filename)
            text = extract_text_from_image(file_path)
            store_name, transaction_date, total_amount = parse_receipt_text(text)
            receipt_data.loc[len(receipt_data)] = [store_name, transaction_date, total_amount]


In [20]:
# Process the receipts
process_receipts(receipt_directory)

# Save the extracted data to a CSV file
receipt_data.to_csv('extracted_receipt_data.csv', index=False)

# Display the dataframe
print(receipt_data)

Empty DataFrame
Columns: [Store Name, Transaction Date, Total Amount]
Index: []
