In [1]:
from difflib import SequenceMatcher
import os
import pandas as pd

# Function to calculate character-level accuracy
def calculate_character_accuracy(ocr_text, ground_truth_text):
    matcher = SequenceMatcher(None, ocr_text, ground_truth_text)
    return matcher.ratio() * 100

# Function to calculate word-level accuracy
def calculate_word_accuracy(ocr_text, ground_truth_text):
    ocr_words = set(ocr_text.split())
    ground_truth_words = set(ground_truth_text.split())
    correct_words = ocr_words & ground_truth_words
    return len(correct_words) / len(ground_truth_words) * 100 if ground_truth_words else 0

# Function to calculate accuracies for all files
def calculate_accuracies(ocr_folder, ground_truth_folder, output_file):
    results = []

    # Iterate through files in the OCR folder
    for filename in os.listdir(ocr_folder):
        if filename.endswith(".txt"):
            ocr_file_path = os.path.join(ocr_folder, filename)
            ground_truth_file_path = os.path.join(ground_truth_folder, filename)

            # Check if the corresponding ground truth file exists
            if os.path.exists(ground_truth_file_path):
                # Skip empty files
                if os.path.getsize(ocr_file_path) == 0 or os.path.getsize(ground_truth_file_path) == 0:
                    print(f"Skipping {filename}: One or both files are empty.")
                    continue

                # Read OCR text and ground truth text
                with open(ocr_file_path, "r") as ocr_file:
                    ocr_text = ocr_file.read().strip()
                with open(ground_truth_file_path, "r") as gt_file:
                    ground_truth_text = gt_file.read().strip()

                # Skip if either file is empty after stripping whitespace
                if not ocr_text or not ground_truth_text:
                    print(f"Skipping {filename}: One or both files contain only whitespace.")
                    continue

                # Calculate character and word-level accuracy
                char_accuracy = calculate_character_accuracy(ocr_text, ground_truth_text)
                word_accuracy = calculate_word_accuracy(ocr_text, ground_truth_text)

                # Append results
                results.append({
                    "file": filename,
                    "character_accuracy (%)": char_accuracy,
                    "word_accuracy (%)": word_accuracy
                })
            else:
                print(f"Ground truth for {filename} not found. Skipping...")

    # Save results to a CSV or Excel file
    results_df = pd.DataFrame(results)
    results_df.to_excel(output_file, index=False)
    print(f"Accuracy results saved to {output_file}")


In [2]:
ocr_folder = '/Users/darwinye/myfile/NorthwesternU/499 Capstone/Data_subset_Final/test/OCR_text'
ground_truth_folder = '/Users/darwinye/myfile/NorthwesternU/499 Capstone/Data_subset_Final/test/groundtruth'
output_file = '/Users/darwinye/myfile/NorthwesternU/499 Capstone/Data_subset_Final/test/accuracy_results.xlsx'

calculate_accuracies(ocr_folder, ground_truth_folder, output_file)


Skipping 512.txt: One or both files are empty.
Skipping 1243.txt: One or both files are empty.
Skipping 201.txt: One or both files contain only whitespace.
Skipping 1186.txt: One or both files are empty.
Skipping 599.txt: One or both files are empty.
Skipping 603.txt: One or both files are empty.
Skipping 358.txt: One or both files are empty.
Skipping 5.txt: One or both files are empty.
Skipping 341.txt: One or both files are empty.
Skipping 41.txt: One or both files are empty.
Skipping 232.txt: One or both files contain only whitespace.
Skipping 1217.txt: One or both files are empty.
Skipping 877.txt: One or both files are empty.
Skipping 692.txt: One or both files contain only whitespace.
Skipping 319.txt: One or both files are empty.
Skipping 1077.txt: One or both files are empty.
Skipping 1102.txt: One or both files contain only whitespace.
Skipping 1274.txt: One or both files contain only whitespace.
Accuracy results saved to /Users/darwinye/myfile/NorthwesternU/499 Capstone/Data_