In [10]:
"""
Post-processes the OCR data by cleaning and correcting the extracted text.

This function performs the following steps for each entry in the OCR data:
1. **Clean the text**: Removes unnecessary characters, normalizes spaces, and trims the text.
2. **Correct spelling errors**: Identifies and corrects misspelled words in the OCR text.
3. **Filter by confidence**: Only processes entries with a confidence score above a specified threshold.

The function outputs a list of processed OCR entries, each containing:
- The original bounding box (`bbox`) and class (`class`).
- The original OCR text (`text`).
- The cleaned OCR text (`cleaned_text`).
- The corrected OCR text (`corrected_text`).

Arguments:
    ocr_data (list): List of OCR data entries, where each entry is a dictionary containing:
        - "bbox": The bounding box coordinates for the detected text.
        - "class": The class of the detected object.
        - "confidence": The confidence score for the OCR detection.
        - "text": The raw OCR text extracted from the image.
    confidence_threshold (float): Minimum confidence score to consider the text for post-processing. Default is 0.5.

Returns:
    list: List of processed OCR entries with cleaned and corrected text.
"""


'\nPost-processes the OCR data by cleaning and correcting the extracted text.\n\nThis function performs the following steps for each entry in the OCR data:\n1. **Clean the text**: Removes unnecessary characters, normalizes spaces, and trims the text.\n2. **Correct spelling errors**: Identifies and corrects misspelled words in the OCR text.\n3. **Filter by confidence**: Only processes entries with a confidence score above a specified threshold.\n\nThe function outputs a list of processed OCR entries, each containing:\n- The original bounding box (`bbox`) and class (`class`).\n- The original OCR text (`text`).\n- The cleaned OCR text (`cleaned_text`).\n- The corrected OCR text (`corrected_text`).\n\nArguments:\n    ocr_data (list): List of OCR data entries, where each entry is a dictionary containing:\n        - "bbox": The bounding box coordinates for the detected text.\n        - "class": The class of the detected object.\n        - "confidence": The confidence score for the OCR detect

In [11]:
!pip install pyspellchecker

Defaulting to user installation because normal site-packages is not writeable


# 0. Clean up the processed OCR directory files from the last run
This cell will allow you to clean up the directory containing the processed OCR files.
Only do this when running tests of tje pipeline.

In [12]:
import shutil
import os

# Function to clean up files in the specified directory
def clean_up_processed_ocr_dir(directory):
    """
    Removes all files in the given directory (including subdirectories).

    Arguments:
        directory (str): Directory path to clean.
    """
    if os.path.exists(directory):
        # Remove all files and subdirectories
        shutil.rmtree(directory)
        print(f"Cleaned up the directory: {directory}")
    else:
        print(f"Directory not found: {directory}")

# Path to the processed OCR directory
processed_ocr_dir = 'ocr_output_json_post_processed'

# Clean up the processed OCR directory
clean_up_processed_ocr_dir(processed_ocr_dir)


Directory not found: ocr_output_json_post_processed


# 1. Load the OCR JSON file

In [13]:
import json

# Function to load the OCR data from the JSON file
def load_ocr_json(file_path):
    """
    Loads the OCR data from a JSON file.
    
    Arguments:
        file_path (str): Path to the OCR JSON file.
        
    Returns:
        dict: Loaded OCR data.
    """
    with open(file_path, 'r') as f:
        ocr_data = json.load(f)
    return ocr_data

# Path to the OCR results JSON file
ocr_json_file_path = 'ocr_output_json/ocr_all_images.json'  # Update the path as per your directory
ocr_data = load_ocr_json(ocr_json_file_path)

# Display the first few entries to ensure it's loaded correctly
ocr_data[:5]  # Display first 5 entries for a quick check


[{'image': '256_png.rf.2d34617941a0c32424cbe06878dafd4a.jpg',
  'bbox': [162, 567, 310, 635],
  'class': 'activity',
  'confidence': 0.926584005355835,
  'text': ''},
 {'image': '256_png.rf.2d34617941a0c32424cbe06878dafd4a.jpg',
  'bbox': [343, 94, 490, 165],
  'class': 'activity',
  'confidence': 0.9263918399810791,
  'text': ''},
 {'image': '256_png.rf.2d34617941a0c32424cbe06878dafd4a.jpg',
  'bbox': [488, 455, 635, 525],
  'class': 'activity',
  'confidence': 0.9126627445220947,
  'text': ''},
 {'image': '256_png.rf.2d34617941a0c32424cbe06878dafd4a.jpg',
  'bbox': [326, 567, 473, 636],
  'class': 'activity',
  'confidence': 0.9073367118835449,
  'text': ''},
 {'image': '256_png.rf.2d34617941a0c32424cbe06878dafd4a.jpg',
  'bbox': [4, 456, 149, 524],
  'class': 'activity',
  'confidence': 0.9048864841461182,
  'text': ''}]

# 2. Post Process the OCR Text

In [14]:
import re
from spellchecker import SpellChecker

# Function to clean OCR text
def clean_text(text):
    """
    Cleans the OCR text by removing unnecessary characters and normalizing spaces.
    
    Arguments:
        text (str): The raw OCR text.
        
    Returns:
        str: Cleaned OCR text.
    """
    # Remove unwanted characters (e.g., special characters, unwanted spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only alphanumeric characters and spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

from spellchecker import SpellChecker

# Function to correct OCR spelling mistakes
def correct_ocr_text(ocr_text):
    """
    Corrects OCR text by identifying and fixing misspelled words.
    
    Arguments:
        ocr_text (str): The OCR text.
        
    Returns:
        str: The corrected OCR text.
    """
    spell = SpellChecker()
    words = ocr_text.split()
    
    # Correct each word and ensure no None values are included
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    corrected_text = ' '.join(corrected_words)
    return corrected_text


# Post-process OCR data
def post_process_ocr(ocr_data, confidence_threshold=0.1):
    """
    Processes the OCR data by cleaning and correcting the extracted text.

    Arguments:
        ocr_data (list): OCR data with bounding boxes and extracted text.
        confidence_threshold (float): Minimum confidence score to consider the text.

    Returns:
        list: Processed OCR data with cleaned and corrected text.
    """
    processed_ocr = []

    for entry in ocr_data:
        confidence = entry.get("confidence", 0)
        text = entry.get("text", "")

        # Process only entries with a confidence score above the threshold
        if confidence >= confidence_threshold:
            # Clean the OCR text
            cleaned_text = clean_text(text)
            # Correct the OCR text
            corrected_text = correct_ocr_text(cleaned_text)

            # Add the processed entry with corrected text
            processed_ocr.append({
                "bbox": entry["bbox"],
                "class": entry["class"],
                "confidence": confidence,
                "text": text,  # Original OCR text
                "cleaned_text": cleaned_text,
                "corrected_text": corrected_text
            })

    return processed_ocr

# Process OCR data
processed_ocr = post_process_ocr(ocr_data)

# Display the first 5 processed entries for validation
processed_ocr[:5]


[{'bbox': [162, 567, 310, 635],
  'class': 'activity',
  'confidence': 0.926584005355835,
  'text': '',
  'cleaned_text': '',
  'corrected_text': ''},
 {'bbox': [343, 94, 490, 165],
  'class': 'activity',
  'confidence': 0.9263918399810791,
  'text': '',
  'cleaned_text': '',
  'corrected_text': ''},
 {'bbox': [488, 455, 635, 525],
  'class': 'activity',
  'confidence': 0.9126627445220947,
  'text': '',
  'cleaned_text': '',
  'corrected_text': ''},
 {'bbox': [326, 567, 473, 636],
  'class': 'activity',
  'confidence': 0.9073367118835449,
  'text': '',
  'cleaned_text': '',
  'corrected_text': ''},
 {'bbox': [4, 456, 149, 524],
  'class': 'activity',
  'confidence': 0.9048864841461182,
  'text': '',
  'cleaned_text': '',
  'corrected_text': ''}]

# 3. Save the processed OCR data to a new JSON file

In [15]:
from pathlib import Path

# Function to save processed OCR data to a JSON file

# Directory to save processed OCR data
post_processed_dir = 'ocr_output_json_post_processed'

# Ensure the directory exists
Path(post_processed_dir).mkdir(parents=True, exist_ok=True)


def save_processed_ocr(ocr_data, output_file_path):
    """
    Saves the processed OCR data to a JSON file.

    Arguments:
        ocr_data (list): The processed OCR data.
        output_file_path (str): Path where the processed OCR data will be saved.
    """
    with open(output_file_path, 'w') as f:
        json.dump(ocr_data, f, indent=2)
    print(f"Processed OCR data saved to: {output_file_path}")

# Path to save the processed OCR data
processed_ocr_file_path = 'ocr_output_json_post_processed/ocr_processed.json'

# Save the processed OCR data
save_processed_ocr(processed_ocr, processed_ocr_file_path)


Processed OCR data saved to: ocr_output_json_post_processed/ocr_processed.json
