In [1]:
!pip install pytesseract pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (25.6 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123632 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [15]:
!rm -rf /content/extracted_texts
!rm -rf /content/processed_images


In [11]:
import os
import glob

# Folder path
folder_path = "/content/invoice/images"

# Patterns to match
patterns = ["*_thresholded.*", "*_resized.*"]

# Loop through patterns and remove matching files
for pattern in patterns:
    # Create the full pattern path
    full_pattern = os.path.join(folder_path, pattern)
    # Get the list of files matching the pattern
    files_to_remove = glob.glob(full_pattern)
    for file_path in files_to_remove:
        try:
            os.remove(file_path)  # Remove the file
            print(f"Removed: {file_path}")
        except Exception as e:
            print(f"Error removing {file_path}: {e}")


Removed: /content/invoice/images/Template1_Instance144_thresholded.jpg
Removed: /content/invoice/images/Template2_Instance3_thresholded.jpg
Removed: /content/invoice/images/Template10_Instance154_thresholded.jpg
Removed: /content/invoice/images/Template7_Instance154_thresholded.jpg
Removed: /content/invoice/images/Template9_Instance170_thresholded.jpg


In [16]:
import os
import cv2
import pytesseract

# Set the path to Tesseract OCR executable (update as per your system)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Paths
image_folder = "/content/invoice/images"
output_folder = "/content/processed_images/"  # Folder to save adjusted images
text_output_folder = "/content/ocr_text/"    # Folder to save extracted text


def adjust_text_size(image, engine_version="4.x", target_height=None):
    """
    Adjust text size in the image to ensure it meets the height requirement for Tesseract OCR.

    Args:
        image (ndarray): Input image in grayscale.
        engine_version (str): Tesseract engine version ("3.x" for legacy, "4.x" for LSTM).
        target_height (int, optional): Custom target height for capital letters.

    Returns:
        ndarray: Resized image with optimized text size.
    """
    try:
        # Detect text bounding boxes using Tesseract's OCR data
        ocr_data = pytesseract.image_to_boxes(image)
        lines = ocr_data.splitlines()

        # Calculate the average height of characters
        heights = []
        for line in lines:
            parts = line.split(' ')
            if len(parts) >= 5:
                char_height = int(parts[4]) - int(parts[2])  # (top - bottom)
                heights.append(char_height)

        if len(heights) == 0:
            print("No text detected to measure height.")
            return image  # Return original image if no text is detected

        avg_char_height = sum(heights) / len(heights)
        print(f"Average character height: {avg_char_height}px")

        # Set target height based on engine version
        if target_height is None:
            if engine_version == "3.x":
                # Approximation for 12 pt font in legacy engine
                target_height = 16
            elif engine_version == "4.x":
                # Recommended height for capital letters in LSTM engine
                target_height = 33

        scale_factor = target_height / avg_char_height
        new_width = int(image.shape[1] * scale_factor)
        new_height = int(image.shape[0] * scale_factor)
        resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)

        print(f"Text resized for Tesseract {engine_version}: Target height = {target_height}px")
        return resized_image
    except Exception as e:
        print(f"Error in adjusting text size: {e}")
        return image  # Return original image if there's an error


def process_images(image_folder, output_folder, text_output_folder, engine_version="4.x", limit=5):
    """
    Process images: adjust text size for Tesseract OCR, extract text, and save results.

    Args:
        image_folder (str): Path to folder containing input images.
        output_folder (str): Path to save processed images.
        text_output_folder (str): Path to save extracted text files.
        engine_version (str): Tesseract engine version ("3.x" or "4.x").
        limit (int): Maximum number of images to process.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if not os.path.exists(text_output_folder):
        os.makedirs(text_output_folder)

    processed_count = 0

    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
            image_path = os.path.join(image_folder, filename)

            # Load the image in grayscale
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                print(f"Error: Unable to load image {image_path}")
                continue

            # Adjust text size
            adjusted_img = adjust_text_size(img, engine_version=engine_version)

            # Save the adjusted image
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, adjusted_img)
            print(f"Processed and saved: {output_path}")

            # Extract text using Tesseract OCR
            extracted_text = pytesseract.image_to_string(adjusted_img)
            print(extracted_text)
            text_file_path = os.path.join(text_output_folder, f"{os.path.splitext(filename)[0]}.txt")

            # Save extracted text to a file
            with open(text_file_path, "w", encoding="utf-8") as text_file:
                text_file.write(extracted_text)
            print(f"Extracted text saved to: {text_file_path}")

            processed_count += 1
            if processed_count >= limit:
                print("Processed limit of images reached.")
                break


# Run the script
process_images(image_folder, output_folder, text_output_folder, engine_version="4.x", limit=5)


Average character height: 8.352180936995154px
Text resized for Tesseract 4.x: Target height = 33px
Processed and saved: /content/processed_images/Template10_Instance154.jpg
Address:4894 Melanie Pass Apt. 742
Tammyland, SD 42587 US

Invoice Date: 17-Nov-2017

Invoice number 7077-628
Mclean-Cochran

Due Date : 16-Feb-1995

 

Bill to:Christopher Mayo

87430 Sarah Tunnel Suite 340

East John, MA 84969 US
Tel:+(878)679-0474
Email:murphyanthony@example.org
Site:http://diaz.net/

ITEMS QUANTITY PRICE TAX AMOUNT
Be large. 4.00 $22.31 $0.00 $89.24
Anything exactly hand. 6.00 $40.95 $0.00 $245.70

DISCOUNT(2.39%): (-) 8.01

TOTAL : 352.03 EUR

Terms & Conditions

1. Please send payment within 8 days of receiving this invoice.
There will be a 2.5% interest charge per month on late invoices.
2. Please include the Invoice number in your payment notes.

Note:
This order is shipped through blue dart courier

Email:cjames@example.com

Extracted text saved to: /content/ocr_text/Template10_Instance154