In [1]:
!pip install pytesseract pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (25.6 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123632 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [4]:
import os
import cv2
import pytesseract
from PIL import Image

# Set the path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Paths
image_folder = "/content/invoice/images"
output_folder = "/content/extracted_texts/"  # Folder to save .txt files

# Function to extract text using Tesseract
def extract_text(image_path):
    """
    Convert an image to text using Tesseract OCR.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Extracted text from the image.
    """
    try:
        # Open the image file
        img = Image.open(image_path)

        # Perform OCR
        text = pytesseract.image_to_string(img)

        return text
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""

# Function to preprocess the image with thresholding (no resizing)
def preprocess_image(image_path):
    """
    Preprocess the image: apply binary threshold without resizing.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Path to the preprocessed image file.
    """
    try:
        # Load the image in grayscale mode
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            print(f"Error: Unable to load image {image_path}")
            return None

        # Apply binary thresholding
        _, thresholded_img = cv2.threshold(img, 127, 255, cv2.THRESH_TOZERO)

        # Save the preprocessed image to a temporary file
        preprocessed_image_path = os.path.splitext(image_path)[0] + "_thresholded.jpg"
        cv2.imwrite(preprocessed_image_path, thresholded_img)

        return preprocessed_image_path
    except Exception as e:
        print(f"Error preprocessing image {image_path}: {e}")
        return None

# Function to process the dataset (folder of images)
def process_dataset(image_folder, output_folder):
    # Check if the output folder exists, create it if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Counter to limit the number of processed images


    # Loop through all files in the folder
    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
            # Construct the full image path
            image_path = os.path.join(image_folder, filename)

            # Extract text from the original image
            print(f"Processing original image: {filename}")
            original_text = extract_text(image_path)
            print("Text from original image:")
            print(original_text)

            # Preprocess the image (threshold only)
            preprocessed_image_path = preprocess_image(image_path)

            if preprocessed_image_path:
                # Extract text from the preprocessed image
                print(f"Processing preprocessed image: {os.path.basename(preprocessed_image_path)}")
                preprocessed_text = extract_text(preprocessed_image_path)
                print("Text from preprocessed image:")
                print(preprocessed_text)

                # Create output .txt file path
                txt_filename = os.path.splitext(filename)[0] + "_preprocessed.txt"
                output_txt_path = os.path.join(output_folder, txt_filename)

                # Save extracted text from preprocessed image to .txt file
                try:
                    with open(output_txt_path, 'w') as txt_file:
                        txt_file.write(preprocessed_text)
                    print(f"Processed {filename}, saved text to {txt_filename}")
                except Exception as e:
                    print(f"Error saving text for {filename}: {e}")

            # Increment the counter and stop if limit is reached


# Call the function to process the dataset
process_dataset(image_folder, output_folder)


Processing original image: Template9_Instance170_resized.jpg
Text from original image:
Navarro, Ford and Bryan INVOICE # INV/73-81/333

Invoice Date: 06-Jun-2012
GSTIN/UIN: O9AABCS142961ZS

Email:rclark@example.org

Buyer :Alan Poole

515 Suzanne Dam Suite 983
Brittanyside, OH 05404 US
Tel:+(141)371-7288
Email:martinezanna@example.com
Site:https://reilly-howell.biz/

| Oty | Description | Unit Price _| Amount
5.00 _Two drug data. | 46.98 | 234.90
2.00 _Mission federal my. | 1.67 | 15.34
_ 3.00 _ Defense participant until. | 56.41 | 169.23
5.00 _ Evidence produce. I 12.39 | 61.95

Total in words: four hundred and ninet-
y-eight point zero two

TOTAL : 498.02 EUR

Note:Total payment due in 14 days.

Processing preprocessed image: Template9_Instance170_resized_thresholded.jpg
Text from preprocessed image:
Navarro, Ford and Bryan INVOICE # INV/73-81/333

GSTIN/UIN: 09AABCS142961ZS

Email:rclark@example.org

Buyer :Alan Poole

515 Suzanne Dam Suite 983
Brittanyside, OH 05404 US
Tel:+(141)3