In [1]:
!pip install pytesseract pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (25.6 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123632 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [3]:
import os
import cv2
import pytesseract
from PIL import Image

# Set the path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Paths
image_folder = "/content/invoice/images"
output_folder = "/content/extracted_texts/"  # Folder to save .txt files

# Function to extract text using Tesseract
def extract_text(image_path):
    """
    Convert an image to text using Tesseract OCR.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Extracted text from the image.
    """
    try:
        # Open the image file
        img = Image.open(image_path)

        # Perform OCR
        text = pytesseract.image_to_string(img)

        return text
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""

# Function to resize the image using OpenCV
def resize_image(image_path):
    """
    Resize the image using OpenCV by a factor of 2.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Path to the resized image file.
    """
    try:
        # Load the image using OpenCV
        img = cv2.imread(image_path)

        if img is None:
            print(f"Error: Unable to load image {image_path}")
            return None

        # Resize the image
        resized_img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

        # Save the resized image to a temporary file
        resized_image_path = os.path.splitext(image_path)[0] + "_resized.jpg"
        cv2.imwrite(resized_image_path, resized_img)

        return resized_image_path
    except Exception as e:
        print(f"Error resizing image {image_path}: {e}")
        return None

# Function to process the dataset (folder of images)
def process_dataset(image_folder, output_folder, limit=5):
    # Check if the output folder exists, create it if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Counter to limit the number of processed images
    processed_count = 0

    # Loop through all files in the folder
    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
            # Construct the full image path
            image_path = os.path.join(image_folder, filename)

            # Extract text from the original image
            print(f"Processing original image: {filename}")
            original_text = extract_text(image_path)
            print("Text from original image:")
            print(original_text)

            # Resize the image
            resized_image_path = resize_image(image_path)

            if resized_image_path:
                # Extract text from the resized image
                print(f"Processing resized image: {os.path.basename(resized_image_path)}")
                resized_text = extract_text(resized_image_path)
                print("Text from resized image:")
                print(resized_text)

                # Create output .txt file path
                txt_filename = os.path.splitext(filename)[0] + "_resized.txt"
                output_txt_path = os.path.join(output_folder, txt_filename)

                # Save extracted text from resized image to .txt file
                try:
                    with open(output_txt_path, 'w') as txt_file:
                        txt_file.write(resized_text)
                    print(f"Processed {filename}, saved text to {txt_filename}")
                except Exception as e:
                    print(f"Error saving text for {filename}: {e}")

            # Increment the counter and stop if limit is reached
            processed_count += 1
            if processed_count >= limit:
                print("Processed limit of images reached.")
                break

# Call the function to process the dataset
process_dataset(image_folder, output_folder, limit=5)


Processing original image: Template10_Instance154.jpg
Text from original image:
Address:4894 Melanie Pass Apt. 742
‘Tammyland, SD 42587 US

Invoice Date: 17-Nov-2017
Invoice number 7077-628

Mclean-Cochran
Due Date : 16-Feb-1995

Bill to:Christopher Mayo.
87430 Sarah Tunnel Suite 340

East John, MA 84969 US
Tel:+(878)679-0474
Email:murphyanthony@example.org
Site:http:/idiaz.net/

 

 

 

iTems ‘quantry PRICE TAK AMOUNT
Be large. 400 sat $0.00 $89.24
‘Anything exactly hand, 600 © $4095 $0.00 $245.70

DISCOUNT(2.39%): (-) 8.04

TOTAL : 352.03 EUR

Terms & Conditions
4. Please sond payment within 8 days of receiving this invoice.
‘There will be a 2.5% interest charge per month on late invoices.
2. Please include the Invoice number in your payment notes,

Note:
This order is shipped through blue dart courier

Email:cjames@example.com

Processing resized image: Template10_Instance154_resized.jpg
Text from resized image:
Address:4894 Melanie Pass Apt. 742
Tammyland, SD 42587 US

Invoice Da