Code to extract individual handwritten words from a page of words (clearly written with enough spacing between words) and output it into a folder for further processing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import cv2
import os

def extract_words(image_path, output_folder):
    # Read the image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Thresholding the image (invert and binary threshold)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Dilation: increase the size of the contours to merge letters into words
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 10))  # Larger horizontal kernel
    dilated = cv2.dilate(thresh, kernel, iterations=2)  # Increase iterations to merge contours more

    # Find contours in the dilated image
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours by x-coordinate (left to right order)
    contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    word_count = 0
    for contour in contours:
        # Get bounding box for each word (or part of a word)
        x, y, w, h = cv2.boundingRect(contour)

        # Filter out small contours that are unlikely to be words
        if w > 50 and h > 15:  # You may need to adjust these values for your handwriting
            word_count += 1
            word_image = img[y:y+h, x:x+w]

            # Save the word image
            output_path = os.path.join(output_folder, f"word_{word_count}.png")
            cv2.imwrite(output_path, word_image)

    print(f"Extracted {word_count} words and saved in '{output_folder}'.")

# Example usage
extract_words("/content/gdrive/MyDrive/Neural/pages/Already Loaded Pages/page 27.jpeg",
              "/content/gdrive/MyDrive/Neural/pages/words13")


Extracted 38 words and saved in '/content/gdrive/MyDrive/Neural/pages/words13'.
