In [1]:
import os
import cv2
from PIL import Image
import numpy as np
import shutil

In [30]:
# Variables for background and foreground
bg_color, fg_color =  1,0

In [31]:
# Delete output folders if they already exist
def delete_folders():
    for folder in ["lines-1", "words-1", "cc-1","lines-2", "words-2", "cc-2"]:
        if os.path.exists(folder):
            shutil.rmtree(folder)

In [32]:
# Create folders to store output images
def create_folders():
    os.makedirs("lines-1", exist_ok=True)
    os.makedirs("words-1", exist_ok=True)
    os.makedirs("cc-1", exist_ok=True)
    os.makedirs("lines-2", exist_ok=True)
    os.makedirs("words-2", exist_ok=True)
    os.makedirs("cc-2", exist_ok=True)

In [33]:
# Read the image, convert to grayscale, and binarize it
def read_and_binarize_image(image_path, threshold=127):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, threshold, 1, cv2.THRESH_BINARY)
    return binary_image.tolist()

In [34]:
def save_grid_as_image(grid, filename):
    array = np.array(grid, dtype=np.uint8) * 255
    image = Image.fromarray(array)
    image.save(filename)

In [35]:
# Flood-fill algorithm to find connected components
def get_neighbouring_points(image, r, c):
    stack = [(r, c)]
    points = []

    while stack:
        x, y = stack.pop()

        if image[x][y] != fg_color:
            continue

        image[x][y] = 2  # Mark the pixel as visited
        points.append((x, y))

        # Explore 8-connected neighbors
        for i in range(-1, 2):
            for j in range(-1, 2):
                nx, ny = x + i, y + j
                if 0 <= nx < len(image) and 0 <= ny < len(image[0]) and image[nx][ny] == fg_color:
                    stack.append((nx, ny))

    return points

In [36]:
# Convert a set of points into a cropped binary image
def convert_to_sub_image(points):
    if not points:  # Handle empty input
        return []

    min_x = min(p[0] for p in points)
    max_x = max(p[0] for p in points)
    min_y = min(p[1] for p in points)
    max_y = max(p[1] for p in points)

    new_image = [[bg_color] * (max_y - min_y + 1) for _ in range(max_x - min_x + 1)]

    for x, y in points:
        new_image[x - min_x][y - min_y] = fg_color

    return new_image

In [37]:
# Extract and save all connected components
def export_connected_components(image_grid, folderName):
    rows, cols = len(image_grid), len(image_grid[0])
    components = []

    for r in range(rows):
        for c in range(cols):
            if image_grid[r][c] == fg_color:
                points = get_neighbouring_points(image_grid, r, c)
                components.append(convert_to_sub_image(points))

    # Save each connected component as an image
    for idx, component in enumerate(components):
        save_grid_as_image(component, f"{folderName}/cc{idx + 1}.png")

In [38]:
# Extract lines based on row histograms
def export_connected_lines(image_grid, folderName):
    histogram = [sum(value != bg_color for value in row) for row in image_grid]
    line_grids, start_index = [], None

    for i, count in enumerate(histogram):
        if count > 0 and start_index is None:  # Start of a new line
            start_index = i
        elif count == 0 and start_index is not None:  # End of the line
            line_grids.append(image_grid[start_index:i])
            start_index = None

    if start_index is not None:  # Handle the last line
        line_grids.append(image_grid[start_index:])

    for idx, grid in enumerate(line_grids):
        if len(grid) > 0:  # Ensure the grid is non-empty
            save_grid_as_image(grid, f"{folderName}/line{idx + 1}.png")

    return line_grids

In [39]:
# Extract columns for word processing
def get_column_histogram(image_grid):
    rows = len(image_grid)
    cols = len(image_grid[0]) if rows>0 else 0

    return [sum(image_grid[row][col] != bg_color for row in range(len(image_grid))) for col in range(cols)]

In [40]:
def export_words_from_lines(line_grids, folderName , space_threshold=5):
    for line_idx, line_grid in enumerate(line_grids):
        word_histogram = get_column_histogram(line_grid)
        word_grids, start_col = [], None
        consecutive_bg = 0  # To track consecutive background columns

        for col, count in enumerate(word_histogram):
            if count > 0:  # Foreground column
                if start_col is None:  # Start of a new word
                    start_col = col
                consecutive_bg = 0  # Reset background counter
            elif count == 0 and start_col is not None:  # Background column
                consecutive_bg += 1
                if consecutive_bg >= space_threshold:  # End of the word
                    end_col = col - consecutive_bg + 1
                    word_grids.append([[row[c] for c in range(start_col, end_col)] for row in line_grid])
                    start_col = None  # Reset start column

        # Handle the last word in the line if it doesn't end with enough background
        if start_col is not None:
            word_grids.append([[row[c] for c in range(start_col, len(line_grid[0]))] for row in line_grid])

        # Save each word as an image
        for word_idx, grid in enumerate(word_grids):
            if len(grid) > 0 and len(grid[0]) > 0:  # Ensure non-empty grid
                save_grid_as_image(grid, f"{folderName}/line{line_idx + 1}_word{word_idx + 1}.png")

**Main Function**

=> Background_color = Black , Foreground_color = White

In [41]:
delete_folders()
create_folders()

In [42]:
# Main Execution - 1
# background is black and foreground is white
bg_color, fg_color =  0,1
image_path = "text-img-2.jpg"
binary_grid = read_and_binarize_image(image_path)


line_grids = export_connected_lines(binary_grid, folderName="lines-1")
export_words_from_lines(line_grids,folderName="words-1")
export_connected_components(binary_grid,folderName="cc-1")

=> Background_color = White , Foreground_color = Black

In [43]:
# Main Execution - 2
# background is white and foreground is black
bg_color, fg_color =  1,0  # colors switched
image_path_2 = "text-img.jpg"
binary_grid_2 = read_and_binarize_image(image_path_2)

line_grids = export_connected_lines(binary_grid_2,folderName="lines-2")
export_words_from_lines(line_grids,folderName="words-2")
export_connected_components(binary_grid_2,folderName="cc-2")