# Tesseract on Patches on a Single Image

### Median Blur and Kernel Filter on every patch of the image

In [5]:
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

In [None]:
# Load the image
img = cv2.imread('/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/11lshEUmCrL.jpg')

# Parameters
patch_size = (50, 50)  # Define the size of each patch (smaller size)
step_size = 10       # Define the step size to create overlapping patches
resize_to = (250, 250)   # Size to resize each patch

# Calculate number of patches
img_height, img_width, _ = img.shape
patch_coordinates = [(x, y) for y in range(0, img_height - patch_size[0] + 1, step_size) 
                     for x in range(0, img_width - patch_size[1] + 1, step_size)]

# Function to process each patch
def process_patch(x, y):
    # Extract the patch
    patch = img[y:y + patch_size[0], x:x + patch_size[1]]

    # Resize the patch
    patch = cv2.resize(patch, resize_to)

    # Apply median blur to the patch for denoising
    patch = cv2.medianBlur(patch, 5)

    # Apply filter for sharpening using kernel
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    patch = cv2.filter2D(patch, -1, kernel)

    # Use Tesseract to detect text in the patch
    d = pytesseract.image_to_data(patch, output_type=Output.DICT)

    # Delete empty strings from detected text
    d['text'] = [text for text in d['text'] if text]
    
    # Skip this patch if no text is detected or only empty text is found
    if len(d['text']) == 0 or (len(d['text']) == 1 and d['text'][0] == ''):
        return None, None

    # Draw rectangles around detected text in the patch
    n_boxes = len(d['level'])
    for i in range(n_boxes):
        (x_patch, y_patch, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        cv2.rectangle(patch, (x_patch, y_patch), (x_patch + w, y_patch + h), (0, 255, 0), 2)

    # Convert BGR patch to RGB for displaying
    patch_rgb = cv2.cvtColor(patch, cv2.COLOR_BGR2RGB)

    
    return patch_rgb, f"Patch at ({x}, {y}) : {d['text']}"

# Using ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit each patch processing task to the thread pool
    futures = [executor.submit(process_patch, x, y) for (x, y) in patch_coordinates]

    # Use tqdm to track progress
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing patches"):
        patch_rgb, title= future.result()

        if patch_rgb is not None:
            # Show the patch with detected text
            # Print the detected text
            
            plt.figure(figsize=(4, 4))
            plt.imshow(patch_rgb)
            plt.title(title)
            plt.axis('off')
            plt.show()


---

# Tesseract on Whole Images

In [None]:
img = cv2.imread('/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/11lshEUmCrL.jpg')

# Show the img
plt.figure(figsize=(4, 4))
plt.imshow(img)
plt.title("title")
plt.axis('off')
plt.show()

# Making a result file from the train.csv file to add the results of the Tesseract on the whole images

In [12]:

# Folder containing images
image_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/train_images'  # Change this to your image folder path
output_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/'  # Path to save thread-wise CSV files

# Function to process an image and extract text
def process_image(image_path):
    # Load the image
    img = cv2.imread(image_path)

    # Apply median blur to the image for denoising
    img = cv2.medianBlur(img, 5)

    # Apply filter for sharpening using kernel
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    img = cv2.filter2D(img, -1, kernel)

    # Use Tesseract to detect text in the whole image
    d = pytesseract.image_to_data(img, output_type=Output.DICT)

    # Prepare the list of tuples with text and coordinates
    extracted_data = []
    for i, text in enumerate(d['text']):
        if text.strip():  # Skip empty text
            coordinates = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            extracted_data.append((text, coordinates))

    # Get the image name from the path
    image_name = os.path.basename(image_path)

    return image_name, extracted_data

# Append to thread-specific CSV file
def append_to_csv(thread_id, image_name, extracted_data):
    # Fix empty and non-empty lists to be stored consistently as strings
    extracted_data_str = str(extracted_data)
    
    # Create a new row as a DataFrame
    new_row = pd.DataFrame({'image_name': [image_name], 'extracted_data': [extracted_data_str]})
    
    # Output file for the thread
    csv_file_path = os.path.join(output_folder, f'output{thread_id}.csv')

    # Check if CSV exists, if not create a new one with headers
    if not os.path.exists(csv_file_path):
        new_row.to_csv(csv_file_path, index=False)
    else:
        # Append to the existing CSV
        new_row.to_csv(csv_file_path, mode='a', header=False, index=False)

# Function to process images in a specific chunk
def process_chunk(thread_id, image_paths_chunk):
    for image_path in image_paths_chunk:
        image_name, extracted_data = process_image(image_path)
        append_to_csv(thread_id, image_name, extracted_data)

# Function to divide image paths into chunks
def divide_chunks(lst, n):
    # Yield n parts of the list
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Process all images in the folder
image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(('.jpg', '.png', '.jpeg'))]

# Divide image paths into 16 chunks (one per thread)
num_threads = 16
chunks = list(divide_chunks(image_paths, len(image_paths) // num_threads))

# Process chunks in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    for thread_id, chunk in enumerate(chunks):
        executor.submit(process_chunk, thread_id, chunk)

# Combine all thread-specific CSV files into one final CSV
final_csv_path = os.path.join(output_folder, 'final_output.csv')

# Create the final CSV by combining all thread CSVs
combined_df = pd.concat([pd.read_csv(os.path.join(output_folder, f'output{thread_id}.csv')) for thread_id in range(num_threads)])

# Save the combined data to the final CSV
combined_df.to_csv(final_csv_path, index=False)

print(f"Final CSV created at: {final_csv_path}")


Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file


Final CSV created at: /mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/final_output.csv
