# Tesseract on Patches on a Single Image

### Median Blur and Kernel Filter on every patch of the image

In [3]:
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

In [None]:
# Load the image
img = cv2.imread('/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/11lshEUmCrL.jpg')

# Parameters
patch_size = (50, 50)  # Define the size of each patch (smaller size)
step_size = 10       # Define the step size to create overlapping patches
resize_to = (250, 250)   # Size to resize each patch

# Calculate number of patches
img_height, img_width, _ = img.shape
patch_coordinates = [(x, y) for y in range(0, img_height - patch_size[0] + 1, step_size) 
                     for x in range(0, img_width - patch_size[1] + 1, step_size)]

# Function to process each patch
def process_patch(x, y):
    # Extract the patch
    patch = img[y:y + patch_size[0], x:x + patch_size[1]]

    # Resize the patch
    patch = cv2.resize(patch, resize_to)

    # Apply median blur to the patch for denoising
    patch = cv2.medianBlur(patch, 5)

    # Apply filter for sharpening using kernel
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    patch = cv2.filter2D(patch, -1, kernel)

    # Use Tesseract to detect text in the patch
    d = pytesseract.image_to_data(patch, output_type=Output.DICT)

    # Delete empty strings from detected text
    d['text'] = [text for text in d['text'] if text]
    
    # Skip this patch if no text is detected or only empty text is found
    if len(d['text']) == 0 or (len(d['text']) == 1 and d['text'][0] == ''):
        return None, None

    # Draw rectangles around detected text in the patch
    n_boxes = len(d['level'])
    for i in range(n_boxes):
        (x_patch, y_patch, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        cv2.rectangle(patch, (x_patch, y_patch), (x_patch + w, y_patch + h), (0, 255, 0), 2)

    # Convert BGR patch to RGB for displaying
    patch_rgb = cv2.cvtColor(patch, cv2.COLOR_BGR2RGB)

    
    return patch_rgb, f"Patch at ({x}, {y}) : {d['text']}"

# Using ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit each patch processing task to the thread pool
    futures = [executor.submit(process_patch, x, y) for (x, y) in patch_coordinates]

    # Use tqdm to track progress
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing patches"):
        patch_rgb, title= future.result()

        if patch_rgb is not None:
            # Show the patch with detected text
            # Print the detected text
            
            plt.figure(figsize=(4, 4))
            plt.imshow(patch_rgb)
            plt.title(title)
            plt.axis('off')
            plt.show()


---

# Tesseract on Whole Images

In [None]:
img = cv2.imread('/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/11lshEUmCrL.jpg')

# Show the img
plt.figure(figsize=(4, 4))
plt.imshow(img)
plt.title("title")
plt.axis('off')
plt.show()

# Making a result file from the train.csv file to add the results of the Tesseract on the whole images

In [None]:
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

# Folder containing images
image_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/train_images'  # Change this to your image folder path
output_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/'  # Path to save thread-wise CSV files

# Function to process an image and extract text
def process_image(image_path):
    try:
        # Load the image
        img = cv2.imread(image_path)

        # Check if the image was loaded successfully
        if img is None:
            raise ValueError(f"Image {image_path} could not be read (possibly corrupted).")

        # Apply median blur to the image for denoising
        img = cv2.medianBlur(img, 5)

        # Apply filter for sharpening using kernel
        kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
        img = cv2.filter2D(img, -1, kernel)

        # Use Tesseract to detect text in the whole image
        d = pytesseract.image_to_data(img, output_type=Output.DICT)

        # Prepare the list of tuples with text and coordinates
        extracted_data = []
        for i, text in enumerate(d['text']):
            if text.strip():  # Skip empty text
                coordinates = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                extracted_data.append((text, coordinates))

        # Get the image name from the path
        image_name = os.path.basename(image_path)

        return image_name, extracted_data, False  # False indicates no error

    except Exception as e:
        # Log the exception with the image name
        print(f"Error processing image {image_path}: {str(e)}")
        return None, None, True  # True indicates an error (skipped image)

# Append to thread-specific CSV file
def append_to_csv(thread_id, image_name, extracted_data):
    if image_name is not None and extracted_data is not None:
        # Fix empty and non-empty lists to be stored consistently as strings
        extracted_data_str = str(extracted_data)
        
        # Create a new row as a DataFrame
        new_row = pd.DataFrame({'image_name': [image_name], 'extracted_data': [extracted_data_str]})
        
        # Output file for the thread
        csv_file_path = os.path.join(output_folder, f'output{thread_id}.csv')

        # Check if CSV exists, if not create a new one with headers
        if not os.path.exists(csv_file_path):
            new_row.to_csv(csv_file_path, index=False)
        else:
            # Append to the existing CSV
            new_row.to_csv(csv_file_path, mode='a', header=False, index=False)

# Function to process images in a specific chunk and count skipped images
def process_chunk(thread_id, image_paths_chunk):
    skipped_count = 0
    for image_path in image_paths_chunk:
        image_name, extracted_data, is_skipped = process_image(image_path)
        if is_skipped:
            skipped_count += 1
        else:
            append_to_csv(thread_id, image_name, extracted_data)
    print(f"Thread {thread_id} finished with {skipped_count} skipped images.")

# Function to check if all chunks are roughly equal in size
def check_chunk_sizes(chunks, total_images, num_threads):
    chunk_sizes = [len(chunk) for chunk in chunks]
    expected_size = total_images // num_threads
    for i, size in enumerate(chunk_sizes):
        print(f"Chunk {i} size: {size} (Expected: ~{expected_size})")

# Process all images in the folder
image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(('.jpg', '.png', '.jpeg'))]


# # Checking if the code works so currently only taking first 100 images from the folder
# image_paths = image_paths[:1000]

total_images = len(image_paths)

# Divide image paths into 16 roughly equal parts
num_threads = 20
chunks = np.array_split(image_paths, num_threads)  # Use numpy to split into roughly equal parts

# Check chunk sizes to verify even distribution
check_chunk_sizes(chunks, total_images, num_threads)

# Process chunks in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    for thread_id, chunk in enumerate(chunks):
        executor.submit(process_chunk, thread_id, chunk)

# Combine all thread-specific CSV files into one final CSV
final_csv_path = os.path.join(output_folder, 'final_output.csv')

# Create the final CSV by combining all thread CSVs
combined_df = pd.concat([pd.read_csv(os.path.join(output_folder, f'output{thread_id}.csv')) for thread_id in range(num_threads)], ignore_index=True)

# Save the combined data to the final CSV
combined_df.to_csv(final_csv_path, index=False)

print(f"Final CSV created at: {final_csv_path}")


In [5]:
import pandas as pd
import os
# Folder containing images
image_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/train_images'  # Change this to your image folder path
output_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/'  # Path to save thread-wise CSV files
# Combine all thread-specific CSV files into one final CSV
final_csv_path = os.path.join(output_folder, 'final_output.csv')
# Create the final CSV by combining all thread CSVs
combined_df = pd.concat([pd.read_csv(os.path.join(output_folder, f'output{thread_id}.csv')) for thread_id in range(20)], ignore_index=True)

# Save the combined data to the final CSV
combined_df.to_csv(final_csv_path, index=False)

print(f"Final CSV created at: {final_csv_path}")

Final CSV created at: /mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/final_output.csv


## Trying PaddleOCR

In [None]:
# import cv2
# import numpy as np
# import pandas as pd
# import os
# from concurrent.futures import ThreadPoolExecutor
# from paddleocr import PaddleOCR, draw_ocr
# from PIL import Image
# import gc


# os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# # Initialize PaddleOCR with a specific language model (switch the language as needed)
# # Enable angle classifier for detecting and correcting text rotation
# ocr = PaddleOCR(use_angle_cls=True, use_gpu=True, lang='en',rec_batch_num=1)  # lang='korean', 'ch', etc., depending on the language you're using

# # Folder containing images
# image_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/train_images'
# output_folder = '/mnt/c/Users/parth/Desktop/CODING/AmazonML/archive/student_resource 3/'

# # Function to process an image and extract text using PaddleOCR
# def process_image(image_path):
#     try:
#         # Load the image
#         img = cv2.imread(image_path)

#         # Check if the image was loaded successfully
#         if img is None:
#             raise ValueError(f"Image {image_path} could not be read (possibly corrupted).")

#         # Perform OCR using PaddleOCR
#         result = ocr.ocr(image_path)  # You can specify det=False or rec=False as needed

#         # Prepare the list of tuples with text and coordinates
#         extracted_data = []
#         for line in result:
#             text = line[1][0]  # Extract recognized text
#             coordinates = line[0]  # Extract the bounding box coordinates
#             extracted_data.append((text, coordinates))

#         # Visualize and save the OCR results
#         image = Image.open(image_path).convert('RGB')
#         boxes = [line[0] for line in result]
#         txts = [line[1][0] for line in result]
#         scores = [line[1][1] for line in result]

#         # You may need to provide the path to a font file for the language
#         font_path = 'doc/fonts/korean.ttf'  # Replace this with the correct path for your language
#         im_show = draw_ocr(image, boxes, txts, scores, font_path=font_path)
#         im_show = Image.fromarray(im_show)
#         output_img_path = os.path.join(output_folder, f"ocr_result_{os.path.basename(image_path)}")
#         im_show.save(output_img_path)

#         # Get the image name from the path
#         image_name = os.path.basename(image_path)

#         return image_name, extracted_data, False  # False indicates no error

#     except Exception as e:
#         # Log the exception with the image name
#         print(f"Error processing image {image_path}: {str(e)}")
#         return None, None, True  # True indicates an error (skipped image)

#     finally:
#         del img
#         gc.collect()

# # Append to thread-specific CSV file
# def append_to_csv(thread_id, image_name, extracted_data):
#     if image_name is not None and extracted_data is not None:
#         # Fix empty and non-empty lists to be stored consistently as strings
#         extracted_data_str = str(extracted_data)
        
#         # Create a new row as a DataFrame
#         new_row = pd.DataFrame({'image_name': [image_name], 'extracted_data': [extracted_data_str]})
        
#         # Output file for the thread
#         csv_file_path = os.path.join(output_folder, f'output{thread_id}.csv')

#         # Check if CSV exists, if not create a new one with headers
#         if not os.path.exists(csv_file_path):
#             new_row.to_csv(csv_file_path, index=False)
#         else:
#             # Append to the existing CSV
#             new_row.to_csv(csv_file_path, mode='a', header=False, index=False)

# # Function to process images in a specific chunk and count skipped images
# def process_chunk(thread_id, image_paths_chunk):
#     skipped_count = 0
#     for image_path in image_paths_chunk:
#         image_name, extracted_data, is_skipped = process_image(image_path)
#         if is_skipped:
#             skipped_count += 1
#         else:
#             append_to_csv(thread_id, image_name, extracted_data)
#     print(f"Thread {thread_id} finished with {skipped_count} skipped images.")

# # Process all images in the folder
# image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(('.jpg', '.png', '.jpeg'))]

# # Use a smaller batch for testing, for example, the first 100 images
# image_paths = image_paths[:100]  # You can adjust this as needed for testing

# total_images = len(image_paths)
# num_threads = 10
# chunks = np.array_split(image_paths, num_threads)

# # Process images using ThreadPoolExecutor
# with ThreadPoolExecutor(max_workers=num_threads) as executor:
#     for thread_id, chunk in enumerate(chunks):
#         executor.submit(process_chunk, thread_id, chunk)

# # Combine all CSV files into one final CSV
# final_csv_path = os.path.join(output_folder, 'final_output.csv')
# combined_df = pd.concat([pd.read_csv(os.path.join(output_folder, f'output{thread_id}.csv')) for thread_id in range(num_threads)], ignore_index=True)
# combined_df.to_csv(final_csv_path, index=False)

# print(f"Final CSV created at: {final_csv_path}")


### Utils

In [None]:
import psutil
import GPUtil

# Function to check RAM usage
def check_ram():
    memory = psutil.virtual_memory()
    total_memory = memory.total / (1024 ** 3)  # Convert bytes to GB
    available_memory = memory.available / (1024 ** 3)
    used_memory = memory.used / (1024 ** 3)

    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Available Memory: {available_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

# Function to check GPU usage
def check_gpu():
    gpus = GPUtil.getGPUs()
    if gpus:
        for gpu in gpus:
            print(f"GPU: {gpu.name}")
            print(f"Total Memory: {gpu.memoryTotal} MB")
            print(f"Used Memory: {gpu.memoryUsed} MB")
            print(f"Free Memory: {gpu.memoryFree} MB")
            print(f"GPU Load: {gpu.load * 100}%")
    else:
        print("No GPU detected.")

# Run both checks
check_ram()
check_gpu()
