COPY FILES

In [3]:
import shutil
import os

In [None]:
# Define source and destination directories
source_folder = "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas"
destination_folder = "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/test_img"

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Copy all files from source to destination
for filename in os.listdir(source_folder)[:10]:
    source_path = os.path.join(source_folder, filename)
    destination_path = os.path.join(destination_folder, filename)

    # Check if it's a file before copying
    if os.path.isfile(source_path):
        shutil.copy2(source_path, destination_path)  # copy2 preserves metadata (timestamps)
        print(f"Copied: {source_path} -> {destination_path}")

print("All files have been copied.")

SAMPLE FROM MULTIPLE DATASETS AND COMBINE

In [None]:
import os
import random
import shutil

def sample_dataset_with_unique(datasets, output_dir, num_samples_list):
    """
    Samples a fixed number of images from each dataset, shuffles them, and saves to an output directory.
    Duplicates are skipped, and if not enough unique images are found, a message will indicate how many were saved.
    The images are saved with their original filenames.

    Args:
        datasets (list): List of paths to the datasets.
        output_dir (str): Path to save the sampled and shuffled dataset.
        num_samples_list (list): List of numbers indicating how many images to sample from each dataset.
                                 Must match the number of datasets.
    """
    if len(datasets) != len(num_samples_list):
        raise ValueError("Number of datasets must match number of sample counts.")

    os.makedirs(output_dir, exist_ok=True)
    all_samples = []
    total_saved = 0  # Total counter for images saved across all datasets

    for dataset, num_samples in zip(datasets, num_samples_list):
        print(f"Processing dataset: {dataset} with {num_samples} images to sample.")
        images = [file for file in os.listdir(dataset) if file.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Ensure that we do not sample more images than are available
        if num_samples > len(images):
            raise ValueError(f"Requested more samples ({num_samples}) than available images in dataset ({len(images)}).")

        unique_images = set()  # Set to track unique images
        successfully_saved = 0  # Counter for successfully saved unique images

        while successfully_saved < num_samples and images:
            image = random.choice(images)
            if image not in unique_images:
                unique_images.add(image)
                all_samples.append((os.path.join(dataset, image), image))
                successfully_saved += 1
            else:
                # If image is a duplicate, try another image
                continue

            # If we couldn't find enough unique images, notify user
            if successfully_saved < num_samples and not images:
                print(f"Warning: Could not find enough unique images in {dataset}. Only {successfully_saved} images were saved.")
                break

        total_saved += successfully_saved
        print(f"Saved {successfully_saved} images from {dataset}.")

    # Shuffle all samples
    random.shuffle(all_samples)

    # Copy sampled and shuffled images to output directory with original filenames
    for i, (src_path, original_name) in enumerate(all_samples, start=1):
        shutil.copy(src_path, os.path.join(output_dir, original_name))

    print(f"Sampled and shuffled dataset saved to: {output_dir}")
    print(f"Total images saved across all datasets: {total_saved}")

In [None]:
datasets = [
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/hugging_face/captchas/nischayS/test", #8000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/fanbyprinciple/data", #8000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/aadhavvignesh/data", #10000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/huthay", #10000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/sandeep1507/data", #15000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/hugging_face/captchas/nischay", #15000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/akashguna/data", #15000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/khushipitroda/data", #15000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/kiran", #15000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/hugging_face/captchas/hammer888", #25000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/parasam/data", #25000
    "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/kaggle/jassoncarvalho/data" #25000
    
]
output_dir = "/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas"
num_samples_list = [8000, 8000, 10000, 9000, 15000, 15000, 15000, 15000, 15000, 25000, 25000, 25000]   # Specify the number of samples per dataset
sample_dataset_with_unique(datasets, output_dir, num_samples_list)

CONVERT ALL IMAGES TO SAME EXTENSION (PNG)

In [1]:
from PIL import Image, UnidentifiedImageError  # Import the error
import os

def convert_images_in_folder(directory, target_extension='png'):
    """
    Converts all images in a directory to the target extension and replaces the original files.
    At the end, lists files that couldn't be converted.
    """
    if not os.path.exists(directory):
        raise ValueError(f"The directory {directory} does not exist.")
    
    failed_conversions = []  # List to store files that couldn't be converted

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        if os.path.isfile(file_path):
            try:
                # Try to open the image file
                with Image.open(file_path) as img:
                    # Get the file extension (lowercase)
                    file_extension = filename.split('.')[-1].lower()

                    # Skip if already in the target format
                    if file_extension == target_extension.lower():
                        continue
                    
                    # Set the new filename with the target extension
                    new_file_path = f"{os.path.splitext(file_path)[0]}.{target_extension.lower()}"

                    # Save the image with the new extension, replacing the original file
                    img.save(new_file_path)
                    os.remove(file_path)  # Remove the original file
                    print(f"Converted and replaced {filename} with {os.path.basename(new_file_path)}")
            except (UnidentifiedImageError, OSError) as e:
                # Add the file to the failed conversions list
                failed_conversions.append(filename)
                print(f"Skipping {filename}: {e}")
    
    print("\nImage conversion complete.")
    
    # Print any files that couldn't be converted
    if failed_conversions:
        print("The following files could not be converted:")
        for failed_file in failed_conversions:
            print(f" - {failed_file}")
    else:
        print("All files were successfully converted.")


In [5]:
input_dir = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas_aug/combined_captchas'
convert_images_in_folder(input_dir)


Image conversion complete.
All files were successfully converted.


SPLIT DATASET

In [6]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define the directory where the images are located
image_dir = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas_aug/combined_captchas'
train_dir = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas_aug/trainH'
test_dir = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas_aug/testH'
val_dir = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-text-recognition-benchmark/datasets/combined_captchas_aug/validH'

# Create directories for train, test, and validation sets if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# List all image files in the directory
image_files = [f for f in os.listdir(image_dir) if f.endswith(('jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff'))]

#slice of dataset CHANGE LATER
image_files= image_files[:40000]

# Split the images into train (80%) and temp (test + validation 20%)
train_images, temp_images = train_test_split(image_files, test_size=0.2, random_state=42)

# Split the temp set into test (15%) and validation (5%) sets
test_images, val_images = train_test_split(temp_images, test_size=0.25, random_state=42)  # 0.25 of 0.2 = 5%

# Function to move images to the respective directories
def move_images(image_list, destination_dir):
    for image in image_list:
        src_path = os.path.join(image_dir, image)
        dest_path = os.path.join(destination_dir, image)
        shutil.move(src_path, dest_path)

# Move the images to the appropriate directories
move_images(train_images, train_dir)
move_images(test_images, test_dir)
move_images(val_images, val_dir)

# Print summary
print(f"Total images: {len(image_files)}")
print(f"Train images: {len(train_images)}")
print(f"Test images: {len(test_images)}")
print(f"Validation images: {len(val_images)}")

Total images: 40000
Train images: 32000
Test images: 6000
Validation images: 2000


APPLY AUGMENTATIONS

Rename images and create gt.txt file

In [8]:
import os
import shutil

def rename_images_and_create_gt(input_folder, output_folder, gt_file_path):
    """
    Renames augmented CAPTCHA images and generates a ground truth file.
    
    Args:
        input_folder (str): Path to the folder containing augmented images.
        output_folder (str): Path to save the renamed images.
        gt_file_path (str): Path to save the ground truth file (gt.txt).
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Open the ground truth file in write mode
    with open(gt_file_path, 'w') as gt_file:
        # Initialize a counter for new image names
        image_counter = 1

        # Loop through the images in the input folder
        for img_name in os.listdir(input_folder):
            img_path = os.path.join(input_folder, img_name)

            if os.path.isfile(img_path) and img_name.endswith(('.png', '.jpg', '.jpeg')):
                # Extract the annotation part from the filename (without extension)
                annotation = os.path.splitext(img_name.split('_')[0])[0]  # Remove extension from annotation

                # Generate the new filename (image_1.extension, image_2.extension, etc.)
                new_img_name = f"image_{image_counter}{os.path.splitext(img_name)[1]}"
                new_img_path = os.path.join(output_folder, new_img_name)

                # Rename the image and move/copy it to the new folder
                shutil.move(img_path, new_img_path)

                # Write the new filename and its annotation to the gt.txt file
                gt_file.write(f"{new_img_name} {annotation}\n")

                # Increment the counter for the next image
                image_counter += 1

    print(f"Images have been renamed and saved to {output_folder}. Ground truth file saved to {gt_file_path}.")

In [9]:
# Example usage:
input_folder = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/fixed'  # Path to the folder with augmented images
output_folder = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/data'    # Path to save renamed images
os.makedirs(output_folder, exist_ok=True)
gt_file_path = '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/gt.txt'     # Path to save ground truth file

rename_images_and_create_gt(input_folder, output_folder, gt_file_path)

Images have been renamed and saved to /home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/data. Ground truth file saved to /home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/gt.txt.


FIX FILENAMES / EXTRACT ANNOTATIONS

From text file

In [None]:
def annotations_from_txt_to_filename(dataset_path, txt_file, output_dir):
    """
    Reads annotations from a TXT file and updates filenames to include the annotations.

    Args:
        dataset_path (str): Path to the dataset containing images.
        txt_file (str): Path to the TXT file with annotations.
        output_dir (str): Directory to save updated images with annotations in filenames.
    """
    os.makedirs(output_dir, exist_ok=True)

    with open(txt_file, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(maxsplit=1)
            if len(parts) < 2:
                continue
            image_name, annotation = parts[0], parts[1]
            image_path = os.path.join(dataset_path, image_name)
            if os.path.exists(image_path):
                new_filename = f"{annotation}.png"
                shutil.copy(image_path, os.path.join(output_dir, new_filename))

{irrelevant}_{annotation}

In [None]:
def fix_filenames_irrelevant_prefix(dataset_path, output_dir):
    """
    Fixes filenames in the format `{irrelevant}_{annotation}` by extracting the annotation.

    Args:
        dataset_path (str): Path to the dataset containing images.
        output_dir (str): Directory to save updated images with fixed filenames.
    """
    os.makedirs(output_dir, exist_ok=True)

    for file in os.listdir(dataset_path):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            filename, ext = os.path.splitext(file)
            if '_' in filename:
                annotation = filename.split('_')[-1]
                new_filename = f"{annotation}.png"
                shutil.copy(os.path.join(dataset_path, file), os.path.join(output_dir, new_filename))


{annotation}_{irrelevant}

In [4]:
def fix_filenames_irrelevant_suffix(dataset_path, output_dir):
    """
    Fixes filenames in the format `{annotation}_{irrelevant}` by extracting the annotation.

    Args:
        dataset_path (str): Path to the dataset containing images.
        output_dir (str): Directory to save updated images with fixed filenames.
    """
    os.makedirs(output_dir, exist_ok=True)

    for file in os.listdir(dataset_path):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            filename, ext = os.path.splitext(file)
            if '_' in filename:
                annotation = filename.split('_')[0]
                new_filename = f"{annotation}.png"
                shutil.copy(os.path.join(dataset_path, file), os.path.join(output_dir, new_filename))


In [7]:
import os
import shutil

data_path= '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1'
out_path= '/home/af-ml-dev/JFreaks/OCR/EasyOCR/deep-aug/deep-text-recognition-benchmark/captcha_batch1/fixed'

os.makedirs(out_path, exist_ok=True)

fix_filenames_irrelevant_suffix(dataset_path=data_path, output_dir=out_path)