In [None]:
import os
import shutil
import cv2
import numpy as np

# Define the paths for the original and the new dataset
original_dataset_path = "D:\\Splitted Curated X-Ray Dataset"
new_dataset_path = "D:\\Splitted AHE Oversampled X-Ray Dataset"

# Create a function to apply CLAHE to an image
def apply_clahe(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    clahe = cv2.createCLAHE()
    return clahe.apply(image)

# Function to copy and oversample the dataset
def oversample_dataset(dataset_path, new_path, classes):
    # Create the new dataset directory structure
    for folder in ['train', 'test', 'val']:
        for class_name in classes:
            new_class_dir = os.path.join(new_path, folder, class_name)
            os.makedirs(new_class_dir, exist_ok=True)

    # Determine the maximum image count across all classes and sets
    max_count = 0
    for folder in ['train', 'test', 'val']:
        for class_name in classes:
            class_dir = os.path.join(dataset_path, folder, class_name)
            image_count = len(os.listdir(class_dir))
            if image_count > max_count:
                max_count = image_count

    # Oversample the dataset
    for folder in ['train', 'test', 'val']:
        for class_name in classes:
            class_dir = os.path.join(dataset_path, folder, class_name)
            new_class_dir = os.path.join(new_path, folder, class_name)
            images = os.listdir(class_dir)
            image_count = len(images)

            # Rule 1: If class has less than 50% of the max count, duplicate all
            if image_count <= max_count * 0.5:
                for image_name in images:
                    original_image_path = os.path.join(class_dir, image_name)
                    new_image_path = os.path.join(new_class_dir, image_name)
                    shutil.copy2(original_image_path, new_image_path)
                    # Save the CLAHE enhanced image
                    clahe_image = apply_clahe(original_image_path)
                    clahe_image_path = os.path.join(new_class_dir, f"CLAHE_{image_name}")
                    cv2.imwrite(clahe_image_path, clahe_image)
            # Rule 2: If class has more than 50% of the max count, duplicate until max count is reached
            else:
                difference = max_count - image_count
                images_to_duplicate = np.random.choice(images, difference, replace=False)
                for image_name in images_to_duplicate:
                    original_image_path = os.path.join(class_dir, image_name)
                    # Save the CLAHE enhanced image
                    clahe_image = apply_clahe(original_image_path)
                    clahe_image_path = os.path.join(new_class_dir, f"CLAHE_{image_name}")
                    cv2.imwrite(clahe_image_path, clahe_image)

# Define the classes
class_names = ['COVID-19', 'Normal', 'Pneumonia-Bacterial', 'Pneumonia-Viral']

# Start the oversampling process
oversample_dataset(original_dataset_path, new_dataset_path, class_names)
