In [1]:
import os
import random
import shutil
from concurrent.futures import ProcessPoolExecutor

def create_dir_structure(base_dir, categories, subfolders):
    for category in categories:
        for subfolder in subfolders:
            path = os.path.join(base_dir, category, subfolder, 'images')
            os.makedirs(path, exist_ok=True)
            path = os.path.join(base_dir, category, subfolder, 'labels')
            os.makedirs(path, exist_ok=True)

def copy_files(src_image_path, dest_image_path, src_label_path, dest_label_path):
    shutil.copy(src_image_path, dest_image_path)
    shutil.copy(src_label_path, dest_label_path)

def split_data(src_dir, dest_dir, categories, split_ratio):
    with ProcessPoolExecutor(max_workers=50) as executor:  # Adjust max_workers as per your CPU cores
        for category in categories:
            images = [f for f in os.listdir(os.path.join(src_dir, category, 'images')) if os.path.isfile(os.path.join(src_dir, category, 'images', f))]
            labels = [f for f in os.listdir(os.path.join(src_dir, category, 'labels')) if os.path.isfile(os.path.join(src_dir, category, 'labels', f))]
            
            images.sort()
            labels.sort()

            combined = list(zip(images, labels))
            random.shuffle(combined)
            images[:], labels[:] = zip(*combined)

            num_images = len(images)
            train_split = int(split_ratio[0] * num_images)
            test_split = int(split_ratio[1] * num_images) + train_split

            datasets = {
                'train': (images[:train_split], labels[:train_split]),
                'test': (images[train_split:test_split], labels[train_split:test_split]),
                'val': (images[test_split:], labels[test_split:])
            }

            for dataset in datasets:
                for image, label in zip(datasets[dataset][0], datasets[dataset][1]):
                    src_image_path = os.path.join(src_dir, category, 'images', image)
                    dest_image_path = os.path.join(dest_dir, category, dataset, 'images', image)

                    src_label_path = os.path.join(src_dir, category, 'labels', label)
                    dest_label_path = os.path.join(dest_dir, category, dataset, 'labels', label)

                    executor.submit(copy_files, src_image_path, dest_image_path, src_label_path, dest_label_path)

def main():
    src_dir = '/mnt/storage/backup/label_wise_jainil/'
    dest_dir = '/mnt/storage/kilsar_mohammad/Kilsar_public_jainil'
    categories = ['0', '1', '2', 'n']  # Specifying the 4 categories
    subfolders = ['train', 'test', 'val']
    split_ratio = (0.8, 0.1, 0.1)  # 80% train, 10% test, 10% validation

    create_dir_structure(dest_dir, categories, subfolders)
    split_data(src_dir, dest_dir, categories, split_ratio)

if __name__ == "__main__":
    main()
