In [None]:
import os
import shutil
import random

def split_dataset(source_dir, output_dir, train_size=0.7, val_size=0.15, seed=None):
    random.seed(seed)
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create the train, test, and validation directories
    train_dir = os.path.join(output_dir, 'train')
    test_dir = os.path.join(output_dir, 'test')
    val_dir = os.path.join(output_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    
    # Iterate over the subdirectories in the source directory
    for class_name in os.listdir(source_dir):
        class_dir = os.path.join(source_dir, class_name)
        if os.path.isdir(class_dir):
            # Create the corresponding subdirectories in train, test, and validation directories
            train_class_dir = os.path.join(train_dir, class_name)
            test_class_dir = os.path.join(test_dir, class_name)
            val_class_dir = os.path.join(val_dir, class_name)
            os.makedirs(train_class_dir, exist_ok=True)
            os.makedirs(test_class_dir, exist_ok=True)
            os.makedirs(val_class_dir, exist_ok=True)
            
            # Get a list of all the image files in the class directory
            image_files = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f))]
            
            # Shuffle the image files
            random.shuffle(image_files)
            
            # Calculate the number of images for each split
            train_count = int(train_size * len(image_files))
            val_count = int(val_size * len(image_files))
            
            # Assign images to train, test, and validation directories
            for i, image_file in enumerate(image_files):
                src_path = os.path.join(class_dir, image_file)
                if i < train_count:
                    dest_path = os.path.join(train_class_dir, image_file)
                elif i < train_count + val_count:
                    dest_path = os.path.join(val_class_dir, image_file)
                else:
                    dest_path = os.path.join(test_class_dir, image_file)
                shutil.copy(src_path, dest_path)




In [4]:
source_dir = 'cell_images'
output_dir = 'Dataset'

split_dataset(source_dir, output_dir, train_size=0.80, val_size=0.10, seed=42)