In [3]:
import os
import shutil
import random
from typing import List, Tuple

In [4]:
class SplitConfig:
    def __init__(
        self,
        train_ratio: float = 0.7,
        test_ratio: float = 0.2,
        eval_ratio: float = 0.1,
        seed: int = 42,
        tol: float = 1e-6
    ):
        total = train_ratio + test_ratio + eval_ratio
        assert abs(total - 1.0) < tol, f"Ratios must sum to 1 (got {total})"

        self.train_ratio = train_ratio
        self.test_ratio = test_ratio
        self.eval_ratio = eval_ratio
        self.seed = seed

In [5]:
class DatasetSplitter:
    def __init__(self, source_dir: str, target_dir: str, config: SplitConfig):
        self.source_dir = source_dir
        self.target_dir = target_dir
        self.config = config
        random.seed(config.seed)

    def _create_dirs(self, split_name: str, class_name: str):
        path = os.path.join(self.target_dir, split_name, class_name)
        os.makedirs(path, exist_ok=True)
        return path

    def _split_indices(self, total: int) -> Tuple[int, int]:
        train_end = int(total * self.config.train_ratio)
        test_end = train_end + int(total * self.config.test_ratio)
        return train_end, test_end

    def split(self):
        classes = [
            cls for cls in os.listdir(self.source_dir)
            if os.path.isdir(os.path.join(self.source_dir, cls))
        ]

        for cls in classes:
            class_path = os.path.join(self.source_dir, cls)
    
            
            images = [
                f for f in os.listdir(class_path)
                if os.path.isfile(os.path.join(class_path, f))
            ]
    
            random.shuffle(images)
    
            train_end, test_end = self._split_indices(len(images))
    
            splits = {
                "train": images[:train_end],
                "test": images[train_end:test_end],
                "eval": images[test_end:]
            }
    
            for split_name, split_images in splits.items():
                split_class_dir = self._create_dirs(split_name, cls)
    
                for img in split_images:
                    src = os.path.join(class_path, img)
                    dst = os.path.join(split_class_dir, img)
                    shutil.copy2(src, dst)  


**Usage Example**

In [6]:
SOURCE_DATASET = "/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam"
TARGET_DATASET = "/kaggle/working/dataset"

In [8]:
config = SplitConfig(
    train_ratio=0.7,
    test_ratio=0.2,
    eval_ratio=0.1,
    seed=42
)

splitter = DatasetSplitter(
    source_dir=SOURCE_DATASET,
    target_dir=TARGET_DATASET,
    config=config
)

splitter.split()

**Notes**

- Dataset was split class-wise to prevent data leakage.
- Random seed ensures reproducibility.
- Split ratios can be easily reconfigured.