In [3]:
import os
import shutil
import random

# Paths to normalized data
data_dir = "processed_data_normalized"
output_dir = "dataset_split"
os.makedirs(output_dir, exist_ok=True)

# Train-Validation-Test Split Ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Categories (Nodules / Non-Nodules)
categories = ["nodules", "non_nodules"]


# Function to split data (Balanced)
def split_data():
    for category in categories:
        category_path = os.path.join(data_dir, category)
        files = os.listdir(category_path)
        random.shuffle(files)

        train_split = int(len(files) * train_ratio)
        val_split = train_split + int(len(files) * val_ratio)

        subsets = {
            "train": files[:train_split],
            "val": files[train_split:val_split],
            "test": files[val_split:],
        }

        for subset, subset_files in subsets.items():
            subset_dir = os.path.join(output_dir, subset, category)
            os.makedirs(subset_dir, exist_ok=True)

            for file in subset_files:
                src = os.path.join(category_path, file)
                dst = os.path.join(subset_dir, file)
                shutil.copy(src, dst)

    print("✅ Dataset splitting completed!")


split_data()

✅ Dataset splitting completed!


In [4]:
from collections import Counter
import os


def count_images_in_folders(base_dir):
    for split in ["train", "val", "test"]:
        for category in ["nodules", "non_nodules"]:
            folder = os.path.join(base_dir, split, category)
            count = len(os.listdir(folder))
            print(f"{split.capitalize()} - {category}: {count} images")


count_images_in_folders("dataset_split")

Train - nodules: 308 images
Train - non_nodules: 16 images
Val - nodules: 66 images
Val - non_nodules: 3 images
Test - nodules: 66 images
Test - non_nodules: 4 images
