## Lung Nodule Detection - Model Training Notebook

### Step 1: Dataset Splitting
#### - We will divide the dataset into Train, Validation, and Test sets.
#### - Ensure a balanced distribution of nodules and non-nodules.

In [None]:
import os
import shutil
import random

# Paths to normalized data
data_dir = "processed_data_normalized"
output_dir = "dataset_split"
os.makedirs(output_dir, exist_ok=True)

# Train-Validation-Test Split Ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Categories (Nodules / Non-Nodules)
categories = ["nodules", "non_nodules"]


# Function to split data
def split_data():
    for category in categories:
        category_path = os.path.join(data_dir, category)
        files = os.listdir(category_path)
        random.shuffle(files)

        train_split = int(len(files) * train_ratio)
        val_split = train_split + int(len(files) * val_ratio)

        subsets = {
            "train": files[:train_split],
            "val": files[train_split:val_split],
            "test": files[val_split:],
        }

        for subset, subset_files in subsets.items():
            subset_dir = os.path.join(output_dir, subset, category)
            os.makedirs(subset_dir, exist_ok=True)

            for file in subset_files:
                src = os.path.join(category_path, file)
                dst = os.path.join(subset_dir, file)
                shutil.copy(src, dst)

    print("✅ Dataset splitting completed!")


split_data()

# Next steps: Model building and training setup

✅ Dataset splitting completed!
