In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Base dataset folder
BASE_DIR = "archive" 
IMG_DIR_1 = os.path.join(BASE_DIR, "HAM10000_images_part_1")
IMG_DIR_2 = os.path.join(BASE_DIR, "HAM10000_images_part_2")
META_PATH = os.path.join(BASE_DIR, "HAM10000_metadata.csv")

In [3]:
# Output folders for split dataset
OUTPUT_DIR = "ham10000_dataset_split"
TRAIN_OUTPUT = os.path.join(OUTPUT_DIR, "train")
VAL_OUTPUT = os.path.join(OUTPUT_DIR, "val")
TEST_OUTPUT = os.path.join(OUTPUT_DIR, "test")

os.makedirs(TRAIN_OUTPUT, exist_ok=True)
os.makedirs(VAL_OUTPUT, exist_ok=True)
os.makedirs(TEST_OUTPUT, exist_ok=True)

In [5]:
# Load metadata
metadata = pd.read_csv(META_PATH)

# Add full image paths
def get_img_path(img_id):
    fname = f"{img_id}.jpg"
    if os.path.exists(os.path.join(IMG_DIR_1, fname)):
        return os.path.join(IMG_DIR_1, fname)
    elif os.path.exists(os.path.join(IMG_DIR_2, fname)):
        return os.path.join(IMG_DIR_2, fname)
    else:
        return None

metadata["path"] = metadata["image_id"].map(get_img_path)

# Drop missing
metadata = metadata.dropna(subset=["path"])

# Classes are in "dx"
labels = metadata["dx"].tolist()

In [6]:
# Split into train / val / test
train_df, temp_df = train_test_split(
    metadata,
    test_size=0.30, # 30% to val+test
    stratify=metadata["dx"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50, # half 30% → 15%
    stratify=temp_df["dx"],
    random_state=42
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

# Copy function
def copy_files(df, dst_dir):
    missing = []
    for _, row in df.iterrows():
        class_name = row["dx"]
        src_path = row["path"]

        if not os.path.exists(src_path):
            missing.append(src_path)
            continue

        class_dir = os.path.join(dst_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)

        dst_path = os.path.join(class_dir, os.path.basename(src_path))
        shutil.copy(src_path, dst_path)

    if missing:
        print(f"kipped {len(missing)} missing files")

# Copy images into folders
copy_files(train_df, TRAIN_OUTPUT)
copy_files(val_df, VAL_OUTPUT)
copy_files(test_df, TEST_OUTPUT)

print("HAM10000 dataset prepared:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")


Train size: 7010
Val size: 1502
Test size: 1503
HAM10000 dataset prepared:
Train: 7010
Val: 1502
Test: 1503


In [8]:
len(train_df), len(val_df), len(test_df)


(7010, 1502, 1503)