In [None]:
import os
import shutil
from pathlib import Path
from tqdm.notebook import tqdm

# ==========================================
# 1. PATH CONFIGURATION
# ==========================================
INPUT_DIR = Path(
    "/kaggle/input/massive-skin-disease-balanced-dataset/"
    "balanced_dataset/balanced_dataset"
)
OUTPUT_DIR = Path("/kaggle/working/pediatric_skin_data")

# Remove existing output directory to avoid duplication
if OUTPUT_DIR.exists():
    shutil.rmtree(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ==========================================
# 2. CLASS MAPPING
# ==========================================
CLASS_MAPPING = {
    # Group 0: Dermatitis / Eczema
    "Atopic Dermatitis Photos": "0_Eczema_Dermatitis",
    "Eczema Photos": "0_Eczema_Dermatitis",
    "Poison Ivy Photos And Other Contact Dermatitis": "0_Eczema_Dermatitis",
    "Urticaria Hives": "0_Eczema_Dermatitis",
    "Rashes": "0_Eczema_Dermatitis",

    # Group 1: Bacterial infections
    "Ba Impetigo": "1_Bacterial_Infections",
    "Ba Cellulitis": "1_Bacterial_Infections",
    "Cellulitis Impetigo And Other Bacterial Infections": "1_Bacterial_Infections",

    # Group 2: Fungal infections
    "Fu Ringworm": "2_Fungal_Infections",
    "Fu Athlete Foot": "2_Fungal_Infections",
    "Fu Nail Fungus": "2_Fungal_Infections",
    "Tinea Ringworm Candidiasis And Other Fungal Infections": "2_Fungal_Infections",

    # Group 3: Viral infections
    "Vi Chickenpox": "3_Viral_Infections",
    "Vi Shingles": "3_Viral_Infections",
    "Warts Molluscum And Other Viral Infections": "3_Viral_Infections",
    "Exanthems And Drug Eruptions": "3_Viral_Infections",

    # Group 4: Infestations
    "Scabies Lyme Disease And Other Infestations And Bites": "4_Infestations",
    "Pa Cutaneous Larva Migrans": "4_Infestations",

    # Group 5: Acneiform
    "Acne And Rosacea Photos": "5_Acneiform",

    # Group 6: Benign / vascular
    "Vascular Tumors": "6_Vascular_Benign",
    "Benign": "6_Vascular_Benign",
    "Seborrheic Keratoses And Other Benign Tumors": "6_Vascular_Benign",

    # Group 7: Healthy skin
    "Heathy": "7_Healthy_Skin"
}

# ==========================================
# 3. COPY FILES AND CALCULATE SIZE
# ==========================================
total_size_bytes = 0
total_files = 0

print(f"Start processing dataset into: {OUTPUT_DIR}")

for source_folder_name in tqdm(os.listdir(INPUT_DIR), desc="Processing folders"):
    source_path = INPUT_DIR / source_folder_name

    # Process only folders defined in mapping
    if source_path.is_dir() and source_folder_name in CLASS_MAPPING:
        target_label = CLASS_MAPPING[source_folder_name]
        target_path = OUTPUT_DIR / target_label
        target_path.mkdir(parents=True, exist_ok=True)

        image_files = list(source_path.glob("*.*"))

        for img in image_files:
            if img.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
                shutil.copy2(img, target_path / img.name)
                total_size_bytes += img.stat().st_size
                total_files += 1

# ==========================================
# 4. SUMMARY
# ==========================================
total_size_gb = total_size_bytes / (1024 ** 3)

print("\n" + "=" * 40)
print("PROCESS COMPLETED")
print(f"Output directory : {OUTPUT_DIR}")
print(f"Total images     : {total_files}")
print(f"Total size       : {total_size_gb:.2f} GB")
print("=" * 40)

print("\nImage count per class:")
for label_dir in sorted(OUTPUT_DIR.iterdir()):
    if label_dir.is_dir():
        count = len(list(label_dir.glob("*")))
        print(f"  {label_dir.name}: {count}")
