In [1]:
from pathlib import Path
dataDir = Path("data")

In [3]:
dermamnist_file = dataDir / "dermamnist.npz"
import numpy as np
data = np.load(dermamnist_file)
data

NpzFile 'data/dermamnist.npz' with keys: train_images, val_images, test_images, train_labels, val_labels...

In [6]:
import numpy as np
from pathlib import Path
from PIL import Image
import os

def save_images(sub_dir="test"):
    """Extract and save test images from dermamnist.npz to data/test directory."""
    
    # Paths
    data_dir = Path("data")
    dermamnist_file = data_dir / "dermamnist.npz"
    test_dir = data_dir / sub_dir
    
    # Create test directory if it doesn't exist
    test_dir.mkdir(exist_ok=True)
    
    # Load the dataset
    print(f"Loading dataset from {dermamnist_file}")
    data = np.load(dermamnist_file)
    
    # Print available keys in the dataset
    print(f"Available keys in dataset: {list(data.keys())}")
    
    # Extract test data (typically stored as 'test_images' and 'test_labels')
    # Let's check what keys are available first
    if 'test_images' in data:
        test_images = data['test_images']
        test_labels = data['test_labels'] if 'test_labels' in data else None
    elif 'x_test' in data:
        test_images = data['x_test'] 
        test_labels = data['y_test'] if 'y_test' in data else None
    else:
        # If we can't find standard keys, look for other possible test data
        available_keys = [k for k in data.keys() if isinstance(data[k], np.ndarray)]
        print(f"Available array keys: {available_keys}")
        
        # Try to find test data by looking for keys containing 'test' or 'val'
        test_keys = [k for k in available_keys if 'test' in k.lower() or 'val' in k.lower()]
        if test_keys:
            test_images = data[test_keys[0]]
            # Look for corresponding labels
            label_keys = [k for k in available_keys if 'label' in k.lower() and ('test' in k.lower() or 'val' in k.lower())]
            test_labels = data[label_keys[0]] if label_keys else None
        else:
            # If no test-specific keys found, we might need to split the data
            # Let's check if there's a way to identify test data
            print("No explicit test data found. Available data:")
            for key in available_keys:
                print(f"  {key}: shape {data[key].shape}")
            raise ValueError(f"Could not find test data in the dataset. Available keys: {list(data.keys())}")
    
    print(f"Test images shape: {test_images.shape}")
    if test_labels is not None:
        print(f"Test labels shape: {test_labels.shape}")
    
    # Create subdirectories for each class if labels are available
    if test_labels is not None:
        unique_labels = np.unique(test_labels)
        print(f"Found {len(unique_labels)} unique classes: {unique_labels}")
        
        # Create class subdirectories
        for label in unique_labels:
            class_dir = test_dir / f"class_{label}"
            class_dir.mkdir(exist_ok=True)
    
    # Save each image
    print(f"Saving {len(test_images)} test images...")
    
    for i, image in enumerate(test_images):
        # Convert to PIL Image
        # Handle different image formats (grayscale vs RGB)
        if len(image.shape) == 2:  # Grayscale
            pil_image = Image.fromarray(image, mode='L')
        elif len(image.shape) == 3 and image.shape[2] == 1:  # Grayscale with channel dimension
            pil_image = Image.fromarray(image.squeeze(), mode='L')
        elif len(image.shape) == 3 and image.shape[2] == 3:  # RGB
            pil_image = Image.fromarray(image, mode='RGB')
        else:
            print(f"Warning: Unexpected image shape {image.shape} for image {i}")
            continue
        
        # Determine save path
        if test_labels is not None:
            label = test_labels[i] if test_labels.ndim == 1 else test_labels[i][0]
            save_path = test_dir / f"class_{label}" / f"test_image_{i:05d}.png"
        else:
            save_path = test_dir / f"test_image_{i:05d}.png"
        
        # Save the image
        pil_image.save(save_path)
        
        # Print progress every 1000 images
        if (i + 1) % 1000 == 0:
            print(f"Saved {i + 1}/{len(test_images)} images...")
    
    print(f"Successfully saved all {len(test_images)} test images to {test_dir}")
    
    # Print summary
    if test_labels is not None:
        for label in np.unique(test_labels):
            count = np.sum(test_labels == label)
            class_dir = test_dir / f"class_{label}"
            print(f"Class {label}: {count} images saved to {class_dir}")
    
    data.close()



In [7]:
save_images("train")


Loading dataset from data/dermamnist.npz
Available keys in dataset: ['train_images', 'val_images', 'test_images', 'train_labels', 'val_labels', 'test_labels']
Test images shape: (2005, 28, 28, 3)
Test labels shape: (2005, 1)
Found 7 unique classes: [0 1 2 3 4 5 6]
Saving 2005 test images...
Saved 1000/2005 images...


  pil_image = Image.fromarray(image, mode='RGB')


Saved 2000/2005 images...
Successfully saved all 2005 test images to data/train
Class 0: 66 images saved to data/train/class_0
Class 1: 103 images saved to data/train/class_1
Class 2: 220 images saved to data/train/class_2
Class 3: 23 images saved to data/train/class_3
Class 4: 223 images saved to data/train/class_4
Class 5: 1341 images saved to data/train/class_5
Class 6: 29 images saved to data/train/class_6


In [8]:
!pip install medmnist

Collecting medmnist
  Downloading medmnist-3.0.2-py3-none-any.whl.metadata (14 kB)
Collecting pandas (from medmnist)
  Using cached pandas-2.3.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn (from medmnist)
  Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scikit-image (from medmnist)
  Downloading scikit_image-0.25.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (14 kB)
Collecting tqdm (from medmnist)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fire (from medmnist)
  Downloading fire-0.7.1-py3-none-any.whl.metadata (5.8 kB)
Collecting termcolor (from fire->medmnist)
  Downloading termcolor-3.1.0-py3-none-any.whl.metadata (6.4 kB)
Collecting pytz>=2020.1 (from pandas->medmnist)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->medmnist)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.11.4 (from 

In [9]:
from medmnist import DermaMNIST

In [10]:
dataset = DermaMNIST(split='train', download=True)

100%|██████████| 19.7M/19.7M [02:49<00:00, 116kB/s] 
