# HAM10000
“Human Against Machine with 10000 training images”

In [1]:
# Import machine learning libraries for image classification for a pytorch model
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torchvision.models as models

# Import additional libraries
from PIL import Image
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split
import shutil

In [2]:
# Check for GPU availability and set the device accordingly
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Print device information
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Import metadata
"""
Each row in the metadata file contains the following information:
- lesion_id    : ID of the lesion
- image_id     : name of image file (.jpg)
- dx           : diagnosis (type of skin cancer)
- dx_type      : method of diagnosis
- age          : age of patient
- sex          : biological sex of patient
- localization : location on body
"""
ham10000_metadata = pd.read_csv(r'archive/HAM10000_metadata.csv')

# Print the first few rows of the DataFrame
ham10000_metadata.head()

FileNotFoundError: [Errno 2] No such file or directory: 'archive/HAM10000_metadata.csv'

In [None]:
# Combine images from two folders into one
image_folder_1        = 'archive/HAM10000_images_part_1'
image_folder_2        = 'archive/HAM10000_images_part_2'
combined_image_folder = 'archive/HAM10000_images_combined'

# Create the combined image folder if it doesn't exist
if not os.path.exists(combined_image_folder):
    os.makedirs(combined_image_folder)

# Copy images from the first folder
for image_name in os.listdir(image_folder_1):
    shutil.copy(os.path.join(image_folder_1, image_name), combined_image_folder)

# Copy images from the second folder
for image_name in os.listdir(image_folder_2):
    shutil.copy(os.path.join(image_folder_2, image_name), combined_image_folder)

# Create a dictionary that maps image_id to the metadata
image_id_to_metadata = {}
for i in range(ham10000_metadata.shape[0]):
    image_id = ham10000_metadata.iloc[i]['image_id']
    image_id_to_metadata[image_id] = ham10000_metadata.iloc[i]

In [None]:
# Custom Dataset Class for HAM10000
class HAM10000Dataset(Dataset):
    def __init__(self, image_folder, metadata, transform=None):
        self.image_folder = image_folder
        self.metadata     = metadata
        self.transform    = transform

    def __len__(self):
        return self.metadata.shape[0]

    def __getitem__(self, idx):
        image_id   = self.metadata.iloc[idx]['image_id']
        image_path = os.path.join(self.image_folder, image_id + '.jpg')
        try:
            image = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"Warning: {image_path} not found.")
            return None, None
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            return None, None

        if self.transform:
            image = self.transform(image)

        dx = self.metadata.iloc[idx]['dx']
        dx = torch.tensor(dx, dtype=torch.long)
        return image, dx


In [None]:
# Create dictionary to map classes to definitions

diagnosis = {
    "akiec": {
        "definition": "Actinic Keratoses and Intraepithelial Carcinoma / Bowen's Disease",
        "cancerous" : True
    },
    "bcc": {
        "definition": "Basal Cell Carcinoma",
        "cancerous" : True
    },
    "bkl": {
        "definition": "Benign Keratosis-like Lesions",
        "cancerous" : False
    },
    "df": {
        "definition": "Dermatofibroma",
        "cancerous" : False
    },
    "mel": {
        "definition": "Melanoma",
        "cancerous" : True
    },
    "nv": {
        "definition": "Melanocytic Nevi",
        "cancerous" : False
    },
    "vasc": {
        "definition": "Vascular Lesions",
        "cancerous" : False
    }
}

In [None]:
# Define the transformations for the images, including color normalization, rotation, resizing, and flipping
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Split the dataset into training and validation sets
"""
We will use 80% of the data for training and 20% of the data for validation.
"""
train_metadata, val_metadata = train_test_split(ham10000_metadata, test_size=0.2, random_state=42)

# Ensure that the image folder path matches the combined image folder
image_folder = 'archive/HAM10000_images_combined'

# Create the training and validation datasets
train_dataset = HAM10000Dataset(image_folder, train_metadata, transform=transform)
val_dataset   = HAM10000Dataset(image_folder, val_metadata  , transform=transform)

# Print the sizes of the training and validation sets
print(f"Training set size   : {len(train_dataset)} samples")
print(f"Validation set size : {len(val_dataset)  } samples")


Training set size   : 8012 samples
Validation set size : 2003 samples


In [None]:
# Load the pre-trained ResNet model which will be used for key features of the dataset to identify over the images are outline/shape of lesion, symmetry of lesion, color of lesion, and texture of lesion
model = models.resnet50(pretrained=True)

# Modify the last layer of the model to output 7 classes instead of 1000 classes
model.fc = nn.Linear(model.fc.in_features, 7)

# Move the model to the appropriate device
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True )
val_loader   = DataLoader(val_dataset  , batch_size=32, shuffle=False)

# Print model summary to verify the structure
print(model)



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# Training the model over 5 epochs and plotting the training and validation loss


# Encode the labels to numeric values
label_mapping           = {label: idx for idx, label in enumerate(diagnosis.keys())}
ham10000_metadata['dx'] = ham10000_metadata['dx'].map(label_mapping)

# Ensure labels in train and validation sets are also mapped to numeric values
train_metadata['dx'] = train_metadata['dx'].map(label_mapping)
val_metadata['dx']   = val_metadata['dx'].map(label_mapping)

# Recreate the datasets to include numeric labels
train_dataset = HAM10000Dataset(image_folder, train_metadata, transform=transform)
val_dataset   = HAM10000Dataset(image_folder, val_metadata  , transform=transform)

# Define the data loaders with an appropriate batch size
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True )
val_loader   = DataLoader(val_dataset  , batch_size=32, shuffle=False)

# Initialize lists to store losses
train_losses = []
val_losses   = []

# Number of epochs
num_epochs = 5

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_train_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss    = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_train_loss += loss.item()

    # Calculate average training loss
    avg_train_loss = running_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    running_val_loss = 0.0

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(val_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            loss    = criterion(outputs, labels)

            # Accumulate loss
            running_val_loss += loss.item()

    # Calculate average validation loss
    avg_val_loss = running_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Plot the training and validation loss over epochs
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss'  )
plt.plot(range(1, num_epochs + 1), val_losses  , label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss over Epochs')
plt.legend()
plt.show()


Epoch [1/5], Training Loss: 0.8845, Validation Loss: 1.3561
Epoch [2/5], Training Loss: 0.7939, Validation Loss: 0.8150
Epoch [3/5], Training Loss: 0.7284, Validation Loss: 0.7272
Epoch [4/5], Training Loss: 0.7036, Validation Loss: 0.7200
