# SIIM-ISIC Melanoma Classification

## Identify melanoma in lesion images

In [21]:
# Import the required libraries

import os
import zipfile
import random
import tensorflow as tf
from shutil import copyfile, rmtree, move
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.image import imread
import matplotlib.image as mpimg
from tensorflow import keras
from keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPool2D
from functools import partial
from keras.layers.advanced_activations import LeakyReLU
from datetime import datetime

The original "jpeg" files were stored in train and validation sub-categories. In the original jpeg files, the "melignant" and "benign" labels were not indicated, instead, they were labelled in the "train.csv" file.

So, in order to use keras.preprocessing.ImageDataGenerator, all the jpeg files were identified as "melignant" and "bening" first, and then split into train and validation set with the split ratio of 0.8.

Below codes show the total numbers of instances. Note that, all the were mixed first and then splitted into train test set.

In [2]:
# Indicate the paths to the directories
source_path = '/sfs/lustre/bahamut/scratch/uk7ud/Kaggle Melanoma/source'
source_benign_path = os.path.join(source_path, 'benign')
source_melignant_path = os.path.join(source_path, 'melignant')
data_path = '/sfs/lustre/bahamut/scratch/uk7ud/Kaggle Melanoma/data'
train_path = os.path.join(data_path, 'train')
train_benign_path = os.path.join(train_path, 'benign')
train_melignant_path = os.path.join(train_path, 'melignant')
validation_path = os.path.join(data_path, 'validation')
validation_benign_path = os.path.join(validation_path, 'benign')
validation_melignant_path = os.path.join(validation_path, 'melignant')

In [3]:
len_source_benign = len(os.listdir(source_benign_path))
len_source_melignant = len(os.listdir(source_melignant_path))
len_source_total = len_source_benign + len_source_melignant
source_melignant_benign_ratio =  len_source_melignant/len_source_benign

len_train_benign = len(os.listdir(train_benign_path))
len_train_melignant = len(os.listdir(train_melignant_path))
len_train_total = len_train_benign + len_train_melignant
train_melignant_benign_ratio =  len_train_melignant/len_train_benign

len_validation_benign = len(os.listdir(validation_benign_path))
len_validation_melignant = len(os.listdir(validation_melignant_path))
len_validation_total = len_validation_benign + len_validation_melignant
validation_melignant_benign_ratio =  len_validation_melignant/len_validation_benign

So, looking at above numbers, we are in a good spot to start training. Keep in mind that the data is highly skewed and thats why we will use ROC AUC for performance of the model.

The melignant/benign ratio is also important for data separation. The original data has a split ratio of 0.018 and we kept that ratio while spliting our data into train and validation.

In [4]:
print('Total Source Benign:', len_source_benign)
print('Total Source Melignant:', len_source_melignant)
print('Source Total:', len_source_total)
print('Source Melignant/Benign:',source_melignant_benign_ratio)

print('\nTotal Train Benign:', len_train_benign)
print('Total Train Melignant:', len_train_melignant)
print('Train Total:', len_train_total)
print('Train Melignant/Benign:', train_melignant_benign_ratio)

print('\nTotal Validation Benign:', len_validation_benign)
print('Total Validation Melignant:', len_validation_melignant)
print('Validation Total:', len_validation_total)
print('Validation Melignant/Benign:',validation_melignant_benign_ratio)

Total Source Benign: 32542
Total Source Melignant: 584
Source Total: 33126
Source Melignant/Benign: 0.017946038965029807

Total Train Benign: 26033
Total Train Melignant: 467
Train Total: 26500
Train Melignant/Benign: 0.01793877002266354

Total Validation Benign: 6509
Total Validation Melignant: 117
Validation Total: 6626
Validation Melignant/Benign: 0.01797511138423721


The image shapes may be different in size, so we have to resize all of them in the same size (1500 x 1500 for example). Below code gives the average dimensions of the images. Let's chose "Source Benign" directory for this since we have 32542 images in this folder out of 33126.

In [5]:
input_shape = (200, 200, 3)

In [16]:
# Prepare the data using ImageDataGenerator API from keras and also include data augmentation

train_datagen = ImageDataGenerator(
    rescale = 1./255,
    rotation_range = 40,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True,
    fill_mode = 'nearest'
)

train_generator = train_datagen.flow_from_directory(
    train_path,
    batch_size = 32,
    target_size = input_shape[:2],
    class_mode = 'binary'
) 

validation_datagen = ImageDataGenerator(rescale = 1./255)

validation_generator = validation_datagen.flow_from_directory(
    validation_path,
    batch_size = 32,
    target_size = input_shape[:2],
    class_mode = 'binary'
)

Found 26500 images belonging to 2 classes.
Found 6626 images belonging to 2 classes.


### Now, create the model

In [19]:
DefaultConv2D = partial(keras.layers.Conv2D, kernel_size=3, padding="SAME")

model = keras.models.Sequential([
    DefaultConv2D(filters=64, kernel_size=3, input_shape=input_shape),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.MaxPooling2D(pool_size=2),
    DefaultConv2D(filters=128),
    keras.layers.LeakyReLU(alpha=0.2),
    DefaultConv2D(filters=128),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.MaxPooling2D(pool_size=2),
    DefaultConv2D(filters=256),
    keras.layers.LeakyReLU(alpha=0.2),
    DefaultConv2D(filters=256),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.MaxPooling2D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=64),
    keras.layers.LeakyReLU(alpha=0.2),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1, activation='sigmoid'),
])

model.compile(
    optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC()])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 200, 200, 64)      1792      
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 200, 200, 64)      0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 100, 100, 64)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 100, 100, 128)     73856     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 100, 100, 128)     0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 100, 100, 128)     147584    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 100, 100, 128)     0

In [22]:
logs = os.path.join(os.curdir, "my_logs", "run_" + datetime.now().strftime("%Y%m%d_%H%M%S"))
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=logs, histogram_freq=1, profile_batch=10)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("model_keras_datagen.h5", save_best_only=True)

In [None]:
weight_for_0 = (1 / 32542)*(33126)/2.0 
weight_for_1 = (1 / 584)*(33126)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

history = model.fit(train_generator, 
                    epochs=200, 
                    validation_data=train_generator, 
                    class_weight = class_weight,
                    callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 829 steps, validate for 829 steps
Epoch 1/200
Epoch 2/200