# Skin Lesion Classifier

## Import Training Data

In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

# dermoscopic images folder path
derm_image_folder = 'C:\ISIC_2019\ISIC_2019_Training_Input'
df_ground_truth = pd.read_csv('C:\ISIC_2019\ISIC_2019_Training_GroundTruth.csv')

# Categories not include UNK
categories = list(df_ground_truth.columns.values[1:9])
known_category_num = len(categories)
print("Number of known categories: {}".format(known_category_num))
idx_to_category = dict((i, c) for i, c in enumerate(categories))
print(idx_to_category, '\n')

df_ground_truth['path'] = df_ground_truth.apply(lambda row : os.path.join(derm_image_folder, row['image']+'.jpg'), axis=1)
df_ground_truth['category'] = pd.Series([idx_to_category[np.argmax(x)] for x in np.array(df_ground_truth.iloc[:,1:9])], name='category')
count_per_category = Counter(df_ground_truth['category'])
total_sample_count = sum(count_per_category.values())
print("Training data has {} samples.".format(total_sample_count))

for c in categories:
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category[c], count_per_category[c]*100/total_sample_count))

fig = plt.bar(count_per_category.keys(), count_per_category.values())

df_ground_truth.head()

### Shuffle and Split Data

In [None]:
from sklearn.model_selection import train_test_split

seed = 1
df_train, df_val = train_test_split(df_ground_truth, stratify=df_ground_truth['category'], test_size=0.2, random_state=seed)

sample_count_train = df_train.shape[0]
print("Training set has {} samples.".format(sample_count_train))
count_per_category_train = Counter(df_train['category'])
for key in categories:
    print("'%s':\t%d\t(%.2f%%)" % (key, count_per_category_train[key], count_per_category_train[key]*100/sample_count_train))
    
sample_count_val = df_val.shape[0]
print("\nValidation set has {} samples.".format(sample_count_val))
count_per_category_val = Counter(df_val['category'])
for key in categories:
    print("'%s':\t%d\t(%.2f%%)" % (key, count_per_category_val[key], count_per_category_val[key]*100/sample_count_val))

### Samples of Each Category

In [None]:
from IPython.display import Image

category_groups = df_train.groupby('category')

# Number of samples for each category
num_per_category = 3

fig, axes = plt.subplots(nrows=known_category_num, ncols=num_per_category, figsize=(9, 24))
plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
fig.patch.set_facecolor('white')

for idx, val in enumerate(categories):
    i = 0
    for index, row in category_groups.get_group(val).head(num_per_category).iterrows():
        ax = axes[idx, i]
        ax.imshow(plt.imread(row['path']))
        ax.set_xlabel(row['image'])
        if ax.is_first_col():
            ax.set_ylabel(val, fontsize=20)
            ax.yaxis.label.set_color('blue')
        i += 1
    
fig.tight_layout()

### Pre-process the Data

In [None]:
# from keras.preprocessing import image                  
# from tqdm import tqdm

# def path_to_tensor(img_path, size=(224, 224)):
#     # loads RGB image as PIL.Image.Image type
#     img = image.load_img(img_path, target_size=size)
#     # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
#     x = image.img_to_array(img)
#     # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
#     return np.expand_dims(x, axis=0)

# def paths_to_tensor(img_paths, size=(224, 224)):
#     list_of_tensors = [path_to_tensor(img_path, size) for img_path in tqdm(img_paths)]
#     return np.vstack(list_of_tensors)

In [None]:
# Rescale the images by dividing every pixel in every image by 255.
# X_train_tensors = paths_to_tensor(X_train['path'])
# X_val_tensors = paths_to_tensor(X_val['path'])

### Evaluation Metrics

In [None]:
# Ref: https://stackoverflow.com/a/54620037/2437361
import keras.backend as K

def balanced_accuracy(y_true, y_pred):
    """
    Computes the average per-class recall metric for a multi-class classification problem
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)  
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)   
    recall = true_positives / (possible_positives + K.epsilon())
    balanced_recall = K.mean(recall)
    return balanced_recall

### Create a vanilla CNN as benchmark model

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from keras.optimizers import Adam

input_size = (224, 224)
batch_size = 40
lr_start = 1e-3 # Starting learning rate

# datagen_train = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
datagen_train = ImageDataGenerator(rescale=1./255)

datagen_val = ImageDataGenerator(rescale=1./255)

generator_train = datagen_train.flow_from_dataframe(
    dataframe=df_train, x_col='path', y_col='category', class_mode='categorical', target_size=input_size, batch_size=batch_size, seed=seed)

generator_val = datagen_val.flow_from_dataframe(
    dataframe=df_val, x_col='path', y_col='category', class_mode='categorical', target_size=input_size, batch_size=batch_size, seed=seed)

# Define vanilla CNN
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(input_size[0], input_size[1], 3)))
model.add(MaxPooling2D(pool_size=2))

model.add(Conv2D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

model.add(Conv2D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

model.add(Dropout(rate=0.3))
model.add(GlobalAveragePooling2D())
model.add(Dense(known_category_num, activation='softmax'))

model.summary()

# Compile the model
model.compile(optimizer=Adam(lr=lr_start), loss='categorical_crossentropy', metrics=[balanced_accuracy, 'accuracy'])

#### Class Weights

In [None]:
from sklearn.utils import class_weight

# Compute class weights for imbalanced data
class_weights = class_weight.compute_class_weight('balanced', np.unique(df_train['category']), df_train['category'])
class_weight_dict = dict(enumerate(class_weights))
# class_weight_dict = dict(zip(np.unique(df_train['category']), class_weights))
print('Class Weights:')
print(class_weight_dict)

# mapping from class names to class indices
print('\nCategories to Indices:')
categories_to_indices = generator_train.class_indices
print(categories_to_indices)
# print(generator_train.classes)

#### Train the vanilla CNN

In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger
from keras_tqdm import TQDMNotebookCallback

if not os.path.exists('saved_models'):
    os.makedirs('saved_models')
checkpointer = ModelCheckpoint(
    filepath='saved_models/vanilla.weights.best.hdf5',
    monitor='balanced_accuracy',
    verbose=1,
    save_best_only=True)

# Reduce learning rate when the validation loss has stopped improving.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-5, verbose=1)

# Stop training when the validation loss has stopped improving.
early_stop = EarlyStopping(monitor='val_loss', patience=22, verbose=1, restore_best_weights=True)

# Callback that streams epoch results to a csv file.
if not os.path.exists('logs'):
    os.makedirs('logs')
csv_logger = CSVLogger('logs/vanilla.training.log')

epoch_num = 10

history = model.fit_generator(
    generator_train,
    class_weight=class_weight_dict,
    steps_per_epoch=sample_count_train//batch_size,
    epochs=epoch_num,
    verbose=0,
    callbacks=[checkpointer, reduce_lr, early_stop, csv_logger, TQDMNotebookCallback(leave_inner=True, leave_outer=True)],
    validation_data=generator_val,
    validation_steps=sample_count_val//batch_size)