# Skin Lesion Classifier - Approach 2

## Google Colab

In [None]:
# Run this cell to mount Google Drive for Colab
from google.colab import drive
drive.mount('/content/drive/')
# !ls '/content/drive/My Drive/Colab Notebooks'

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/isic-2019')

In [None]:
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_Input.zip' '/home/ISIC_2019_Training_Input.zip'
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_GroundTruth.csv' '/home/ISIC_2019_Training_GroundTruth.csv'
# !cp '/content/drive/My Drive/Colab Notebooks/Out_Distribution.zip' '/home/Out_Distribution.zip'
# !unzip -qq '/home/ISIC_2019_Training_Input.zip' -d '/home'
# !unzip -qq '/home/Out_Distribution.zip' -d '/home'
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Test_Input.zip' '/home/ISIC_2019_Test_Input.zip'
# !unzip -qq '/home/ISIC_2019_Test_Input.zip' -d '/home'

In [None]:
# # Ref https://docs.fast.ai/performance.html
# !pip3 uninstall -y pillow pil jpeg libtiff libjpeg-turbo
# !CFLAGS="${CFLAGS} -mavx2" pip3 install --upgrade --no-cache-dir --force-reinstall --no-binary :all: --compile pillow-simd

## Common Parameters

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
%matplotlib inline

data_folder = 'C:\ISIC_2019'
# data_folder = '/home'
# data_folder = '/home/jupyter'

model_folder = 'models_2'
history_folder = 'history_2'
pred_result_folder = 'predict_results_2'

# How to handle SVG fonts
plt.rcParams['svg.fonttype'] = 'none'

## Import Training Data and Out-of-Distribution Dataset

* All images of the out-of-distribution dataset are regarded as the unknown category.
* Two duplicate images (ISIC_0069013, ISIC_0067980) of the training data are removed.

In [None]:
from collections import Counter
from data import load_isic_training_and_out_dist_data
from visuals import autolabel

training_image_folder = os.path.join(data_folder, 'ISIC_2019_Training_Input')
ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth_DuplicateRemoved.csv')
out_dist_image_folder = os.path.join(data_folder, 'Out_Distribution')

df_ground_truth, category_names = load_isic_training_and_out_dist_data(training_image_folder, ground_truth_file, out_dist_image_folder)

category_num = len(category_names)
print("Number of categories: {}".format(category_num))
print(category_names, '\n')

# mapping from category to index
print('Category to Index:')
category_to_index = dict((c, i) for i, c in enumerate(category_names))
print(category_to_index, '\n')

count_per_category = Counter(df_ground_truth['category'])
total_sample_count = sum(count_per_category.values())
print("Original training data has {} samples.".format(total_sample_count))
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category[i], count_per_category[i]*100/total_sample_count))

# Create a bar chart
fig, ax = plt.subplots(figsize=(8, 5))
fig.patch.set_facecolor('white')
ax.set(xlabel='Category', ylabel='Number of Images')
rects = plt.bar(category_names, [count_per_category[i] for i in range(category_num)])
autolabel(ax, rects)
fig.tight_layout()

df_ground_truth.head()

### Shuffle and Split Original Training Data into Training  and Validation Sets

In [None]:
from data import train_validation_split
from visuals import plot_grouped_2bars

df_train, df_val = train_validation_split(df_ground_truth)

# Training Set
sample_count_train = df_train.shape[0]
print("Training set has {} samples.".format(sample_count_train))
count_per_category_train = Counter(df_train['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_train[i], count_per_category_train[i]*100/sample_count_train))

# Validation Set
sample_count_val = df_val.shape[0]
print("\nValidation set has {} samples.".format(sample_count_val))
count_per_category_val = Counter(df_val['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_val[i], count_per_category_val[i]*100/sample_count_val))

plot_grouped_2bars(
    scalars=[[count_per_category_train[i] for i in range(category_num)],
             [count_per_category_val[i] for i in range(category_num)]],
    scalarlabels=['Training', 'Validation'],
    xticklabels=category_names,
    xlabel='Category',
    ylabel='Number of Images',
    title='Distribution of Training and Validation Sets'
)

### Class Weights based on the Traning Set

In [None]:
from data import compute_class_weight_dict

class_weight_dict, class_weights = compute_class_weight_dict(df_train)
print('Class Weights Dictionary:')
print(class_weight_dict)

# Create a bar chart
fig, ax = plt.subplots(figsize=(8, 5))
fig.patch.set_facecolor('white')
ax.set_title('Class Weights')
ax.set(xlabel='Category', ylabel='Weight')
plt.bar(category_names, [class_weight_dict[i] for i in range(category_num)]);

### Per-channel Mean and Standard Deviation over the Training Set

In [None]:
from utils import calculate_mean_std

### Uncomment below codes to calculate per-channel mean and standard deviation over the training set
rgb_mean, rgb_std = calculate_mean_std(df_train['path'])
print("Mean:{}\nSTD:{}".format(rgb_mean, rgb_std))

# Output was:
# Mean:[0.6296238064420809, 0.5202302775509949, 0.5032952297664738]
# STD:[0.24130893564897463, 0.22150225707876617, 0.2297057828857888]

## Transfer Learning

### Train Models by Transfer Learning

In [None]:
!python3 main.py /home --approach 2 --modelfolder models_2 --training --epoch 100 --batchsize 32 --maxqueuesize 10 --model DenseNet201 Xception ResNeXt50

### Complexity Graph of Transfer Learning Models

In [None]:
from visuals import *

model_names = ['DenseNet201', 'Xception', 'ResNeXt50']
feature_extract_epochs = 3

for model_name in model_names:
    file_path = os.path.join(history_folder, "{}.training.csv".format(model_name))
    if os.path.exists(file_path):
        fig = plot_complexity_graph(csv_file=file_path,
                              title="Complexity Graph of {}".format(model_name),
                              figsize=(14, 10),
                              feature_extract_epochs=feature_extract_epochs)
        fig.savefig(os.path.join(history_folder, "{}.training.svg".format(model_name)), format='svg',
                    bbox_inches='tight', pad_inches=0)

## Predict Validation Set

### Predict Validation Set by Different Models

In [None]:
# !python3 main.py /home --approach 2 --modelfolder models_2 --predval --predresultfolder predict_results_2 --model Xception DenseNet201 ResNeXt50
!python main.py C:\ISIC_2019 --approach 2 --modelfolder models_2 --predval --predresultfolder predict_results_2 --model Xception DenseNet201 ResNeXt50

### Ensemble Models' Predictions on Validation Set

In [None]:
from utils import ensemble_predictions

ensemble_predictions(pred_result_folder, category_names)

### Load Prediction Results on Validation Set

In [None]:
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, recall_score
from visuals import plot_confusion_matrix
from keras.utils import np_utils
from keras_numpy_backend import categorical_crossentropy

model_names = ['DenseNet201', 'Xception', 'ResNeXt50', 'Ensemble']
postfix = 'best_balanced_acc'
print('Model selection criteria: ', postfix)

for model_name in model_names:
    # Load predicted results
    file_path = os.path.join(pred_result_folder, "{}_{}.csv".format(model_name, postfix))
    # file_path = os.path.join(pred_result_folder, "{}_best_loss.csv".format(model_name))
    if not os.path.exists(file_path):
        continue

    print("========== {} ==========".format(model_name))
    df = pd.read_csv(file_path)
    y_true = df['category']
    y_pred = df['pred_category']

    # Compute Balanced Accuracy
    print('balanced_accuracy_score: ', balanced_accuracy_score(y_true, y_pred))
    print('macro recall_score: ', recall_score(y_true, y_pred, average='macro'))

    # Compute categorical_crossentropy
    y_true_onehot = np_utils.to_categorical(df['category'], num_classes=category_num)
    y_pred_onehot = np.array(df.iloc[:,1:1+category_num])
    print('categorical_crossentropy: ',
          np.average(categorical_crossentropy(y_true_onehot, y_pred_onehot)))

    # Compute weighted categorical_crossentropy
    print('weighted categorical_crossentropy: ',
          np.average(categorical_crossentropy(y_true_onehot, y_pred_onehot, class_weights=class_weights)))

    # Confusion Matrix
    fig = plot_confusion_matrix(y_true, y_pred, category_names, normalize=True,
                                title="Confusion Matrix of {}".format(model_name),
                                figsize=(8, 6))
    print('')

In [None]:
from visuals import plot_grouped_2bars

sample_count_val = y_true.shape[0]
print("Validation set has {} samples.\n".format(sample_count_val))

print('========== Ground Truth ==========')
count_true = Counter(y_true)
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_true[i], count_true[i]*100/sample_count_val))

for model_name in model_names:
    # Load predicted results
    file_path = os.path.join(pred_result_folder, "{}_{}.csv".format(model_name, postfix))
    if not os.path.exists(file_path):
        continue

    print("\n========== {} Prediction ==========".format(model_name))
    df = pd.read_csv(file_path)
    y_pred = df['pred_category']
    
    count_pred = Counter(y_pred)
    for i, c in enumerate(category_names):
        print("'%s':\t%d\t(%.2f%%)" % (c, count_pred[i], count_pred[i]*100/sample_count_val))

    # Plot Prediction Distribution
    plot_grouped_2bars(
        scalars=[[count_true[i] for i in range(category_num)],
                 [count_pred[i] for i in range(category_num)]],
        scalarlabels=['Ground Truth', 'Prediction'],
        xticklabels=category_names,
        xlabel='Category',
        ylabel='Number of Images',
        title="Prediction Distribution of {}".format(model_name)
    )

## Test Data

### Predict Test Data by Different Models

In [None]:
!python3 main.py /home --approach 2 --modelfolder models_2 --predtest --predresultfolder test_predict_results_2 --model DenseNet201 Xception ResNeXt50