### Import Library

In [1]:
%matplotlib inline
# python libraties
import os, cv2,itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from PIL import Image

# pytorch libraries
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms

# sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# to make the results are reproducible
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)

### EDA

In [2]:
# Image Path
train_image_path = glob('./Dataset/Train/*.jpg')
train_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in train_image_path}

val_image_path = glob('./Dataset/Val/*.jpg')
val_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in val_image_path}

test_image_path = glob('./Dataset/Test/*.jpg')
test_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in test_image_path}

all_image_path = dict(train_path_dict, **val_path_dict, **test_path_dict)

# For Label
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

def define_split(x):
    if 'Train' in x:
        return 'Train'
    elif 'Val' in x:
        return 'Val'
    else:
        return 'Test'

# MetaData
df_original = pd.read_csv(os.path.join('./Dataset', 'Metadata.csv'))
df_original['path'] = df_original['image_id'].map(all_image_path.get)
df_original['cell_type'] = df_original['dx'].map(lesion_type_dict.get)
df_original['cell_type_idx'] = pd.Categorical(df_original['cell_type']).codes
df_original['split'] = df_original['path'].apply(lambda x:define_split(x))
df_original.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,./Dataset/Train/ISIC_0027419.jpg,Benign keratosis-like lesions,2,Train
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,./Dataset/Val/ISIC_0025030.jpg,Benign keratosis-like lesions,2,Val
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,./Dataset/Train/ISIC_0026769.jpg,Benign keratosis-like lesions,2,Train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,./Dataset/Train/ISIC_0025661.jpg,Benign keratosis-like lesions,2,Train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,./Dataset/Train/ISIC_0031633.jpg,Benign keratosis-like lesions,2,Train


**Split Train, Val, Test**

In [3]:
# Split df -> Train, Val, Test
df_train = df_original[df_original['split'] == 'Train'].copy()
df_val = df_original[df_original['split'] == 'Val'].copy()
df_test = df_original[df_original['split'] == 'Test'].copy()

**Check Imbalanced Dataset**

In [4]:
df_train['cell_type_idx'].value_counts()

4    4018
2     686
6     660
1     302
0     202
5      84
3      57
Name: cell_type_idx, dtype: int64

In [5]:
df_train['cell_type'].value_counts()

Melanocytic nevi                  4018
Benign keratosis-like lesions      686
dermatofibroma                     660
Basal cell carcinoma               302
Actinic keratoses                  202
Vascular lesions                    84
Dermatofibroma                      57
Name: cell_type, dtype: int64

**Oversampling**

In [6]:
data_aug_rate = [int(4018/202), int(4018/302),int(4018/686), int(4018/57), 0, int(4018/84), int(4018/660)]
for i in range(7):
    if data_aug_rate[i]:
        df_train=df_train.append([df_train.loc[df_train['cell_type_idx'] == i,:]]*(data_aug_rate[i]-1), ignore_index=True)
df_train['cell_type'].value_counts()

Melanocytic nevi                  4018
Dermatofibroma                    3990
dermatofibroma                    3960
Vascular lesions                  3948
Basal cell carcinoma              3926
Actinic keratoses                 3838
Benign keratosis-like lesions     3430
Name: cell_type, dtype: int64

**Reset Index**

In [7]:
df_train = df_train.reset_index()
df_val = df_val.reset_index()
df_test = df_test.reset_index()

### Make DataLoader

In [8]:
# Parameter => Equal to Segmentation_Model
input_size = 224
norm_mean = [0.76304215, 0.5456439, 0.5700431]
norm_std = [0.14092751, 0.15261441, 0.16997588]

# Transform
# Train Transform
train_transform = transforms.Compose([transforms.Resize((input_size,input_size)),transforms.RandomHorizontalFlip(),
                                      transforms.RandomVerticalFlip(),transforms.RandomRotation(20),
                                      transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
                                      transforms.ToTensor(), transforms.Normalize(norm_mean, norm_std)])

# Validation & Test Transform
val_transform = transforms.Compose([transforms.Resize((input_size,input_size)),
                                    transforms.ToTensor(), transforms.Normalize(norm_mean, norm_std)])

# Dataset
class HAM10000(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.df['path'][index])
        y = torch.tensor(int(self.df['cell_type_idx'][index]))

        if self.transform:
            X = self.transform(X)

        return X, y
    
# DataLoader
training_set = HAM10000(df_train, transform=train_transform)
train_loader = DataLoader(training_set, batch_size=32, shuffle=True, num_workers=4)

validation_set = HAM10000(df_val, transform=val_transform)
val_loader = DataLoader(validation_set, batch_size=32, shuffle=False, num_workers=4)

test_set = HAM10000(df_test, transform=val_transform)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=4)

### Define Classification Model

In [9]:
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')

# DenseNet -> Output:7
model_ft = models.densenet121(pretrained=True)
# set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(num_ftrs, 7)
model = model_ft.to(device)

In [10]:
model

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

### Train Model

**Metric**

In [11]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

**Train**

In [12]:
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    curr_iter = (epoch - 1) * len(train_loader)
    for i, data in enumerate(train_loader):
        images, labels = data
        N = images.size(0)
        # print('image shape:',images.size(0), 'label shape',labels.size(0))
        images = Variable(images).to(device)
        labels = Variable(labels).to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        prediction = outputs.max(1, keepdim=True)[1]
        train_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/N)
        train_loss.update(loss.item())
        curr_iter += 1
        if (i + 1) % 100 == 0:
            print('[epoch %d], [iter %d / %d], [train loss %.5f], [train acc %.5f]' % (
                epoch, i + 1, len(train_loader), train_loss.avg, train_acc.avg))
            total_loss_train.append(train_loss.avg)
            total_acc_train.append(train_acc.avg)
    return train_loss.avg, train_acc.avg

**Validation**

In [13]:
def validate(val_loader, model, criterion, optimizer, epoch):
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            images, labels = data
            N = images.size(0)
            images = Variable(images).to(device)
            labels = Variable(labels).to(device)

            outputs = model(images)
            prediction = outputs.max(1, keepdim=True)[1]

            val_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/N)

            val_loss.update(criterion(outputs, labels).item())

    print('------------------------------------------------------------')
    print('[epoch %d], [val loss %.5f], [val acc %.5f]' % (epoch, val_loss.avg, val_acc.avg))
    print('------------------------------------------------------------')
    return val_loss.avg, val_acc.avg

**Model Train & Save Best Model**

In [14]:
# Optimizer & LossFunction
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss().to(device)

epoch_num = 10
best_val_acc = 0

total_loss_train, total_acc_train = [],[]
total_loss_val, total_acc_val = [],[]

for epoch in range(1, epoch_num+1):
    loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
    loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
    total_loss_val.append(loss_val)
    total_acc_val.append(acc_val)
    if acc_val > best_val_acc:
        best_val_acc = acc_val
        print('*****************************************************')
        print('best record: [epoch %d], [val loss %.5f], [val acc %.5f]' % (epoch, loss_val, acc_val))
        print('*****************************************************')
        torch.save(model.state_dict(), './PreTrainModel/base_clf_model.pt')

[epoch 1], [iter 100 / 848], [train loss 1.26763], [train acc 0.53375]
[epoch 1], [iter 200 / 848], [train loss 1.12787], [train acc 0.58484]
[epoch 1], [iter 300 / 848], [train loss 1.04659], [train acc 0.61667]
[epoch 1], [iter 400 / 848], [train loss 0.98276], [train acc 0.63648]
[epoch 1], [iter 500 / 848], [train loss 0.94214], [train acc 0.65000]
[epoch 1], [iter 600 / 848], [train loss 0.90498], [train acc 0.66370]
[epoch 1], [iter 700 / 848], [train loss 0.86608], [train acc 0.67746]
[epoch 1], [iter 800 / 848], [train loss 0.83814], [train acc 0.68750]
------------------------------------------------------------
[epoch 1], [val loss 0.65010], [val acc 0.76068]
------------------------------------------------------------
*****************************************************
best record: [epoch 1], [val loss 0.65010], [val acc 0.76068]
*****************************************************
[epoch 2], [iter 100 / 848], [train loss 0.60030], [train acc 0.77938]
[epoch 2], [iter 200

### Model Test

**Load PreTrain Model**

In [16]:
# DenseNet -> Output:7
model_ft = models.densenet121(pretrained=True)
# set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(num_ftrs, 7)
model = model_ft.to(device)
# Load PreTrain Model
model.load_state_dict(torch.load('./PreTrainModel/base_clf_model.pt'))

<All keys matched successfully>

**Test**

In [17]:
model.eval()
y_label = []
y_predict = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        images, labels = data
        N = images.size(0)
        images = Variable(images).to(device)
        outputs = model(images)
        prediction = outputs.max(1, keepdim=True)[1]
        y_label.extend(labels.cpu().numpy())
        y_predict.extend(np.squeeze(prediction.cpu().numpy().T))

**Report of Test Performance**

In [19]:
# Generate a classification report
plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'nv', 'vasc','mel']

report = classification_report(y_label, y_predict, target_names=plot_labels)
print(report)

              precision    recall  f1-score   support

       akiec       0.53      0.69      0.60        61
         bcc       0.87      0.58      0.69       113
         bkl       0.64      0.60      0.62       213
          df       0.94      0.65      0.77        26
          nv       0.92      0.85      0.88      1340
        vasc       0.91      0.78      0.84        27
         mel       0.41      0.69      0.52       223

    accuracy                           0.78      2003
   macro avg       0.75      0.69      0.70      2003
weighted avg       0.82      0.78      0.79      2003

