# Imports



In [1]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import cv2
from keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout, Input, BatchNormalization, Activation, concatenate
from tensorflow.keras.models import Model
from keras import Sequential
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow

# Global functions

In [130]:
def output_metrics(model, test_data, test_labels):
  predictions = model.predict(test_data)

  y_pred  = np.argmax(predictions, axis=-1)

  print(f'Accuracy score: {accuracy_score(test_labels, y_pred)}')
  print(f'F1 score: {f1_score(test_labels, y_pred, average="weighted")}')
  print(f'Precision score: {precision_score(test_labels, y_pred, average="weighted")}')
  print(f'Recall score: {recall_score(test_labels, y_pred, average="weighted")}')

In [3]:
def read_and_resize_image(image_path, size):
    img = cv2.imread(image_path)
    img = cv2.resize(img, size)
    return img

# Loading dataset

In [4]:
!pip install -q kaggle
from google.colab import files 

In [5]:
files.upload()

Saving kaggle.json to kaggle (4).json


{'kaggle.json': b'{"username":"johnk27","key":"00b3f1fe3fa0259eb805e8fe2dd1eb02"}'}

In [6]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [7]:
!cp kaggle.json ~/.kaggle/

In [8]:
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.18G/5.20G [00:34<00:00, 308MB/s]
100% 5.20G/5.20G [00:34<00:00, 162MB/s]


In [27]:
!unzip -q skin-cancer-mnist-ham10000.zip -d content

unzip:  cannot find or open skin-cancer-mnist-ham10000.zip, skin-cancer-mnist-ham10000.zip.zip or skin-cancer-mnist-ham10000.zip.ZIP.


In [28]:
# Removing the zip to save space
!rm skin-cancer-mnist-ham10000.zip

rm: cannot remove 'skin-cancer-mnist-ham10000.zip': No such file or directory


In [29]:
df = pd.read_csv('content/HAM10000_metadata.csv')

In [30]:
print(df)

         lesion_id      image_id     dx dx_type   age     sex localization
0      HAM_0000118  ISIC_0027419    bkl   histo  80.0    male        scalp
1      HAM_0000118  ISIC_0025030    bkl   histo  80.0    male        scalp
2      HAM_0002730  ISIC_0026769    bkl   histo  80.0    male        scalp
3      HAM_0002730  ISIC_0025661    bkl   histo  80.0    male        scalp
4      HAM_0001466  ISIC_0031633    bkl   histo  75.0    male          ear
...            ...           ...    ...     ...   ...     ...          ...
10010  HAM_0002867  ISIC_0033084  akiec   histo  40.0    male      abdomen
10011  HAM_0002867  ISIC_0033550  akiec   histo  40.0    male      abdomen
10012  HAM_0002867  ISIC_0033536  akiec   histo  40.0    male      abdomen
10013  HAM_0000239  ISIC_0032854  akiec   histo  80.0    male         face
10014  HAM_0003521  ISIC_0032258    mel   histo  70.0  female         back

[10015 rows x 7 columns]


In [31]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [32]:
df.describe()

Unnamed: 0,age
count,9958.0
mean,51.863828
std,16.968614
min,0.0
25%,40.0
50%,50.0
75%,65.0
max,85.0


In [33]:
df.localization.unique()

array(['scalp', 'ear', 'face', 'back', 'trunk', 'chest',
       'upper extremity', 'abdomen', 'unknown', 'lower extremity',
       'genital', 'neck', 'hand', 'foot', 'acral'], dtype=object)

# Pre-processing

In [34]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Bening keratosis-like lesions',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}
raw_images = 'content'

In [35]:
from glob import glob

imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(raw_images, '*', '*.jpg'))}

In [36]:
print(os.path.join(raw_images, '*', '*.jpg'))

content/*/*.jpg


In [37]:
df['path'] = df['image_id'].map(imageid_path_dict.get)
df['cell_type'] = df['dx'].map(lesion_type_dict.get)
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

In [38]:
df.isna().sum()

lesion_id         0
image_id          0
dx                0
dx_type           0
age              57
sex               0
localization      0
path              0
cell_type         0
cell_type_idx     0
dtype: int64

In [39]:
df = df.dropna()

In [42]:
image_size = (64, 64)

In [43]:
# Apply the function to each image path in the 'path' column of the dataframe
df['image'] = df['path'].apply(lambda x: read_and_resize_image(x, image_size))

In [45]:
df['image'].map(lambda x: x.shape).value_counts()

(64, 64, 3)    9958
Name: image, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['cell_type_idx'], random_state=42)

In [47]:
train_df = train_df.reset_index()  # make sure indexes pair with number of rows

In [48]:
print(train_df)

      index    lesion_id      image_id     dx    dx_type   age     sex  \
0      5649  HAM_0003963  ISIC_0027448     nv  follow_up  45.0  female   
1        29  HAM_0001480  ISIC_0031753    bkl      histo  70.0    male   
2      8367  HAM_0000597  ISIC_0030654     nv      histo  35.0  female   
3      9841  HAM_0002615  ISIC_0033413  akiec      histo  70.0    male   
4      1429  HAM_0001729  ISIC_0024537    mel      histo  85.0    male   
...     ...          ...           ...    ...        ...   ...     ...   
7961   5022  HAM_0005841  ISIC_0027804     nv  follow_up  50.0  female   
7962   3768  HAM_0007408  ISIC_0028377     nv  follow_up   5.0    male   
7963   9509  HAM_0004705  ISIC_0034260     nv  consensus  65.0    male   
7964   3341  HAM_0001810  ISIC_0025794     nv  follow_up  45.0  female   
7965   3697  HAM_0005218  ISIC_0028939     nv  follow_up  55.0  female   

         localization                                             path  \
0     lower extremity  content/HAM100

In [49]:
train_df['image'].map(lambda x: x.shape).value_counts()

(64, 64, 3)    7966
Name: image, dtype: int64

# Individual

In [104]:
# Extract the image data and target labels for train and test sets
X_train_img = np.stack(train_df['image'].values)
y_train = train_df['cell_type_idx'].values
X_test_img = np.stack(test_df['image'].values)
y_test = test_df['cell_type_idx'].values

In [105]:
def cnn():
  model = Sequential()
  model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)))
  model.add(BatchNormalization())
  model.add(MaxPool2D((2, 2)))
  model.add(BatchNormalization())
  model.add(Conv2D(128, (3, 3), activation='relu'))
  model.add(Dropout(0.6))
  model.add(BatchNormalization())
  model.add(MaxPool2D((2, 2)))
  model.add(BatchNormalization())
  model.add(Flatten())
  model.add(Dense(256, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.6))
  model.add(Dense(len(df['cell_type_idx'].unique()), activation='softmax'))
  return model

In [106]:
cnn = cnn()
cnn.compile(optimizer = 'adam', loss= 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [107]:
cnn.fit(X_train_img, y_train, batch_size=256, epochs=30, validation_split=0.3)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f3e629d5d50>

In [108]:
# Use the trained model to make predictions on the test data
output_metrics(cnn, X_test_img, y_test)

Accuracy score: 0.6686746987951807
F1 score: 0.6478484779786376
Precision score: 0.7097094034157636
Recall score: 0.6686746987951807


In [109]:
def mlp():
  model = Sequential()
  model.add(Input(shape=(3,)))
  model.add(Flatten())
  model.add(Dense(128, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dense(len(df['cell_type_idx'].unique()), activation='softmax'))
  return model

In [110]:
mlp = mlp()
mlp.compile(optimizer = 'adam', loss= 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [111]:
sex_dict = {
    'male': 0.0,
    'female': 1.0,
    'unknown': 1.5
}

loc_dict = {
    'back': 1.0,
    'lower extremity': 2.0,
    'trunk': 3.0,
    'upper extremity': 4.0,
    'abdomen': 5.0,
    'face': 6.0,
    'chest': 7.0,
    'foot': 8.0,
    'unknown': 9.0,
    'neck': 10.0,
    'scalp': 11.0,
    'hand': 12.0,
    'ear': 13.0,
    'genital': 14.0,
    'acral': 15.0
}

In [112]:
train_df = train_df.replace({"sex": sex_dict})
train_df = train_df.replace({"localization": loc_dict})
test_df = test_df.replace({"sex": sex_dict})
test_df = test_df.replace({"localization": loc_dict})

In [113]:
#Extract and recombine the demographic data for training set and test set
X_train_demo = np.stack((np.asarray(train_df['age'].values), np.asarray(train_df['sex'].values), np.asarray(train_df['localization'].values)))
y_train = train_df['cell_type_idx'].values
X_test_demo = np.stack((np.asarray(test_df['age'].values), np.asarray(test_df['sex'].values), np.asarray(test_df['localization'].values)))
y_test = test_df['cell_type_idx'].values

X_train_demo=X_train_demo.T.astype(int)
X_test_demo=X_test_demo.T.astype(int)

In [114]:
mlp.fit(X_train_demo, y_train, batch_size=256, epochs=30, validation_split=0.3)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f3e5e9f7940>

In [115]:
# Use the trained model to make predictions on the test data
output_metrics(mlp, X_test_demo, y_test)

Accuracy score: 0.6772088353413654
F1 score: 0.5603128659310151
Precision score: 0.5030501602974999
Recall score: 0.6772088353413654


  _warn_prf(average, modifier, msg_start, len(result))


# Combined 1 - Training the combined model with the previously trained CNN and MLP

In [116]:
combinedInput = concatenate(inputs=[mlp.output, cnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(128, activation="relu")(combinedInput)
x = Dense(64, activation="relu")(x)
x = Dense(len(df['cell_type_idx'].unique()), activation='softmax')(x)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

model.fit([X_train_demo, X_train_img], y_train, batch_size=256, epochs=30, validation_split=0.3)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f3e5e6b97e0>

In [117]:
output_metrics(model, [X_test_demo, X_test_img], y_test)

Accuracy score: 0.7449799196787149
F1 score: 0.7198458169609947
Precision score: 0.7239105234344776
Recall score: 0.7449799196787149


# Combined 2 - Training the combined model from scratch

In [118]:
def cnn():
  model = Sequential()
  model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)))
  model.add(BatchNormalization())
  model.add(MaxPool2D((2, 2)))
  model.add(BatchNormalization())
  model.add(Conv2D(128, (3, 3), activation='relu'))
  model.add(Dropout(0.6))
  model.add(BatchNormalization())
  model.add(MaxPool2D((2, 2)))
  model.add(BatchNormalization())
  model.add(Flatten())
  model.add(Dense(256, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.6))
  model.add(Dense(len(df['cell_type_idx'].unique()), activation='softmax'))
  return model

def mlp():
  model = Sequential()
  model.add(Input(shape=(3,)))
  model.add(Flatten())
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(Dense(len(df['cell_type_idx'].unique()), activation='softmax'))
  return model

In [119]:
mlp = mlp()
cnn = cnn()
# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate(inputs=[mlp.output, cnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(128, activation="relu")(combinedInput)
x = Dense(64, activation="relu")(x)
x = Dense(len(df['cell_type_idx'].unique()), activation='softmax')(x)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

In [121]:
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

model.fit([X_train_demo, X_train_img], y_train, batch_size=256, epochs=30, validation_split=0.3)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f3e62af7f10>

In [138]:
output_metrics(model, [X_test_demo, X_test_img], y_test)

Accuracy score: 0.6907630522088354
F1 score: 0.6808304521842057
Precision score: 0.7134948433986945
Recall score: 0.6907630522088354


  _warn_prf(average, modifier, msg_start, len(result))
