In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pathlib
from keras_preprocessing.image import ImageDataGenerator
import cv2
from tensorflow.keras.preprocessing import image
from PIL import Image 
# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D,BatchNormalization
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.keras import layers
from keras.applications.xception import Xception
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

**get the path of data**

In [2]:
dir = pathlib.Path('../input/isic-2019/ISIC_2019_Training_Input')

In [3]:
FilePath =pd.Series(list(dir.glob(r'**/*.jpg')), name = 'FilePaths').astype(str)
#image_data = image_data.rename(columns = {0: 'FilePath'})
image_data = pd.DataFrame(FilePath)
image_data

In [4]:
del dir

**Read target data containing skin cancer type**

In [5]:
data = pd.read_csv('../input/isic-2019/ISIC_2019_Training_GroundTruth.csv')
data

In [6]:
data = data.drop(['image'], axis = 1)

**create a  cloumn with label to explore the data and target**


In [7]:
def get_col_name(row):    
    b = (data.loc[row.name] == 1)
    c = list(b.index[b])
    return c[0]

data['label'] = data.apply(get_col_name, axis=1)

**concate the data (img+ target)**

In [8]:
df = pd.concat([image_data, data], axis = 1)
df

In [9]:
del data

In [10]:
del image_data

**numeric data containing the age and gender of patient**

In [11]:
num_data = pd.read_csv('../input/isic-2019/ISIC_2019_Training_Metadata.csv')
num_data

In [12]:
num_data1 = num_data.drop(['image', 'anatom_site_general', 'lesion_id'], axis = 1)
num_data1

impute the missing value with the most frequent 

In [13]:
SI = SimpleImputer(strategy='most_frequent')

In [14]:
SI.fit(num_data1)

In [15]:
num_data2 = pd.DataFrame(SI.transform(num_data1))

In [16]:
num_data2.index = num_data1.index

In [17]:
num_data2.columns = num_data1.columns

In [18]:
num_data2.isnull().sum()

**encode the sex**

In [19]:
LE = LabelEncoder()

In [20]:
sex = pd.DataFrame(LE.fit_transform(num_data2['sex']))

In [21]:
sex.index = num_data2.index

In [22]:
num_data3 = pd.concat([num_data2, sex], axis = 1)
num_data3

In [23]:
num_data4 = num_data3.drop(['sex'], axis = 1)

In [24]:
num_data4= num_data4.rename(columns = {0: 'sex'})
num_data4.head()

In [25]:
num_data4 = num_data4.astype(np.float64)
num_data4

In [26]:
df2 = pd.concat([df,num_data4], axis = 1)

**We want to sample the data and to be fair with the skin cancer types and let them all be in this model, we aggregate them and then we sample**

In [27]:
df2 = df2.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=0.45))
df2

In [28]:
images= pd.DataFrame(df2['FilePaths'].map(lambda x:image.load_img(x, target_size=(299, 299))))
df2['img'] = images['FilePaths'].map(lambda x:image.img_to_array(x))
df2.head(2)

**we drop "unk" column because there is no patient we don't know his skin cancer type**

In [29]:
df2 = df2.drop(['FilePaths', 'UNK'], axis = 1)

In [30]:
df2.head(2)

In [31]:
del df

In [32]:
(df2['label'].value_counts()) / len(df2) * 100

**split the data and add "stratify" to help in imbalance problem**

In [33]:
train_DS, val_DS= train_test_split(df2,test_size=0.3, stratify=df2['label'], random_state=1)

print('Train Data: ', train_DS.shape)
print('Val Data: ', val_DS.shape)

In [34]:
val_DS, test_DS= train_test_split(val_DS, test_size=0.3, stratify=val_DS['label'], random_state=1)
print('Val Data: ', val_DS.shape)
print('test Data: ', test_DS.shape)

**to fix the imbalance  problem depend on label column**

In [35]:
from sklearn.utils import class_weight


class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = train_DS.label.unique().tolist(),
                                                 y = train_DS.label.tolist())
class_weights

In [36]:
Class_Weight = { 0 : 3.66322314,
1 : 0.95322581,                
2 : 1.20776567,
3 : 13.03676471,
4 : 0.70023697,
5 : 0.24597669,
6 : 5.03693182,
7 : 12.3125}

In [37]:
train_DS.label.unique()

In [38]:
val_DS.label.unique()

In [39]:
test_DS.label.unique()

In [40]:
train_DS = train_DS.drop(['label'], axis = 1)
val_DS = val_DS.drop(['label'], axis = 1)
test_DS = test_DS.drop(['label'], axis = 1)

In [41]:
X_train = train_DS[['img','age_approx','sex']]
y_train = train_DS.drop(['img','age_approx','sex'], axis = 1)
X_val = val_DS[['img','age_approx','sex']]
y_val = val_DS.drop(['img','age_approx','sex'], axis = 1)
X_test = test_DS[['img','age_approx','sex']]
y_test = test_DS.drop(['img','age_approx','sex'], axis = 1)

In [42]:
y_train

In [43]:
del df2
del train_DS
del val_DS
del test_DS

In [44]:
X_train_img = np.asarray(X_train['img'].tolist())
X_val_img = np.asarray(X_val['img'].tolist())
X_test_img = np.asarray(X_test['img'].tolist())

In [45]:
X_train_age = np.asarray(X_train['age_approx'].tolist())
X_val_age = np.asarray(X_val['age_approx'].tolist())
X_test_age = np.asarray(X_test['age_approx'].tolist())

In [46]:
X_train_sex = np.asarray(X_train['sex'].tolist())
X_val_sex = np.asarray(X_val['sex'].tolist())
X_test_sex = np.asarray(X_test['sex'].tolist())

**adjust the parameters**

In [47]:
#from tensorflow.keras.optimizers import Adam 
#optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=5, verbose=1, factor=0.5, min_lr=0.00001)
early_stopping_monitor = EarlyStopping(patience=20, monitor='val_accuracy', restore_best_weights=True)

datagen = ImageDataGenerator(
        featurewise_center=False,
        samplewise_center=False,
        featurewise_std_normalization=False,
        samplewise_std_normalization=False,
        zca_whitening=False,
        rotation_range=90,
        zoom_range = 0.1,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        shear_range = 10)

**here we use a built in model "Xceptional" which inspire from google image processing model**

In [48]:
Xception_weights_path = '../input/xception/xception_weights_tf_dim_ordering_tf_kernels.h5'

In [53]:
from keras.layers import Input

input_tens = Input(shape=(299,299,3))
# for sex and age 
sex = tf.keras.Input(1, dtype = tf.float32)
age = tf.keras.Input(1, dtype = tf.float32)

basemodel = Xception(
    include_top=True,
    weights=Xception_weights_path,
    input_tensor=input_tens,
    input_shape=(299,299,3)
)
Concatmodel = basemodel.output
#take the flatten
flat_features = Flatten()(Concatmodel)
#concate the model of sex , age and  model flatten 
flat_features = tf.concat([flat_features, sex, age], -1)

Concatmodel = BatchNormalization()(flat_features)
Concatmodel = Dense(128, activation='relu')(Concatmodel)
Concatmodel = Dropout(0.2)(Concatmodel)

Concatmodel = BatchNormalization()(Concatmodel)
ConcatOutput = Dense(8, activation = 'softmax')(Concatmodel)

Concatmodel = Model(inputs = [[input_tens, sex, age]], outputs=ConcatOutput)

model1 = Concatmodel
model1.summary()

In [54]:
model1.compile(optimizer = 'adam' , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
batch_size = 32
history = model1.fit([X_train_img, X_train_sex, X_train_age], y_train,
                    epochs = 20, 
                    validation_data = ([X_val_img, X_val_sex, X_val_age],y_val),
                    verbose = 1, steps_per_epoch=X_train.shape[0] // batch_size, 
                      callbacks=[learning_rate_reduction,early_stopping_monitor], class_weight= Class_Weight)