In [1]:
from numpy.random import seed
seed(101)
# from tensorflow import set_random_seed
# set_random_seed(101)

import pandas as pd
import numpy as np
import keras
from keras import backend as K
from keras.layers.core import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import os

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline


Using TensorFlow backend.


In [2]:
dataset_dir = '../../Documents/skin-cancer-mnist-ham10000/'

os.listdir(dataset_dir)

# # Create a new directory
base_dir = 'base_dir_v2'
if base_dir not in os.listdir():
    os.mkdir(base_dir)


# #[CREATE FOLDERS INSIDE THE BASE DIRECTORY]

# # now we create 7 folders inside 'base_dir':

# # train
#     # nv
#     # mel
#     # bkl
#     # bcc
#     # akiec
#     # vasc
#     # df
# # val
#     # nv
#     # mel
#     # bkl
#     # bcc
#     # akiec
#     # vasc
#     # df

# # create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)


# # [CREATE FOLDERS INSIDE THE TRAIN, VALIDATION AND TEST FOLDERS]
# # Inside each folder we create seperate folders for each class

# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)



# create new folders inside val_dir
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

In [3]:
df_data = pd.read_csv(dataset_dir + '/HAM10000_metadata.csv')
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
#X = df_data.drop('dx', axis=1)
y = df_data['dx']

df_train, df_val = train_test_split(df_data, test_size=0.1, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

(9013, 7)
(1002, 7)


In [5]:
df_train['dx'].value_counts()

nv       6034
mel      1002
bkl       989
bcc       463
akiec     294
vasc      128
df        103
Name: dx, dtype: int64

In [6]:
df_val['dx'].value_counts()

nv       671
mel      111
bkl      110
bcc       51
akiec     33
vasc      14
df        12
Name: dx, dtype: int64

In [7]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

# Get a list of images in each of the two folders
folder_1 = os.listdir(dataset_dir + '/ham10000_images_part_1')
folder_2 = os.listdir(dataset_dir + '/ham10000_images_part_2')

# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])


# Transfer the train images
for image in train_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join(dataset_dir + '/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join(dataset_dir + '/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)


# Transfer the val images

for image in val_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join(dataset_dir + '/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join(dataset_dir + '/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [10]:
# check how many train images we have in each folder

print(len(os.listdir('base_dir_v2/train_dir/nv')))
print(len(os.listdir('base_dir_v2/train_dir/mel')))
print(len(os.listdir('base_dir_v2/train_dir/bkl')))
print(len(os.listdir('base_dir_v2/train_dir/bcc')))
print(len(os.listdir('base_dir_v2/train_dir/akiec')))
print(len(os.listdir('base_dir_v2/train_dir/vasc')))
print(len(os.listdir('base_dir_v2/train_dir/df')))

6034
1002
989
463
294
128
103


In [11]:
# check how many val images we have in each folder

print(len(os.listdir('base_dir_v2/val_dir/nv')))
print(len(os.listdir('base_dir_v2/val_dir/mel')))
print(len(os.listdir('base_dir_v2/val_dir/bkl')))
print(len(os.listdir('base_dir_v2/val_dir/bcc')))
print(len(os.listdir('base_dir_v2/val_dir/akiec')))
print(len(os.listdir('base_dir_v2/val_dir/vasc')))
print(len(os.listdir('base_dir_v2/val_dir/df')))

671
111
110
51
33
14
12


In [12]:
# note that we are not augmenting class 'nv'
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    
    # We are creating temporary directories here because we delete these directories later
    # create a base dir
    aug_dir = 'aug_dir'
    if aug_dir not in os.listdir():
        os.mkdir(aug_dir)
        
    # create a dir within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    if img_dir not in os.listdir():
        os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # list all images in that directory
    img_list = os.listdir('base_dir_v2/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir e.g. class 'mel'
    for fname in img_list:
            # source path to image
            src = os.path.join('base_dir_v2/train_dir/' + img_class, fname)
            # destination path to image
            dst = os.path.join(img_dir, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)


    # point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'base_dir_v2/train_dir/' + img_class

    # Create a data generator
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        #brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir=save_path,
                                           save_format='jpg',
                                                    target_size=(224,224),
                                                    batch_size=batch_size)



    # Generate the augmented images and add them to the training folders
    
    ###########
    
    num_aug_images_wanted = 6000 # total number of images we want to have in each class
    
    ###########
    
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0,num_batches):

        imgs, labels = next(aug_datagen)
        
    # delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 1002 images belonging to 1 classes.
Found 989 images belonging to 1 classes.
Found 463 images belonging to 1 classes.
Found 294 images belonging to 1 classes.
Found 128 images belonging to 1 classes.
Found 103 images belonging to 1 classes.


In [13]:
# Check how many train images we now have in each folder.
# This is the original images plus the augmented images.

print(len(os.listdir('base_dir_v2/train_dir/nv')))
print(len(os.listdir('base_dir_v2/train_dir/mel')))
print(len(os.listdir('base_dir_v2/train_dir/bkl')))
print(len(os.listdir('base_dir_v2/train_dir/bcc')))
print(len(os.listdir('base_dir_v2/train_dir/akiec')))
print(len(os.listdir('base_dir_v2/train_dir/vasc')))
print(len(os.listdir('base_dir_v2/train_dir/df')))

6034
5810
5984
5606
5930
5170
4170


In [14]:
# Check how many val images we have in each folder.

print(len(os.listdir('base_dir_v2/val_dir/nv')))
print(len(os.listdir('base_dir_v2/val_dir/mel')))
print(len(os.listdir('base_dir_v2/val_dir/bkl')))
print(len(os.listdir('base_dir_v2/val_dir/bcc')))
print(len(os.listdir('base_dir_v2/val_dir/akiec')))
print(len(os.listdir('base_dir_v2/val_dir/vasc')))
print(len(os.listdir('base_dir_v2/val_dir/df')))

671
111
110
51
33
14
12
