[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

https://www.kaggle.com/vbookshelf/skin-lesion-analyzer-tensorflow-js-web-app

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
from sklearn.model_selection import train_test_split

In [2]:
os.listdir('../data/skin-cancer-mnist-ham10000')

['HAM10000_images_part_1',
 'HAM10000_images_part_2',
 'HAM10000_metadata.csv',
 'hmnist_28_28_L.csv',
 'hmnist_28_28_RGB.csv',
 'hmnist_8_8_L.csv',
 'hmnist_8_8_RGB.csv']

# Create the directory structure
In these folders we will store the images that will later be fed to the Keras generators.

In [3]:
# Create a new directory
base_dir = 'base_dir'
os.mkdir(base_dir)

# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# test_dir
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)


# [CREATE FOLDERS INSIDE THE TRAIN, VALIDATION AND TEST FOLDERS]
# Inside each folder we create seperate folders for each class

# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# create new folders inside test_dir
nv = os.path.join(test_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(test_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(test_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(test_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(test_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(test_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(test_dir, 'df')
os.mkdir(df)

Create Train and Test Sets

In [4]:
metadata = pd.read_csv('../data/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
metadata.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


Create a stratified test set

In [5]:
# this will tell us how many images are associated with each lesion_id
df = metadata.groupby('lesion_id').count()

# now we filter out lesion_id's that have only one image associated with it
df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000001,1,1,1,1,1,1
1,HAM_0000003,1,1,1,1,1,1
2,HAM_0000004,1,1,1,1,1,1
3,HAM_0000007,1,1,1,1,1,1
4,HAM_0000008,1,1,1,1,1,1


In [6]:
# here we identify lesion_id's that have duplicate images and those that have only
# one image.
def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# create a new colum that is a copy of the lesion_id column
metadata['duplicates'] = metadata['lesion_id']
# apply the function to this new column
metadata['duplicates'] = metadata['duplicates'].apply(identify_duplicates)

metadata.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicates
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,has_duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,has_duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,has_duplicates
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,has_duplicates
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,has_duplicates


In [7]:
metadata['duplicates'].value_counts()

no_duplicates     5514
has_duplicates    4501
Name: duplicates, dtype: int64

In [8]:
# now we filter out images that don't have duplicates
df = metadata[metadata['duplicates'] == 'no_duplicates']
df.shape

(5514, 8)

In [9]:
# now we create a test set using df because we are sure that none of these images
# have augmented duplicates in the train set
y = df['dx']
_, df_test = train_test_split(df, test_size=0.3, random_state=101, stratify=y)
df_test.shape

(1655, 8)

In [10]:
df_test['dx'].value_counts()

nv       1325
bkl       132
mel        69
bcc        53
akiec      45
vasc       19
df         12
Name: dx, dtype: int64

Create a train set that excludes images that are in the test set

In [11]:
# This set will be metadata excluding all rows that are in the test set
# This function identifies if an image is part of the train or test set.
def identify_test_rows(x):
    # create a list of all the lesion_id's in the test set
    test_list = list(df_test['image_id'])
    
    if str(x) in test_list:
        return 'test'
    else:
        return 'train'

# identify train and test rows
# create a new colum that is a copy of the image_id column
metadata['train_or_test'] = metadata['image_id']
# apply the function to this new column
metadata['train_or_test'] = metadata['train_or_test'].apply(identify_test_rows)

# filter out train rows
df_train = metadata[metadata['train_or_test'] == 'train']

print(len(df_train))
print(len(df_test))

8360
1655


In [12]:
df_train['dx'].value_counts()

nv       5380
mel      1044
bkl       967
bcc       461
akiec     282
vasc      123
df        103
Name: dx, dtype: int64

In [13]:
df_test['dx'].value_counts()

nv       1325
bkl       132
mel        69
bcc        53
akiec      45
vasc       19
df         12
Name: dx, dtype: int64

Transfer the Images into the Folders.

In [14]:
# Set the image_id as the index in metadata
metadata.set_index('image_id', inplace=True)

In [15]:
# Get a list of images in each of the two folders
folder_1 = os.listdir('../data/skin-cancer-mnist-ham10000/ham10000_images_part_1')
folder_2 = os.listdir('../data/skin-cancer-mnist-ham10000/ham10000_images_part_2')

# Get a list of train and test images
train_list = list(df_train['image_id'])
test_list = list(df_test['image_id'])

# Transfer the train images
for image in train_list:
    
    fname = image + '.jpg'
    label = metadata.loc[image,'dx']
    
    if fname in folder_1:
        try:
            # source path to image
            src = os.path.join('../data/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
            # destination path to image
            dst = os.path.join(train_dir, label, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)
        except:
            pass

    if fname in folder_2:
        try:
            # source path to image
            src = os.path.join('../data/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
            # destination path to image
            dst = os.path.join(train_dir, label, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)
        except:
            pass


# Transfer the test images
for image in test_list:
    
    fname = image + '.jpg'
    label = metadata.loc[image,'dx']
    
    if fname in folder_1:
        try:
            # source path to image
            src = os.path.join('../data/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
            # destination path to image
            dst = os.path.join(test_dir, label, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)
        except:
            pass

    if fname in folder_2:
        try:
            # source path to image
            src = os.path.join('../data/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
            # destination path to image
            dst = os.path.join(test_dir, label, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)
        except:
            pass

In [16]:
# check how many train images we have in each folder
print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

5380
1044
967
461
282
123
103


In [17]:
# check how many test images we have in each folder
print(len(os.listdir('base_dir/test_dir/nv')))
print(len(os.listdir('base_dir/test_dir/mel')))
print(len(os.listdir('base_dir/test_dir/bkl')))
print(len(os.listdir('base_dir/test_dir/bcc')))
print(len(os.listdir('base_dir/test_dir/akiec')))
print(len(os.listdir('base_dir/test_dir/vasc')))
print(len(os.listdir('base_dir/test_dir/df')))

1325
69
132
53
45
19
12
