In [None]:
import os
import shutil
from pathlib import Path
from dotenv import load_dotenv
env_path = Path('..') / 'environmentsettings.env'
load_dotenv(dotenv_path=env_path)

skin_cancer_bucket=os.environ.get('SKIN_CANCER_BUCKET')
skin_cancer_bucket_path=os.environ.get('SKIN_CANCER_BUCKET_PATH')
skin_cancer_files=os.environ.get('SKIN_CANCER_FILES')
skin_cancer_files_ext=os.environ.get('SKIN_CANCER_FILES_EXT')
base_dir = os.environ.get('BASE_DIR')

In [None]:
import boto3

if os.path.exists(os.path.join(base_dir,skin_cancer_files)):
    shutil.rmtree(base_dir+skin_cancer_files)
    
if os.path.exists(os.path.join(base_dir,skin_cancer_files_ext)):
    os.remove(os.path.join(base_dir,skin_cancer_files_ext))    

folder_data = os.path.join(base_dir,'HAM10000')

if os.path.exists(os.path.join(base_dir,'HAM10000.tar.gz')):
    os.remove(os.path.join(base_dir,'HAM10000.tar.gz'))

if os.path.exists(folder_data):
    shutil.rmtree(folder_data)
    
buket = boto3.client('s3')
buket.download_file(skin_cancer_bucket, skin_cancer_bucket_path+'/'+skin_cancer_files_ext,base_dir+skin_cancer_files_ext)

print('we are downloading from s3 bucket '+skin_cancer_bucket)

In [None]:
import torchtext
from numpy.random import seed
seed(101)
import pandas as pd
import numpy as np
import os

os.mkdir(base_dir+skin_cancer_files)
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_2')

print('extract dataset for train and transform')

torchtext.utils.extract_archive(base_dir+skin_cancer_files_ext, base_dir+skin_cancer_files)
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_1.zip', base_dir+skin_cancer_files+'/HAM_images_part_1')
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_2.zip', base_dir+skin_cancer_files+'/HAM_images_part_2')

In [None]:
os.mkdir(folder_data)


folder_training = os.path.join(folder_data, 'folder_training')
os.mkdir(folder_training)


folder_val = os.path.join(folder_data, 'folder_val')
os.mkdir(folder_val)

print('Make a validation and training directory under HAM10000.')



nv = os.path.join(folder_training, 'nv')
os.mkdir(nv)
mel = os.path.join(folder_training, 'mel')
os.mkdir(mel)
bkl = os.path.join(folder_training, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(folder_training, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(folder_training, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(folder_training, 'vasc')
os.mkdir(vasc)
df = os.path.join(folder_training, 'df')
os.mkdir(df)


nv = os.path.join(folder_val, 'nv')
os.mkdir(nv)
mel = os.path.join(folder_val, 'mel')
os.mkdir(mel)
bkl = os.path.join(folder_val, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(folder_val, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(folder_val, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(folder_val, 'vasc')
os.mkdir(vasc)
df = os.path.join(folder_val, 'df')
os.mkdir(df)

In [None]:
metadata = pd.read_csv(base_dir+skin_cancer_files+'/HAM10000_metadata')

metadata.head()

In [None]:

df = metadata.groupby('lesion_id').count()


df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

df.head()

In [None]:


def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    

metadata['duplicates'] = metadata['lesion_id']

metadata['duplicates'] = metadata['duplicates'].apply(identify_duplicates)

metadata.head()

print('we have removed duplicates here')


In [None]:
metadata['duplicates'].value_counts()

In [None]:

df = metadata[metadata['duplicates'] == 'no_duplicates']

df.shape

In [None]:

from sklearn.model_selection import train_test_split

y = df['dx']

_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)

df_val.shape

In [None]:
df_val['dx'].value_counts()

In [None]:

def identify_val_rows(x):

    listOfval = list(df_val['image_id'])
    
    if str(x) in listOfval:
        return 'val'
    else:
        return 'train'




metadata['train_or_val'] = metadata['image_id']

metadata['train_or_val'] = metadata['train_or_val'].apply(identify_val_rows)
   

df_train = metadata[metadata['train_or_val'] == 'train']

In [None]:
df_train['dx'].value_counts()

In [None]:
df_val['dx'].value_counts()

In [None]:

metadata.set_index('image_id', inplace=True)

In [None]:

import shutil

HAM1 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
HAM2 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_2')


listoftrain = list(df_train['image_id'])
listOfval = list(df_val['image_id'])




print('Sort Pictures by Type of Skin Cancer')

for image in listoftrain:
    
    fname = image + '.jpg'
    label = metadata.loc[image,'dx']
    
    if fname in HAM1:

        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)

        dst = os.path.join(folder_training, label, fname)

        shutil.copyfile(src, dst)

    if fname in HAM2:

        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)

        dst = os.path.join(folder_training, label, fname)

        shutil.copyfile(src, dst)
        

for image in listOfval:
    
    fname = image + '.jpg'
    label = metadata.loc[image,'dx']
    
    if fname in HAM1:

        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)

        dst = os.path.join(folder_val, label, fname)

        shutil.copyfile(src, dst)

    if fname in HAM2:

        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)

        dst = os.path.join(folder_val, label, fname)

        shutil.copyfile(src, dst)
        

print('Classes based on images')
print('nv: '+str(len(os.listdir(folder_training +'/nv'))))
print('mel: '+str(len(os.listdir(folder_training +'/mel'))))
print('bkl: '+str(len(os.listdir(folder_training +'/bkl'))))
print('bcc: '+str(len(os.listdir(folder_training +'/bcc'))))
print('akiec: '+str(len(os.listdir(folder_training +'/akiec'))))
print('vasc: '+str(len(os.listdir(folder_training +'/vasc'))))
print('df: '+str(len(os.listdir(folder_training +'/df'))))

In [None]:

class_list = ['mel','bkl','bcc','akiec','vasc','df']

print('Expand Pictures Via Class')

for item in class_list:
    


    aug_dir = folder_data + '/aug_dir'
    os.mkdir(aug_dir)

    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)


    img_class = item


    img_list = os.listdir(folder_training + '/'+ img_class)


    for fname in img_list:
        
        src = os.path.join(folder_training + '/' + img_class, fname)
        
        dst = os.path.join(img_dir,fname)
        
        shutil.copyfile(src, dst)
        
    
    aug_list = os.listdir(img_dir)
    
    num_aug_images_wanted = 5000 
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted/num_files)))
    
    j = 0
    for i in range(1,num_batches):
        for fname in aug_list:
            
            src = os.path.join(img_dir, fname)
            
            dst = os.path.join(folder_training + '/' + img_class, 'AUG_' + str(j) + '_'+ fname)
            
            shutil.copyfile(src, dst)
        j = j + 1
            
    shutil.rmtree(aug_dir)

In [None]:

print('Class-Based Images Following Augmentation')
print('nv: '+str(len(os.listdir(folder_training +'/nv'))))
print('mel: '+str(len(os.listdir(folder_training +'/mel'))))
print('bkl: '+str(len(os.listdir(folder_training +'/bkl'))))
print('bcc: '+str(len(os.listdir(folder_training +'/bcc'))))
print('akiec: '+str(len(os.listdir(folder_training +'/akiec'))))
print('vasc: '+str(len(os.listdir(folder_training +'/vasc'))))
print('df: '+str(len(os.listdir(folder_training +'/df'))))

In [None]:
from PIL import Image

class_names = sorted([x for x in os.listdir(folder_training) if os.path.isdir(os.path.join(folder_training, x))])
num_class = len(class_names)
image_files = [[os.path.join(folder_training, class_name, x) 
                for x in os.listdir(os.path.join(folder_training, class_name))] 
               for class_name in class_names]
image_file_list = []
image_label_list = []

for i, class_name in enumerate(class_names):
    image_file_list.extend(image_files[i])
    image_label_list.extend([i] * len(image_files[i]))
num_total = len(image_label_list)
image_width, image_height = Image.open(image_file_list[0]).size

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


print('A Sample of Image Training')
plt.subplots(3, 3, figsize=(8, 8))
for i,k in enumerate(np.random.randint(num_total, size=9)):
    im = Image.open(image_file_list[k])
    arr = np.array(im)
    #print(arr.shape)
    plt.subplot(3, 3, i + 1)
    plt.xlabel(class_names[image_label_list[k]])
    plt.imshow(arr, vmin=0, vmax=255)
plt.tight_layout()
plt.show()

print('')
print('Total image count:', num_total)
print('Image dimensions:', image_width, "x", image_height)
print('Label names:', class_names)
print('Label counts:', [len(image_files[i]) for i in range(num_class)])
print('')

In [None]:
print('compressed HAM10000 data set after transformation.')

!tar -czf ../HAM10000.tar.gz ../HAM10000

print('Transformation of the training dataset is finished.')