In [None]:
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

import os
import shutil
from pathlib import Path
from dotenv import load_dotenv
env_path = Path('..') / 'environmentsettings.env'
load_dotenv(dotenv_path=env_path)

skin_cancer_bucket=os.environ.get('SKIN_CANCER_BUCKET')
skin_cancer_bucket_path=os.environ.get('SKIN_CANCER_BUCKET_PATH')
skin_cancer_files=os.environ.get('SKIN_CANCER_FILES')
skin_cancer_files_ext=os.environ.get('SKIN_CANCER_FILES_EXT')
base_dir = os.environ.get('BASE_DIR')

In [None]:
import boto3

if os.path.exists(os.path.join(base_dir,skin_cancer_files)):
    shutil.rmtree(base_dir+skin_cancer_files)
    
if os.path.exists(os.path.join(base_dir,skin_cancer_files_ext)):
    os.remove(os.path.join(base_dir,skin_cancer_files_ext))    

datafolder = os.path.join(base_dir,'HAM10000')

if os.path.exists(os.path.join(base_dir,'HAM10000.tar.gz')):
    os.remove(os.path.join(base_dir,'HAM10000.tar.gz'))

if os.path.exists(datafolder):
    shutil.rmtree(datafolder)
    
s3 = boto3.client('s3')
s3.download_file(skin_cancer_bucket, skin_cancer_bucket_path+'/'+skin_cancer_files_ext,base_dir+skin_cancer_files_ext)

print('we are doing download dataset from  ours3 bucket '+skin_cancer_bucket)

In [None]:
import torchtext
from numpy.random import seed
seed(101)
import pandas as pd
import numpy as np
import os

os.mkdir(base_dir+skin_cancer_files)
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_2')

print('extract dataset for modle train and transform.')

torchtext.utils.extract_archive(base_dir+skin_cancer_files_ext, base_dir+skin_cancer_files)
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_1.zip', base_dir+skin_cancer_files+'/HAM_images_part_1')
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_2.zip', base_dir+skin_cancer_files+'/HAM_images_part_2')

In [None]:
os.mkdir(datafolder)

folderfortrain = os.path.join(datafolder, 'folderfortrain')
os.mkdir(folderfortrain)

validfolder = os.path.join(datafolder, 'validfolder')
os.mkdir(validfolder)

print('Create training and validation folder under directory HAM10000')

nv = os.path.join(folderfortrain, 'nv')
os.mkdir(nv)
mel = os.path.join(folderfortrain, 'mel')
os.mkdir(mel)
bkl = os.path.join(folderfortrain, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(folderfortrain, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(folderfortrain, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(folderfortrain, 'vasc')
os.mkdir(vasc)
datafolder = os.path.join(folderfortrain, 'datafolder')
os.mkdir(datafolder)

nv = os.path.join(validfolder, 'nv')
os.mkdir(nv)
mel = os.path.join(validfolder, 'mel')
os.mkdir(mel)
bkl = os.path.join(validfolder, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(validfolder, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(validfolder, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(validfolder, 'vasc')
os.mkdir(vasc)
datafolder = os.path.join(validfolder, 'datafolder')
os.mkdir(datafolder)

In [None]:
datafeed = pd.read_csv(base_dir+skin_cancer_files+'/HAM10000_metadata')
datafeed.head()

In [None]:
datafolder = datafeed.groupby('lesion_id').count()
datafolder = datafolder[datafolder['image_id'] == 1]
datafolder.reset_index(inplace=True)
datafolder.head()

In [None]:
def identify_duplicates(x):
   
    unique_list = list(datafolder['lesion_id'])
    
    if x in unique_list:
        return 'unique_data'
    else:
        return 'duplicates'
    
datafeed['duplicates'] = datafeed['lesion_id']
datafeed['duplicates'] = datafeed['duplicates'].apply(identify_duplicates)
datafeed.head()
print('we have removed duplicates')


In [None]:
datafeed['duplicates'].value_counts()

In [None]:
datafolder = datafeed[datafeed['duplicates'] == 'unique_data']
datafolder.shape

In [None]:
from sklearn.model_selection import train_test_split
y = datafolder['dx']
_, df_val = train_test_split(datafolder, test_size=0.17, random_state=101, stratify=y)
df_val.shape

In [None]:
df_val['dx'].value_counts()

In [None]:
def identify_val_rows(x):
    valiList = list(df_val['image_id'])
    
    if str(x) in valiList:
        return 'val'
    else:
        return 'train'

datafeed['train_or_val'] = datafeed['image_id']
datafeed['train_or_val'] = datafeed['train_or_val'].apply(identify_val_rows)
df_train = datafeed[datafeed['train_or_val'] == 'train']

In [None]:
df_train['dx'].value_counts()

In [None]:
df_val['dx'].value_counts()

In [None]:
datafeed.set_index('image_id', inplace=True)

In [None]:
import shutil
HAM1 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
HAM2 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_2')
train_list = list(df_train['image_id'])
valiList = list(df_val['image_id'])

# we are transfering the trained images
print('Sort Pictures by Type of Skin Cancer')

for pictire in train_list:
    
    fname = pictire + '.jpg'
    label = datafeed.loc[pictire,'dx']
    
    if fname in HAM1:        
        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)        
        dst = os.path.join(folderfortrain, label, fname)        
        shutil.copyfile(src, dst)

    if fname in HAM2:        
        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)        
        dst = os.path.join(folderfortrain, label, fname)        
        shutil.copyfile(src, dst)
        
for pictire in valiList:
    
    fname = pictire + '.jpg'
    label = datafeed.loc[pictire,'dx']
    
    if fname in HAM1:        
        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)        
        dst = os.path.join(validfolder, label, fname)        
        shutil.copyfile(src, dst)

    if fname in HAM2:        
        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)        
        dst = os.path.join(validfolder, label, fname)        
        shutil.copyfile(src, dst)
        
print('Images sorted by their Classes')
print('nv: '+str(len(os.listdir(folderfortrain +'/nv'))))
print('mel: '+str(len(os.listdir(folderfortrain +'/mel'))))
print('bkl: '+str(len(os.listdir(folderfortrain +'/bkl'))))
print('bcc: '+str(len(os.listdir(folderfortrain +'/bcc'))))
print('akiec: '+str(len(os.listdir(folderfortrain +'/akiec'))))
print('vasc: '+str(len(os.listdir(folderfortrain +'/vasc'))))
print('datafolder: '+str(len(os.listdir(folderfortrain +'/datafolder'))))

In [None]:

classList = ['mel','bkl','bcc','akiec','vasc','datafolder']
print('Expand Pictures Via Class')

for item in classList:        
    aug_dir = datafolder + '/aug_dir'
    os.mkdir(aug_dir)    
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)
    
    img_class = item

    img_list = os.listdir(folderfortrain + '/'+ img_class)
    
    for fname in img_list:
        src = os.path.join(folderfortrain + '/' + img_class, fname)
        dst = os.path.join(img_dir,fname)
        shutil.copyfile(src, dst)
        
    aug_list = os.listdir(img_dir)
    
    num_aug_images_wanted = 5000
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted/num_files)))
    
    j = 0
    for i in range(1,num_batches):
        for fname in aug_list:
            src = os.path.join(img_dir, fname)
            dst = os.path.join(folderfortrain + '/' + img_class, 'AUG_' + str(j) + '_'+ fname)
            shutil.copyfile(src, dst)
        j = j + 1
            
    shutil.rmtree(aug_dir)

In [None]:
print('Class-Based Images Following Augmentation')
print('nv: '+str(len(os.listdir(folderfortrain +'/nv'))))
print('mel: '+str(len(os.listdir(folderfortrain +'/mel'))))
print('bkl: '+str(len(os.listdir(folderfortrain +'/bkl'))))
print('bcc: '+str(len(os.listdir(folderfortrain +'/bcc'))))
print('akiec: '+str(len(os.listdir(folderfortrain +'/akiec'))))
print('vasc: '+str(len(os.listdir(folderfortrain +'/vasc'))))
print('datafolder: '+str(len(os.listdir(folderfortrain +'/datafolder'))))

In [None]:
from PIL import Image

class_names = sorted([x for x in os.listdir(folderfortrain) if os.path.isdir(os.path.join(folderfortrain, x))])
num_class = len(class_names)
image_files = [[os.path.join(folderfortrain, class_name, x) 
                for x in os.listdir(os.path.join(folderfortrain, class_name))] 
               for class_name in class_names]
image_file_list = []
image_label_list = []

for i, class_name in enumerate(class_names):
    image_file_list.extend(image_files[i])
    image_label_list.extend([i] * len(image_files[i]))
num_total = len(image_label_list)
image_width, image_height = Image.open(image_file_list[0]).size

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

print('A Sample of Image Training')
plt.subplots(3, 3, figsize=(8, 8))
for i,k in enumerate(np.random.randint(num_total, size=9)):
    im = Image.open(image_file_list[k])
    arr = np.array(im)
    #print(arr.shape)
    plt.subplot(3, 3, i + 1)
    plt.xlabel(class_names[image_label_list[k]])
    plt.imshow(arr, vmin=0, vmax=255)
plt.tight_layout()
plt.show()

print('Total image count:', num_total)
print('Image dimensions:', image_width, "x", image_height)
print('Label names:', class_names)
print('Label counts:', [len(image_files[i]) for i in range(num_class)])


In [None]:
print('HAM10000 data set transformation and compression.')
!tar -czf ../HAM10000.tar.gz ../HAM10000
print('Transformation of the training dataset is finished.')