In [None]:
import os
import re
import cv2
import h5py
import keras
import random
import numpy as np # linear algebra
import splitfolders  # or import split_folders
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import tensorflow as tf
from tqdm.notebook import tqdm

In [None]:
LIST_SIZE   = [224] #[128, 160, 192, 224, 256]
IMG_SIZE    = 224
ROOT_PATH   = r'E:\HK7\Tieu_Luan_Chuyen_Nganh\Interior Design\'
NAME_FOLDER = r'SkinCancer_Data' #r'Interior Design'
FLOATX_R    = r'float32' #float16, float32, float64(default)
FLOATX_R    = r'float64' if (FLOATX_R == r'default') else FLOATX_R
LABELS      = os.listdir(ROOT_PATH)
LABELS.sort()
DATA_PATH = os.path.join(os.getcwd(), NAME_FOLDER)
OUTPUT_PATH = os.path.join(os.getcwd(), str(IMG_SIZE) + r'x' + str(IMG_SIZE))
PREFIX_OUT  = "[{size}][{floatx}]_".format(size = str(IMG_SIZE) + r'x' + str(IMG_SIZE), 
                                          floatx = FLOATX_R)
SIZE_PADDING = 60

In [None]:
print(LABELS)

In [None]:
print(r' SPLIT FOLDER ROOT'.rjust(SIZE_PADDING, '='))

if not os.path.exists(DATA_PATH):
    # Split with a ratio.
    # To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
    splitfolders.ratio(ROOT_PATH, output=NAME_FOLDER, seed=1337, ratio=(0.8, 0.2), group_prefix=None) # default values
    print(r'Folder already create...!')
else: print(r'Folder already exists...!')
print(r'Done')

In [None]:
print(r' DEFINE FUNCTION CHECK THE DIRECTORY CONTAINING THE OUTPUT'.rjust(SIZE_PADDING, '='))
def check_output_folder(OUTPUT_PATH):
    if not os.path.exists(OUTPUT_PATH):
        os.mkdir(OUTPUT_PATH)
        print(r'The directory containing the output already create : ' + OUTPUT_PATH)
    else: print(r'The directory containing the output already exists : ' + OUTPUT_PATH)
print(r'Done')
print(OUTPUT_PATH)

In [None]:
print(r' DEFINE FUNCTION GET DATA'.rjust(SIZE_PADDING, '='))
def get_data(data_dir):
    print(r'Start get the file...')
    data = []
    for label in tqdm(LABELS, desc='Labels'): 
        path = os.path.join(data_dir, label)
        class_num = LABELS.index(label)
        listfile = os.listdir(path)
        desc = label.rjust(15, '_') + ' : '
        for img in tqdm(listfile, desc=desc, leave=False):
            try:
                img_arr = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)
                resized_arr = cv2.resize(img_arr, (IMG_SIZE, IMG_SIZE)) # Reshaping images to preferred size
                data.append([resized_arr, class_num])
            except Exception as e:
                print(e)
                print(os.path.join(path, img))
    print('Get data successfully!')
    return np.array(data)
print(r'Done')

In [None]:
print(r' DEFINE FUNCTION GENERGATE DATA'.rjust(SIZE_PADDING, '='))
def gen_data(data, filename):
    print(r'Start creating the file...')
    x = []
    y = []
    
    for feature, label in tqdm(data, desc='Labels'):
        x.append(feature)
        y.append(label)
    
    x = np.array(x, dtype=FLOATX_R) / 255

    #Reshaping the data from 1-D to 3-D as required through input by CNN's 
    x = x.reshape(-1, IMG_SIZE, IMG_SIZE, 3)
    y = np.array(y)

    #Assign dtype
    x = x.astype(FLOATX_R)
    y = y.astype(FLOATX_R)

    if os.path.exists(filename):
        os.remove(filename)
        print('Removed file old : ', filename)
    
    hf = h5py.File(filename, 'a')
    hf.create_dataset('x', data=x)
    hf.create_dataset('y', data=y)
    hf.close()

    del x,y,data,hf
    print(r'Successful file creation!')
print(r'Done')

In [None]:
print(r' DEFINE FUNCTION GET & GENERATE LABELS'.rjust(SIZE_PADDING, '='))
def save_file_labels(LABELS):
    print(r'Start creating the file...')
    LABELS_SAVE = np.array(LABELS)
    LABELS_PATH = os.path.join(OUTPUT_PATH, PREFIX_OUT + r'Labels.npy')
    if not os.path.exists(LABELS_PATH):
        np.save(LABELS_PATH, LABELS_SAVE)
        print(r'File labels already create : ' + LABELS_PATH)
    else: print(r'File labels already exist : ' + LABELS_PATH)
print(r'Done')

In [None]:
print(r' PROCESSING LOAD DATA'.rjust(SIZE_PADDING, '='))

for size in LIST_SIZE:
    IMG_SIZE = size
    DATA_PATH = os.path.join(os.getcwd(), NAME_FOLDER)
    OUTPUT_PATH = os.path.join(os.getcwd(), str(IMG_SIZE) + r'x' + str(IMG_SIZE))
    PREFIX_OUT  = "[{size}][{floatx}]_".format(size = str(IMG_SIZE) + r'x' + str(IMG_SIZE), 
                                              floatx = FLOATX_R)
    print('\n')
    check_output_folder(OUTPUT_PATH)
    print(" [{size}][{floatx}]".format(size = str(IMG_SIZE) + r'x' + str(IMG_SIZE), floatx = FLOATX_R).rjust(SIZE_PADDING, '='))
    for item in os.listdir(DATA_PATH):
        path = os.path.join(DATA_PATH, item)
        filename = PREFIX_OUT + item.title() + r'.h5'
        filename = os.path.join(OUTPUT_PATH, filename)
        data = get_data(path)
        gen_data(data, filename)
        save_file_labels(LABELS)
        print(r''.rjust(SIZE_PADDING, '='))

print(r'Done')