# 1. Setup

In [None]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import shutil
import os

In [None]:
paths = {
    'TRAIN_PATH' : os.path.join('workspace', 'images','kfold', 'train'),
    'TEST_PATH' : os.path.join('workspace', 'images','kfold','test'),
    'IMAGES_PATH': os.path.join('workspace','images','all'),
    'ANNOTATION_PATH': os.path.join('workspace','annotations'),
    'RESULTS_PATH': os.path.join('workspace','results')
 }

In [None]:
input_height = 69
input_width = 69
batch_size = 32

augmentation=False

# 2. Dynamic dataset rebalancing/folders creation

In [None]:
# function used to reset the dataset used in each different iteration
def emptyFolders():
    if not os.path.exists(os.path.join(paths["TRAIN_PATH"])):
        os.makedirs(os.path.join(paths["TRAIN_PATH"]))
    if not os.path.exists(os.path.join(paths["TEST_PATH"])):
        os.makedirs(os.path.join(paths["TEST_PATH"]))
    for i in range(0,10):
        if not os.path.exists(os.path.join(paths["TRAIN_PATH"],str(i))):
            os.makedirs(os.path.join(paths["TRAIN_PATH"],str(i)))
        if not os.path.exists(os.path.join(paths["TEST_PATH"],str(i))):
            os.makedirs(os.path.join(paths["TEST_PATH"],str(i)))

        for file in os.listdir(os.path.join(paths["TRAIN_PATH"],str(i))):
            if file != ".DS_Store":
                os.remove(os.path.join(paths["TRAIN_PATH"],str(i), file))
        for file in os.listdir(os.path.join(paths["TEST_PATH"],str(i))):
            if file != ".DS_Store":
                os.remove(os.path.join(paths["TEST_PATH"],str(i), file))

df = pd.read_csv(os.path.join(paths['ANNOTATION_PATH'],"annotations.csv"))
df = df.dropna()

In [None]:
import random
import keras.utils as image
import matplotlib.pyplot as plt

# Function used to dynamically delete elements in Class 5
def remove_images(label):
    img_names = os.listdir(os.path.join(paths['TRAIN_PATH'],str(label)))
    for image in img_names:
      f = os.path.join(paths['TRAIN_PATH'],str(label),image)
      os.remove(f)

# Function used to dinamically undersample the classes in the training set
def undersample(label,n):
    img_names = os.listdir(os.path.join(paths['TRAIN_PATH'],str(label)))
    img_names = random.sample(img_names,n)  # Pick n random images to remove
    for image in img_names:
      f = os.path.join(paths['TRAIN_PATH'],str(label),image)
      os.remove(f)

# Function used to dinamically oversample the classes in the training set
def augment_images(label,number_images, datagen):
    number_images = int(number_images)
    path = os.path.join(paths['TRAIN_PATH'],str(label))
    i = 0
    while i < number_images:
        for f in os.listdir(path):
          img = image.load_img(os.path.join(paths['TRAIN_PATH'],str(label),f), target_size=(69, 69))
          x = image.img_to_array(img)
          x = x.reshape((1,) + x.shape)
          for batch in datagen.flow(x, batch_size=1):
              new_image = image.array_to_img(batch[0])
              new_image.save(os.path.join(paths['TRAIN_PATH'],str(label),"aug_" + str(i) +".jpg"))
              i += 1
              if i > number_images:
                  return
              break

# Function used to dinamically remove folders' names after deleting Class 5
def renameDirectories():
    for i in range(6,10):
        os.rename(os.path.join(paths['TRAIN_PATH'],str(i)),os.path.join(paths['TRAIN_PATH'],str(i-1)))
        os.rename(os.path.join(paths['TEST_PATH'],str(i)),os.path.join(paths['TEST_PATH'],str(i-1)))

# Function used to coordinate all the previous ones
def rebalanceTrainingSet():
    CLASS_TO_DELETE = 5
    remove_images(CLASS_TO_DELETE)
    N_IMG_TO_DELETE = [2017,1551]
    CLASSES_TO_REDUCE = [1,2]
    for i in range(len(CLASSES_TO_REDUCE)):
        undersample(CLASSES_TO_REDUCE[i], N_IMG_TO_DELETE[i])
    CLASSES_TO_AUGMENT = [3,4,6,7,8,9]
    N_IMG_TO_AUGMENT = [600,400,600,500,500,600]

    datagen = ImageDataGenerator(
          rotation_range=40,
          width_shift_range=0.2,
          height_shift_range=0.1,
          shear_range=0.2,
          zoom_range=0.2,
          horizontal_flip=True,
          fill_mode='nearest')

    for i in range(len(CLASSES_TO_AUGMENT)):
        augment_images(CLASSES_TO_AUGMENT[i], N_IMG_TO_AUGMENT[i], datagen)
    renameDirectories()

# function used to plot the training distribution in order to check the balancing
def plotTrainingDistribution():
    files_per_label = dict()
    for i in range(9):
      path = os.path.join(paths['TRAIN_PATH'],str(i))
      n_images = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
      files_per_label[i] = n_images
    plt.bar(list(files_per_label.keys()), files_per_label.values(), color='g')
    plt.show()
    print(files_per_label)
    return files_per_label

# 3. K-Fold CV

In [None]:
K = 5

In [None]:
import keras
# Training with K-fold cross validation
kf = StratifiedKFold(n_splits=K, random_state=None, shuffle=False)

df_new = pd.DataFrame()
for _, row in df.iterrows():
    if(row['Path'].endswith(".jpg")):
        new_row = { 'Path': row['Path'], 'Class': row['Class'] }
        df_new_row = pd.DataFrame([new_row])
        df_new = pd.concat([df_new, df_new_row],ignore_index=True)

kf.get_n_splits(df_new)
df_results = pd.DataFrame()

In [None]:
# Creation of the scratch model to be evaluated during K-Fold CV
import matplotlib.pyplot as plt
import keras.optimizers as optimizers
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from keras import models
from keras import regularizers
import math
import tensorflow as tf
import keras.backend as KB

# best loss function for multi-class classification, measures the distance between two probability distributions
# the probability distribution of the output of the network and the true distribution of the labels
loss_function='categorical_crossentropy'

def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = KB.sum(KB.round(KB.clip(y_true * y_pred, 0, 1)))
    predicted_positives = KB.sum(KB.round(KB.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + KB.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = KB.sum(KB.round(KB.clip(y_true * y_pred, 0, 1)))
    possible_positives = KB.sum(KB.round(KB.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + KB.epsilon())
    return recall

metrics = [
    precision,
    recall,
    tf.keras.metrics.CategoricalAccuracy(name='acc')
]
optimizer='rmsprop'
optimizer_learning_rate=1e-4
epochs=100
batch_size=32
regularizer=regularizers.l1_l2(l1=0.001, l2=0.001) # simultaneous l1 and l2, add 0.001*weight_coefficient_value + 0.001 * 1/2*weight^2

if optimizer == 'rmsprop':
    optimizer=optimizers.RMSprop(learning_rate=optimizer_learning_rate)

In [None]:
def create_scratch_CNN(width, height, depth, num_classes,filters=(16, 32, 64)):
    inputShape = (height, width, depth)
    chanDim = -1
    model = models.Sequential()
    for (i, f) in enumerate(filters):
        if i == 0:
            model.add(Conv2D(f, (3, 3), activation="relu", padding="same", input_shape=inputShape))
        else:
            model.add(Conv2D(f, (3, 3), activation="relu", padding="same"))
        model.add(BatchNormalization(axis=chanDim))
        model.add(MaxPooling2D(pool_size=(2, 2)))
    # FLATTEN => FC => RELU => BN => DROPOUT
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu")) # consider if we need to add this dense layer before with more units, such as 64 in order to shrink in two different stages, depends on the outpout size of flatten
    model.add(BatchNormalization(axis=chanDim))
    model.add(Dense(num_classes, activation="softmax"))
    return model

model = create_scratch_CNN(input_width, input_height, 3, 9, (16,32,64,128))
model.summary()

In [None]:
from keras.applications import VGG16

def create_finetuned_VGG16():
    conv_base = VGG16(weights='imagenet',
                                include_top=False, # exclude fully connected layer
                                input_shape=(input_width, input_height, 3))
    conv_base.trainable = True
    set_trainable = False
    for layer in conv_base.layers:
        if layer.name.startswith("block5"): # fine tune layers from block5_*
            set_trainable = True
        if set_trainable:
            layer.trainable = True
        else:
            layer.trainable = False
    built_model = models.Sequential()
    built_model.add(conv_base)
    # add fully connected layer
    built_model.add(Flatten())
    built_model.add(Dense(64, activation='relu'))
    built_model.add(Dense(9, activation='softmax'))
    return built_model

In [None]:
df_results = pd.DataFrame()
print(df_results)
i = 0
for train_index, test_index in kf.split(df_new, df_new['Class']):
    print("====== K-Fold CV Iteration: %d/%d =======" % (i,K))
    emptyFolders()
    print("Folders empty..")
    print("Train: ", len(train_index))
    print("Test: ", len(test_index))
    df_train = df_new.iloc[train_index]
    df_test = df_new.iloc[test_index]
    # dinamically create the folders containing the fold's images
    for _, row in df_train.iterrows():
        try:
            shutil.copy(os.path.join(paths["IMAGES_PATH"],row['Path']),
                    os.path.join(paths["TRAIN_PATH"],str(row['Class']),row['Path']))
        except:
            pass
    for _, row in df_test.iterrows():
        try:
            shutil.copy(os.path.join(paths["IMAGES_PATH"],row['Path']),
                    os.path.join(paths["TEST_PATH"],str(row['Class']),row['Path']))
        except:
            pass
    print("Images moved...")
    # rebalancing
    rebalanceTrainingSet()
    files_per_label = plotTrainingDistribution()

    # training set image data generator
    train_datagen = ImageDataGenerator(rescale=1./255)
    train_dir=paths['TRAIN_PATH']
    train_generator = train_datagen.flow_from_directory(train_dir, target_size=(input_width, input_height), batch_size=batch_size, class_mode='categorical')

    # validation set image data generator
    val_datagen = ImageDataGenerator(rescale=1./255)
    validation_dir=paths['TEST_PATH']
    validation_generator = val_datagen.flow_from_directory(validation_dir, target_size=(input_width, input_height), batch_size=batch_size, class_mode='categorical')

    VGG16model = create_finetuned_VGG16()
    VGG16model.compile(optimizer=optimizer,
                  loss=loss_function,
                  metrics=metrics)

    scratchModel = create_scratch_CNN(input_width, input_height, 3, 9, (16,32,64,128))
    scratchModel.compile(optimizer=optimizer,
                  loss=loss_function,
                  metrics=metrics)

    num_classes = 9
    tot_images = sum(list(files_per_label.values()))
    weights = dict([ (class_label , tot_images/(num_classes * n_images)) for class_label, n_images in files_per_label.items()])

    n_images_eval = 0
    for j in range(9):
        path = os.path.join(paths['TEST_PATH'],str(j))
        n_images_eval = n_images_eval + len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])

    number_training = tot_images
    number_eval = n_images_eval

    callbacks_list = [
        keras.callbacks.EarlyStopping(
            monitor='val_loss', # should be part of the metrics specific during compilation
            patience=2
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2, # divides LR by 5 when triggered
            patience=3 # called when stopped improving for 3 epochs
        )
    ]

    #epochs=20

    history = scratchModel.fit_generator(
      train_generator,
      steps_per_epoch=int(math.ceil((1. * number_training) / batch_size)),
      epochs=epochs,
      validation_data=validation_generator,
      callbacks=callbacks_list,
      validation_steps=int(math.ceil((1. * number_eval) / batch_size)))

    acc = history.history['acc'][len(history.history['acc'])-1]
    val_acc = history.history['val_acc'][len(history.history['acc'])-1]
    val_recall = history.history['val_recall'][len(history.history['acc'])-1]
    val_precision = history.history['val_precision'][len(history.history['acc'])-1]
    new_row = { 'model': 'scratch', 'k': i+1, 'epochs': epochs, 'acc': acc, 'test_acc': val_acc, 'test_recall': val_recall, 'test_precision': val_precision}
    df_new_row = pd.DataFrame([new_row])
    df_results = pd.concat([df_results, df_new_row],ignore_index=True)

    history = VGG16model.fit_generator(
      train_generator,
      steps_per_epoch=int(math.ceil((1. * number_training) / batch_size)),
      epochs=epochs,
      validation_data=validation_generator,
      callbacks=callbacks_list,
      validation_steps=int(math.ceil((1. * number_eval) / batch_size)))

    acc = history.history['acc'][len(history.history['acc'])-1]
    val_acc = history.history['val_acc'][len(history.history['acc'])-1]
    val_recall = history.history['val_recall'][len(history.history['acc'])-1]
    val_precision = history.history['val_precision'][len(history.history['acc'])-1]
    new_row = { 'model': 'VGG16', 'k': i+1, 'epochs': epochs, 'acc': acc, 'test_acc': val_acc, 'test_recall': val_recall, 'test_precision': val_precision}
    df_new_row = pd.DataFrame([new_row])
    df_results = pd.concat([df_results, df_new_row],ignore_index=True)

    i+=1
    print(df_results)

In [None]:
df_results.to_csv(os.path.join(paths['RESULTS_PATH'], "resultsKFold.csv"), index=False)

In [None]:
df_results