# Data Augmentation Study

This notebook studies the impact of data augmentation on our training 
process. In this study, we will use a VGG16-based architecture for
comparison.

Reference:
 - https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
 - https://towardsdatascience.com/cnn-architectures-a-deep-dive-a99441d18049

In [1]:
from pandas import read_csv
from typing import Tuple
from gc import collect
import matplotlib.pyplot as plt
import numpy as np
from h5py import File
from sklearn.metrics import accuracy_score, confusion_matrix, \
    classification_report
from tensorflow import ConfigProto, Session
from tensorflow.keras.backend import set_session, clear_session
from tensorflow.keras.callbacks import CSVLogger, LearningRateScheduler, \
    ModelCheckpoint
from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input, MaxPool2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.random import set_random_seed


In [2]:
# Improve GPU memory utilisation
config = ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = Session(config=config)
set_session(sess)

# Fix tensorflow random seed
set_random_seed(324)



### Model Creation Function

In [3]:
# VGG-16
def create_vgg16(input_shape: Tuple[int, int, int], num_classes: int) -> Model:
    inputs = Input(shape=input_shape)
    
    layer = Conv2D(filters=64, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(inputs)
    layer = Conv2D(filters=64, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = MaxPool2D(pool_size=(2, 2), strides=2)(layer)

    layer = Conv2D(filters=128, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=128, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = MaxPool2D(pool_size=(2, 2), strides=2)(layer)
    
    layer = Conv2D(filters=256, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=256, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=256, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = MaxPool2D(pool_size=(2, 2), strides=2)(layer)
    
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = MaxPool2D(pool_size=(2, 2), strides=2)(layer)
    
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = Conv2D(filters=512, kernel_size=(3, 3), strides=1, padding="same", 
                   activation="relu", kernel_initializer="he_normal",
                   bias_initializer="he_normal")(layer)
    layer = MaxPool2D(pool_size=(2, 2), strides=2)(layer)
    
    layer = Flatten()(layer)
    layer = Dense(units=4096, activation="relu", 
                  kernel_initializer="he_normal",
                  bias_initializer="he_normal")(layer)
    layer = Dense(units=4096, activation="relu", 
                  kernel_initializer="he_normal",
                  bias_initializer="he_normal")(layer)
    layer = Dense(num_classes, activation="softmax")(layer)
    
    model = Model(inputs=inputs, outputs=layer)
    model.compile(optimizer=Adam(learning_rate=0.0001), 
                  loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model


### Other Functions

In [4]:
def import_dataset(filepath: str = "./dataset.hdf5") \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray, 
                 np.ndarray, np.ndarray, np.ndarray]:
    file = File(filepath, "r")
    train_data = file.get("tr_data")[()]
    val_data = file.get("val_data")[()]
    test_data = file.get("ts_data")[()]
    train_labels = file.get("tr_labels")[()]
    val_labels = file.get("val_labels")[()]
    test_labels = file.get("ts_labels")[()]
    
    return train_data, val_data, test_data, \
           train_labels, val_labels, test_labels


def get_test_results(test_model: Model, test_data: np.ndarray, 
                     test_labels: np.ndarray) -> Tuple:
    predicts = test_model.predict(test_data)
    pred_out = np.argmax(predicts, axis=1)
    test_out = np.argmax(test_labels, axis=1)
    labels = ["car", "heavy vehicles", "motorcycle"]
    
    return accuracy_score(test_out, pred_out), \
           confusion_matrix(test_out, pred_out), \
           classification_report(test_out, pred_out, target_names=labels)


def get_learn_rate(epoch: int) -> float:
    lr = 1e-4
    if epoch > 10:
        lr = 1e-5
    elif epoch > 20:
        lr = 1e-6
    elif epoch > 30:
        lr = 1e-7
    elif epoch > 40:
        lr = 1e-8
    elif epoch > 50:
        lr = 1e-9
        
    print(f"Learning rate: {lr}")
    
    return lr



### Data Initialisation


In [5]:
tr_dat, val_dat, ts_dat, tr_lbls, val_lbls, ts_lbls = import_dataset()

in_shape = (tr_dat.shape[1], tr_dat.shape[2], tr_dat.shape[3])
num_cls = tr_lbls.shape[1]

model = None
lr_scheduler = LearningRateScheduler(get_learn_rate)

# Clear test data from memory as we're not using it here
del ts_dat
del ts_lbls
collect()

### Model Creation & Training

In [6]:
# No data augmentation
model = create_vgg16(in_shape, num_cls)
print(model.summary())

checkpoint = ModelCheckpoint("./trained_models/no_augmentation_best.hdf5", 
                             monitor="val_loss", verbose=0, 
                             save_best_only=True, mode="min")
logger = CSVLogger("./training_logs/no_augmentation_log.csv")

model.fit(tr_dat, tr_lbls, batch_size=32, validation_data=(val_dat, val_lbls), 
          epochs=20, verbose=2, shuffle=True, 
          callbacks=[checkpoint, logger, lr_scheduler])

model.save(f"./trained_models/no_augmentation_20epoch.hdf5")


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256, 256, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 256, 256, 64)      1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 256, 256, 64)      36928     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 64)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 128, 128, 128)     73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 128, 128, 128)     147584    
_________

In [6]:
# Use data augmentation
data_gen = ImageDataGenerator(
    rotation_range=45, width_shift_range=0.2, height_shift_range=0.2,
    zoom_range=0.2, horizontal_flip=True)

model = create_vgg16(in_shape, num_cls)
print(model.summary())

checkpoint = ModelCheckpoint("./trained_models/augmented_best.hdf5", 
                             monitor="val_loss", verbose=0, 
                             save_best_only=True, mode="min")
logger = CSVLogger("./training_logs/augmented_log.csv")

model.fit_generator(
    data_gen.flow(tr_dat, tr_lbls, batch_size=32, shuffle=True), 
    steps_per_epoch=(len(tr_dat) / 32), epochs=20, verbose=2, 
    callbacks=[checkpoint, logger, lr_scheduler], 
    validation_data=(val_dat, val_lbls))

model.save(f"./trained_models/augmented_20epoch.hdf5")


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256, 256, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 256, 256, 64)      1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 256, 256, 64)      36928     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 64)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 128, 128, 128)     73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 128, 128, 128)     147584    
_________

In [None]:
# Training stage complete: free training data memory
del tr_dat
del tr_lbls
collect()


### Data Visualisation

In [None]:
acc_scores = dict()
conf_matrices = dict()
class_reports = dict()

#### Before Data Augmentation

In [None]:
model_unaugmented = create_vgg16(in_shape, num_cls)
model_unaugmented.load_weights("./trained_models/no_augmentation_best.hdf5")
model_unaugmented.compile(optimizer=Adam(learning_rate=0.0001),
                          loss="categorical_crossentropy", 
                          metrics=["accuracy"])
acc_scores["unaugmented"], conf_matrices["unaugmented"], \
class_reports["unaugmented"] \
    = get_test_results(model_unaugmented, val_dat, val_lbls)

print(f"Validation accuracy before augmentation: {acc_scores['unaugmented']}")
print("Confusion Matrix:")
print(conf_matrices['unaugmented'])
print(class_reports['unaugmented'])

unaugmented_log = read_csv("./training_logs/no_augmentation_log.csv")

plt.style.use("ggplot")
plt.figure(figsize=[10, 7.5])
plt.subplot(211)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(unaugmented_log["loss"], label="train loss")
plt.plot(unaugmented_log["val_loss"], label="validation loss")
plt.title("Loss")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))

plt.subplot(212)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(unaugmented_log["acc"], label="train accuracy")
plt.plot(unaugmented_log["val_acc"], label="validation accuracy")
plt.title("Accuracy")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))

# Free memory
clear_session()
collect()
del model_unaugmented


#### After Data Augmentation

In [None]:
model_augmented = create_vgg16(in_shape, num_cls)
model_augmented.load_weights("./trained_models/augmented_best.hdf5")
model_augmented.compile(optimizer=Adam(learning_rate=0.0001),
                        loss="categorical_crossentropy", metrics=["accuracy"])
acc_scores["augmented"], conf_matrices["augmented"], \
class_reports["augmented"] \
    = get_test_results(model_augmented, val_dat, val_lbls)

print(f"Validation accuracy after augmentation: {acc_scores['augmented']}")
print("Confusion Matrix:")
print(conf_matrices['augmented'])
print(class_reports['augmented'])

augmented_log = read_csv("./training_logs/augmented_log.csv")

plt.style.use("ggplot")
plt.figure(figsize=[10, 7.5])
plt.subplot(211)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(augmented_log["loss"], label="train loss")
plt.plot(augmented_log["val_loss"], label="validation loss")
plt.title("Loss")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))

plt.subplot(212)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(augmented_log["acc"], label="train accuracy")
plt.plot(augmented_log["val_acc"], label="validation accuracy")
plt.title("Accuracy")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))

# Free memory
clear_session()
collect()
del model_augmented



#### Cross-Model Comparison

In [None]:
plt.style.use("ggplot")
plt.figure(figsize=[10, 7.5])
plt.subplot(211)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(unaugmented_log["val_loss"], label="Before Augmentation")
plt.plot(augmented_log["val_loss"], label="After Augmentation")
plt.title("Validation Loss")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))

plt.subplot(212)
plt.xticks([0, 5, 10, 15, 20])
plt.plot(unaugmented_log["val_acc"], label="VGG-11")
plt.plot(augmented_log["val_acc"], label="VGG-13")
plt.title("Validation Accuracy")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))


### Conclusion

# TO BE UPDATED

From the above, we can observe that overfitting occurs in all models 
between the 5-10th epoch, and maximum validation accuracy is about 65%. 
However, this is before any regularisation measures are taken.

Based on the cross-model analysis comparisons, it can be seen that 
VGG-16 and VGG-16(Conv1) achieved the lowest validation loss, and at 
those points, VGG-16(Conv1) achieved the higher validation accuracy.

Therefore, we shall proceed to optimise the VGG-16(Conv1) model with
hyperparameter tuning, followed by regularisation.