# Train CNN-Model

This notebook will train our model based on pictures in our sorted_images folder and sub-folders.

In [None]:
#load the modules
import keras
from keras import models, layers
from keras.activations import relu, softmax
from tensorflow.keras.applications import VGG19
from keras.models import Sequential, load_model, Model
from tensorflow.keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Activation, Dropout, Dense, Flatten, concatenate
from matplotlib import pyplot as plt
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
import pandas as pd
import numpy as np
from PIL import Image
from tensorflow.keras.metrics import TopKCategoricalAccuracy

from sklearn.preprocessing import OneHotEncoder

sys.modules['Image'] = Image

In [None]:
# Keras' data generator can be used to pass the images through the convolutional neural network and apply
#rotation and zoom transformations to the images. Check https://keras.io/preprocessing/image/ for more transformations

train_data = ImageDataGenerator(
        rescale = 1./255,
        rotation_range=40,
        zoom_range=0.2,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        fill_mode='nearest'
)

train_generator = train_data.flow_from_directory(
        directory=r"../cropped_sorted/train",
        target_size=(224, 224),
        batch_size=32,
        shuffle=True)

In [None]:
#defining the validation data generator
val_data = ImageDataGenerator(rescale = 1./255)
                                 
val_generator = val_data.flow_from_directory(
        directory=r"../cropped_sorted/val",
        target_size=(224, 224),
        batch_size=32,
        shuffle=True)

## Convolutional neural network

In [None]:
#load the pre-trained VGG19 from keras
vgg19 = VGG19(input_shape=(224,224,3), weights='imagenet', include_top=False)
x = vgg19.layers[-1].output
#add dropout and the fully connected layer
x = Dropout(0.5)(x)
x = Flatten()(x)
x = Dense(255, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(255, activation='relu')(x)

#add a dense layer with a value equal to the number of classes
predictors = Dense(100, activation='softmax')(x)
# Create the model
vgg19model = Model(vgg19.input, predictors)

vgg19model

In [None]:
#check the model
vgg19model.summary()

In [None]:
# define where to save the model after each epoch
filepath = "../models/VGG19_model.h5"
# add a critera to save only if there was an improvement in the model comparing
# to the previous epoch (in this caset the model is saved if there was a decrease in the loss value)
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# stop training if there is no improvement in model for 3 consecutives epochs.
early_stopping_monitor = EarlyStopping(patience=3)
callbacks_list = [checkpoint, early_stopping_monitor]

In [None]:
# Compile the model
vgg19model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=1e-5),#define the optimizer and the learning rate
              metrics=tf.keras.metrics.TopKCategoricalAccuracy(k=5))

In [None]:
#train the model
batch_size=32
model_history=vgg19model.fit(
        train_generator,
        steps_per_epoch=1822//batch_size,#number of pictures in training data set divided by the batch size
        epochs=30,
        validation_data=val_generator,
        validation_steps= 300// batch_size,#number of pictures in validation data set divided by the batch size
        callbacks=callbacks_list)

### A second iteration with smaller learning rate 

In [None]:
# define where to save the model after each epoch
filepath = "../models/VGG19_model_lre-6.h5"
# add a critera to save only if there was an improvement in the model comparing
# to the previous epoch (in this caset the model is saved if there was a decrease in the loss value)
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# stop training if there is no improvement in model for 3 consecutives epochs.
early_stopping_monitor = EarlyStopping(patience=3)
callbacks_list = [checkpoint, early_stopping_monitor]

In [None]:
#load the model
model=load_model("../models/VGG19_model.h5")

# Compile the model
model.compile(loss='categorical_crossentropy',
             optimizer=SGD(lr=1e-6),
             metrics=tf.keras.metrics.TopKCategoricalAccuracy(k=5))
#train the model
batch_size=32
model_history_2=model.fit_generator(
        train_generator,
    #! BEWARE: steps_per_epoch needs to be adapted: containing number of images in train // batch_size
        steps_per_epoch=1822//batch_size,
        epochs=20,
        validation_data=val_generator,
        validation_steps= 300// batch_size,
        callbacks=callbacks_list)

In [None]:
#size of the plots
fig=plt.figure(figsize=(15,5))
columns = 2
rows = 1

#plot loss
#the accuracy and loss are stored in the "model_history"
fig.add_subplot(rows, columns, 1)
plt.plot(model_history.history['loss']+ model_history_2.history['loss']) #merge the loss from the two training steps
plt.plot(model_history.history['val_loss']+ model_history_2.history['val_loss'])
plt.title('loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')

#plot accuracy
fig.add_subplot(rows, columns, 2)
plt.plot(model_history.history['acc']+ model_history_2.history['acc'])
plt.plot(model_history.history['val_acc']+ model_history_2.history['val_acc'])
plt.title('accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


### Testing the model + submission

Prepare the data for submission

In [None]:
#load the best saved trained model
model=load_model("../models/VGG19_model.h5")

In [None]:
val_sony_datagen1 = ImageDataGenerator(rescale = 1./255)
val_sony_datagen = val_sony_datagen1.flow_from_directory(
        directory="../cropped_sorted/test", #This folder should contain pictures of each bird in a different subfolder (similar to the training data set)
        target_size=(224, 224),
        class_mode="categorical",
        batch_size=490, #number of images in the testing dataset
        shuffle=False)

In [None]:
#load the pictures in the testing folder. The x_batch contains the pictures and the y_batch contains the
#identities of the individuals
x_batch, y_batch=next(val_sony_datagen)

In [None]:
x_batch.shape

In [None]:
predictions_index = []
predictions_probabilities = []
results = []

#for i in range(0,len(x_batch)):
for i in range(0,len(x_batch)):    
    probabilities_sorted = []
    index_sorted = [] 
    image=np.expand_dims(x_batch[i], axis=0)
    result=model.predict(image)
    results.append(result)
    y_preds_indices = np.argsort(result)[:,-5:]
    top_5_values = [result[0][j] for j in y_preds_indices]
    
    for index in range(1,6):
        probabilities_sorted.append(top_5_values[0][-index])
        index_sorted.append(y_preds_indices[0][-index])
    predictions_probabilities.append(probabilities_sorted)
    predictions_index.append(index_sorted)

In [None]:
predictions_probabilities

In [None]:
'''
#This code creates a new predictions_indices list with taking:
# If first and second probabilities have a difference more than 80%, take only first prediction for top5
# If two probabilities differ more than 50%, take the higher probability thrice
# If two probabilities differ more than 20%, take the higher probability twice

new_predictions_indices = []
for i in range(0,len(predictions_probabilities)):   
    list = []
    j = 0
    numbers_taken = 0
    while j < 4 and numbers_taken < 4:
        if predictions_probabilities[i][j] < 0.03:
            list.append(100)
            j = j + 1
            numbers_taken = numbers_taken + 1
        
        elif predictions_probabilities[i][j] - predictions_probabilities[i][j+1] > 0.8:
            for times in range(5):
                list.append(predictions_index[i][j])
            j = j + 5
            numbers_taken = numbers_taken + 5
            
        elif predictions_probabilities[i][j] - predictions_probabilities[i][j+1] > 0.5 and numbers_taken < 3:
            for times in range(3):
                list.append(predictions_index[i][j])
            j = j + 1
            numbers_taken = numbers_taken + 3
            
        elif predictions_probabilities[i][j] - predictions_probabilities[i][j+1] > 0.2 and numbers_taken < 4:
            for times in range(2):
                list.append(predictions_index[i][j])
            j = j + 1
            numbers_taken = numbers_taken + 2
            
        elif numbers_taken < 4:
            list.append(predictions_index[i][j])
            j = j + 1
            numbers_taken = numbers_taken + 1
        else:
            j = j + 1
    if numbers_taken < 5:
        if predictions_probabilities[i][j] < 0.03:
            list.append(100)
        else:
            list.append(predictions_index[i][j])
    new_predictions_indices.append(list)    

In [None]:
#Create labels to connect indices with turtle_ids (folder structure in train)
labels = list(train_generator.class_indices.keys())

In [None]:
#Check if labels contains anything which is not correct, e.g. .ipynb_checkpoints
#The correct order would be as order of sub-folders in "sorted_images/train"
#labels

In [None]:
#Create dataframe for image_ids in order of validation generator (folder structure in val)
images_ids = list(val_sony_datagen.class_indices.keys())
titles = ['image_id']
test_data = pd.DataFrame(images_ids,columns=titles)

In [None]:
#train_data = pd.read_csv('../data/train_corrected.csv')
#unique_turtle_ids = list(train_data['turtle_id'].unique())

list = []
array = []
for line in predictions_index:
    for id in line:
        list.append(labels[id])
    array.append(list)
    list = []
    
titles = ['prediction1', 'prediction2','prediction3','prediction4','prediction5']
submission = pd.DataFrame(array, columns= titles)

#Insert image_ids from test_data

submission.insert(loc=0, column='image_id', value=test_data['image_id'])
submission

In [None]:
#Save submission data as CSV
submission.to_csv('../data/submission_VGG19.csv', index = False)

## Random list for submission

If you want to compare the results with random turtles, use the following code

In [None]:
import random

In [None]:
list = []
predictions_index = []
for line in range(0,490):
    for number in range(0,5):
        list.append(random.randint(0,99))
    predictions_index.append(list)
    list = []

In [None]:
list = []
array = []
for line in predictions_index:
    for id in line:
        list.append(labels[id])
    array.append(list)
    list = []
    
titles = ['prediction1', 'prediction2','prediction3','prediction4','prediction5']
submission = pd.DataFrame(array, columns= titles)

#Insert image_ids from test_data
test_data = pd.read_csv(images_ids)
submission.insert(loc=0, column='image_id', value=test_data['image_id'])
submission

In [None]:
#Save submission data as CSV
submission.to_csv('../data/submission_random.csv', index = False)