# Baseline Model

The baseline model is a transfer-learning CNN model using InceptionV3. 

Input Images: Original images from the training set and the test set. The train/validation split is hard coded using the lines in the training-dataframe.

Preprocessing: Scaling. Augmentation can be used.

### Imports

In [None]:
# Import packages for data preparation
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

# Import packages for modeling
import tensorflow as tf
import tensorflow_hub as hub
import datetime
import csv
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import datasets, layers, models, optimizers
# Load the TensorBoard notebook extension
%load_ext tensorboard

## Loading Data

In [None]:
#Path to images
image_dir = "../images/"

#Load csv-data and append .JPG to image_id to access images using the dataframe
train_data = pd.read_csv('../data/train.csv')
train_data.image_id = train_data.image_id.apply(lambda x: x.strip()+".JPG")
test_data = pd.read_csv('../data/test.csv')
test_data.image_id = test_data.image_id.apply(lambda x: x.strip()+".JPG")

#Get unique_turtle_ids from train.csv
unique_turtle_ids = list(train_data['turtle_id'].unique())
#Add category for new turtle for test set
unique_turtle_ids.append("new_turtle")

#Get number of images for train/test split
split = 0.7
lines = round(len(train_data)*split)

#We set some parameters for the model
HEIGHT = 224 #image height
WIDTH = 224 #image width
CHANNELS = 3 #image RGB channels
CLASS_NAMES = unique_turtle_ids
NCLASSES = len(CLASS_NAMES)
BATCH_SIZE = 32
SHUFFLE_BUFFER = 10 * BATCH_SIZE
TRAINING_SIZE = lines
TRAINING_STEPS = (TRAINING_SIZE // BATCH_SIZE)
AUTOTUNE = tf.data.experimental.AUTOTUNE

## Data Preparation

In [None]:
def preprocess(augment = True):
    '''
    Function to create ImageDataGenerator-Object to augment and scale image
    input: augment=True
    output: train_datagen, test_datagen
    If augment is true, augmentation is applied on train_datagen, scaling for test_datagen.
    If augment is false, only scaling is applied for both generators.
    '''
    if augment == True:
        train_datagen = ImageDataGenerator(
                rotation_range     = 40,
                width_shift_range  = 0.2,
                height_shift_range = 0.2,
                # use "rescale" to scale array of original image pixel values to be between [0,1] and specify the parameter rescale=1./255.
                rescale            = 1./255,
                shear_range        = 0.2,
                zoom_range         = 0.2,
                horizontal_flip    = False,
                fill_mode          = 'nearest')

        test_datagen = ImageDataGenerator(rescale=1./255)
    
    else:
        train_datagen = ImageDataGenerator(rescale=1./255)
        test_datagen  = ImageDataGenerator(rescale=1./255)
        
    return train_datagen, test_datagen


def use_image_generator(train_datagen, test_datagen, training=True): 
    '''
    Function to apply ImageDataGenerator-Object to images for augmentation and scaling
    input: 
        train_datagen as ImageDataGenerator-Object
        test_datagen as ImageDataGenerator-Object
        training=True
    output: train_generator and validation_generator or test_generator
    If training is true, train_generator (augmented image and label) and validation_generator (scaled image and label) are returned.
    If training is false, test_generator is returned containing scaled image, no label is returned.
    '''
    if training == True:
        # Augment and scale images for training
        # This is a generator that will read pictures found in directory, 
        # and indefinitely generate batches of augmented image data
        # flow_from_directory: Takes the path to a directory & generates batches of augmented data.
        train_generator = train_datagen.flow_from_dataframe(dataframe =train_data[0:lines], 
                directory   = image_dir,
                x_col       = "image_id", #name of column(in dataframe) having file names
                y_col       = "turtle_id", #name of column(in dataframe) having class/label
                target_size = (HEIGHT, WIDTH),
                batch_size  = BATCH_SIZE,
                classes     = CLASS_NAMES,
                class_mode  = 'categorical',
                shuffle     = True)

        # Scale images for validation
        validation_generator = test_datagen.flow_from_dataframe(dataframe = train_data[lines:], 
                directory    = image_dir,
                x_col        = "image_id",
                y_col        = "turtle_id",
                target_size  = (HEIGHT, WIDTH),
                batch_size   = BATCH_SIZE,
                classes      = CLASS_NAMES,
                class_mode   = 'categorical',
                shuffle      = True)
        
        return train_generator, validation_generator
    
    else:
        # Scale images for testing, no target provided and returned
        test_generator = test_datagen.flow_from_dataframe(dataframe = test_data, 
                directory   = image_dir,
                x_col       = "image_id",
                target_size = (HEIGHT, WIDTH),
                batch_size  = BATCH_SIZE,
                class_mode  = None,
                shuffle     = False)
            
        return test_generator

## Baseline-Model

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
#Loading the transfer-learning model and freezing the layers.
base_model = InceptionV3(input_shape = (224, 224, 3), include_top = False, weights = 'imagenet')

#change the last layer
for layer in base_model.layers:
    layer.trainable = False

#Epochs for fitting the model
EPOCHS = 10

In [None]:
#Adding some layers to be trained for this task
x = layers.Flatten()(base_model.output)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dropout(0.2)(x)

#Add a final softmax layer with 101 nodes for classification output
x = layers.Dense(NCLASSES, activation='softmax')(x)

model = tf.keras.models.Model(base_model.input, x)

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = tf.keras.metrics.TopKCategoricalAccuracy(k=5))
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#Loading and preprocessing the data 
train_datagen, test_datagen = preprocess()
train_generator, validation_generator = use_image_generator(train_datagen, test_datagen, training=True)

#Fitting the model
inception =  model.fit(
        train_generator, 
        validation_data=validation_generator,
        steps_per_epoch=TRAINING_STEPS, 
        epochs=EPOCHS,
        callbacks=[tensorboard_callback])

In [None]:
#Save model
#model.save('InceptionV3')

## Prepare data for submission

In [None]:
test_generator = use_image_generator(train_datagen, test_datagen, training=False)

In [None]:
#Get probabilities for all turtle id's
y_preds = model.predict(test_generator)
print(y_preds[0])
#Get indices from top 5 predictions
# Corrected: [:,:-6:-1] instead of [:,-5:]
y_preds = np.argsort(y_preds, axis=1)[:,:-6:-1]

#Save indices of top 5 predictions as dataframe
df = pd.DataFrame(y_preds)

In [None]:
#Create a DataFrame with top 5 predictions in submission form
list = []
array = []
for line in y_preds:
    for id in line:
        list.append(CLASS_NAMES[id])
    array.append(list)
    list = []

titles = ['prediction1', 'prediction2','prediction3','prediction4','prediction5']

submission = pd.DataFrame(array, columns= titles)

#Insert image_ids from test_data
test_data = pd.read_csv('../data/test.csv')
submission.insert(loc=0, column='image_id', value=test_data['image_id'])
submission

In [None]:
#Save submission data as CSV
submission.to_csv('../data/submission.csv', index = False)