# Model for Prediction of Image Location

The extra images have no labels for image location. To make them useable like the training images, these labels have to be created.

The image locations of training and test images were manually checked and wrong labels were corrected. The corrected dataframes were stored as *\*_corrected.csv*.

A model is fit on these labels to automatically predict the image location of the extra images.

In [None]:
# Import required packages 
import tensorflow as tf
import tensorflow_hub as hub
import datetime
import csv
import numpy as np
import pandas as pd
import seaborn as sns
from keras import optimizers
from sklearn.metrics import classification_report, confusion_matrix
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

# Load and look at the data

In [None]:
#Load csv-data
image_dir = "../images/"
train_data = pd.read_csv('../data/train_corrected.csv')
train_data.image_id = train_data.image_id.apply(lambda x: x.strip()+".JPG")
test_data = pd.read_csv('../data/test_corrected.csv')
test_data.image_id = test_data.image_id.apply(lambda x: x.strip()+".JPG")

In [None]:
train_data

In [None]:
sns.countplot(x='image_location',data=train_data).set_title("Data Distribution")

In [None]:
test_data

In [None]:
sns.countplot(x='image_location',data=test_data).set_title("Data Distribution")

In [None]:
train_data['image_location'].unique()

## Define parameters

In [None]:
#Get unique_turtle_ids from train.csv
unique_turtle_ids = list(train_data['turtle_id'].unique())
#Add category for new turtle for test set
unique_turtle_ids.append("new_turtle")
#Get number of images for train/test split
split = 0.9
lines = round(len(train_data)*split)
length_data = len(train_data)

#We set some parameters for the model
HEIGHT = 224 #image height
WIDTH = 224 #image width
CHANNELS = 3 #image RGB channels
CLASS_NAMES = list(train_data['image_location'].unique())
NCLASSES = 3
BATCH_SIZE = 32
SHUFFLE_BUFFER = 10 * BATCH_SIZE
TRAINING_SIZE = lines
TRAINING_STEPS = TRAINING_SIZE // BATCH_SIZE
AUTOTUNE = tf.data.experimental.AUTOTUNE

## Pre-processing

In [None]:
def preprocess(augment = True):
    '''
    Function to create ImageDataGenerator-Object to augment and scale image
    input: augment=True
    output: train_datagen, test_datagen
    If augment is true, augmentation is applied on train_datagen, scaling for test_datagen.
    If augment is false, only scaling is applied for both generators.
    '''
    if augment == True:
        train_datagen = ImageDataGenerator(
                rotation_range     = 40,
                width_shift_range  = 0.2,
                height_shift_range = 0.2,
                # use "rescale" to scale array of original image pixel values to be between [0,1] and specify the parameter rescale=1./255.
                rescale            = 1./255, 
                shear_range        = 0.2,
                zoom_range         = 0.2,
                horizontal_flip    = False,
                fill_mode          = 'nearest')

        test_datagen = ImageDataGenerator(rescale=1./255)
    
    else:
        train_datagen = ImageDataGenerator(rescale=1./255)
        test_datagen  = ImageDataGenerator(rescale=1./255)
        
    return train_datagen, test_datagen

In [None]:
def use_image_generator(df, train_datagen, test_datagen, training=True): 
    '''
    Function to apply ImageDataGenerator-Object to images for augmentation and scaling
    input: 
        dataframe for which the function should be used
        train_datagen as ImageDataGenerator-Object
        test_datagen as ImageDataGenerator-Object
        training=True
    output: train_generator and validation_generator or test_generator
    If training is true, train_generator (augmented image and label) and validation_generator (scaled image and label) are returned.
    If training is false, test_generator is returned containing scaled image, no label is returned.
    '''
    if training == True:
        # Augment and scale images for training
        # This is a generator that will read pictures found in directory, 
        # and indefinitely generate batches of augmented image data
        # flow_from_directory: Takes the path to a directory & generates batches of augmented data.
        train_generator = train_datagen.flow_from_dataframe(dataframe = df[0:lines], 
                directory   = image_dir,
                x_col       = "image_id", #name of column(in dataframe) having file names
                y_col       = "image_location", #name of column(in dataframe) having class/label
                target_size = (HEIGHT, WIDTH),
                batch_size  = BATCH_SIZE,
                classes     = CLASS_NAMES,
                class_mode  = 'categorical',
                shuffle     = False)

        # Scale images for validation
        validation_generator = test_datagen.flow_from_dataframe(dataframe = df[lines:], 
                directory    = image_dir,
                x_col        = "image_id",
                y_col        = "image_location",
                target_size  = (HEIGHT, WIDTH),
                batch_size   = BATCH_SIZE,
                classes      = CLASS_NAMES,
                class_mode   = 'categorical',
                shuffle      = False)
        
        return train_generator, validation_generator
    
    else:
        # Scale images for testing, no target provided and returned
        test_generator = test_datagen.flow_from_dataframe(dataframe = df, 
                directory   = image_dir,
                x_col       = "image_id",
                target_size = (HEIGHT, WIDTH),
                batch_size  = BATCH_SIZE,
                class_mode  = None,
                shuffle     = False)
            
        return test_generator

## Create model and train

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import datasets, layers, models

base_model = InceptionV3(input_shape = (224, 224, 3), include_top = False, weights = 'imagenet')

EPOCHS = 2

In [None]:
#change the last layer
for layer in base_model.layers:
    layer.trainable = False

x = layers.Flatten()(base_model.output)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dropout(0.2)(x)

# Add a final softmax layer with 3 nodes for classification output
x = layers.Dense(NCLASSES, activation='softmax')(x)

model = tf.keras.models.Model(base_model.input, x)

model.compile(optimizer = tf.keras.optimizers.Adam(1e-5), loss = 'binary_crossentropy', metrics = 'accuracy')
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

train_datagen, test_datagen = preprocess()
train_generator, validation_generator = use_image_generator(train_data, train_datagen, test_datagen, training=True)
    
inception =  model.fit(
        train_generator, 
        validation_data=validation_generator,
        steps_per_epoch=TRAINING_STEPS, 
        epochs=EPOCHS,
        callbacks=[tensorboard_callback])

## Check confusion matrix & auc_score on test data with corrected image location

In [None]:
train_datagen, test_datagen = preprocess(augment = False)
test_generator = use_image_generator(test_data, train_datagen, test_datagen, training=False)

In [None]:
#Get probabilities for all turtle id's
y_preds = model.predict(test_generator)
#Get index of highest prediction
y_preds = np.argmax(y_preds, axis=1)

In [None]:
#Get locations for test_data in numerical form
test_loc = pd.factorize(test_data['image_location'], sort=True)[0]

In [None]:
print(accuracy_score(test_loc, y_preds))
print(confusion_matrix(test_loc, y_preds))

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    for (idx, c_label) in enumerate(['left', 'top', 'right']):
        fpr, tpr, thresholds = roc_curve(y_test[:,idx].astype(int), y_pred[:,idx])
        c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
    c_ax.plot(fpr, fpr, 'b-', label = 'Random Guessing')
    return roc_auc_score(y_test, y_pred, average=average)

In [None]:
# set plot figure size
fig, c_ax = plt.subplots(1,1, figsize = (12, 8))
multiclass_roc_auc_score(test_loc, y_preds)

## Create image location on extra images with trained model

In [None]:
extra = pd.read_csv('../data/extra_images.csv')
extra.image_id = extra.image_id.apply(lambda x: x.strip()+".JPG")

In [None]:
extra

In [None]:
train_datagen, test_datagen = preprocess(augment = False)
test_generator_extra = use_image_generator(extra, train_datagen, test_datagen, training=False)

In [None]:
#Get probabilities for all turtle id's
y_preds = model.predict(test_generator_extra)
#Get index of highest prediction
y_preds = np.argmax(y_preds, axis=1)
y_preds

In [None]:
#Create a DataFrame with top prediction in extra form
list = []
for id in y_preds:
    list.append(CLASS_NAMES[id])

title = ['image_location']

image_location = pd.DataFrame(list, columns= title)

#Insert image_ids from extra_data
extra = pd.read_csv('../data/extra_images.csv')
extra.insert(loc=1, column='image_location', value=image_location['image_location'])
extra

In [None]:
#Save submission data as CSV
extra.to_csv('../data/extra_images_loc.csv', index = False)

In [None]:
extra.head(50)