In [8]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline 

import cv2

import os
from keras import backend as k

In [9]:
k.clear_session()

In [10]:
# Fixed for our Cats & Dogs classes
NUM_CLASSES = 2

# Fixed for Cats & Dogs color images
CHANNELS = 3

IMAGE_RESIZE = 224
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'binary_crossentropy'

# Common accuracy metric for all outputs, but can use different metrics for different output
LOSS_METRICS = ['accuracy']

# EARLY_STOP_PATIENCE must be < NUM_EPOCHS
NUM_EPOCHS = 20
#EARLY_STOP_PATIENCE = 3

# These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively
# Training images processed in each step would be no.-of-train-images / STEPS_PER_EPOCH_TRAINING
STEPS_PER_EPOCH_TRAINING = 24
STEPS_PER_EPOCH_VALIDATION = 24

# These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively
# NOTE that these BATCH* are for Keras ImageDataGenerator batching to fill epoch step input
BATCH_SIZE_TRAINING = 32
BATCH_SIZE_VALIDATION = 32

# Using 1 to easily manage mapping between test_generator & prediction for submission preparation
BATCH_SIZE_TESTING = 1

In [11]:
from tensorflow.python.keras.applications import ResNet50
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

In [12]:
resnet_weights_path = 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

In [13]:
#Still not talking about our train/test data or any pre-processing.

model = Sequential()

# 1st layer as the lumpsum weights from resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
# NOTE that this layer will be set below as NOT TRAINABLE, i.e., use it as is
model.add(ResNet50(include_top = False, pooling = RESNET50_POOLING_AVERAGE, weights = resnet_weights_path))

# 2nd layer as Dense for 2-class classification, i.e., dog or cat using SoftMax activation
model.add(Dense(NUM_CLASSES, activation = DENSE_LAYER_ACTIVATION))

# Say not to train first layer (ResNet) model as it is already trained
model.layers[0].trainable = False


In [14]:
from tensorflow.python.keras import optimizers

sgd = optimizers.SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)
model.compile(optimizer = sgd, loss = OBJECTIVE_FUNCTION, metrics = LOSS_METRICS)

In [15]:
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator

image_size = IMAGE_RESIZE

# preprocessing_function is applied on each image but only after re-sizing & augmentation (resize => augment => pre-process)
# Each of the keras.application.resnet* preprocess_input MOSTLY mean BATCH NORMALIZATION (applied on each batch) stabilize the inputs to nonlinear activation functions
# Batch Normalization helps in faster convergence
data_generator = ImageDataGenerator(preprocessing_function=preprocess_input, validation_split=0.2)


In [16]:
train = pd.read_csv('../clean_data/mass/train/resnet_train_data.csv')

In [17]:
train.head()

Unnamed: 0.1,Unnamed: 0,path,class
0,0,0.tif,1
1,1,1.tif,1
2,2,2.tif,0
3,3,3.tif,0
4,4,4.tif,0


In [19]:
# flow_From_directory generates batches of augmented data (where augmentation can be color conversion, etc)
# Both train & valid folders must have NUM_CLASSES sub-folders

train_generator = data_generator.flow_from_dataframe(train,
                                                     directory = '../clean_data/mass/train/crops/',
                                                     x_col = 'path',
                                                     y_col = 'class',
                                                     target_size= (image_size, image_size),
                                                     batch_size = BATCH_SIZE_TRAINING,
                                                     class_mode = 'categorical',
                                                     subset = 'training')

validation_generator = data_generator.flow_from_dataframe(train,
                                                     directory = '../clean_data/mass/train/crops/',
                                                     x_col = 'path',
                                                     y_col = 'class',
                                                     target_size= (image_size, image_size),
                                                     batch_size = BATCH_SIZE_TRAINING,
                                                     class_mode = 'categorical',
                                                     subset = 'validation')

Found 621 images belonging to 2 classes.
Found 155 images belonging to 2 classes.


In [20]:
# Max number of steps that these generator will have opportunity to process their source content
# len(train_generator) should be 'no. of available train images / BATCH_SIZE_TRAINING'
# len(valid_generator) should be 'no. of available train images / BATCH_SIZE_VALIDATION'
#(BATCH_SIZE_TRAINING, len(train_generator), BATCH_SIZE_VALIDATION, len(validation_generator))

In [21]:
# Early stopping & checkpointing the best model in ../working dir & restoring that as our model for prediction
# from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

# cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE)
# cb_checkpointer = ModelCheckpoint(filepath = 'best.hdf5', monitor = 'val_loss', save_best_only = True, mode = 'auto')

In [22]:
# Grid Search is an ideal candidate for distributed machine learning
# Pseudo code for hyperparameters Grid Search

'''
from sklearn.grid_search import ParameterGrid
param_grid = {'epochs': [5, 10, 15], 'steps_per_epoch' : [10, 20, 50]}

grid = ParameterGrid(param_grid)

# Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model
for params in grid:
    print(params)
'''


"\nfrom sklearn.grid_search import ParameterGrid\nparam_grid = {'epochs': [5, 10, 15], 'steps_per_epoch' : [10, 20, 50]}\n\ngrid = ParameterGrid(param_grid)\n\n# Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model\nfor params in grid:\n    print(params)\n"

In [None]:
fit_history = model.fit_generator(
        train_generator,
        steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
        epochs = NUM_EPOCHS,
        validation_data=validation_generator,
        validation_steps=STEPS_PER_EPOCH_VALIDATION,
        #callbacks=[cb_checkpointer, cb_early_stopper]
)
model.load_weights("best.hdf5")

Epoch 1/20

In [None]:
data_generator_test = ImageDataGenerator(preprocessing_function=preprocess_input)

In [None]:
test = pd.read_csv('clean_data/mass/test/resnet_test_data.csv')

In [None]:
# NOTE that flow_from_directory treats each sub-folder as a class which works fine for training data
# Actually class_mode=None is a kind of workaround for test data which too must be kept in a subfolder

# batch_size can be 1 or any factor of test dataset size to ensure that test dataset is samples just once, i.e., no data is left out
test_generator = data_generator.flow_from_dataframe(test,
                                                     directory = '../clean_data/mass/test/crops/',
                                                     x_col = 'path',
                                                     y_col = 'class',
                                                     target_size= (image_size, image_size),
                                                     batch_size = BATCH_SIZE_TRAINING,
                                                     class_mode = 'categorical')

# Try batch size of 1+ in test_generator & check batch_index & filenames in resulting batches
'''
for i in test_generator:
    #print(test_generator.batch_index, test_generator.batch_size)
    idx = (test_generator.batch_index - 1) * test_generator.batch_size
    print(test_generator.filenames[idx : idx + test_generator.batch_size])
'''

In [22]:
# Reset before each call to predict
test_generator.reset()

pred = model.predict_generator(test_generator, steps = len(test_generator), verbose = 1)

predicted_class_indices = np.argmax(pred, axis = 1)



In [23]:
import sklearn.metrics as sklearn
np.array(test['class']) == predicted_class_indices

array([False, False, False, False,  True,  True, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False, False,  True,  True,
       False, False, False,  True,  True,  True, False,  True,  True,
       False, False, False, False, False,  True,  True,  True,  True,
        True, False, False, False,  True,  True,  True,  True,  True,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False,  True,  True, False, False, False, False,
       False,  True,

In [24]:
true_indices = np.array(test['class'])

In [25]:
sklearn.confusion_matrix(true_indices, predicted_class_indices)

array([[134,  13],
       [ 87,   4]])

In [None]:
predicted_class_indices

In [None]:
results_df = pd.DataFrame(
    {
        'id': pd.Series(test_generator.filenames), 
        'label': pd.Series(predicted_class_indices)
    })
results_df['id'] = results_df.id.str.extract('(\d+)')
results_df['id'] = pd.to_numeric(results_df['id'], errors = 'coerce')
results_df.sort_values(by='id', inplace = True)

results_df.to_csv('submission.csv', index=False)
results_df.head()