In [None]:
# Rather than importing everything manually, we'll make things easy
#   and load them all in utils.py, and just import them from there.
%matplotlib inline
import utils; reload(utils)
from utils import *

## Introduction

We need to find a way to convert the imagenet predictions to a probability of being a cat or a dog, since that is what the Kaggle competition requires us to submit. We could use the imagenet hierarchy to download a list of all the imagenet categories in each of the dog and cat groups, and could then solve our problem in various ways, such as:

- Finding the largest probability that's either a cat or a dog, and using that label
- Averaging the probability of all the cat categories and comparing it to the average of all the dog categories.

But these approaches have some downsides:

- They require manual coding for something that we should be able to learn from the data
- They ignore information available in the predictions; for instance, if the models predicts that there is a bone in the image, it's more likely to be a dog than a cat.

A very simple solution to both of these problems is to learn a linear model that is trained using the 1,000 predictions from the imagenet model for each image as input, and the dog/cat label as target.

In [None]:
%matplotlib inline
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
import scipy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import utils; reload(utils)
from utils import plots, get_batches, plot_confusion_matrix, get_data

In [None]:
from numpy.random import random, permutation
from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers import Input
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop
from keras.preprocessing import image

## Train linear model on predictions

Using a Dense() layer in this way, we can easily convert the 1,000 predictions given by our model into a probability of dog vs cat--simply train a linear model to take the 1,000 predictions as input, and return dog or cat as output, learning from the Kaggle data. This should be easier and more accurate than manually creating a map from imagenet categories to one dog/cat category

### Training the model

We start with some basic config steps. We copy a small amount of our data into a 'sample' directory, with the exact same structure as our 'train' directory--this is *always* a good idea in *all* machine learning, since we should do all of our initial testing using a dataset small enough that we never have to wait for it.

In [None]:
#path = "data/dogscats/sample/"
path = "data/dogscats/"
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)

We will process as many images at a time as our graphics card allows. This is a case of trial and error to find the max batch size - the largest size that doesn't give an out of memory error.

In [None]:
batch_size = 64

We need to start with our VGG 16 model, since we'll be using its predictions and features.

In [None]:
from vgg16 import vgg16
vgg = vgg16()
model = vgg.model

Our overall approach here will be
1. Get the true label for every image
2. Get 1000 imagenet category prediction of every image
3. Feed these prediction as a input to linear classifier

Let's start by grabbing training and validation batches.

In [None]:
val_batches = get_batches(path+'valid/', shuffle = False, batch_size = 1)
batches = get_batches(path+'train/', shuffle = False, batch_size =1)

Loading and resizing the images every time we want to use them isn't necessary - instead we should save the processed arrays. By far the fastest way to save and load numpy arrays is using bcolz. This also compresses the arrays, so we save disk space. Here are the functions we'll use to save and load using bcolz.

In [None]:
import bcolz
def save_array(fname, arr) : c=bcolz.carray(arr, rootdir=fname, mode = 'w'); c.flush()
def load_array(fname) : return bcolz.open(fname)[:]

We have provided a simple function that joins the arrays from all the batches - let's use this to grab the training and validation data:

In [None]:
val_data = get_data(val_batches)

#deg get_data(path, target_size):
#batches = get_batches(path, shuffle=False, batch_size=1, class_mode=None, target_size=target_size)
#return np.concatenate([batches.next() for in in range(batches.nb_sample)])

In [None]:
#Try this also
val_data_dummy = get_batches(val_batches)

In [None]:
trn_data = get_data(batches)

In [None]:
trn_data.shape

In [None]:
save_array(model_path+ 'train_data.bc', trn_data)
save_array(model_path+ 'val_data.bc', val_data)

We can load our training and validation data later without recalculating them:

In [None]:
trn_data = load_array(model_path+'train_data.bc')
val_data = load_array(model_path+'valid_data.bc')

In [None]:
val_data.shape

Keras returns *classes* as a single column, so we convert to one hot encoding

In [None]:
def onehot(x) : return np.array(oneHotEncoder().fit_transform(x.reshape(-1,1)).todense())
#One shape dimension can be -1. In this case, the value is inferred from the length of the array and remaining dimensions.
#toarray returns an ndarray; todense returns a matrix. If you want a matrix, use todense; otherwise, use toarray.
#fit_transform apply fit on the data to calcualte the parameter and then trasform on the same data

In [None]:
val_classes = val_batches.classes
trn_classes = val_batches.classes
val_label = onehot(val_classes)
trn_label = onehot(trn_classes)

In [None]:
trn_labels.shape

In [None]:
trn_classes[:4]

...and their 1,000 imagenet probabilties from VGG16--these will be the *features* for our linear model:

In [None]:
trn_features = model.predict(trn_data, batch_size = batch_size)
val_features = model.predict(val_data, batch_size = batch_size)

In [None]:
trn_features.shape

In [None]:
save_array(model_path+ 'train_lastlayer_features.bc', trn_features)
save_array(model_path + 'valid_lastlayer_features.bc', val_features)

In [None]:
We can load our training and validation features later without recalculating them:

In [None]:
trn_features = load_array(model_path+'train_lastlayer_features.bc')
val_features = load_array(model_path+'valid_lastlayer_features.bc')

Now we can define our linear model, just like we did earlier:

In [None]:
#Now we can define our linear model, just like we did earlier:
lm = Sequential([Dense(2, activation = 'softmax', input_shape = (1000,))])
lm.compile(optimizer = RMSprop(lr=0.1), loss = 'categorical_crossentry', metrics = ['accuracy'])

We're ready to fit the model!

In [None]:
batch_size = 64

In [None]:
lm.fit(trn_features, trn_labels, nb_epoch=3, batch_size=batch_size, 
       validation_data=(val_features, val_labels))
#Here we have used fit_generator

In [None]:
lm.summary()

### Viewing model prediction examples

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to handle that - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we, if anything, we can from these (in general, these are particularly useful for debugging problems in the model; since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
# We want both the classes...
preds = lm.predict_classes(val_features, batch_size = bath_size)
# ...and the probabilities of being a cat
probs = lm.predict_proba(val_features, batch_size = batch_size)

In [None]:
probs[:8]

Get the filenames for the validation set, so we can view images:

In [None]:
filenames = val_batches.filenames

In [None]:
# Number of images to view for each visualization task
n_view = 4

Helper function to plot images by index in the validation set:

In [None]:
def plots_idx(idx, titles=None):
    plots([image.load_img(path + 'valid/' + filenames[i]) for i in idx], titles=titles)

In [None]:
#1. A few correct labels at random
correct = np.where(preds==val_labels[:,1])[0]
idx = permutation(correct)[:n_view]
plots_idx(idx, probs[idx])

In [None]:
#2. A few incorrect labels at random
incorrect = np.where(preds!=val_labels[:,1])[0]
idx = permutation(incorrect)[:n_view]
plots_idx(idx, probs[idx])

In [None]:
#3. The images we most confident were cats, and are actually cats
correct_cats = np.where((preds==0) & (preds==val_labels[:,1]))[0]
most_correct_cats = np.argsort(probs[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], probs[correct_cats][most_correct_cats])

In [None]:
# as above, but dogs
correct_dogs = np.where((preds==1) & (preds==val_labels[:,1]))[0]
most_correct_dogs = np.argsort(probs[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], 1-probs[correct_dogs][most_correct_dogs])

In [None]:
#3. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((preds==0) & (preds!=val_labels[:,1]))[0]
most_incorrect_cats = np.argsort(probs[incorrect_cats])[::-1][:n_view]
plots_idx(incorrect_cats[most_incorrect_cats], probs[incorrect_cats][most_incorrect_cats])

In [None]:
#3. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((preds==1) & (preds!=val_labels[:,1]))[0]
most_incorrect_dogs = np.argsort(probs[incorrect_dogs])[:n_view]
plots_idx(incorrect_dogs[most_incorrect_dogs], 1-probs[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(probs-0.5))
plots_idx(most_uncertain[:n_view], probs[most_uncertain])

Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
cm = confusion_matrix(val_classes, preds)

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

# Modifying the model 

## Retrain last layer's linear model and use non-linear activation function

Since the original VGG16 network's last layer is Dense (i.e. a linear model) it seems a little odd that we are adding an additional linear model on top of it. This is especially true since the last layer had a softmax activation, which is an odd choice for an intermediate layer--and by adding an extra layer on top of it, we have made it an intermediate layer. What if we just removed the original final layer and replaced it with one that we train for the purpose of distinguishing cats and dogs? It turns out that this is a good idea - as we'll see!

We start by removing the last layer, and telling Keras that we want to fix the weights in all the other layers (since we aren't looking to learn new parameters for those other layers).

In [None]:
vgg.model.summary()

In [None]:
model.pop()
for layer in model.layers: layer.trainable = False

In [None]:
model.add(Dense(2, activation = 'softmax'))

...and compile our updated model, and set up our batches to use the preprocessed images (note that now we will also *shuffle* the training batches, to add more randomness when using multiple epochs):

In [None]:
gen = image.ImageDataGenerator()
batches = gen.flow(trn_data, trn_label, batch_size = batch_size, shuffle = True)
val_batches = gen.flow(val_data, val_label, batch_size = batch_size, shuffle = False)

We'll define a simple function for fitting models, just to save a little typing...

In [None]:
def fit_model(model, batches, val_batches, nb_epoch =1):
    model.fit_generator(batches, sample_per_epoch = batches.N, nb_epoch = nb_epoch,
                       validation_data=val_batches, nb_val_samples=val_batches.N)

...and now we can use it to train the last layer of our model!
(It runs quite slowly, since it still has to calculate all the previous layers in order to know what input to pass to the new final layer. We could precalculate the output of the penultimate layer, like we did for the final layer earlier - but since we're only likely to want one or two iterations, it's easier to follow this alternative approach.)

In [None]:
opt = RMSprop(lr = 0.1)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
fit_model(model, batches, val_batches, nb_epoch=2)

Before moving on, go back and look at how little code we had to write in this section to finetune the model. Because this is such an important and common operation, keras is set up to make it as easy as possible. We didn't even have to use any external helper functions in this section.

It's a good idea to save weights of all your models, so you can re-use them later. Be sure to note the git log number of your model when keeping a research journal of your results.

In [None]:
model.save_weights(model_path+'finetune1.h5')

In [None]:
model.load_weights(model_path+'finetune1.h5')

In [None]:
model.evaluate(val_data, val_labels)

We can look at the earlier prediction examples visualizations by redefining *probs* and *preds* and re-using our earlier code.

In [None]:
preds = model.predict_classes(val_data, batch_size=batch_size)
probs = model.predict_proba(val_data, batch_size=batch_size)[:,0]

In [None]:
probs[:8]

In [None]:
cm = confusion_matrix(val_classes, preds)
plot_confusion_matrix(cm, {'cat':0, 'dog':1})

## Retraining more layers

Now that we've fine-tuned the new final layer, can we, and should we, fine-tune *all* the dense layers? The answer to both questions, it turns out, is: yes! Let's start with the "can we" question...

### Training multiple layers in Keras

The code below will work on any model that contains dense layers; it's not just for this VGG model.

NB: Don't skip the step of fine-tuning just the final layer first, since otherwise you'll have one layer with random weights, which will cause the other layers to quickly move a long way from their optimized imagenet weights.

In [None]:
layers = model.layers
#Get the index of first dense layer
first_dense_idx = [index for index, layer in enumerate(layers) if type(layers) is Dense][0]
# and set this and all subsequent layer as trainable
for layer in layers[first_dense_idx:]: layers.trainable = True

Since we haven't changed our architecture, there's no need to re-compile the model - instead, we just set the learning rate. Since we're training more layers, and since we've already optimized the last layer, we should use a lower learning rate than previously.

In [None]:
k.set_value(opt.lr, 0.01)
fit_model(model, batches, val_batches, 3)

This is an extraordinarily powerful 5 lines of code. We have fine-tuned all of our dense layers to be optimized for our specific data set. This kind of technique has only become accessible in the last year or two - and we can already do it in just 5 lines of python!

In [None]:
model.save_weights(model_path+'finetune2.h5')

There's generally little room for improvement in training the convolutional layers, if you're using the model on natural images (as we are). However, there's no harm trying a few of the later conv layers, since it may give a slight improvement, and can't hurt (and we can always load the previous weights if the accuracy decreases).

In [None]:
for layers in layers[12:] : layers.trainable = False
K.set_value(opt.lr, 0.01)

In [None]:
fit_model(model, batches, val_batchs, 4)

In [None]:
model.save_weight(model_path+'finetune3.h5')

You can always load the weights later and use the model to do whatever you need:

In [None]:
model.load_weights(model_path+'finetune3.h5')
model.evaluate_generator(gen.flow(val_data, val_labels, batch_size=batch_size, shuffle=False)

## Are we underfitting?

Our validation accuracy so far has generally been higher than our training accuracy. That leads to two obvious questions:

1. How is this possible?
2. Is this desirable?

The answer to (1) is that this is happening because of *dropout*. Dropout refers to a layer that randomly deletes (i.e. sets to zero) each activation in the previous layer with probability *p* (generally 0.5). This only happens during training, not when calculating the accuracy on the validation set, which is why the validation set can show higher accuracy than the training set.

The purpose of dropout is to avoid overfitting. By deleting parts of the neural network at random during training, it ensures that no one part of the network can overfit to one part of the training set. The creation of dropout was one of the key developments in deep learning, and has allowed us to create rich models without overfitting. However, it can also result in underfitting if overused, and this is something we should be careful of with our model.

So the answer to (2) is: this is probably not desirable. It is likely that we can get better validation set results with less (or no) dropout, if we're seeing that validation accuracy is higher than training accuracy - a strong sign of underfitting. So let's try removing dropout entirely, and see what happens!

(We had dropout in this model already because the VGG authors found it necessary for the imagenet competition. But that doesn't mean it's necessary for dogs v cats, so we will do our own analysis of regularization approaches from scratch.)

## Removing dropout

Our high level approach here will be to start with our fine-tuned cats vs dogs model (with dropout), then fine-tune all the dense layers, after removing dropout from them. The steps we will take are:
- Re-create and load our modified VGG model with binary dependent (i.e. dogs v cats)
- Split the model between the convolutional (*conv*) layers and the dense layers
- Pre-calculate the output of the conv layers, so that we don't have to redundently re-calculate them on every epoch
- Create a new model with just the dense layers, and dropout p set to zero
- Train this new model using the output of the conv layers as training data.

As before we need to start with a working model, so let's bring in our working VGG 16 model and change it to predict our binary dependent...

In [None]:
def vgg_ft(out_dim):
    vgg = Vgg16()
    vgg.ft(out_dim)
    model = vgg.model
    return model

vgg_ft(2)

...and load our fine-tuned weights.

In [None]:
model.load_weight(model_path+'finetune3.h5')

We're going to be training a number of iterations without dropout, so it would be best for us to pre-calculate the input to the fully connected layers - i.e. the *Flatten()* layer. We'll start by finding this layer in our model, and creating a new model that contains just the layers up to and including this layer:

In [None]:
layers = model.layers

In [None]:
last_conv_idx = [index for index, layer in enumerate(layers) if type(layer) is Convolution2D][-1]

In [None]:
last_conv_idx

In [None]:
layers[last_conv_idx]

In [None]:
conv_layer = layers[:last_conv_idx+1]
conv_model = Sequential(conv_layer)
# Dense layers - also known as fully connected or 'FC' layers
fc_layer = layers[last_conv_idx+1:]

Now we can use the exact same approach to creating features as we used when we created the linear model from the imagenet predictions in the last lesson - it's only the model that has changed. As you're seeing, there's a fairly small number of "recipes" that can get us a long way!

In [None]:
batches = get_batches(path+'train', shuffle=False, batch_size=batch_size)
val_batches = get_batches(path+'valid', shuffle=False, batch_size=batch_size)

val_classes = val_batches.classes
trn_classes = batches.classes
val_labels = onehot(val_classes)
trn_labels = onehot(trn_classes)

In [None]:
val_features = conv_model.predict_generator(val_batches, val_batches.nb_sample)

In [None]:
trn_feature = conv_model.predict_generator(batches, batches.nb_sample)

In [None]:
save_array(model_path + 'train_convlayer_features.bc', trn_features)
save_array(model_path + 'valid_convlayer_features.bc', val_features)

In [None]:
trn_features = load_array(model_path+'train_convlayer_features.bc')
val_features = load_array(model_path+'valid_convlayer_features.bc')

In [None]:
trn_features.shape

For our new fully connected model, we'll create it using the exact same architecture as the last layers of VGG 16, so that we can conveniently copy pre-trained weights over from that model. However, we'll set the dropout layer's p values to zero, so as to effectively remove dropout.

In [None]:
# Copy the weights from the pre-trained model.
# NB: Since we're removing dropout, we want to half the weights
def proc_wgts(layer) : return [o/2 for o in layer.get_weight()]

In [None]:
# As we have removed dropout and for such a finely tuned model needs to be updated very slowly!
opt = RMSprop(lr = 0.0001, rho = 0.7 )

In [None]:
def get_fc_model():
    model = Sequential([
        MaxPooliing2D(input_shape = conv_layer[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation = 'relu')
        dropout(0.)
        Dense(4096, activation = 'relu')
        dropout(0.)
        Dense(2, activation = 'softmax')    
        ])
    
    for l1, l2 in zip(model.layers, fc_layers) : l1.set_weights(proc_wgts(l2))
    
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
fc_model = get_fc_model

And fit the model in the usual way:

In [None]:
fc_model.fit(trn_features,trn_label, nb_epoch=8, 
             batch_size=batch_size, validation_data=(val_features, val_labels))

In [None]:
fc_model.save_weights(model_path+'no_dropout.h5')

In [None]:
fc_model.load_weights(model_path+'no_dropout.h5')

# Reducing overfitting

Now that we've gotten the model to overfit, we can take a number of steps to reduce this.

## Approaches to reducing overfitting

We do not necessarily need to rely on dropout or other regularization approaches to reduce overfitting. There are other techniques we should try first, since regularlization, by definition, biases our model towards simplicity - which we only want to do if we know that's necessary. This is the order that we recommend using for reducing overfitting (more details about each in a moment):

1. Add more data
2. Use data augmentation
3. Use architectures that generalize well
4. Add regularization
5. Reduce architecture complexity.

We'll assume that you've already collected as much data as you can, so step (1) isn't relevant (this is true for most Kaggle competitions, for instance). So the next step (2) is data augmentation. This refers to creating additional synthetic data, based on reasonable modifications of your input data. For images, this is likely to involve one or more of: flipping, rotation, zooming, cropping, panning, minor color changes.

Which types of augmentation are appropriate depends on your data. For regular photos, for instance, you'll want to use horizontal flipping, but not vertical flipping (since an upside down car is much less common than a car the right way up, for instance!)

We recommend *always* using at least some light data augmentation, unless you have so much data that your model will never see the same input twice.


## About data augmentation

Keras comes with very convenient features for automating data augmentation. You simply define what types and maximum amounts of augmentation you want, and keras ensures that every item of every batch randomly is changed according to these settings. Here's how to define a generator that includes data augmentation:

In [None]:
# dim_ordering='tf' uses tensorflow dimension ordering,
#   which is the same order as matplotlib uses for display.
# Therefore when just using for display purposes, this is more convenient
gen = image.ImageDataGenerator(rotation_range = 10, width_shift = 0.1, height_shift_range=0.1, width_zoom = 0.1,shear_range=0.15, zoom_range=0.1, 
       channel_shift_range=10., horizontal_flip=True, dim_ordering='tf')

In [None]:
# Create a 'batch' of a single image
img = np.expand_dims(ndimage.imread('cat.jpg'),0)
# Request the generator to create batches from this image
aug_iter = gen.flow(img)

In [None]:
# Get eight examples of these augmented images
aug_imgs = [next(aug_iter)[0].astype(np.uint8) for i in range(8)]

In [None]:
# The original
plt.imshow(img[0])

As you can see below, there's no magic to data augmentation - it's a very intuitive approach to generating richer input data. Generally speaking, your intuition should be a good guide to appropriate data augmentation, although it's a good idea to test your intuition by checking the results of different augmentation approaches.

In [None]:
# Augmented data
plots(aug_imgs, (20,7), 2)

In [None]:
# Ensure that we return to theano dimension ordering
K.set_image_dim_ordering('th')

## Adding data augmentation

Let's try adding a small amount of data augmentation, and see if we reduce overfitting as a result. The approach will be identical to the method we used to finetune the dense layers, except that we will use a generator with augmentation configured. Here's how we set up the generator, and create batches from it:

In [None]:
gen = image.ImagedataGenerator(rotation_range=15, width_shift_range=0.1, 
                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True)

In [None]:
batches = get_batches(path+'train', gen, batch_size = batch_size)
# NB: We don't want to augment or shuffle the validation set
val_batches = get_batches(path+'valid', shuffle=False, batch_size=batch_size)

When using data augmentation, we can't pre-compute our convolutional layer features, since randomized changes are being made to every input image. That is, even if the training process sees the same image multiple times, each time it will have undergone different data augmentation, so the results of the convolutional layers will be different.

Therefore, in order to allow data to flow through all the conv layers and our new dense layers, we attach our fully connected model to the convolutional model--after ensuring that the convolutional layers are not trainable:

In [None]:
fc_model = get_fc_model()

In [None]:
for layer in conv_model.layers: layer.trainable = False
# Look how easy it is to connect two models together!
conv_model.add(fc_model)

Now we can compile, train, and save our model as usual - note that we use *fit_generator()* since we want to pull random images from the directories on every batch.

In [None]:
conv_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
conv_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=8, 
                        validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
conv_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3, 
                        validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
conv_model.save_weights(model_path + 'aug1.h5')

In [None]:
conv_model.load_weights(model_path + 'aug1.h5')

## Batch normalization

### About batch normalization

Batch normalization (*batchnorm*) is a way to ensure that activations don't become too high or too low at any point in the model. Adjusting activations so they are of similar scales is called *normalization*. Normalization is very helpful for fast training - if some activations are very high, they will saturate the model and create very large gradients, causing training to fail; if very low, they will cause training to proceed very slowly. Furthermore, large or small activations in one layer will tend to result in even larger or smaller activations in later layers, since the activations get multiplied repeatedly across the layers.

Prior to the development of batchnorm in 2015, only the inputs to a model could be effectively normalized - by simply subtracting their mean and dividing by their standard deviation. However, weights in intermediate layers could easily become poorly scaled, due to problems in weight initialization, or a high learning rate combined with random fluctuations in weights.

Batchnorm resolves this problem by normalizing each intermediate layer as well. The details of how it works are not terribly important (although I will outline them in a moment) - the important takeaway is that **all modern networks should use batchnorm, or something equivalent**. There are two reasons for this:
1. Adding batchnorm to a model can result in **10x or more improvements in training speed**
2. Because normalization greatly reduces the ability of a small number of outlying inputs to over-influence the training, it also tends to **reduce overfitting**.

As promised, here's a brief outline of how batchnorm works. As a first step, it normalizes intermediate layers in the same way as input layers can be normalized. But this on its own would not be enough, since the model would then just push the weights up or down indefinitely to try to undo this normalization. Therefore, batchnorm takes two additional steps:
1. Add two more trainable parameters to each layer - one to multiply all activations to set an arbitrary standard deviation, and one to add to all activations to set an arbitary mean
2. Incorporate both the normalization, and the learnt multiply/add parameters, into the gradient calculations during backprop.

This ensures that the weights don't tend to push very high or very low (since the normalization is included in the gradient calculations, so the updates are aware of the normalization). But it also ensures that if a layer does need to change the overall mean or standard deviation in order to match the output scale, it can do so.

### Adding batchnorm to the model

We can use nearly the same approach as before - but this time we'll add batchnorm layers (and dropout layers):

In [None]:
conv_layers[-1].output_shape[1:]

In [None]:
def get_bn_layers(p):
    return [
        MaxPooling(input_shape = conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation = 'relu'),
        dropout(p),
        BatchNormalization(),
        Dense(4096, activation = 'relu'),
        dropout(p),
        BatchNormalization(),
        Dense(1000, activation ='softmax')
    ]


In [None]:
p= 0.6
bn_model = Sequential(get_bn_layer(0.6))

In [None]:
##bn_model.load_weights(model_path+'finetune3.h5')

In [None]:
def proc_wgts(layers, prev_p, next_p):
    scale = (1-prev_p)/(1-next_p)
    return [o*scale for o in layers.get_weight()]

In [None]:
for l in bn_model.layers:
    if type(l) == Dense: l.set_weights(proc_wgts(l, 0.5, 0.6))

In [None]:
bn_model.pop()
#for layer in bn_model.layers: layer.trainable=False

In [None]:
bn_model.add(Dense(2, activation = "softmax"))

In [None]:
bn_model.compile(Adam(), 'categorical_crossentropy', metrics=['accuracy'])

In [None]:
bn_model.fit(trn_features, trn_labels, nb_epoch=8, validation_data=(val_features, val_labels))

In [None]:
bn_model.save_weights(model_path+'bn.h5')

In [None]:
bn_model.load_weights(model_path+'bn.h5')

### Final Model
## Dropout + BatchNorm + Data Augumentation

In [None]:
bn_layers = get_bn_layers(0.6)
bn_layers.pop()
bn_layers.append(Dense(2,activation='softmax'))

In [None]:
final_model = Sequential(conv_layers)
for layer in final_model.layer : layer.trainable = False
for layer in bn_layer : final_model.add(layer)

In [None]:
for l1, l2 in zip(bn_model.layer, bn_layer) : l2.set_weights(l1.get_weights())

In [None]:
final_model.compile(optimizer=Adam(), 
                    loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
final_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=1, 
                        validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
final_model.save_weights(model_path + 'final1.h5')

In [None]:
final_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=4, 
                        validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
final_model.save_weights(model_path + 'final2.h5')

In [None]:
final_model.optimizer.lr = 0.001

In [None]:
final_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=4, 
                        validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
bn_model.save_weights(model_path + 'final3.h5')