# MNIST CNN (without BN vs with BN)

In [1]:
from theano.sandbox import cuda
cuda.use('gpu0')

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 950 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5110)
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29



In [2]:
#%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


## Setup

In [3]:
batch_size=64

In [4]:
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [5]:
X_test = np.expand_dims(X_test,1)
X_train = np.expand_dims(X_train,1)

In [6]:
X_train.shape

(60000, 1, 28, 28)

In [7]:
y_train[:5]

array([5, 0, 4, 1, 9], dtype=uint8)

In [8]:
y_train = onehot(y_train)
y_test = onehot(y_test)

In [9]:
y_train[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [10]:
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32)

In [11]:
def norm_input(x): return (x-mean_px)/std_px

## Conv model

In [45]:
def get_conv_model(lr=0.001):
    model = Sequential([
        Lambda(norm_input, input_shape=(1,28,28)),
        ZeroPadding2D((1, 1)),
        Convolution2D(32, 3, 3, activation='relu'),
        MaxPooling2D(),
        ZeroPadding2D((1, 1)),
        Convolution2D(64, 3, 3, activation='relu'),  
        MaxPooling2D(),
        Flatten(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(10, activation='softmax')
        ])
    model.compile(Adam(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### learning rate 0.001 -> 0.0001

In [88]:
conv_model = get_conv_model(0.001)

  .format(self.name, input_shape))


In [89]:
conv_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_12 (Lambda)               (None, 1, 28, 28)     0           lambda_input_12[0][0]            
____________________________________________________________________________________________________
zeropadding2d_23 (ZeroPadding2D) (None, 1, 30, 30)     0           lambda_12[0][0]                  
____________________________________________________________________________________________________
convolution2d_23 (Convolution2D) (None, 32, 28, 28)    320         zeropadding2d_23[0][0]           
____________________________________________________________________________________________________
maxpooling2d_23 (MaxPooling2D)   (None, 32, 14, 14)    0           convolution2d_23[0][0]           
___________________________________________________________________________________________

In [90]:
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size=64) # keras.preprocessing.image.NumpyArrayIterator
test_batches = gen.flow(X_test, y_test, batch_size=64)

In [91]:
conv_model.fit_generator(batches, batches.n, nb_epoch=1, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f4f2ae6afd0>

#### Why is loss much higher than val_loss ?

https://faroit.github.io/keras-docs/1.2.2/getting-started/faq/#why-is-the-training-loss-much-higher-than-the-testing-loss

> Besides, the training loss is the average of the losses over each batch of training data. Because your model is changing over time, the loss over the first batches of an epoch is generally higher than over the last batches. On the other hand, the testing loss for an epoch is computed using the model as it is at the end of the epoch, resulting in a lower loss.

Also, in evaluation, we don't use dropout.

In [92]:
conv_model.optimizer.lr = 0.0001

In [93]:
conv_model.fit_generator(batches, batches.n, nb_epoch=8, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f4f2a58f990>

### learning rate 0.01 -> 0.001

In [94]:
conv_model = get_conv_model(0.01)

  .format(self.name, input_shape))


In [95]:
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size=64) # keras.preprocessing.image.NumpyArrayIterator
test_batches = gen.flow(X_test, y_test, batch_size=64)

In [96]:
conv_model.fit_generator(batches, batches.n, nb_epoch=1, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f4f294d0290>

In [97]:
conv_model.optimizer.lr = 0.001

In [98]:
conv_model.fit_generator(batches, batches.n, nb_epoch=8, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f4f28bf5350>

## Batch normalization

With batch normalization, we can use higher learning rate.

https://deepage.net/deep_learning/2016/10/26/batch_normalization.html

In [99]:
def get_bn_model(lr=0.001):
    model = Sequential([
        Lambda(norm_input, input_shape=(1,28,28)),
        ZeroPadding2D((1, 1)),
        Convolution2D(32, 3, 3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        ZeroPadding2D((1, 1)),
        Convolution2D(64, 3, 3, activation='relu'),  
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Flatten(),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(10, activation='softmax')
        ])
    model.compile(Adam(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### learning rate 0.001 -> 0.0001

In [100]:
bn_model = get_bn_model(0.001)

  .format(self.name, input_shape))


In [101]:
bn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_14 (Lambda)               (None, 1, 28, 28)     0           lambda_input_14[0][0]            
____________________________________________________________________________________________________
zeropadding2d_27 (ZeroPadding2D) (None, 1, 30, 30)     0           lambda_14[0][0]                  
____________________________________________________________________________________________________
convolution2d_27 (Convolution2D) (None, 32, 28, 28)    320         zeropadding2d_27[0][0]           
____________________________________________________________________________________________________
batchnormalization_20 (BatchNorm (None, 32, 28, 28)    128         convolution2d_27[0][0]           
___________________________________________________________________________________________

In [102]:
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size=64) # keras.preprocessing.image.NumpyArrayIterator
test_batches = gen.flow(X_test, y_test, batch_size=64)

In [103]:
bn_model.fit_generator(batches, batches.n, nb_epoch=1, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f4f273dd490>

In [104]:
bn_model.optimizer.lr = 0.0001

In [105]:
bn_model.fit_generator(batches, batches.n, nb_epoch=8, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f4f267ddc90>

### learning rate 0.01 -> 0.001

In [106]:
bn_model = get_bn_model(0.01)

  .format(self.name, input_shape))


In [107]:
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size=64) # keras.preprocessing.image.NumpyArrayIterator
test_batches = gen.flow(X_test, y_test, batch_size=64)

In [108]:
bn_model.fit_generator(batches, batches.n, nb_epoch=1, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f4f24a47890>

In [109]:
bn_model.optimizer.lr = 0.001

In [110]:
bn_model.fit_generator(batches, batches.n, nb_epoch=8, 
                    validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f4f23e44bd0>

| | loss | acc | val_loss | val_acc |
|---|---|---|---|---|
| lr=0.001 -> 0.0001  | 0.0133 | 0.9957 | 0.0205 | 0.9936 | 
| BN, lr=0.001 -> 0.0001 | 0.0158 | 0.9948 | 0.0369 | 0.9889 |
| lr=0.01 -> 0.001 | 0.2219 | 0.9352 | 0.1436 | 0.9582 |
| BN, lr=0.01 -> 0.001 | 0.0507 | 0.9886 | 0.0567 | 0.9877 |
