In [1]:
'''Trains a simple deep NN on the MNIST dataset.
Gets to 98.40% test accuracy after 20 epochs
(there is *a lot* of margin for parameter tuning).
2 seconds per epoch on a K520 GPU.
'''
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.callbacks import CSVLogger, TensorBoard, ReduceLROnPlateau, EarlyStopping
import os
from keras.layers.noise import AlphaDropout
import keras.activations
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
batch_size = 64
num_classes = 10
epochs = 40
units = 64
experiments = 1
start = 2
add_final_dense = False
activations = ['sigmoid', 'tanh', 'relu', 'linear', 'elu', 'softplus', 'softsign', 'hard_sigmoid', 'LeakyReLU', 'ThresholdedReLU']
# activations = ['ThresholdedReLU']
# PReLU is not used, since it does not currently support variable inputs
optimizers = ['rmsp', 'adam', 'sgd', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam']
# optimizers = ['Nadam']
save_dir = os.path.join(os.getcwd(), 'saved_models')
start_time = time.time()
counter = 0
total_items = (len(activations) + 1 )* len(optimizers)

In [3]:
img_rows, img_cols = 28, 28

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


## Train per each activation function

In [4]:

for act in activations:
    for opt in optimizers:
        print("\nTraining for activation " + act + " with optimizer " + opt)

        # Selecting activation function
        if act == 'sigmoid':
            activation = Activation(keras.activations.sigmoid)
        elif act == 'tanh':
            activation = Activation(keras.activations.tanh)
        elif act == 'relu':
            activation = Activation(keras.activations.relu)
        elif act == 'linear':
            activation = Activation(keras.activations.linear)
        elif act == 'elu':
            activation = Activation(keras.activations.elu)
        elif act == 'softplus':
            activation = Activation(keras.activations.softplus)
        elif act == 'softsign':
            activation = Activation(keras.activations.softsign)
        elif act == 'hard_sigmoid':
            activation = Activation(keras.activations.hard_sigmoid)
        elif act == 'LeakyReLU':
            activation = keras.layers.advanced_activations.LeakyReLU()
        elif act == 'PReLU':
            activation = keras.layers.advanced_activations.PReLU()
        elif act == 'ThresholdedReLU':
            activation = keras.layers.advanced_activations.ThresholdedReLU(theta=0.7)
            

        if opt == 'rmsp':
            optimizer = keras.optimizers.rmsprop()                
        elif opt == 'adam':
            optimizer = keras.optimizers.Adam()
        elif opt == 'sgd':
            optimizer = keras.optimizers.SGD()
        elif opt == 'Adagrad':
            optimizer = keras.optimizers.Adagrad()
        elif opt == 'Adadelta':
            optimizer = keras.optimizers.Adadelta()
        elif opt == 'Adamax':
            optimizer = keras.optimizers.Adamax()
        elif opt == 'Nadam':
            optimizer = keras.optimizers.Nadam()
        
        for i in range(experiments):
            if add_final_dense:
                model_name = 'mnist_cnn_' + act + "_" + opt + '_' + str(epochs) + '_' +str(i + start) + '_fd'
            else:
                model_name = 'mnist_cnn_' + act + "_" + opt + '_' + str(epochs) + '_' + str(i + start)
            model = Sequential()
            model.add(Conv2D(32, kernel_size=(3, 3),
                             input_shape=input_shape))
            
            model.add(activation)
            model.add(Conv2D(64, kernel_size=(3, 3)))
            model.add(activation)
            model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='same'))
            model.add(Dropout(0.2))
            model.add(Flatten())
            if add_final_dense:
                model.add(Dense(128))
                model.add(activation)
                model.add(Dropout(0.2))
            model.add(Dense(num_classes, activation='softmax'))

            model.compile(loss='categorical_crossentropy',
                              optimizer=optimizer,
                              metrics=['accuracy'])
            
            print('-'*30)
            print('Experiment', i)

            csv_logger = CSVLogger('./logs/%s_%d.csv' % (model_name, units), append=False, separator=';')
            reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=3, verbose=1, mode='auto', epsilon=0.0001, cooldown=2, min_lr=0)
#             tb = TensorBoard(log_dir='./tb_logs/' + model_name + '_' + str(units), histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
            history = model.fit(x_train, y_train,
                                batch_size=batch_size,
                                epochs=epochs,
                                verbose=1,
                                validation_data=(x_test, y_test), callbacks=[csv_logger, reduce_lr])

            if not os.path.isdir(save_dir):
                os.makedirs(save_dir)
            model_path = os.path.join(save_dir, model_name + ".h5")
#             model.save(model_path)

            score = model.evaluate(x_test, y_test, verbose=0)
            print('Test loss:', score[0])
            print('Test accuracy:', score[1])
            t = time.time()
            time_diff = t - start_time
            counter +=1
            rem_items = total_items - counter
            total_time = round((total_items / counter) * time_diff)
            rem_time = round(total_time - time_diff)
            m, s = divmod(rem_time, 60)
            h, m = divmod(m, 60)
            d, h = divmod(h, 24)
            print('Remaining time: %d days %d hours %02d minutes %02d seconds' % (d, h, m, s))
            


Training for activation sigmoid with optimizer rmsp
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40

Epoch 00026: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40

Epoch 00031: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40

Epoch 00036: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.04410293909445754
Test accuracy: 0.9858
Re

Epoch 15/40
Epoch 16/40
Epoch 17/40

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40

Epoch 00022: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40

Epoch 00027: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40

Epoch 00035: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40

Epoch 00040: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Test loss: 0.04483505396270193
Test accuracy: 0.986
Remaining time: 6 days 16 hours 26 minutes 49 seconds

Training for activation sigmoid with optimizer sgd
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 

Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.20203989427685737
Test accuracy: 0.9434
Remaining time: 6 days 11 hours 20 minutes 09 seconds

Training for activation sigmoid with optimizer Adagrad
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40

Epoch 00015: ReduceLROnPlateau reducing learning rate to 7.999999215826393e-05.
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40

Epoch 00020: ReduceLROnPlateau reducing learning rate to 1.599999814061448e-05.
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25

Test loss: 14.28869146270752
Test accuracy: 0.1135
Remaining time: 6 days 7 hours 55 minutes 09 seconds

Training for activation sigmoid with optimizer Adadelta
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.2.
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.04000000059604645.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40

Epoch 00031: ReduceLROnPlateau reducing learning rate to 0.007999999821186066.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40

Epoch 00036: ReduceLROnPlateau reducing learning rate to 0.0015999998897314072.
Epoch 37/40
Epoch 3

Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40

Epoch 00027: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40

Epoch 00033: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40

Epoch 00039: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.
Epoch 40/40
Test loss: 0.042371149355568925
Test accuracy: 0.9874
Remaining time: 6 days 4 hours 21 minutes 30 seconds

Training for activation sigmoid with optimizer Nadam
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 

Epoch 29/40
Epoch 30/40
Epoch 31/40

Epoch 00031: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.05057651520556683
Test accuracy: 0.9864
Remaining time: 6 days 2 hours 39 minutes 48 seconds

Training for activation tanh with optimizer rmsp
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40

Epoch 00026: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/

Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40

Epoch 00012: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40

Epoch 00017: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40

Epoch 00022: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40

Epoch 00027: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40

Epoch 00032: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-08.
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40

Epoch 00037: ReduceLROnPl

Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40

Epoch 00034: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.04701956300488673
Test accuracy: 0.9843
Remaining time: 6 days 19 hours 04 minutes 58 seconds

Training for activation tanh with optimizer Adagrad
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch

Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40

Epoch 00033: ReduceLROnPlateau reducing learning rate to 7.999999215826393e-05.
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40

Epoch 00038: ReduceLROnPlateau reducing learning rate to 1.599999814061448e-05.
Epoch 39/40
Epoch 40/40
Test loss: 0.045940684078325286
Test accuracy: 0.986
Remaining time: 6 days 14 hours 17 minutes 15 seconds

Training for activation tanh with optimizer Adadelta
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.2.
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.04000000059604645.
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.007999

Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40

Epoch 00016: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40

Epoch 00021: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40

Epoch 00026: ReduceLROnPlateau reducing learning rate to 3.2000003557186575e-06.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40

Epoch 00031: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-07.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40

Epoch 00036: ReduceLROnPlateau reducing learning rate to 1.280000105907675e-07.
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epo


Epoch 00013: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40

Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40

Epoch 00023: ReduceLROnPlateau reducing learning rate to 3.2000003557186575e-06.
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40

Epoch 00028: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-07.
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40

Epoch 00033: ReduceLROnPlateau reducing learning rate to 1.280000105907675e-07.
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40

Epoch 00038: ReduceLROnPlateau reducing learning rate to 2.5600002118153498e-08.
Epoch 39/40
Epoch 40/40
Test loss: 0.07741822412975694
Test accuracy: 0.9837
Remaining time: 6 days 3 hours 07 minutes 32 seconds

Training for activation relu with optimizer rmsp
------------------------------


Epoch 26/40
Epoch 27/40

Epoch 00027: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40

Epoch 00032: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40

Epoch 00037: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-08.
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.02929145397132161
Test accuracy: 0.9915
Remaining time: 5 days 23 hours 24 minutes 39 seconds

Training for activation relu with optimizer adam
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40


Epoch 39/40

Epoch 00039: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Epoch 40/40
Test loss: 0.050381223035747505
Test accuracy: 0.9902
Remaining time: 5 days 20 hours 06 minutes 01 seconds

Training for activation relu with optimizer sgd
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40

Epoch 0004

Epoch 11/40
Epoch 12/40
Epoch 13/40

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40

Epoch 00018: ReduceLROnPlateau reducing learning rate to 7.999999215826393e-05.
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40

Epoch 00023: ReduceLROnPlateau reducing learning rate to 1.599999814061448e-05.
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40

Epoch 00028: ReduceLROnPlateau reducing learning rate to 3.199999628122896e-06.
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40

Epoch 00033: ReduceLROnPlateau reducing learning rate to 6.399999165296323e-07.
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40

Epoch 00038: ReduceLROnPlateau reducing learning rate to 1.2799998785339995e-07.
Epoch 39/40
Epoch 40/40
Test loss: 0.033434533236257266
Test accuracy: 0.9887
Remaining time: 5 days 13 hours 36 minutes 10 seconds

Training for activation relu with optimizer 

Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40

Epoch 00027: ReduceLROnPlateau reducing learning rate to 0.007999999821186066.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0015999998897314072.
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40

Epoch 00037: ReduceLROnPlateau reducing learning rate to 0.0003199999686330557.
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.036246989504407065
Test accuracy: 0.991
Remaining time: 5 days 10 hours 51 minutes 35 seconds

Training for activation relu with optimizer Adamax
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40

Epoch 00018:

Epoch 36/40
Epoch 37/40
Epoch 38/40

Epoch 00038: ReduceLROnPlateau reducing learning rate to 1.280000105907675e-07.
Epoch 39/40
Epoch 40/40
Test loss: 0.036387057847474716
Test accuracy: 0.9893
Remaining time: 5 days 8 hours 08 minutes 53 seconds

Training for activation relu with optimizer Nadam
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40

Epoch 00019: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40

Epoch 00024: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40

Epoch 00029: ReduceLROn

Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 13/40
Epoch 14/40
Epoch 15/40

KeyboardInterrupt: 

# SeLU
This one has special requirements

In [5]:
# start = 2
for opt in optimizers:
    print("Training for activation SeLU with optimizer " + opt)
    for i in range(experiments):
        if add_final_dense:
            model_name = 'mnist_cnn_selu_' + opt + '_' + str(epochs) + '_' +str(i + start) + '_fd'
        else:
            model_name = 'mnist_cnn_selu_' + opt + '_' + str(epochs) + '_' + str(i + start)
                
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3),
                         activation='selu', kernel_initializer='lecun_normal',
                         input_shape=input_shape))
        model.add(Conv2D(64, (3, 3), activation='selu', kernel_initializer='lecun_normal'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(AlphaDropout(0.2))
        model.add(Flatten())
        if add_final_dense:
            model.add(Dense(128, activation='selu', kernel_initializer='lecun_normal'))
            model.add(AlphaDropout(0.2))
        model.add(Dense(num_classes, activation='softmax'))

        if opt == 'rmsp':
            optimizer = keras.optimizers.rmsprop()                
        elif opt == 'adam':
            optimizer = keras.optimizers.Adam()
        elif opt == 'sgd':
            optimizer = keras.optimizers.SGD()
        elif opt == 'Adagrad':
            optimizer = keras.optimizers.Adagrad()
        elif opt == 'Adadelta':
            optimizer = keras.optimizers.Adadelta()
        elif opt == 'Adamax':
            optimizer = keras.optimizers.Adamax()
        elif opt == 'Nadam':
            optimizer = keras.optimizers.Nadam()


        model.compile(loss='categorical_crossentropy',
                          optimizer=optimizer,
                          metrics=['accuracy'])
        print('-'*30)
        print('Experiment', i+1)

        csv_logger = CSVLogger('./logs/%s_%d.csv' % (model_name, units), append=False, separator=';')
        reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=3, verbose=0, mode='auto', epsilon=0.0001, cooldown=2, min_lr=0)
#         tb = TensorBoard(log_dir='./tb_logs/' + model_name + '_' + str(units), histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
        history = model.fit(x_train, y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1,
                            validation_data=(x_test, y_test), callbacks=[csv_logger, reduce_lr])

        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        model_path = os.path.join(save_dir, model_name + ".h5")
#         model.save(model_path)
        score = model.evaluate(x_test, y_test, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
        t = time.time()
        time_diff = t - start_time
        counter +=1
        rem_items = total_items - counter
        total_time = round((total_items / counter) * time_diff)
        rem_time = round(total_time - time_diff)
        m, s = divmod(rem_time, 60)
        h, m = divmod(m, 60)
        d, h = divmod(h, 24)
        print('Remaining time: %d days %d hours %02d minutes %02d seconds' % (d, h, m, s))
        

Training for activation SeLU with optimizer rmsp
------------------------------
Experiment 1
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.10260849180980613
Test accuracy: 0.9885
Remaining time: 6 days 18 hours 40 minutes 30 seconds
------------------------------
Experiment 2
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Ep

Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.16756786854205735
Test accuracy: 0.9878
Remaining time: 6 days 11 hours 59 minutes 25 seconds
------------------------------
Experiment 2
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.10624317196074234
Test accuracy: 0.9882
Remaining time: 6 days 15 hours 30 minutes 00 seconds
Training for activation SeLU with optimizer sgd
------------------------------
Experiment 1
Train on 60000 samples, validate on 10000 samples
Epoch 1

Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.04614412600668729
Test accuracy: 0.9877
Remaining time: 6 days 11 hours 24 minutes 47 seconds
------------------------------
Experiment 2
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoc

Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.04924459680216432
Test accuracy: 0.9877
Remaining time: 6 days 21 hours 18 minutes 43 seconds
Training for activation SeLU with optimizer Adadelta
------------------------------
Experiment 1
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 

Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.07825659418930768
Test accuracy: 0.9891
Remaining time: 6 days 11 hours 21 minutes 15 seconds
Training for activation SeLU with optimizer Adamax
------------------------------
Experiment 1
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.07948067439237748
Test accuracy: 0.9877
Remaining time: 6 days 6 hours 52 minutes 46 seconds
------------------------------
Experiment 2
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/

Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.07019284479929001
Test accuracy: 0.9895
Remaining time: 6 days 2 hours 46 minutes 38 seconds
Training for activation SeLU with optimizer Nadam
------------------------------
Experiment 1
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epo