In [5]:
'''Trains a simple deep NN on the MNIST dataset.
Gets to 98.40% test accuracy after 20 epochs
(there is *a lot* of margin for parameter tuning).
2 seconds per epoch on a K520 GPU.
'''
import keras
import keras.backend as K
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Activation
from keras.optimizers import RMSprop
from keras.callbacks import CSVLogger, ReduceLROnPlateau
import os
from keras.layers.noise import AlphaDropout
import keras.activations

import numpy as np
from scipy import stats
import pandas as pd
import time

In [6]:
batch_size = 64
num_classes = 10
epochs = 200
units = 64
experiments = 5
start = 0

activations = ['selu', 'sigmoid']
optimizers = ['Adamax', 'sgd']

configs = [
    {
        'factor': 0.2, 
        'patience': 10,  
        'cooldown': 5
    },
    {
        'factor': 0.35, 
        'patience': 10, 
        'cooldown': 5
    },
    {
        'factor': 0.5, 
        'patience': 10, 
        'cooldown': 5
    }
]

In [7]:
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

60000 train samples
10000 test samples


In [8]:
start_time = time.time()
counter = 0
total_items = len(activations) * len(optimizers) * experiments * len(configs)

for cfg in configs:
    for act in activations:
        for opt in optimizers:
            print("Training for activation " + act + " with optimizer " + opt + ' with config ' + str(cfg))
            for i in range(experiments):
            

                act_dict = {
                  'sigmoid': Activation(keras.activations.sigmoid),
                  'tanh': Activation(keras.activations.tanh),
                  'relu': Activation(keras.activations.relu),
                  'linear': Activation(keras.activations.linear),
                  'elu': Activation(keras.activations.elu),
                  'softplus': Activation(keras.activations.softplus),
                  'softsign': Activation(keras.activations.softsign),
                  'hard_sigmoid': Activation(keras.activations.hard_sigmoid),
                  'LeakyReLU': keras.layers.advanced_activations.LeakyReLU(),
                  'PReLU': keras.layers.advanced_activations.PReLU(),
                  'selu': Activation(keras.activations.selu),
                  'ThresholdedReLU': keras.layers.advanced_activations.ThresholdedReLU(theta=0.7) # As proposed in the original paper
                }

                opt_dict = {
                  'rmsp': keras.optimizers.rmsprop(lr=0.001),
                  'adam': keras.optimizers.Adam(),
                  'sgd': keras.optimizers.SGD(),
                  'Adagrad': keras.optimizers.Adagrad(),
                  'Adadelta': keras.optimizers.Adadelta(),
                  'Adamax': keras.optimizers.Adamax(),
                  'Nadam': keras.optimizers.Nadam()
                }

            
                model_name = 'normd_lr_' + str(cfg['factor']) + '_' + act + "_" + opt + '_' + str(i + start)
                inputs = Input(shape=(784,))
                x = Dense(units, name = 'dense_1')(inputs)
                x = act_dict[act](x)
                x = Dropout(0.2)(x)
                x = Dense(units, name = 'dense_2')(x)
                x = act_dict[act](x)
                x = Dropout(0.2)(x)
                predictions = Dense(num_classes, activation='softmax', name = 'dense_output')(x)
                model = Model(inputs=inputs, outputs=predictions)

                model.compile(loss='categorical_crossentropy',
                                  optimizer=opt_dict[opt],
                                  metrics=['accuracy'])
                print('-'*30)
                print('Experiment', i)

                csv_logger = CSVLogger('./training_logs/%s_%d.csv' % (model_name, units), append=False)
                reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=cfg['factor'], patience=cfg['patience'], verbose=1, mode='auto', epsilon=0.0001, cooldown=cfg['cooldown'], min_lr=0)
                history = model.fit(x_train, y_train,
                                    batch_size=batch_size,
                                    epochs=epochs,
                                    verbose=0,
                                    validation_data=(x_test, y_test), callbacks=[csv_logger, reduce_lr])

                score = model.evaluate(x_test, y_test, verbose=0)
                print('Test accuracy:', score[1])
                
                t = time.time()
                time_diff = t - start_time
                counter +=1
                rem_items = total_items - counter
                total_time = round((total_items / counter) * time_diff)
                rem_time = round(total_time - time_diff)
                m, s = divmod(rem_time, 60)
                h, m = divmod(m, 60)
                d, h = divmod(h, 24)
                print('Remaining time: %d days %d hours %02d minutes %02d seconds' % (d, h, m, s))

Training for activation selu with optimizer Adamax with config {'factor': 0.2, 'patience': 10, 'cooldown': 5}
------------------------------
Experiment 0

Epoch 00075: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.

Epoch 00095: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.

Epoch 00110: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.

Epoch 00125: ReduceLROnPlateau reducing learning rate to 3.2000003557186575e-06.

Epoch 00140: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-07.

Epoch 00155: ReduceLROnPlateau reducing learning rate to 1.280000105907675e-07.

Epoch 00170: ReduceLROnPlateau reducing learning rate to 2.5600002118153498e-08.

Epoch 00185: ReduceLROnPlateau reducing learning rate to 5.1200004236306995e-09.

Epoch 00200: ReduceLROnPlateau reducing learning rate to 1.02400008472614e-09.
Test accuracy: 0.9807
Remaining time: 0 days 8 hours 24 minutes 37 seconds
------------------------------
Experime


Epoch 00113: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.

Epoch 00128: ReduceLROnPlateau reducing learning rate to 3.2000003557186575e-06.

Epoch 00143: ReduceLROnPlateau reducing learning rate to 6.400000529538374e-07.

Epoch 00158: ReduceLROnPlateau reducing learning rate to 1.280000105907675e-07.

Epoch 00173: ReduceLROnPlateau reducing learning rate to 2.5600002118153498e-08.

Epoch 00188: ReduceLROnPlateau reducing learning rate to 5.1200004236306995e-09.
Test accuracy: 0.9766
Remaining time: 0 days 6 hours 55 minutes 05 seconds
------------------------------
Experiment 3

Epoch 00086: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.

Epoch 00112: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.

Epoch 00127: ReduceLROnPlateau reducing learning rate to 1.6000001050997525e-05.

Epoch 00142: ReduceLROnPlateau reducing learning rate to 3.2000003557186575e-06.

Epoch 00157: ReduceLROnPlateau reducing learning rate to 6.40000

------------------------------
Experiment 4
Test accuracy: 0.9742
Remaining time: 0 days 5 hours 41 minutes 48 seconds
Training for activation sigmoid with optimizer Adamax with config {'factor': 0.35, 'patience': 10, 'cooldown': 5}
------------------------------
Experiment 0

Epoch 00080: ReduceLROnPlateau reducing learning rate to 0.0007000000332482159.

Epoch 00096: ReduceLROnPlateau reducing learning rate to 0.00024500001163687554.

Epoch 00111: ReduceLROnPlateau reducing learning rate to 8.575000101700424e-05.

Epoch 00126: ReduceLROnPlateau reducing learning rate to 3.001249933731742e-05.

Epoch 00141: ReduceLROnPlateau reducing learning rate to 1.0504374768061097e-05.

Epoch 00156: ReduceLROnPlateau reducing learning rate to 3.676531105156755e-06.

Epoch 00171: ReduceLROnPlateau reducing learning rate to 1.2867859027210216e-06.

Epoch 00186: ReduceLROnPlateau reducing learning rate to 4.503750460571609e-07.
Test accuracy: 0.9774
Remaining time: 0 days 5 hours 36 minutes 54 secon

------------------------------
Experiment 4

Epoch 00076: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 00102: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00131: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 00146: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 00161: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 00176: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 00191: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Test accuracy: 0.9811
Remaining time: 0 days 3 hours 12 minutes 57 seconds
Training for activation selu with optimizer sgd with config {'factor': 0.5, 'patience': 10, 'cooldown': 5}
------------------------------
Experiment 0

Epoch 00161: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.

Epoch 00176: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.

E