In [9]:
'''Trains a simple deep NN on the MNIST dataset.
Gets to 98.40% test accuracy after 20 epochs
(there is *a lot* of margin for parameter tuning).
2 seconds per epoch on a K520 GPU.
'''
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.callbacks import CSVLogger, TensorBoard, ReduceLROnPlateau, EarlyStopping
import os
from keras.layers.noise import AlphaDropout
import keras.activations
import time

In [10]:
batch_size = 64
num_classes = 10
epochs = 40
units = 64
experiments = 1
start = 0
add_final_dense = False
# activations = ['sigmoid', 'tanh', 'relu', 'linear', 'elu', 'softplus', 'softsign', 'hard_sigmoid', 'LeakyReLU', 'ThresholdedReLU']
activations = ['tanh', 'relu', 'linear', 'softplus', 'softsign', 'hard_sigmoid', 'LeakyReLU', 'ThresholdedReLU']
# PReLU is not used, since it does not currently support variable inputs
optimizers = ['rmsp', 'adam', 'sgd', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam']
# optimizers = ['Nadam']
save_dir = os.path.join(os.getcwd(), 'saved_models')
start_time = time.time()
counter = 0
total_items = (len(activations) + 1 )* len(optimizers)

In [11]:
img_rows, img_cols = 28, 28

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


## Train per each activation function

In [None]:

for act in activations:
    for opt in optimizers:
        print("\nTraining for activation " + act + " with optimizer " + opt)

        # Selecting activation function
        if act == 'sigmoid':
            activation = Activation(keras.activations.sigmoid)
        elif act == 'tanh':
            activation = Activation(keras.activations.tanh)
        elif act == 'relu':
            activation = Activation(keras.activations.relu)
        elif act == 'linear':
            activation = Activation(keras.activations.linear)
        elif act == 'elu':
            activation = Activation(keras.activations.elu)
        elif act == 'softplus':
            activation = Activation(keras.activations.softplus)
        elif act == 'softsign':
            activation = Activation(keras.activations.softsign)
        elif act == 'hard_sigmoid':
            activation = Activation(keras.activations.hard_sigmoid)
        elif act == 'LeakyReLU':
            activation = keras.layers.advanced_activations.LeakyReLU()
        elif act == 'PReLU':
            activation = keras.layers.advanced_activations.PReLU()
        elif act == 'ThresholdedReLU':
            activation = keras.layers.advanced_activations.ThresholdedReLU(theta=0.7)
            

        if opt == 'rmsp':
            optimizer = keras.optimizers.rmsprop()                
        elif opt == 'adam':
            optimizer = keras.optimizers.Adam()
        elif opt == 'sgd':
            optimizer = keras.optimizers.SGD()
        elif opt == 'Adagrad':
            optimizer = keras.optimizers.Adagrad()
        elif opt == 'Adadelta':
            optimizer = keras.optimizers.Adadelta()
        elif opt == 'Adamax':
            optimizer = keras.optimizers.Adamax()
        elif opt == 'Nadam':
            optimizer = keras.optimizers.Nadam()
        
        for i in range(experiments):
            if add_final_dense:
                model_name = 'mnist_cnn_' + act + "_" + opt + '_' + str(epochs) + '_' +str(i + start) + '_fd'
            else:
                model_name = 'mnist_cnn_' + act + "_" + opt + '_' + str(epochs) + '_' + str(i + start)
            model = Sequential()
            model.add(Conv2D(32, kernel_size=(3, 3),
                             input_shape=input_shape))
            
            model.add(activation)
            model.add(Conv2D(64, kernel_size=(3, 3)))
            model.add(activation)
            model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='same'))
            model.add(Dropout(0.2))
            model.add(Flatten())
            if add_final_dense:
                model.add(Dense(128))
                model.add(activation)
                model.add(Dropout(0.2))
            model.add(Dense(num_classes, activation='softmax'))

            model.compile(loss='categorical_crossentropy',
                              optimizer=optimizer,
                              metrics=['accuracy'])
            
            print('-'*30)
            print('Experiment', i)

            csv_logger = CSVLogger('./logs/%s_%d.csv' % (model_name, units), append=False, separator=';')
            reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=3, verbose=1, mode='auto', epsilon=0.0001, cooldown=2, min_lr=0)
#             tb = TensorBoard(log_dir='./tb_logs/' + model_name + '_' + str(units), histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
            history = model.fit(x_train, y_train,
                                batch_size=batch_size,
                                epochs=epochs,
                                verbose=1,
                                validation_data=(x_test, y_test), callbacks=[csv_logger, reduce_lr])

            if not os.path.isdir(save_dir):
                os.makedirs(save_dir)
            model_path = os.path.join(save_dir, model_name + ".h5")
#             model.save(model_path)

            score = model.evaluate(x_test, y_test, verbose=0)
            print('Test loss:', score[0])
            print('Test accuracy:', score[1])
            t = time.time()
            time_diff = t - start_time
            counter +=1
            rem_items = total_items - counter
            total_time = round((total_items / counter) * time_diff)
            rem_time = round(total_time - time_diff)
            m, s = divmod(rem_time, 60)
            h, m = divmod(m, 60)
            d, h = divmod(h, 24)
            print('Remaining time: %d days %d hours %02d minutes %02d seconds' % (d, h, m, s))
            


Training for activation tanh with optimizer rmsp
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 00009: reducing learning rate to 0.00020000000949949026.
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 00015: reducing learning rate to 4.0000001899898055e-05.
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 00020: reducing learning rate to 8.000000525498762e-06.
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 00025: reducing learning rate to 1.6000001778593287e-06.
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 00030: reducing learning rate to 3.200000264769187e-07.
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 00035: reducing learning rate to 6.400000529538374e-08.
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 0004

Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 00015: reducing learning rate to 0.00020000000949949026.
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 00021: reducing learning rate to 4.0000001899898055e-05.
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 00026: reducing learning rate to 8.000000525498762e-06.
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 00031: reducing learning rate to 1.6000001778593287e-06.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 00036: reducing learning rate to 3.200000264769187e-07.
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.0902440554044
Test accuracy: 0.9837
Remaining time: 6 days 20 hours 41 minutes 01 seconds

Training for activation tanh with optimizer sgd
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epo

Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 00025: reducing learning rate to 0.0019999999552965165.
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 00031: reducing learning rate to 0.0003999999724328518.
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 00036: reducing learning rate to 7.999999215826393e-05.
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.050480847574
Test accuracy: 0.9836
Remaining time: 6 days 9 hours 37 minutes 28 seconds

Training for activation tanh with optimizer Adagrad
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 00012: reducing learning rate to 0.0019999999552965165.
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 00017: reducing learning rate to 0.0003999999724328518.
Epoch 18/40
Epoch 1

Epoch 32/40
Epoch 00032: reducing learning rate to 3.199999628122896e-06.
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 00037: reducing learning rate to 6.399999165296323e-07.
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.0439745311859
Test accuracy: 0.9862
Remaining time: 6 days 3 hours 09 minutes 59 seconds

Training for activation tanh with optimizer Adadelta
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 00014: reducing learning rate to 0.2.
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 00023: reducing learning rate to 0.04000000059604645.
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 00028: reducing learning rate to 0.007999999821186066.
Epoch 29/40
Epoch 30/40
Ep

Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 00011: reducing learning rate to 0.0004000000189989805.
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 00017: reducing learning rate to 8.000000379979611e-05.
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 00024: reducing learning rate to 1.6000001050997525e-05.
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 00029: reducing learning rate to 3.2000003557186575e-06.
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 00034: reducing learning rate to 6.400000529538374e-07.
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 00039: reducing learning rate to 1.280000105907675e-07.
Epoch 40/40
Test loss: 0.0486788884263
Test accuracy: 0.9871
Remaining time: 5 days 18 hours 35 minutes 08 seconds

T

Epoch 12/40
Epoch 00012: reducing learning rate to 0.0004000000189989805.
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 00022: reducing learning rate to 8.000000379979611e-05.
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 00027: reducing learning rate to 1.6000001050997525e-05.
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 00032: reducing learning rate to 3.2000003557186575e-06.
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 00037: reducing learning rate to 6.400000529538374e-07.
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.103151865698
Test accuracy: 0.9831
Remaining time: 5 days 15 hours 06 minutes 28 seconds

Training for activation relu with optimizer rmsp
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 

Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 00025: reducing learning rate to 8.000000525498762e-06.
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 00030: reducing learning rate to 1.6000001778593287e-06.
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 00035: reducing learning rate to 3.200000264769187e-07.
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 00040: reducing learning rate to 6.400000529538374e-08.
Test loss: 0.0342426698056
Test accuracy: 0.991
Remaining time: 5 days 11 hours 46 minutes 53 seconds

Training for activation relu with optimizer adam
------------------------------
Experiment 0
Train on 60000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 00010: reducing learning rate to 0.00020000000949949026.
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 1

# SeLU
This one has special requirements

In [None]:
# start = 2
for opt in optimizers:
    print("Training for activation SeLU with optimizer " + opt)
    for i in range(experiments):
        if add_final_dense:
            model_name = 'mnist_cnn_selu_' + opt + '_' + str(epochs) + '_' +str(i + start) + '_fd'
        else:
            model_name = 'mnist_cnn_selu_' + opt + '_' + str(epochs) + '_' + str(i + start)
                
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3),
                         activation='selu', kernel_initializer='lecun_normal',
                         input_shape=input_shape))
        model.add(Conv2D(64, (3, 3), activation='selu', kernel_initializer='lecun_normal'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(AlphaDropout(0.2))
        model.add(Flatten())
        if add_final_dense:
            model.add(Dense(128, activation='selu', kernel_initializer='lecun_normal'))
            model.add(AlphaDropout(0.2))
        model.add(Dense(num_classes, activation='softmax'))

        if opt == 'rmsp':
            optimizer = keras.optimizers.rmsprop()                
        elif opt == 'adam':
            optimizer = keras.optimizers.Adam()
        elif opt == 'sgd':
            optimizer = keras.optimizers.SGD()
        elif opt == 'Adagrad':
            optimizer = keras.optimizers.Adagrad()
        elif opt == 'Adadelta':
            optimizer = keras.optimizers.Adadelta()
        elif opt == 'Adamax':
            optimizer = keras.optimizers.Adamax()
        elif opt == 'Nadam':
            optimizer = keras.optimizers.Nadam()


        model.compile(loss='categorical_crossentropy',
                          optimizer=optimizer,
                          metrics=['accuracy'])
        print('-'*30)
        print('Experiment', i+1)

        csv_logger = CSVLogger('./logs/%s_%d.csv' % (model_name, units), append=False, separator=';')
        reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=3, verbose=0, mode='auto', epsilon=0.0001, cooldown=2, min_lr=0)
#         tb = TensorBoard(log_dir='./tb_logs/' + model_name + '_' + str(units), histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
        history = model.fit(x_train, y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1,
                            validation_data=(x_test, y_test), callbacks=[csv_logger, reduce_lr])

        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        model_path = os.path.join(save_dir, model_name + ".h5")
#         model.save(model_path)
        score = model.evaluate(x_test, y_test, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
        t = time.time()
        counter +=1
        time_diff = t - start_time
        rem_items = total_items - counter
        total_time = round(total_items / counter) * time_diff)
        rem_time = round(total_time - time_diff)
        m, s = divmod(rem_time, 60)
        h, m = divmod(m, 60)
        d, h = divmod(h, 24)
        print('Remaining time: %d days %d hours %02d minutes %02d seconds' % (d, h, m, s))
        