In [1]:
from utils import *

In [2]:
# load data

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp/")

Extracting tmp/train-images-idx3-ubyte.gz
Extracting tmp/train-labels-idx1-ubyte.gz
Extracting tmp/t10k-images-idx3-ubyte.gz
Extracting tmp/t10k-labels-idx1-ubyte.gz


The SELU (scaled exponential linear unit) activation function's shape and parameters were determined experimentally in the [the 2017 paper Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). Merely using this activation has the amazing effect of ensuring the outputs of neurons in a particular layer have 0 mean and a standard deviation of 1. This helps immensely with learning and the vanishing/exploding gradient problem.

To illustrate this, let's imagine a deep network with random weights and a random input. We'll forward propogate the inputs through it and watch what the cumulative effect of matrix multiplications is on the neuron's outputs when using the vanilla sigmoid function.

In [4]:
def sigmoid(x):
    return x / (1+np.exp(-x))


Z = np.random.normal(size=(500, 100))
for layer in range(15):
    W = np.random.normal(size=(100, 100), scale=np.sqrt(1/100))
    Z = sigmoid(np.dot(Z, W))
    means = np.mean(Z, axis=1)
    stds = np.std(Z, axis=1)
    print("Layer {}: {:.2f} < mean < {:.2f}, {:.2f} < std < {:.2f}".format(layer, 
                                                                               means.min(), 
                                                                               means.max(), 
                                                                               stds.min(), 
                                                                               stds.max()))

Layer 0: 0.06 < mean < 0.40, 0.35 < std < 0.81
Layer 1: -0.02 < mean < 0.22, 0.17 < std < 0.52
Layer 2: -0.02 < mean < 0.08, 0.08 < std < 0.28
Layer 3: -0.01 < mean < 0.04, 0.04 < std < 0.15
Layer 4: -0.01 < mean < 0.02, 0.02 < std < 0.08
Layer 5: -0.00 < mean < 0.01, 0.01 < std < 0.04
Layer 6: -0.01 < mean < 0.00, 0.01 < std < 0.02
Layer 7: -0.00 < mean < 0.00, 0.00 < std < 0.01
Layer 8: -0.00 < mean < 0.00, 0.00 < std < 0.01
Layer 9: -0.00 < mean < 0.00, 0.00 < std < 0.00
Layer 10: -0.00 < mean < 0.00, 0.00 < std < 0.00
Layer 11: -0.00 < mean < 0.00, 0.00 < std < 0.00
Layer 12: -0.00 < mean < 0.00, 0.00 < std < 0.00
Layer 13: -0.00 < mean < 0.00, 0.00 < std < 0.00
Layer 14: -0.00 < mean < 0.00, 0.00 < std < 0.00


By the 15th layer, most of the neurons are dead - they're not getting enough of a signal. By contrast, compare SELU. Note that in this SELU example we're not going to use a 15-layer neural network, we'll use a 100 layer one, and still the signals are preserved.

In [5]:
def elu(z, alpha=1, name=None):
    return np.where(z < 0, alpha * (np.exp(z) - 1), z)

def selu(z,
         scale=1.0507009873554804934193349852946,
         alpha=1.6732632423543772848170429916717, name=None):
    return scale * elu(z, alpha)

Z = np.random.normal(size=(500, 100))
for layer in range(100):
    W = np.random.normal(size=(100, 100), scale=np.sqrt(1/100))
    Z = selu(np.dot(Z, W))
    means = np.mean(Z, axis=1)
    stds = np.std(Z, axis=1)
    if layer % 10 == 0:
        print("Layer {}: {:.2f} < mean < {:.2f}, {:.2f} < std < {:.2f}".format(layer, 
                                                                               means.min(), 
                                                                               means.max(), 
                                                                               stds.min(), 
                                                                               stds.max()))

Layer 0: -0.26 < mean < 0.37, 0.75 < std < 1.28
Layer 10: -0.26 < mean < 0.20, 0.74 < std < 1.31
Layer 20: -0.28 < mean < 0.26, 0.77 < std < 1.40
Layer 30: -0.30 < mean < 0.33, 0.76 < std < 1.25
Layer 40: -0.24 < mean < 0.24, 0.73 < std < 1.25
Layer 50: -0.23 < mean < 0.22, 0.72 < std < 1.30
Layer 60: -0.31 < mean < 0.37, 0.75 < std < 1.28
Layer 70: -0.23 < mean < 0.19, 0.72 < std < 1.26
Layer 80: -0.25 < mean < 0.27, 0.78 < std < 1.27
Layer 90: -0.22 < mean < 0.27, 0.70 < std < 1.36


Pretty impressive. Here's a shallow mnist neural network that uses it - just so we know how implementation works. Likely, in the future TF will have its own native implementation of SELU.

In [6]:
# redefining activations with tf ops

def elu(z, alpha=1, name=None):
    return tf.where(z < 0, alpha * (tf.exp(z) - 1), z)

def selu(z,
         scale=1.0507009873554804934193349852946,
         alpha=1.6732632423543772848170429916717, name=None):
    return scale * elu(z, alpha)

In [7]:
# construction

reset_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate= 0.01

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=selu , name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=selu , name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy, name="loss")   
    
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, k=1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [8]:
# execution

n_epochs = 40
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
        if epoch % 5 == 0 or epoch == n_epochs - 1:
            acc_train = accuracy.eval(feed_dict={X:mnist.train.images, y:mnist.train.labels})
            acc_val = accuracy.eval(feed_dict={X:mnist.validation.images, y:mnist.validation.labels})
            print(epoch, "train acc:", acc_train, "val acc:", acc_val)
    acc_test = accuracy.eval(feed_dict={X:mnist.test.images, y:mnist.test.labels})
    print("Test acc:", acc_test)

0 train acc: 0.903691 val acc: 0.9108
5 train acc: 0.937545 val acc: 0.938
10 train acc: 0.955418 val acc: 0.9542
15 train acc: 0.966818 val acc: 0.9618
20 train acc: 0.973382 val acc: 0.9664
25 train acc: 0.978509 val acc: 0.968
30 train acc: 0.982818 val acc: 0.97
35 train acc: 0.986309 val acc: 0.9724
39 train acc: 0.987873 val acc: 0.974
Test acc: 0.9742
