### Highway Networks

http://people.idsia.ch/~rupesh/very_deep_learning/

In [24]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf

# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py
import input_data

$$ \textbf{y} = H(\textbf{x}, \textbf{W}_\textbf{H}) $$

In [2]:
def dense_layer(x, input_size, output_size, activation):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[output_size]), name='bias')
    y = activation(tf.matmul(x, W) + b)
    return y

\begin{align}
  \textbf{y} &= H(\textbf{x}, \textbf{W}_\textbf{H}) \cdot T(\textbf{x}, \textbf{W}_\textbf{T}) + \textbf{x} \cdot C(\textbf{x}, \textbf{W}_\textbf{C}) \\
  \textbf{y} &= H(\textbf{x}, \textbf{W}_\textbf{H}) \cdot T(\textbf{x}, \textbf{W}_\textbf{T}) + \textbf{x} \cdot (1 - T(x, \textbf{W}_\textbf{T}))
\end{align}

In [3]:
def highway_layer(x, size, activation, carry_bias=-1.0):
    W = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[size]), name='bias')

    W_T = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name='weight_transform')
    b_T = tf.Variable(tf.constant(carry_bias, shape=[size]), name='bias_transform')

    H = activation(tf.matmul(x, W) + b, name='activation')
    T = tf.sigmoid(tf.matmul(x, W_T) + b_T, name='transform_gate')
    C = tf.sub(1.0, T, name="carry_gate")

    y = tf.add(tf.mul(H, T), tf.mul(x, C), 'y') # y = (H * T) + (x * C)
    return y

In [4]:
sess = tf.InteractiveSession()

In [5]:
mnist = input_data.read_data_sets(os.path.expanduser('~') + "/data/mnist/", one_hot=True)

Extracting /Users/andy/data/mnist/train-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/train-labels-idx1-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-labels-idx1-ubyte.gz


In [6]:
input_layer_size = 784
hidden_layer_size = 50 # use ~71 for fully-connected (plain) layers, 50 for highway layers
output_layer_size = 10

x = tf.placeholder("float", [None, input_layer_size])
y_ = tf.placeholder("float", [None, output_layer_size])

In [7]:
layer_count = 50
carry_bias_init = -2.0

prev_y = None
y = None
for i in range(layer_count):
    with tf.name_scope("layer{0}".format(i)) as scope:
        if i == 0: # first, input layer
            prev_y = dense_layer(x, input_layer_size, hidden_layer_size, tf.nn.relu)
        elif i == layer_count - 1: # last, output layer
            y = dense_layer(prev_y, hidden_layer_size, output_layer_size, tf.nn.softmax)
        else: # hidden layers
            # prev_y = dense_layer(prev_y, hidden_layer_size, hidden_layer_size, tf.nn.relu)
            prev_y = highway_layer(prev_y, hidden_layer_size, tf.nn.relu, carry_bias=carry_bias_init)

In [8]:
with tf.name_scope("loss") as scope:
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    cross_entropy_summary = tf.scalar_summary("loss", cross_entropy)

with tf.name_scope("train") as scope:
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

with tf.name_scope("test") as scope:
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)

In [9]:
merged = tf.merge_all_summaries()

log_path = "mnist_logs_{0}/".format(int(time.time()))
print(log_path)
writer = tf.train.SummaryWriter(log_path, sess.graph_def)

mnist_logs_1451456027/


In [12]:
tf.initialize_all_variables().run()
mini_batch_size = 50
num_epochs = 20
iter_per_epoch = int(mnist.train.num_examples / mini_batch_size)
num_iter = num_epochs * iter_per_epoch
print("Running %d epochs with mini batch size of %d" % (num_epochs, mini_batch_size))
print("With %d iterations per epoch for a total of %d iterations" % (iter_per_epoch, num_iter))

Running 20 epochs with mini batch size of 50
With 1100 iterations per epoch for a total of 22000 iterations


In [11]:
for i in range(num_iter):
  batch_xs, batch_ys = mnist.train.next_batch(50)

  if i % 100 == 0:
    summary_str, train_accuracy = sess.run([merged, accuracy], feed_dict={
        x: batch_xs,
        y_: batch_ys,
    })
    writer.add_summary(summary_str, i)
    print("iter %d, epoch %d, training accuracy %g" % (i, i / iter_per_epoch, train_accuracy))
    print("test accuracy %g" % accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }))

  train_step.run(feed_dict={ x: batch_xs, y_: batch_ys })



iter 0, epoch 0, training accuracy 0.1
test accuracy 0.101
iter 100, epoch 0, training accuracy 0.12
test accuracy 0.098
iter 200, epoch 0, training accuracy 0.16
test accuracy 0.1135
iter 300, epoch 0, training accuracy 0.06
test accuracy 0.098
iter 400, epoch 0, training accuracy 0.12
test accuracy 0.1135
iter 500, epoch 0, training accuracy 0.08
test accuracy 0.1009
iter 600, epoch 0, training accuracy 0.02
test accuracy 0.1028
iter 700, epoch 0, training accuracy 0.12
test accuracy 0.1135
iter 800, epoch 0, training accuracy 0.1
test accuracy 0.1028
iter 900, epoch 0, training accuracy 0.14
test accuracy 0.101
iter 1000, epoch 0, training accuracy 0.1
test accuracy 0.101
iter 1100, epoch 1, training accuracy 0.2
test accuracy 0.1028
iter 1200, epoch 1, training accuracy 0.06
test accuracy 0.098
iter 1300, epoch 1, training accuracy 0.06
test accuracy 0.1135
iter 1400, epoch 1, training accuracy 0.32
test accuracy 0.2545
iter 1500, epoch 1, training accuracy 0.36
test accuracy 0.422