### GradNets

http://arxiv.org/pdf/1511.06827v1.pdf

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf

# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py
import input_data

$$
g = min(t/\tau, 1)
$$

Where $g \in (0, 1)$ and $t$ is the current epoch. In code, `g` is a weight that anneals over $\tau$ epochs.

In [2]:
def dense_layer(x, input_size, output_size, activation):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[output_size]), name='bias')
    y = activation(tf.matmul(x, W) + b)
    return y

In [3]:
def grelu_layer(x, input_size, output_size, g):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[output_size]), name='bias')
    y = g * tf.nn.relu(tf.matmul(x, W) + b) + (1 - g) * x
    return y

In [4]:
sess = tf.InteractiveSession()

In [5]:
mnist = input_data.read_data_sets(os.path.expanduser('~') + "/data/mnist/", one_hot=True)

Extracting /Users/andy/data/mnist/train-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/train-labels-idx1-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-labels-idx1-ubyte.gz


In [6]:
input_layer_size = 784
hidden_layer_size = 50 # use ~71 for fully-connected (plain) layers, 50 for highway layers
output_layer_size = 10

x = tf.placeholder("float", [None, input_layer_size])
y_ = tf.placeholder("float", [None, output_layer_size])
g = tf.placeholder("float")
learning_rate = tf.placeholder("float")
mini_batch_size = 50
num_epochs = 20
tau = 10 # num epochs to anneal g over

In [7]:
layer_count = 50

prev_y = None
y = None
for i in range(layer_count):
    with tf.name_scope("layer{0}".format(i)) as scope:
        if i == 0: # first, input layer
            prev_y = dense_layer(x, input_layer_size, hidden_layer_size, tf.nn.relu)
        elif i == layer_count - 1: # last, output layer
            y = dense_layer(prev_y, hidden_layer_size, output_layer_size, tf.nn.softmax)
        else: # hidden layers
            prev_y = grelu_layer(prev_y, hidden_layer_size, hidden_layer_size, g)

In [8]:
with tf.name_scope("loss") as scope:
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    cross_entropy_summary = tf.scalar_summary("loss", cross_entropy)

with tf.name_scope("train") as scope:
    train_step = tf.train.AdamOptimizer().minimize(cross_entropy)

with tf.name_scope("test") as scope:
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)

In [9]:
merged = tf.merge_all_summaries()

log_path = "mnist_logs_{0}/".format(int(time.time()))
print(log_path)
writer = tf.train.SummaryWriter(log_path, sess.graph_def)

mnist_logs_1451475190/


In [10]:
tf.initialize_all_variables().run()

iter_per_epoch = int(mnist.train.num_examples / mini_batch_size)
num_iter = num_epochs * iter_per_epoch
print("Running %d epochs with mini batch size of %d" % (num_epochs, mini_batch_size))
print("With %d iterations per epoch for a total of %d iterations" % (iter_per_epoch, num_iter))

Running 20 epochs with mini batch size of 50
With 1100 iterations per epoch for a total of 22000 iterations


In [None]:
for i in range(num_iter):
  batch_xs, batch_ys = mnist.train.next_batch(mini_batch_size)

  epoch = i / iter_per_epoch
  gs = min(epoch / tau, 1.0)

  if i % 100 == 0:
    summary_str, train_accuracy = sess.run([merged, accuracy], feed_dict={
        x: batch_xs,
        y_: batch_ys,
        g: gs,
    })
    writer.add_summary(summary_str, i)
    print("iter %d, epoch %d, training accuracy %g, g: %g" % (i, epoch, train_accuracy, gs))
    print("test accuracy %g" % accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels, g: gs }))

  train_step.run(feed_dict={ x: batch_xs, y_: batch_ys, g: gs})



iter 0, epoch 0, training accuracy 0.14, g: 0
test accuracy 0.0645
iter 100, epoch 0, training accuracy 0.82, g: 0.00909091
test accuracy 0.8077
iter 200, epoch 0, training accuracy 0.88, g: 0.0181818
test accuracy 0.8606
iter 300, epoch 0, training accuracy 0.8, g: 0.0272727
test accuracy 0.8879
iter 400, epoch 0, training accuracy 0.94, g: 0.0363636
test accuracy 0.8931
iter 500, epoch 0, training accuracy 0.84, g: 0.0454545
test accuracy 0.8792
iter 600, epoch 0, training accuracy 0.98, g: 0.0545455
test accuracy 0.8983
iter 700, epoch 0, training accuracy 0.88, g: 0.0636364
test accuracy 0.9058
iter 800, epoch 0, training accuracy 0.82, g: 0.0727273
test accuracy 0.9072
iter 900, epoch 0, training accuracy 0.84, g: 0.0818182
test accuracy 0.9144
iter 1000, epoch 0, training accuracy 0.9, g: 0.0909091
test accuracy 0.9243
iter 1100, epoch 1, training accuracy 0.84, g: 0.1
test accuracy 0.909
iter 1200, epoch 1, training accuracy 0.92, g: 0.109091
test accuracy 0.9325
iter 1300, epoc