### GradNets

http://arxiv.org/pdf/1511.06827v1.pdf

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf

# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py
import input_data

In [2]:
def dense_layer(x, input_size, output_size, activation):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0., shape=[output_size]), name='bias')
    y = activation(tf.matmul(x, W) + b)
    return y

$$
y = g \cdot H(x) + (1 - g) \cdot I(x)
$$

In [3]:
def grelu_layer(x, input_size, output_size, g):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0., shape=[output_size]), name='bias')
    y = g * tf.nn.relu(tf.matmul(x, W) + b) + \
        (1 - g) * x
    return y

In [4]:
sess = tf.InteractiveSession()

In [5]:
mnist = input_data.read_data_sets(os.path.expanduser('~') + "/data/mnist/", one_hot=True)

Extracting /Users/andy/data/mnist/train-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/train-labels-idx1-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-labels-idx1-ubyte.gz


In [6]:
input_layer_size = 784
hidden_layer_size = 50 # use ~71 for fully-connected (plain) layers, 50 for highway layers
output_layer_size = 10

x = tf.placeholder("float", [None, input_layer_size])
y_ = tf.placeholder("float", [None, output_layer_size])
g = tf.placeholder("float")
keep_prob = tf.placeholder("float")
mini_batch_size = 50
num_epochs = 20
tau = 10 # num epochs to anneal g over

In [7]:
# Creates a variable to hold the global_step.
global_step = tf.Variable(10, trainable=False, name='global_step')

In [8]:
layer_count = 50

prev_y = None
y = None
for i in range(layer_count):
    with tf.name_scope("layer{0}".format(i)) as scope:
        if i == 0: # first, input layer
            prev_y = tf.nn.dropout(dense_layer(x, input_layer_size, hidden_layer_size, tf.nn.relu), keep_prob)
        elif i == layer_count - 1: # last, output layer
            y = dense_layer(prev_y, hidden_layer_size, output_layer_size, tf.nn.softmax)
        else: # hidden layers
            prev_y = tf.nn.dropout(grelu_layer(prev_y, hidden_layer_size, hidden_layer_size, g), keep_prob)

In [9]:
with tf.name_scope("loss") as scope:
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y + 1e-9))
    cross_entropy_summary = tf.scalar_summary("loss", cross_entropy)

with tf.name_scope("train") as scope:
    learning_rate = tf.train.exponential_decay(1e-3, global_step, 1000, 0.8)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    weight_gradients = [
        (grad, var)
        for grad, var 
        in optimizer.compute_gradients(cross_entropy) 
        if "weight" in var.name
    ][1:-1] # drop layer0 and layer49
    weights = [var for grad, var in weight_gradients]
    grads = [grad for grad, var in weight_gradients]
    weight_norm = tf.reduce_mean(tf.abs(tf.concat(0, weights)))
    weight_summary = tf.scalar_summary("weight_norm", weight_norm)
    gradient_norm = tf.reduce_mean(tf.abs(tf.concat(0, grads)))
    gradient_summary = tf.scalar_summary("gradient_norm", gradient_norm)

    train_step = optimizer.minimize(cross_entropy + 50 * weight_norm, global_step=global_step)

with tf.name_scope("test") as scope:
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)

In [10]:
merged = tf.merge_all_summaries()

log_path = "mnist_logs_{0}/".format(int(time.time()))
print(log_path)
writer = tf.train.SummaryWriter(log_path, sess.graph_def)

mnist_logs_1451550074/


In [11]:
tf.initialize_all_variables().run()

iter_per_epoch = int(mnist.train.num_examples / mini_batch_size)
num_iter = num_epochs * iter_per_epoch
print("Running %d epochs with mini batch size of %d" % (num_epochs, mini_batch_size))
print("With %d iterations per epoch for a total of %d iterations" % (iter_per_epoch, num_iter))

Running 20 epochs with mini batch size of 50
With 1100 iterations per epoch for a total of 22000 iterations


$$
g = min(t/\tau, 1)
$$

Where $g \in (0, 1)$ and $t$ is the current epoch. In code, `g` is a weight that anneals over $\tau$ epochs.

In [12]:
for i in range(num_iter):
  batch_xs, batch_ys = mnist.train.next_batch(mini_batch_size)

  epoch = i / iter_per_epoch
  gs = min(epoch / tau, 1.0)

  if i % 100 == 0:
    summary_str, train_accuracy, w_norm, grad_norm = sess.run([merged, accuracy, weight_norm, gradient_norm], feed_dict={
        x: batch_xs,
        y_: batch_ys,
        g: gs,
        keep_prob: 1.0,
    })
    writer.add_summary(summary_str, i)
    print("iter %d, epoch %d, training accuracy %g, g: %g" % (i, epoch, train_accuracy, gs))
    print("weight norm: %g, grad norm: %g" % (w_norm, grad_norm))
    print("test accuracy %g" % accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels, g: gs , keep_prob: 1.0}))

  train_step.run(feed_dict={ x: batch_xs, y_: batch_ys, g: gs, keep_prob: 0.95})

iter 0, epoch 0, training accuracy 0.08, g: 0
weight norm: 0.0724656, grad norm: 0
test accuracy 0.0931
iter 100, epoch 0, training accuracy 0.52, g: 0.00909091
weight norm: 0.0269349, grad norm: 0.000760927
test accuracy 0.587
iter 200, epoch 0, training accuracy 0.72, g: 0.0181818
weight norm: 0.0280573, grad norm: 0.00303799
test accuracy 0.6761
iter 300, epoch 0, training accuracy 0.72, g: 0.0272727
weight norm: 0.0468031, grad norm: 0.00763341
test accuracy 0.7225
iter 400, epoch 0, training accuracy 0.82, g: 0.0363636
weight norm: 0.0602556, grad norm: 0.00669258
test accuracy 0.7521
iter 500, epoch 0, training accuracy 0.66, g: 0.0454545
weight norm: 0.0675551, grad norm: 0.00844835
test accuracy 0.799
iter 600, epoch 0, training accuracy 0.94, g: 0.0545455
weight norm: 0.0721122, grad norm: 0.0108451
test accuracy 0.8313
iter 700, epoch 0, training accuracy 0.9, g: 0.0636364
weight norm: 0.0738767, grad norm: 0.00968969
test accuracy 0.8506
iter 800, epoch 0, training accuracy 