### GradNets

http://arxiv.org/pdf/1511.06827v1.pdf

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf

# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py
import input_data

$$
g = min(t/\tau, 1)
$$

Where $g \in (0, 1)$ and $t$ is the current epoch. In code, `g` is a weight that anneals over $\tau$ epochs.

In [2]:
def dense_layer(x, input_size, output_size, activation):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[output_size]), name='bias')
    y = activation(tf.matmul(x, W) + b)
    return y

In [3]:
def grelu_layer(x, input_size, output_size, g):
    W = tf.Variable(tf.truncated_normal([input_size, output_size], stddev=0.1), name='weight')
    b = tf.Variable(tf.constant(0.1, shape=[output_size]), name='bias')
    y = g * tf.nn.relu(tf.matmul(x, W) + b) + (1 - g) * x
    return y

In [4]:
sess = tf.InteractiveSession()

In [5]:
mnist = input_data.read_data_sets(os.path.expanduser('~') + "/data/mnist/", one_hot=True)

Extracting /Users/andy/data/mnist/train-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/train-labels-idx1-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-images-idx3-ubyte.gz
Extracting /Users/andy/data/mnist/t10k-labels-idx1-ubyte.gz


In [6]:
input_layer_size = 784
hidden_layer_size = 50 # use ~71 for fully-connected (plain) layers, 50 for highway layers
output_layer_size = 10

x = tf.placeholder("float", [None, input_layer_size])
y_ = tf.placeholder("float", [None, output_layer_size])
g = tf.placeholder("float")
learning_rate = tf.placeholder("float")
mini_batch_size = 50
num_epochs = 20
tau = 10 # num epochs to anneal g over

In [7]:
layer_count = 50

prev_y = None
y = None
for i in range(layer_count):
    with tf.name_scope("layer{0}".format(i)) as scope:
        if i == 0: # first, input layer
            prev_y = dense_layer(x, input_layer_size, hidden_layer_size, tf.nn.relu)
        elif i == layer_count - 1: # last, output layer
            y = dense_layer(prev_y, hidden_layer_size, output_layer_size, tf.nn.softmax)
        else: # hidden layers
            prev_y = grelu_layer(prev_y, hidden_layer_size, hidden_layer_size, g)

In [8]:
with tf.name_scope("loss") as scope:
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    cross_entropy_summary = tf.scalar_summary("loss", cross_entropy)

with tf.name_scope("train") as scope:
    optimizer = tf.train.GradientDescentOptimizer(1e-2)
    weight_gradients = [
        grad 
        for grad, var 
        in optimizer.compute_gradients(cross_entropy) 
        if "weight" in var.name
    ][1:-1] # drop layer0 and layer49
    gradient_norm = tf.reduce_mean(tf.concat(0, weight_gradients))
    gradient_summary = tf.scalar_summary("gradient_norm", gradient_norm)
    train_step = optimizer.minimize(cross_entropy)

with tf.name_scope("test") as scope:
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)

In [9]:
merged = tf.merge_all_summaries()

log_path = "mnist_logs_{0}/".format(int(time.time()))
print(log_path)
writer = tf.train.SummaryWriter(log_path, sess.graph_def)

mnist_logs_1451520911/


In [10]:
tf.initialize_all_variables().run()

iter_per_epoch = int(mnist.train.num_examples / mini_batch_size)
num_iter = num_epochs * iter_per_epoch
print("Running %d epochs with mini batch size of %d" % (num_epochs, mini_batch_size))
print("With %d iterations per epoch for a total of %d iterations" % (iter_per_epoch, num_iter))

Running 20 epochs with mini batch size of 50
With 1100 iterations per epoch for a total of 22000 iterations


In [11]:
for i in range(num_iter):
  batch_xs, batch_ys = mnist.train.next_batch(mini_batch_size)

  epoch = i / iter_per_epoch
  gs = min(epoch / tau, 1.0)

  if i % 100 == 0:
    summary_str, train_accuracy, grad_norm = sess.run([merged, accuracy, gradient_norm], feed_dict={
        x: batch_xs,
        y_: batch_ys,
        g: gs,
    })
    writer.add_summary(summary_str, i)
    print("iter %d, epoch %d, training accuracy %g, g: %g, grad_norm: %g" % (i, epoch, train_accuracy, gs, grad_norm))
    print("test accuracy %g" % accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels, g: gs }))

  train_step.run(feed_dict={ x: batch_xs, y_: batch_ys, g: gs})

iter 0, epoch 0, training accuracy 0.12, g: 0, grad_norm: 0
test accuracy 0.0945
iter 100, epoch 0, training accuracy 0.86, g: 0.00909091, grad_norm: -8.33214e-05
test accuracy 0.8853
iter 200, epoch 0, training accuracy 0.94, g: 0.0181818, grad_norm: -1.93185e-05
test accuracy 0.9058
iter 300, epoch 0, training accuracy 0.88, g: 0.0272727, grad_norm: -0.000155139
test accuracy 0.9093
iter 400, epoch 0, training accuracy 0.94, g: 0.0363636, grad_norm: 0.00018907
test accuracy 0.9158
iter 500, epoch 0, training accuracy 0.82, g: 0.0454545, grad_norm: -0.000708206
test accuracy 0.9077
iter 600, epoch 0, training accuracy 0.98, g: 0.0545455, grad_norm: 0.000203154
test accuracy 0.9189
iter 700, epoch 0, training accuracy 0.98, g: 0.0636364, grad_norm: -0.000361135
test accuracy 0.9099
iter 800, epoch 0, training accuracy 0.86, g: 0.0727273, grad_norm: 0.000576911
test accuracy 0.9043
iter 900, epoch 0, training accuracy 0.94, g: 0.0818182, grad_norm: 0.00152448
test accuracy 0.9035
iter 1

InvalidArgumentError: ReluGrad input is not finite. : Tensor had NaN values
	 [[Node: train/gradients_1/layer0/Relu_grad/layer0/Relu/CheckNumerics = CheckNumerics[T=DT_FLOAT, message="ReluGrad input is not finite.", _device="/job:localhost/replica:0/task:0/cpu:0"](layer0/add)]]
Caused by op u'train/gradients_1/layer0/Relu_grad/layer0/Relu/CheckNumerics', defined at:
  File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 403, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/site-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3006, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-f6c4dd7d0680>", line 15, in <module>
    train_step = optimizer.minimize(cross_entropy)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 186, in minimize
    aggregation_method=aggregation_method)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 232, in compute_gradients
    aggregation_method=aggregation_method)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/gradients.py", line 445, in gradients
    in_grads = _AsList(grad_fn(op_wrapper, *out_grads))
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 126, in _ReluGrad
    t = _VerifyTensor(op.inputs[0], op.name, "ReluGrad input is not finite.")
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 119, in _VerifyTensor
    verify_input = array_ops.check_numerics(t, message=msg)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 48, in check_numerics
    name=name)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
    self._traceback = _extract_stack()

...which was originally created as op u'layer0/Relu', defined at:
  File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
[elided 17 identical lines from previous traceback]
  File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-2a8f182725aa>", line 8, in <module>
    prev_y = dense_layer(x, input_layer_size, hidden_layer_size, tf.nn.relu)
  File "<ipython-input-2-2b8f3abaff4e>", line 4, in dense_layer
    y = activation(tf.matmul(x, W) + b)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 547, in relu
    return _op_def_lib.apply_op("Relu", features=features, name=name)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 664, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1834, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1043, in __init__
    self._traceback = _extract_stack()


Notes:

* Check whether test set eval is causing training on test set
* Add gradient checking (take reduce_mean of norm of gradients) to see if gradients are exploding