In [1]:
import numpy as np
import tensorflow as tf
import tensorpack as tp

In [2]:
from functools import reduce
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import array_ops, tensor_array_ops, control_flow_ops

def hessians_highrank(ys, xs, gradients=None, name="hessians", colocate_gradients_with_ops=False,
            gate_gradients=False, aggregation_method=None):
  """Constructs the Hessian (one or more rank matrix) of sum of `ys` with respect to `x` in `xs`.
  `hessians_highrank()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`. This function currently
  only supports evaluating the Hessian with respect to (a list of) one-
  dimensional tensors.
  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.
  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
  xs = gradients_impl._AsList(xs)
  kwargs = {
    'colocate_gradients_with_ops': colocate_gradients_with_ops,
    'gate_gradients': gate_gradients,
    'aggregation_method': aggregation_method
  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = tf.gradients(ys, xs, **kwargs) if gradients is None else gradients
  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
    shape = x.shape
    _gradient = tf.reshape(_gradient, [-1])
    
    n = tf.size(x)
    loop_vars = [
      array_ops.constant(0, tf.int32),
      tensor_array_ops.TensorArray(x.dtype, n)
    ]
    _, hessian = control_flow_ops.while_loop(
      lambda j, _: j < n,
      lambda j, result: (j + 1, result.write(j, tf.gradients(_gradient[j], x)[0])),
      loop_vars
    )
    hessians.append(hessian.stack())
  return hessians

In [3]:
import cv2
import numpy as np
import tensorflow as tf
import tensorpack as tp

from tensorpack import dataset
from tensorpack.dataflow import imgaug, AugmentImageComponent, BatchData, PrefetchData
import tensorpack.tfutils.symbolic_functions as symbf

import tensorflow.contrib.slim as slim
from tensorflow.python.framework import ops
from tensorflow.python.training import optimizer

class ModelMNIST10x10_simple(object):
    def __init__(self, learning_rate=1.0, batch_size=128):
        self.batch_size = batch_size
        self.inputs = [
            tf.placeholder(tf.float32, shape=(None, 10, 10, 1)),
            tf.placeholder(tf.int32, shape=(None,)),
            tf.placeholder(tf.float32, shape=(None, 10))
        ]
        
        self.probability, self.cost, self.accuracy = self._build_graph(self.inputs)
        self.op = self._get_optimize_operator(self.cost, learning_rate)
        self.dataflow = {
            'train':self._get_data('train'),
            'valid':self._get_data('test'),
        }
        
    def _build_graph(self, inputs):
        image, label, vector = inputs
        
        with slim.arg_scope([slim.layers.fully_connected], weights_regularizer=slim.l2_regularizer(1e-5)):
            l = slim.layers.conv2d(image, 8, [3, 3], padding='SAME', scope='conv0' ) # 10x10
            l = slim.layers.max_pool2d(l, [2, 2], scope='pool0') # 5x5
            l = slim.layers.conv2d(l, 8, [3, 3], scope='conv1') # 3x3
            l = slim.layers.conv2d(l, 8, [3, 3], scope='conv2') # 1x1
            l = slim.layers.flatten(l, scope='flatten')
            logits = slim.layers.fully_connected(l, 10, activation_fn=None, scope='fc0')

        # Currently there is no way to take the second derivative of sparse_softmax_cross_entropy_with_logits due to the fused implementation
        #cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=vector)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        prob = tf.nn.softmax(logits, name='prob')
        accuracy = symbf.accuracy(logits, label, topk=1)
        return prob, cost, accuracy
    
    def _get_optimize_operator(self, cost, learning_rate=1.0):
        var_list = (tf.trainable_variables() + tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
        var_list += tf.get_collection(tf.GraphKeys._STREAMING_MODEL_PORTS)

        processors = [optimizer._get_processor(v) for v in var_list]
        var_refs = [p.target() for p in processors]

        # compute_gradients
        grads = tf.gradients(
                cost, var_refs,
                grad_ys=None, aggregation_method=None, colocate_gradients_with_ops=True)
        hessis = hessians_highrank(
                 cost, var_refs, gradients=grads,
                 aggregation_method=None, colocate_gradients_with_ops=True)
        
        second_order_grads = []
        for g, h in zip(grads, hessis):
            shape = g.shape
            d = int(reduce(lambda a,b: a*b, shape))

            g = tf.reshape(g, [d, 1])
            h = tf.reshape(h, [d, d]) + (tf.eye(d) * 1e-1)
            h_inv = tf.matrix_inverse(h)
            grad = tf.matmul(h_inv, g)
            grad = tf.reshape(grad, shape)
            second_order_grads.append(grad)
        grads_and_vars = list(zip(second_order_grads, var_list))
        
        self.grads = grads
        
        self.global_step = tf.contrib.framework.get_or_create_global_step()
        lr_schedule = {
            'step':     [                   50],
            'rate':     [0.1*learning_rate, learning_rate],
        }
        lr_schedule['step'] = ops.convert_n_to_tensor(lr_schedule['step'], tf.int64)
        learning_rate = tf.train.piecewise_constant(self.global_step, lr_schedule['step'], lr_schedule['rate'])
        #opt = tf.train.AdamOptimizer(learning_rate)
        opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_nesterov=True)
        #opt = tf.train.GradientDescentOptimizer(learning_rate)
        return opt.apply_gradients(grads_and_vars)
    
    def _get_data(self, train_or_test):
        BATCH_SIZE = self.batch_size
        isTrain = train_or_test == 'train'
        ds = dataset.Mnist(train_or_test)
        if isTrain:
            augmentors = [
                #imgaug.RandomApplyAug(imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3),
                #imgaug.RandomApplyAug(imgaug.RotationAndCropValid(15), 0.5),
                #imgaug.RandomApplyAug(imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25),
                imgaug.Resize((10, 10)),
                imgaug.CenterPaste((12, 12)),
                imgaug.RandomCrop((10, 10)),
                imgaug.MapImage(lambda x: x.reshape(10, 10, 1))
            ]
        else:
            augmentors = [
                imgaug.Resize((10, 10)),
                imgaug.MapImage(lambda x: x.reshape(10, 10, 1))
            ]
        ds = AugmentImageComponent(ds, augmentors)
        ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
        if isTrain:
            ds = PrefetchData(ds, 3, 2)
        return ds

In [4]:
model = ModelMNIST10x10_simple(learning_rate=0.9)

[32m[1016 16:47:36 @fs.py:89][0m [5m[31mWRN[0m Env var $TENSORPACK_DATASET not set, using /root/tensorpack_data for datasets.


In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print('session initialized')

session initialized


In [None]:
import sklearn.preprocessing

history = []
for epoch in range(30):
    result = {}
    
    model.dataflow['train'].reset_state()
    step, costs, accuracies = 0, [], []
    for datapoint in model.dataflow['train'].get_data():
        datapoint.append( sklearn.preprocessing.label_binarize( datapoint[1], range(10) ).astype(np.float32) )
        _, cost, accuracy, grads = sess.run([model.op, model.cost, model.accuracy, model.grads],
                                     feed_dict=dict(zip(model.inputs, datapoint)))
        grads = [np.sum(np.abs(g)) for g in grads]
        costs.append(cost)
        accuracies.append(accuracy)
        step += 1
        print('[train] epoch:%04d step:%04d cost:%.3f accuracy:%0.3f'%(epoch, step, cost, accuracy))
    print('[%04d] [train] cost:%.3f accuracy:%0.3f'%(epoch+1, np.mean(costs), np.mean(accuracies)), end=' ')
    result['train'] = {'epoch':epoch, 'cost':np.mean(costs), 'accuracy':np.mean(accuracies), 'grads_abs':grads}

    model.dataflow['valid'].reset_state()
    costs, accuracies = [], []
    for datapoint in model.dataflow['valid'].get_data():
        datapoint.append( sklearn.preprocessing.label_binarize( datapoint[1], range(10) ).astype(np.float32) )
        cost, accuracy = sess.run([model.cost, model.accuracy],
                                     feed_dict=dict(zip(model.inputs, datapoint)))
        costs.append(cost)
        accuracies.append(accuracy)
        
    print('[valid] cost:%.3f accuracy:%0.3f'%(np.mean(costs), np.mean(accuracies)))
    result['valid'] = {'epoch':epoch, 'cost':np.mean(costs), 'accuracy':np.mean(accuracies)}
    
    history.append( result )

[train] epoch:0000 step:0001 cost:0.695 accuracy:0.094
[train] epoch:0000 step:0002 cost:0.624 accuracy:0.117
[train] epoch:0000 step:0003 cost:0.456 accuracy:0.125
[train] epoch:0000 step:0004 cost:0.341 accuracy:0.117
[train] epoch:0000 step:0005 cost:0.423 accuracy:0.055
[train] epoch:0000 step:0006 cost:0.333 accuracy:0.148
[train] epoch:0000 step:0007 cost:0.335 accuracy:0.125
[train] epoch:0000 step:0008 cost:0.340 accuracy:0.109
[train] epoch:0000 step:0009 cost:0.338 accuracy:0.109
[train] epoch:0000 step:0010 cost:0.329 accuracy:0.156
[train] epoch:0000 step:0011 cost:0.323 accuracy:0.148
[train] epoch:0000 step:0012 cost:0.324 accuracy:0.109
[train] epoch:0000 step:0013 cost:0.319 accuracy:0.141
[train] epoch:0000 step:0014 cost:0.318 accuracy:0.203
[train] epoch:0000 step:0015 cost:0.315 accuracy:0.195
[train] epoch:0000 step:0016 cost:0.309 accuracy:0.344
[train] epoch:0000 step:0017 cost:0.311 accuracy:0.281
[train] epoch:0000 step:0018 cost:0.310 accuracy:0.312
[train] ep

[train] epoch:0000 step:0150 cost:0.066 accuracy:0.945
[train] epoch:0000 step:0151 cost:0.067 accuracy:0.906
[train] epoch:0000 step:0152 cost:0.071 accuracy:0.891
[train] epoch:0000 step:0153 cost:0.078 accuracy:0.875
[train] epoch:0000 step:0154 cost:0.084 accuracy:0.867
[train] epoch:0000 step:0155 cost:0.093 accuracy:0.859
[train] epoch:0000 step:0156 cost:0.062 accuracy:0.914
[train] epoch:0000 step:0157 cost:0.068 accuracy:0.883
[train] epoch:0000 step:0158 cost:0.082 accuracy:0.844
[train] epoch:0000 step:0159 cost:0.066 accuracy:0.922
[train] epoch:0000 step:0160 cost:0.066 accuracy:0.930
[train] epoch:0000 step:0161 cost:0.057 accuracy:0.922
[train] epoch:0000 step:0162 cost:0.083 accuracy:0.867
[train] epoch:0000 step:0163 cost:0.074 accuracy:0.875
[train] epoch:0000 step:0164 cost:0.079 accuracy:0.906
[train] epoch:0000 step:0165 cost:0.065 accuracy:0.891
[train] epoch:0000 step:0166 cost:0.057 accuracy:0.930
[train] epoch:0000 step:0167 cost:0.082 accuracy:0.844
[train] ep

[train] epoch:0000 step:0299 cost:0.083 accuracy:0.891
[train] epoch:0000 step:0300 cost:0.066 accuracy:0.891
[train] epoch:0000 step:0301 cost:0.062 accuracy:0.898
[train] epoch:0000 step:0302 cost:0.039 accuracy:0.953
[train] epoch:0000 step:0303 cost:0.066 accuracy:0.883
[train] epoch:0000 step:0304 cost:0.065 accuracy:0.906
[train] epoch:0000 step:0305 cost:0.074 accuracy:0.875
[train] epoch:0000 step:0306 cost:0.043 accuracy:0.961
[train] epoch:0000 step:0307 cost:0.062 accuracy:0.906
[train] epoch:0000 step:0308 cost:0.061 accuracy:0.906
[train] epoch:0000 step:0309 cost:0.054 accuracy:0.922
[train] epoch:0000 step:0310 cost:0.039 accuracy:0.969
[train] epoch:0000 step:0311 cost:0.035 accuracy:0.953
[train] epoch:0000 step:0312 cost:0.089 accuracy:0.859
[train] epoch:0000 step:0313 cost:0.071 accuracy:0.906
[train] epoch:0000 step:0314 cost:0.097 accuracy:0.844
[train] epoch:0000 step:0315 cost:0.045 accuracy:0.945
[train] epoch:0000 step:0316 cost:0.057 accuracy:0.914
[train] ep

[train] epoch:0000 step:0448 cost:0.069 accuracy:0.867
[train] epoch:0000 step:0449 cost:0.055 accuracy:0.906
[train] epoch:0000 step:0450 cost:0.046 accuracy:0.930
[train] epoch:0000 step:0451 cost:0.057 accuracy:0.938
[train] epoch:0000 step:0452 cost:0.026 accuracy:0.977
[train] epoch:0000 step:0453 cost:0.060 accuracy:0.914
[train] epoch:0000 step:0454 cost:0.059 accuracy:0.906
[train] epoch:0000 step:0455 cost:0.058 accuracy:0.906
[train] epoch:0000 step:0456 cost:0.052 accuracy:0.891
[train] epoch:0000 step:0457 cost:0.039 accuracy:0.953
[train] epoch:0000 step:0458 cost:0.045 accuracy:0.938
[train] epoch:0000 step:0459 cost:0.067 accuracy:0.906
[train] epoch:0000 step:0460 cost:0.057 accuracy:0.891
[train] epoch:0000 step:0461 cost:0.056 accuracy:0.898
[train] epoch:0000 step:0462 cost:0.066 accuracy:0.875
[train] epoch:0000 step:0463 cost:0.049 accuracy:0.922
[train] epoch:0000 step:0464 cost:0.043 accuracy:0.961
[train] epoch:0000 step:0465 cost:0.038 accuracy:0.953
[train] ep