In [1]:
import numpy as np
import tensorflow as tf

initial = [np.random.random_sample(), np.random.random_sample()]

## First-order Minimization

In [2]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial, dtype='float32')
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

learning_rate = 0.2
for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    delta = - grads*learning_rate
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'delta=', ['%.3f'%d for d in delta.eval()])
    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=14.509 variables= ['0.219', '0.924']
	 	gradients= ['6.286', '10.981'] delta= ['-1.257', '-2.196']
EPOCH 1 f=4.062 variables= ['-1.038', '-1.272']
	 	gradients= ['-0.621', '-4.711'] delta= ['0.124', '0.942']
EPOCH 2 f=2.459 variables= ['-0.914', '-0.330']
	 	gradients= ['1.512', '1.190'] delta= ['-0.302', '-0.238']
EPOCH 3 f=2.124 variables= ['-1.216', '-0.568']
	 	gradients= ['0.431', '-0.843'] delta= ['-0.086', '0.169']
EPOCH 4 f=2.009 variables= ['-1.302', '-0.400']
	 	gradients= ['0.596', '-0.004'] delta= ['-0.119', '0.001']
EPOCH 5 f=1.952 variables= ['-1.421', '-0.399']
	 	gradients= ['0.359', '-0.237'] delta= ['-0.072', '0.047']
EPOCH 6 f=1.920 variables= ['-1.493', '-0.352']
	 	gradients= ['0.310', '-0.096'] delta= ['-0.062', '0.019']
EPOCH 7 f=1.901 variables= ['-1.555', '-0.332']
	 	gradients= ['0.225', '-0.105'] delta= ['-0.045', '0.021']
EPOCH 8 f=1.890 variables= ['-1.600', '-0.311']
	 	gradients= ['0.177', '-0.069'] delta= ['-0.035', '0.014']
EPOCH 9 f=1.884 var

## Second-order Minimization

In [3]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(3):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = tf.hessians(f, vs)[0]
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=14.509 variables= ['0.219', '0.924']
	 	gradients= ['6.286', '10.981'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-1.969', '-1.174']
EPOCH 1 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['0.000', '-0.000']
EPOCH 2 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-0.000', '0.000']
FINAL   f=1.875 variables= ['-1.750', '-0.250']


## Second-order Minimization self implemented Hessian

In [4]:
from functools import reduce
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import array_ops, tensor_array_ops, control_flow_ops

def hessians_highrank(ys, xs, gradients=None, name="hessians", colocate_gradients_with_ops=False,
            gate_gradients=False, aggregation_method=None):
  """Constructs the Hessian (one or more rank matrix) of sum of `ys` with respect to `x` in `xs`.
  `hessians_highrank()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`. This function currently
  only supports evaluating the Hessian with respect to (a list of) one-
  dimensional tensors.
  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.
  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
  xs = gradients_impl._AsList(xs)
  kwargs = {
    'colocate_gradients_with_ops': colocate_gradients_with_ops,
    'gate_gradients': gate_gradients,
    'aggregation_method': aggregation_method
  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = tf.gradients(ys, xs, **kwargs) if gradients is None else gradients
  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
    shape = x.shape
    _gradient = tf.reshape(_gradient, [-1])
    
    n = tf.size(x)
    loop_vars = [
      array_ops.constant(0, tf.int32),
      tensor_array_ops.TensorArray(x.dtype, n)
    ]
    _, hessian = control_flow_ops.while_loop(
      lambda j, _: j < n,
      lambda j, result: (j + 1, result.write(j, tf.gradients(_gradient[j], x, **kwargs)[0])),
      loop_vars
    )
    hessians.append(hessian.stack())
  return hessians

In [5]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(3):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = hessians_highrank(f, vs)[0]
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=14.509 variables= ['0.219', '0.924']
	 	gradients= ['6.286', '10.981'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-1.969', '-1.174']
EPOCH 1 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['0.000', '-0.000']
EPOCH 2 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-0.000', '0.000']
FINAL   f=1.875 variables= ['-1.750', '-0.250']


## Second-order Minimization self implemented Diagonal Hessian

### brute force

In [18]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = hessians_highrank(f, vs)[0] * tf.eye(2)
    
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('   ', '').replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=14.509 variables= ['0.219', '0.924']
	 	gradients= ['6.286', '10.981'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['-3.143', '-1.830']
EPOCH 1 f=6.086 variables= ['-2.924', '-0.906']
	 	gradients= ['-3.660', '-6.286'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['1.830', '1.048']
EPOCH 2 f=3.279 variables= ['-1.094', '0.141']
	 	gradients= ['2.095', '3.660'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['-1.048', '-0.610']
EPOCH 3 f=2.343 variables= ['-2.141', '-0.469']
	 	gradients= ['-1.220', '-2.095'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['0.610', '0.349']
EPOCH 4 f=2.031 variables= ['-1.531', '-0.120']
	 	gradients= ['0.698', '1.220'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['-0.349', '-0.203']
EPOCH 5 f=1.927 variables= ['-1.880', '-0.323']
	 	gradients= ['-0.407', '-0.698'] hessi_inv= [[ 0.50.  ], [ 0. 0.16666667]] delta= ['0.203', '0.116']
EPOCH 6 f=1.892 variables= ['-1.677', '-0.207']
	 	gradients= ['0.233', '0.407'] hessi_inv= [[ 

### optimized computational impl.

In [19]:
from functools import reduce
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import array_ops, tensor_array_ops, control_flow_ops

def invers_diagonal_hessians_highrank(ys, xs, gradients=None, name="hessians", colocate_gradients_with_ops=False,
            gate_gradients=False, aggregation_method=None):
  """Constructs the Hessian (one or more rank matrix) of sum of `ys` with respect to `x` in `xs`.
  `hessians_highrank()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`. This function currently
  only supports evaluating the Hessian with respect to (a list of) one-
  dimensional tensors.
  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.
  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
  xs = gradients_impl._AsList(xs)
  kwargs = {
    'colocate_gradients_with_ops': colocate_gradients_with_ops,
    'gate_gradients': gate_gradients,
    'aggregation_method': aggregation_method
  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = tf.gradients(ys, xs, **kwargs) if gradients is None else gradients
  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
    shape = x.shape
    _gradient = tf.reshape(_gradient, [-1])
    
    n = tf.size(x)
    g = tf.gradients(_gradient, x)[0]
    hessian = tf.diag( 1.0 / (tf.reshape(g, [-1]) + 1e-2) )
    hessians.append(hessian)
  return hessians

In [20]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi_inv = invers_diagonal_hessians_highrank(f, vs)[0]
    
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('   ', '').replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=14.509 variables= ['0.219', '0.924']
	 	gradients= ['6.286', '10.981'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-1.568', '-1.371']
EPOCH 1 f=1.995 variables= ['-1.348', '-0.447']
	 	gradients= ['0.409', '-0.380'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-0.102', '0.047']
EPOCH 2 f=1.942 variables= ['-1.450', '-0.400']
	 	gradients= ['0.300', '-0.299'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-0.075', '0.037']
EPOCH 3 f=1.913 variables= ['-1.525', '-0.362']
	 	gradients= ['0.225', '-0.225'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-0.056', '0.028']
EPOCH 4 f=1.896 variables= ['-1.581', '-0.334']
	 	gradients= ['0.169', '-0.169'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-0.042', '0.021']
EPOCH 5 f=1.887 variables= ['-1.623', '-0.313']
	 	gradients= ['0.127', '-0.127'] hessi_inv= [[ 0.24937655  0.  ], [ 0. 0.12484394]] delta= ['-0.032', '0.016']
EPOCH 6 f=1.882 variables= ['-1.655', '-