In [1]:
import numpy as np
import tensorflow as tf

initial = [np.random.random_sample(), np.random.random_sample()]

## First-order Minimization

In [15]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial, dtype='float32')
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

learning_rate = 0.25
for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    delta = - grads*learning_rate
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'delta=', ['%.3f'%d for d in delta.eval()])
    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=11.267 variables= ['0.926', '0.099']
	 	gradients= ['6.049', '7.446'] delta= ['-1.512', '-1.861']
EPOCH 1 f=6.571 variables= ['-0.587', '-1.762']
	 	gradients= ['-0.698', '-6.747'] delta= ['0.175', '1.687']
EPOCH 2 f=4.223 variables= ['-0.412', '-0.075']
	 	gradients= ['3.025', '3.723'] delta= ['-0.756', '-0.931']
EPOCH 3 f=3.049 variables= ['-1.168', '-1.006']
	 	gradients= ['-0.349', '-3.374'] delta= ['0.087', '0.843']
EPOCH 4 f=2.462 variables= ['-1.081', '-0.163']
	 	gradients= ['1.512', '1.861'] delta= ['-0.378', '-0.465']
EPOCH 5 f=2.169 variables= ['-1.459', '-0.628']
	 	gradients= ['-0.175', '-1.687'] delta= ['0.044', '0.422']
EPOCH 6 f=2.022 variables= ['-1.416', '-0.206']
	 	gradients= ['0.756', '0.931'] delta= ['-0.189', '-0.233']
EPOCH 7 f=1.948 variables= ['-1.605', '-0.439']
	 	gradients= ['-0.087', '-0.843'] delta= ['0.022', '0.211']
EPOCH 8 f=1.912 variables= ['-1.583', '-0.228']
	 	gradients= ['0.378', '0.465'] delta= ['-0.095', '-0.116']
EPOCH 9 f=1.893 vari

## Second-order Minimization

In [3]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(3):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = tf.hessians(f, vs)[0]
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=11.267 variables= ['0.926', '0.099']
	 	gradients= ['6.049', '7.446'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-2.676', '-0.349']
EPOCH 1 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-0.000', '0.000']
EPOCH 2 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['-0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['0.000', '-0.000']
FINAL   f=1.875 variables= ['-1.750', '-0.250']


## Second-order Minimization self implemented Hessian

In [4]:
from functools import reduce
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import array_ops, tensor_array_ops, control_flow_ops

def hessians_highrank(ys, xs, gradients=None, name="hessians", colocate_gradients_with_ops=False,
            gate_gradients=False, aggregation_method=None):
  """Constructs the Hessian (one or more rank matrix) of sum of `ys` with respect to `x` in `xs`.
  `hessians_highrank()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`. This function currently
  only supports evaluating the Hessian with respect to (a list of) one-
  dimensional tensors.
  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.
  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
  xs = gradients_impl._AsList(xs)
  kwargs = {
    'colocate_gradients_with_ops': colocate_gradients_with_ops,
    'gate_gradients': gate_gradients,
    'aggregation_method': aggregation_method
  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = tf.gradients(ys, xs, **kwargs) if gradients is None else gradients
  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
    shape = x.shape
    _gradient = tf.reshape(_gradient, [-1])
    
    n = tf.size(x)
    loop_vars = [
      array_ops.constant(0, tf.int32),
      tensor_array_ops.TensorArray(x.dtype, n)
    ]
    _, hessian = control_flow_ops.while_loop(
      lambda j, _: j < n,
      lambda j, result: (j + 1, result.write(j, tf.gradients(_gradient[j], x, **kwargs)[0])),
      loop_vars
    )
    hessians.append(hessian.stack())
  return hessians

In [5]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(3):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = hessians_highrank(f, vs)[0]
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%hessi_inv.eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=11.267 variables= ['0.926', '0.099']
	 	gradients= ['6.049', '7.446'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-2.676', '-0.349']
EPOCH 1 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['-0.000', '0.000']
EPOCH 2 f=1.875 variables= ['-1.750', '-0.250']
	 	gradients= ['-0.000', '0.000'] hessi_inv= [[ 0.75 -0.25], [-0.25  0.25]] delta= ['0.000', '-0.000']
FINAL   f=1.875 variables= ['-1.750', '-0.250']


## Second-order Minimization self implemented Diagonal Hessian

### brute force

In [12]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi = hessians_highrank(f, vs)[0] * tf.eye(2)
    
    hessi_inv = tf.matrix_inverse(hessi)
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%tf.diag_part(hessi_inv).eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=11.267 variables= ['0.926', '0.099']
	 	gradients= ['6.049', '7.446'] hessi_inv= [ 0.5         0.16666667] delta= ['-3.025', '-1.241']
EPOCH 1 f=5.006 variables= ['-2.099', '-1.142']
	 	gradients= ['-2.482', '-6.049'] hessi_inv= [ 0.5         0.16666667] delta= ['1.241', '1.008']
EPOCH 2 f=2.919 variables= ['-0.858', '-0.134']
	 	gradients= ['2.016', '2.482'] hessi_inv= [ 0.5         0.16666667] delta= ['-1.008', '-0.414']
EPOCH 3 f=2.223 variables= ['-1.866', '-0.547']
	 	gradients= ['-0.827', '-2.016'] hessi_inv= [ 0.5         0.16666667] delta= ['0.414', '0.336']
EPOCH 4 f=1.991 variables= ['-1.453', '-0.211']
	 	gradients= ['0.672', '0.827'] hessi_inv= [ 0.5         0.16666667] delta= ['-0.336', '-0.138']
EPOCH 5 f=1.914 variables= ['-1.789', '-0.349']
	 	gradients= ['-0.276', '-0.672'] hessi_inv= [ 0.5         0.16666667] delta= ['0.138', '0.112']
EPOCH 6 f=1.888 variables= ['-1.651', '-0.237']
	 	gradients= ['0.224', '0.276'] hessi_inv= [ 0.5         0.16666667] delta= 

### optimized computational impl.

In [13]:
from functools import reduce
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import array_ops, tensor_array_ops, control_flow_ops

def invers_diagonal_hessians_highrank(ys, xs, gradients=None, name="hessians", colocate_gradients_with_ops=False,
            gate_gradients=False, aggregation_method=None):
  """Constructs the Hessian (one or more rank matrix) of sum of `ys` with respect to `x` in `xs`.
  `hessians_highrank()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`. This function currently
  only supports evaluating the Hessian with respect to (a list of) one-
  dimensional tensors.
  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.
  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
  xs = gradients_impl._AsList(xs)
  kwargs = {
    'colocate_gradients_with_ops': colocate_gradients_with_ops,
    'gate_gradients': gate_gradients,
    'aggregation_method': aggregation_method
  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = tf.gradients(ys, xs, **kwargs) if gradients is None else gradients
  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
    shape = x.shape
    _gradient = tf.reshape(_gradient, [-1])
    
    n = tf.size(x)
    g = tf.gradients(_gradient, x)[0]
    hessian = tf.diag( 1.0 / (tf.reshape(g, [-1]) + 1e-8) )
    hessians.append(hessian)
  return hessians

In [14]:
sess = tf.InteractiveSession()

vs = tf.Variable(initial)
f = vs[0]**2 + 2.0*vs[0]*vs[1] + 3.0*vs[1]**2 + 4.0*vs[0] + 5.0*vs[1] + 6.0
sess.run(tf.global_variables_initializer())

for e in range(10):
    print('EPOCH', e, 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
    
    grads = tf.gradients(f, vs)[0]
    hessi_inv = invers_diagonal_hessians_highrank(f, vs)[0]
    
    delta = - tf.reshape( tf.matmul(hessi_inv, tf.reshape(grads, [-1, 1])), [2] )
    print('\t \tgradients=', ['%.3f'%g for g in grads.eval()],
          'hessi_inv=', ('%s'%tf.diag_part(hessi_inv).eval()).replace('\n', ','),
          'delta=', ['%.3f'%d for d in delta.eval()])

    op = vs.assign( vs + delta )
    op.eval()

print('FINAL  ', 'f=%.3f'%f.eval(), 'variables=', ['%.3f'%v for v in vs.eval()])
sess.close()

EPOCH 0 f=11.267 variables= ['0.926', '0.099']
	 	gradients= ['6.049', '7.446'] hessi_inv= [ 0.25   0.125] delta= ['-1.512', '-0.931']
EPOCH 1 f=2.890 variables= ['-0.587', '-0.832']
	 	gradients= ['1.163', '-1.163'] hessi_inv= [ 0.25   0.125] delta= ['-0.291', '0.145']
EPOCH 2 f=2.446 variables= ['-0.878', '-0.686']
	 	gradients= ['0.872', '-0.872'] hessi_inv= [ 0.25   0.125] delta= ['-0.218', '0.109']
EPOCH 3 f=2.196 variables= ['-1.096', '-0.577']
	 	gradients= ['0.654', '-0.654'] hessi_inv= [ 0.25   0.125] delta= ['-0.164', '0.082']
EPOCH 4 f=2.056 variables= ['-1.259', '-0.495']
	 	gradients= ['0.491', '-0.491'] hessi_inv= [ 0.25   0.125] delta= ['-0.123', '0.061']
EPOCH 5 f=1.977 variables= ['-1.382', '-0.434']
	 	gradients= ['0.368', '-0.368'] hessi_inv= [ 0.25   0.125] delta= ['-0.092', '0.046']
EPOCH 6 f=1.932 variables= ['-1.474', '-0.388']
	 	gradients= ['0.276', '-0.276'] hessi_inv= [ 0.25   0.125] delta= ['-0.069', '0.035']
EPOCH 7 f=1.907 variables= ['-1.543', '-0.354']
	