Skip to content

Commit

Permalink
Fix the handling of IndexedSlices with repeated sparse indices in opt…
Browse files Browse the repository at this point in the history
…imizers

This changes the behavior of AdagradOptimizer and AdamOptimizer, among others, when dealing with sparse gradients (e.g. generated from embedding lookups), correcting it to be equivalent to the dense behavior. Previously sparse gradients with repeated indices would lead to sums of squares rather than a squared sum (AdagradOptimizer) or repeated momentum updates (AdamOptimizer), with related bugs in other optimizers which either do non-linear computations with gradients or apply momentum updates.

Adds _apply_sparse_duplicate_indices which allows optimizers to manually deal with duplicate indices, but otherwise defaults them to first uniquifying the IndexedSlices.

Future changes will add checks to C++ optimizer implementations to prevent repeated index bugs when called from outside the Python interface.
Change: 146010483
  • Loading branch information
tensorflower-gardener committed Jan 30, 2017
1 parent ab2f960 commit f9f56f9
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 3 deletions.
30 changes: 30 additions & 0 deletions tensorflow/python/training/adagrad_test.py
Expand Up @@ -123,6 +123,36 @@ def testSparseBasic(self):
self.assertAllCloseAccordingToType(
np.array([[3.0], [3.715679168701172]]), var1.eval())

def testSparseRepeatedIndices(self):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
repeated_index_update_var = variables.Variable(
[[1.0], [2.0]], dtype=dtype)
aggregated_update_var = variables.Variable(
[[1.0], [2.0]], dtype=dtype)
grad_repeated_index = ops.IndexedSlices(
constant_op.constant(
[0.1, 0.1], shape=[2, 1], dtype=dtype),
constant_op.constant([1, 1]),
constant_op.constant([2, 1]))
grad_aggregated = ops.IndexedSlices(
constant_op.constant(
[0.2], shape=[1, 1], dtype=dtype),
constant_op.constant([1]),
constant_op.constant([2, 1]))
repeated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
[(grad_repeated_index, repeated_index_update_var)])
aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
[(grad_aggregated, aggregated_update_var)])
variables.global_variables_initializer().run()
self.assertAllClose(aggregated_update_var.eval(),
repeated_index_update_var.eval())
for _ in range(3):
repeated_update.run()
aggregated_update.run()
self.assertAllClose(aggregated_update_var.eval(),
repeated_index_update_var.eval())

def testSparseStability(self):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
Expand Down
30 changes: 30 additions & 0 deletions tensorflow/python/training/adam_test.py
Expand Up @@ -93,6 +93,36 @@ def testSparse(self):
self.assertAllCloseAccordingToType(var0_np, var0.eval())
self.assertAllCloseAccordingToType(var1_np, var1.eval())

def testSparseRepeatedIndices(self):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
repeated_index_update_var = variables.Variable(
[[1.0], [2.0]], dtype=dtype)
aggregated_update_var = variables.Variable(
[[1.0], [2.0]], dtype=dtype)
grad_repeated_index = ops.IndexedSlices(
constant_op.constant(
[0.1, 0.1], shape=[2, 1], dtype=dtype),
constant_op.constant([1, 1]),
constant_op.constant([2, 1]))
grad_aggregated = ops.IndexedSlices(
constant_op.constant(
[0.2], shape=[1, 1], dtype=dtype),
constant_op.constant([1]),
constant_op.constant([2, 1]))
repeated_update = adam.AdamOptimizer().apply_gradients(
[(grad_repeated_index, repeated_index_update_var)])
aggregated_update = adam.AdamOptimizer().apply_gradients(
[(grad_aggregated, aggregated_update_var)])
variables.global_variables_initializer().run()
self.assertAllClose(aggregated_update_var.eval(),
repeated_index_update_var.eval())
for _ in range(3):
repeated_update.run()
aggregated_update.run()
self.assertAllClose(aggregated_update_var.eval(),
repeated_index_update_var.eval())

def doTestBasic(self, use_resource=False):
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
with self.test_session():
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/python/training/gradient_descent.py
Expand Up @@ -61,7 +61,7 @@ def _resource_apply_sparse(self, grad, handle, indices):
return resource_variable_ops.resource_scatter_add(
handle, indices, -grad * self._learning_rate)

def _apply_sparse(self, grad, var):
def _apply_sparse_duplicate_indices(self, grad, var):
delta = ops.IndexedSlices(
grad.values *
math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
Expand Down
51 changes: 49 additions & 2 deletions tensorflow/python/training/optimizer.py
Expand Up @@ -24,8 +24,10 @@

from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import gradients
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variables
from tensorflow.python.training import slot_creator
Expand Down Expand Up @@ -64,7 +66,8 @@ def update_op(self, optimizer, g):
else:
assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
"tensor nor IndexedSlices.")
return optimizer._apply_sparse(g, self._v) # pylint: disable=protected-access
# pylint: disable=protected-access
return optimizer._apply_sparse_duplicate_indices(g, self._v)


class _DenseReadResourceVariableProcessor(_OptimizableVariable):
Expand Down Expand Up @@ -529,11 +532,55 @@ def _apply_dense(self, grad, var):
def _resource_apply_dense(self, grad, handle):
raise NotImplementedError()

def _apply_sparse_duplicate_indices(self, grad, var):
"""Add ops to apply sparse gradients to `var`, with repeated sparse indices.
Optimizers which override this method must deal with IndexedSlices objects
such as the following:
IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
The correct interpretation is:
IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
Many optimizers deal incorrectly with repeated indices when updating based
on sparse gradients (e.g. summing squares rather than squaring the sum, or
applying momentum terms multiple times). Adding first is always the correct
behavior, so this is enforced here by reconstructing the IndexedSlices to
have only unique indices, then calling _apply_sparse.
Optimizers which deal correctly with repeated indices may instead override
this method to avoid the overhead of summing indices.
Args:
grad: `IndexedSlices`.
var: A `Variable` object.
Returns:
An `Operation`.
"""
unique_indices, new_index_positions = array_ops.unique(grad.indices)
summed_values = math_ops.unsorted_segment_sum(
grad.values, new_index_positions, array_ops.shape(unique_indices)[0])
gradient_no_duplicate_indices = ops.IndexedSlices(
indices=unique_indices,
values=summed_values,
dense_shape=grad.dense_shape)
return self._apply_sparse(gradient_no_duplicate_indices, var)

def _apply_sparse(self, grad, var):
"""Add ops to apply sparse gradients to `var`.
The IndexedSlices object passed to `grad` in this function is by default
pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
indices (see its docstring for details). Optimizers which can tolerate or
have correct special cases for duplicate sparse indices may override
`_apply_sparse_duplicate_indices` instead of this function, avoiding that
overhead.
Args:
grad: `IndexedSlices`.
grad: `IndexedSlices`, with no repeated indices.
var: A `Variable` object.
Return:
Expand Down

0 comments on commit f9f56f9

Please sign in to comment.