Fix the handling of IndexedSlices with repeated sparse indices in opt…

…imizers This changes the behavior of AdagradOptimizer and AdamOptimizer, among others, when dealing with sparse gradients (e.g. generated from embedding lookups), correcting it to be equivalent to the dense behavior. Previously sparse gradients with repeated indices would lead to sums of squares rather than a squared sum (AdagradOptimizer) or repeated momentum updates (AdamOptimizer), with related bugs in other optimizers which either do non-linear computations with gradients or apply momentum updates. Adds _apply_sparse_duplicate_indices which allows optimizers to manually deal with duplicate indices, but otherwise defaults them to first uniquifying the IndexedSlices. Future changes will add checks to C++ optimizer implementations to prevent repeated index bugs when called from outside the Python interface. Change: 146010483
tensorflow · Jan 30, 2017 · f9f56f9 · f9f56f9
1 parent ab2f960
commit f9f56f9
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 3 deletions.
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
@@ -123,6 +123,36 @@ def testSparseBasic(self):
         self.assertAllCloseAccordingToType(
             np.array([[3.0], [3.715679168701172]]), var1.eval())
 
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():

diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
@@ -93,6 +93,36 @@ def testSparse(self):
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
   def doTestBasic(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():

diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
@@ -61,7 +61,7 @@ def _resource_apply_sparse(self, grad, handle, indices):
     return resource_variable_ops.resource_scatter_add(
         handle, indices, -grad * self._learning_rate)
 
-  def _apply_sparse(self, grad, var):
+  def _apply_sparse_duplicate_indices(self, grad, var):
     delta = ops.IndexedSlices(
         grad.values *
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),

diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
@@ -24,8 +24,10 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
@@ -64,7 +66,8 @@ def update_op(self, optimizer, g):
     else:
       assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
                                                 "tensor nor IndexedSlices.")
-      return optimizer._apply_sparse(g, self._v)  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v)
 
 
 class _DenseReadResourceVariableProcessor(_OptimizableVariable):
@@ -529,11 +532,55 @@ def _apply_dense(self, grad, var):
   def _resource_apply_dense(self, grad, handle):
     raise NotImplementedError()
 
+  def _apply_sparse_duplicate_indices(self, grad, var):
+    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
+
+    Optimizers which override this method must deal with IndexedSlices objects
+    such as the following:
+
+      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
+
+    The correct interpretation is:
+
+      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
+
+    Many optimizers deal incorrectly with repeated indices when updating based
+    on sparse gradients (e.g. summing squares rather than squaring the sum, or
+    applying momentum terms multiple times). Adding first is always the correct
+    behavior, so this is enforced here by reconstructing the IndexedSlices to
+    have only unique indices, then calling _apply_sparse.
+
+    Optimizers which deal correctly with repeated indices may instead override
+    this method to avoid the overhead of summing indices.
+
+    Args:
+      grad: `IndexedSlices`.
+      var: A `Variable` object.
+
+    Returns:
+      An `Operation`.
+    """
+    unique_indices, new_index_positions = array_ops.unique(grad.indices)
+    summed_values = math_ops.unsorted_segment_sum(
+        grad.values, new_index_positions, array_ops.shape(unique_indices)[0])
+    gradient_no_duplicate_indices = ops.IndexedSlices(
+        indices=unique_indices,
+        values=summed_values,
+        dense_shape=grad.dense_shape)
+    return self._apply_sparse(gradient_no_duplicate_indices, var)
+
   def _apply_sparse(self, grad, var):
     """Add ops to apply sparse gradients to `var`.
 
+    The IndexedSlices object passed to `grad` in this function is by default
+    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
+    indices (see its docstring for details). Optimizers which can tolerate or
+    have correct special cases for duplicate sparse indices may override
+    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
+    overhead.
+
     Args:
-      grad: `IndexedSlices`.
+      grad: `IndexedSlices`, with no repeated indices.
       var: A `Variable` object.
 
     Return: