tensorflow · benbarsdell · Jun 9, 2020 · Jun 10, 2020 · Jun 10, 2020 · Jun 18, 2020
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
@@ -198,10 +198,8 @@ def __init__(self,
     if self._USE_V2_BEHAVIOR:
       if fused:
         self._raise_if_fused_cannot_be_used()
-      # We leave fused as None if self._fused_can_be_used()==True, since we
-      # still may set it to False in self.build() if the input rank is not 4.
-      elif fused is None and not self._fused_can_be_used():
-        fused = False
+      elif fused is None:
+        fused = self._fused_can_be_used()
     elif fused is None:
       fused = True
     self.supports_masking = True
@@ -221,26 +219,20 @@ def __init__(self,
 
   def _raise_if_fused_cannot_be_used(self):
     """Raises a ValueError if fused implementation cannot be used.
-
-    In addition to the checks done in this function, the input tensors rank must
-    be 4. The input rank check can only be done once the input shape is known.
     """
     # Note the ValueErrors in this function are caught and not reraised in
     # _fused_can_be_used(). No other exception besides ValueError should be
     # raised here.
 
     # Currently fused batch norm doesn't support renorm. It also only supports a
-    # channel dimension on axis 1 or 3, when no virtual batch size or adjustment
-    # is used.
+    # single axis, when no virtual batch size or adjustment is used.
     if self.renorm:
       raise ValueError('Passing both fused=True and renorm=True is '
                        'unsupported')
     axis = [self.axis] if isinstance(self.axis, int) else self.axis
-    # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, because the
-    # input rank is required to be 4 (which is checked later).
-    if len(axis) > 1 or axis[0] not in (-3, -1, 1, 3):
-      raise ValueError('Passing fused=True is only supported when axis is 1 '
-                       'or 3')
+    if len(axis) > 1:
+      raise ValueError('Passing fused=True is only supported when operating '
+                       'over a single axis.')
     if self.virtual_batch_size is not None:
       raise ValueError('Passing fused=True is unsupported when '
                        'virtual_batch_size is specified.')
@@ -281,6 +273,51 @@ def _support_zero_size_input(self):
         distribution_strategy_context.get_strategy().extended,
         'experimental_enable_get_next_as_optional', False)
 
+  def _get_shape_and_axis_for_fused(self, nd_shape, nd_axis):
+    '''Returns a 4D shape and axis (1 or 3) to which nd_shape and nd_axis can
+    be changed without changing the result of the batch normalization operation.
+    '''
+    assert(isinstance(nd_axis, int))
+    ndims = len(nd_shape)
+    shape = nd_shape[:]
+    axis = nd_shape + nd_axis if nd_axis < 0 else nd_axis
+    # First check if the axis needs to be moved.
+    if axis not in (1, ndims - 1):
+      # Move axis to dim 1.
+      if axis == 0:
+        # Transform [C, ...] to [1, C, ...].
+        shape.insert(0, 1)
+        ndims += 1
+      else:
+        # Merge excess pre-axis dims into first dim.
+        # Transform [N, ..., C, ...] to [product(N, ...), C, ...].
+        for dim in range(axis - 1, 0, -1):
+          shape[0] *= shape[dim]
+          del shape[dim]
+          ndims -= 1
+      axis = 1
+    # Now change shape to 4D.
+    is_channels_last = axis == ndims - 1
+    if ndims < 4:
+      # Insert new dims after existing spatial dim or before channel dim.
+      new_dims = [1] * (4 - ndims)
+      if is_channels_last:
+        # Transform [..., C] to [..., 1..., C] (ndims=4).
+        shape = shape[:-1] + new_dims + shape[-1:]
+      else:
+        # Transform [N, C, ...] to [N, C, ..., 1...] (ndims=4).
+        shape += new_dims
+    elif ndims > 4:
+      # Merge excess spatial dims into the second spatial dim.
+      # Transform [N, C, H, W, ...] to [N, C, H, product(W, ...)].
+      # Or        [N, H, W, ..., C] to [N, H, product(W, ...), C].
+      merge_dim = 2 if is_channels_last else 3
+      for dim in range(merge_dim + (ndims - 4), merge_dim, -1):
+        shape[merge_dim] *= shape[dim]
+        del shape[dim]
+    axis = 3 if is_channels_last else 1
+    return shape, axis
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -315,18 +352,22 @@ def build(self, input_shape):
         raise ValueError('When using virtual_batch_size, adjustment cannot '
                          'be specified')
 
+    fused_axis = self.axis
+    self._input_fused_shape = None
     if self.fused in (None, True):
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      if self._USE_V2_BEHAVIOR:
-        if self.fused is None:
-          self.fused = (ndims == 4)
-        elif self.fused and ndims != 4:
-          raise ValueError('Batch normalization layers with fused=True only '
-                           'support 4D input tensors.')
-      else:
+      if len(self.axis) == 1 and (self.axis[0] not in (1, ndims - 1) or
+                                  ndims != 4):
+        # The fused implementation only supports NCHW or NHWC, so we will
+        # reshape the input/output tensor to/from an equivalent 4D shape.
+        fused_shape, fused_axis = self._get_shape_and_axis_for_fused(
+            input_shape.dims, self.axis[0])
+        fused_shape = tensor_shape.TensorShape(fused_shape)
+        self._input_fused_shape = [-1] + fused_shape.as_list()[1:]
+        fused_axis = [fused_axis]
+
+      if not self._USE_V2_BEHAVIOR:
         assert self.fused is not None
-        self.fused = (ndims == 4 and self._fused_can_be_used())
+        self.fused = self._fused_can_be_used()
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -335,9 +376,9 @@ def build(self, input_shape):
       # common use case (turning 5D w/ virtual batch to NCHW)
 
     if self.fused:
-      if self.axis == [1]:
+      if fused_axis == [1]:
         self._data_format = 'NCHW'
-      elif self.axis == [3]:
+      elif fused_axis == [3]:
         self._data_format = 'NHWC'
       else:
         raise ValueError('Unsupported axis, fused batch norm only supports '
@@ -499,6 +540,10 @@ def _fused_batch_norm(self, inputs, training):
     beta = self.beta if self.center else self._beta_const
     gamma = self.gamma if self.scale else self._gamma_const
 
+    original_shape = [-1] + inputs.shape.as_list()[1:]
+    if self._input_fused_shape is not None:
+      inputs = array_ops.reshape(inputs, self._input_fused_shape)
+
     # TODO(b/129279393): Support zero batch input in non DistributionStrategy
     # code as well.
     if self._support_zero_size_input():
@@ -575,8 +620,11 @@ def _fused_batch_norm_inference():
 
     output, mean, variance = tf_utils.smart_cond(training, train_op,
                                                  _fused_batch_norm_inference)
-    variance = _maybe_add_or_remove_bessels_correction(variance, remove=True)
 
+    if self._input_fused_shape is not None:
+      output = array_ops.reshape(output, original_shape)
+
+    variance = _maybe_add_or_remove_bessels_correction(variance, remove=True)
     training_value = tf_utils.constant_value(training)
     if training_value or training_value is None:
       if not use_fused_avg_updates:

diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
@@ -138,6 +138,27 @@ def test_batchnorm_convnet_channel_last(self):
     np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
     np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_batchnorm_convnet_channel_last_3d_fused(self):
+    model = keras.models.Sequential()
+    norm = keras.layers.BatchNormalization(
+        axis=-1, input_shape=(4, 4, 4, 3), momentum=0.8, fused=True)
+    model.add(norm)
+    model.compile(
+        loss='mse',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
   @keras_parameterized.run_all_keras_modes
   def test_batchnorm_correctness(self):
     _run_batchnorm_correctness_test(
@@ -213,7 +234,7 @@ def call(self, x, training):
       model = MyModel()
 
       for _ in range(10):
-        x = constant_op.constant(0.5, shape=[1, 1])
+        x = constant_op.constant(0.5, shape=[2, 1])
         model(x, training=True)
 
       # Make sure the moving mean and variance have been updated
@@ -255,20 +276,28 @@ def test_basic_batchnorm_v2(self):
         normalization_v2.BatchNormalization,
         kwargs={'fused': None},
         input_shape=(3, 3, 3))
+    testing_utils.layer_test(
+        normalization_v2.BatchNormalization,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization_v2.BatchNormalization,
+        kwargs={'fused': True},
+        input_shape=(3, 3))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_v2_fused_attribute(self):
     norm = normalization_v2.BatchNormalization()
-    self.assertEqual(norm.fused, None)
+    self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, True)
 
     norm = normalization_v2.BatchNormalization()
-    self.assertEqual(norm.fused, None)
+    self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4))
     norm(inp)
-    self.assertEqual(norm.fused, False)
+    self.assertEqual(norm.fused, True)
 
     norm = normalization_v2.BatchNormalization(virtual_batch_size=2)
     self.assertEqual(norm.fused, False)
@@ -291,10 +320,7 @@ def test_v2_fused_attribute(self):
     with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
       normalization_v2.BatchNormalization(fused=True, renorm=True)
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
-      normalization_v2.BatchNormalization(fused=True, axis=2)
-
-    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+    with self.assertRaisesRegexp(ValueError, 'fused.*over a single axis'):
       normalization_v2.BatchNormalization(fused=True, axis=[1, 3])
 
     with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
@@ -304,12 +330,6 @@ def test_v2_fused_attribute(self):
       normalization_v2.BatchNormalization(fused=True,
                                           adjustment=lambda _: (1, 0))
 
-    norm = normalization_v2.BatchNormalization(fused=True)
-    self.assertEqual(norm.fused, True)
-    inp = keras.layers.Input(shape=(4, 4))
-    with self.assertRaisesRegexp(ValueError, '4D input tensors'):
-      norm(inp)
-
   def test_updates_in_wrap_function(self):
     with context.eager_mode():
       layer = keras.layers.BatchNormalization()