BatchNormalization: add evaluation mode, add doc for nn.Jacobian

torch · Apr 16, 2016 · 26a5a7e · 26a5a7e
1 parent e07d84d
commit 26a5a7e
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 51 deletions.
diff --git a/BatchNormalization.lua b/BatchNormalization.lua
@@ -125,7 +125,6 @@ end
 local function backward(self, input, gradOutput, scale, gradInput, gradWeight, gradBias)
    self:checkInputDim(input)
    self:checkInputDim(gradOutput)
-   assert(self.train == true, 'should be in training mode when self.train is true')
    assert(self.save_mean and self.save_std, 'must call :updateOutput() first')
 
    input, gradOutput = makeContiguous(self, input, gradOutput)
@@ -142,9 +141,13 @@ local function backward(self, input, gradOutput, scale, gradInput, gradWeight, g
       THNN.optionalTensor(gradWeight),
       THNN.optionalTensor(gradBias),
       THNN.optionalTensor(self.weight),
+      self.running_mean:cdata(),
+      self.running_var:cdata(),
       self.save_mean:cdata(),
       self.save_std:cdata(),
-      scale)
+      self.train,
+      scale,
+      self.eps)
 
    return self.gradInput
 end

diff --git a/doc/testing.md b/doc/testing.md
@@ -1,5 +1,69 @@
-## Testing ##
+# Testing #
 For those who want to implement their own modules, we suggest using
 the `nn.Jacobian` class for testing the derivatives of their class,
 together with the [torch.Tester](https://github.com/torch/torch7/blob/master/doc/tester.md) class. The sources
 of `nn` package contains sufficiently many examples of such tests.
+
+
+## nn.Jacobian ##
+
+
+<a name="nn.Jacobian.testJacobian"></a>
+### testJacobian(module, input, minval, maxval, perturbation) ###
+
+Test the jacobian of a module w.r.t. to its input. 
+
+`module` takes as its input a random tensor shaped the same as `input`.  
+`minval` and `maxval` specify the range of the random tensor ([-2, 2] by default).  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the jacobian computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.testJacobianParameters"></a>
+### testJacobianParameters(module, input, param, dparam, minval, maxval, perturbation) ###
+
+Test the jacobian of a module w.r.t. its parameters (instead of its input).
+
+The input and parameters of `module` are random tensors shaped the same as `input` and `param`.  
+`minval` and `maxval` specify the range of the random tensors ([-2, 2] by default).  
+`dparam` points to the gradient w.r.t. parameters.  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the jacobian computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.testJacobianUpdateParameters"></a>
+### testJacobianUpdateParameters(module, input, param, minval, maxval, perturbation) ###
+
+Test the amount of update of a module to its parameters.
+
+The input and parameters of `module` are random tensors shaped the same as `input` and `param`.  
+`minval` and `maxval` specify the range of the random tensors ([-2, 2] by default).  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the update computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.forward"></a>
+### forward(module, input, param, perturbation) ###
+
+Compute the jacobian by finite difference.
+
+`module` has parameters `param` and input `input`.  
+If provided, `param` is regarded as independent variables, otherwise `input` is the independent variables.  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the jacobian computed by finite difference.
+
+
+<a name="nn.Jacobian.backward"></a>
+### backward(module, input, param, dparam) ###
+
+Compute the jacobian by backpropagation.
+
+`module` has parameters `param` and input `input`.  
+If provided, `param` is regarded as independent variables, otherwise `input` is the independent variables.  
+`dparam` is the gradient w.r.t. parameters, it must present as long as `param` is present.  
+
+Returns the jacobian computed by backpropagation.
diff --git a/lib/THNN/generic/BatchNormalization.c b/lib/THNN/generic/BatchNormalization.c
@@ -66,22 +66,26 @@ void THNN_(BatchNormalization_updateOutput)(
 void THNN_(BatchNormalization_backward)(
   THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
   THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
-  THTensor *save_mean, THTensor *save_std, double scale)
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
 {
   long nInput = THTensor_(size)(input, 1);
   long n = THTensor_(nElement)(input) / nInput;
 
-  // Q(X) = X - E[x] ; i.e. input centered to zero mean
-  // Y = Q(X) / σ    ; i.e. BN output before weight and bias
-  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y / n) / σ * w
-
   #pragma omp parallel for
   for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
-    real mean = THTensor_(get1d)(save_mean, f);
-    real invstd = THTensor_(get1d)(save_std, f);
     real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
 
     // sum over all gradOutput in feature plane
     accreal sum = 0;
@@ -95,14 +99,29 @@ void THNN_(BatchNormalization_backward)(
     if (gradInput) {
       THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
 
-      // projection of gradOutput on to output scaled by std
-      real k = (real) dotp * invstd * invstd / n;
-      TH_TENSOR_APPLY2(real, gradIn, real, in,
-        *gradIn_data = (*in_data - mean) * k;);
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
 
-      accreal gradMean = sum / n;
-      TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
-        *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
 
       THTensor_(free)(gradIn);
     }

diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
@@ -535,9 +535,13 @@ TH_API void THNN_(BatchNormalization_backward)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *weight,
+          THTensor *running_mean,
+          THTensor *running_var,
           THTensor *save_mean,
           THTensor *save_std,
-          double scale);
+          bool train,
+          double scale,
+          double eps);
 
 TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
           THNNState *state,       // library state

diff --git a/test.lua b/test.lua
@@ -5541,49 +5541,54 @@ local function testBatchNormalization(moduleName, dim, k)
       table.insert(size, torch.random(1,k))
    end
    local input = torch.zeros(table.unpack(size)):uniform()
-   local module = nn[moduleName](planes)
 
-   local err = jac.testJacobian(module,input)
-   mytester:assertlt(err,precision, 'error on state ')
+   local function jacTests(module, input, affine)
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
 
-   local err = jac.testJacobianParameters(module, input,
-                                      module.weight, module.gradWeight)
-   mytester:assertlt(err,precision, 'error on weight ')
+      if affine then
+         local err = jac.testJacobianParameters(module, input,
+                                            module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
 
-   local err = jac.testJacobianParameters(module, input,
-                                      module.bias, module.gradBias)
-   mytester:assertlt(err,precision, 'error on weight ')
+         local err = jac.testJacobianParameters(module, input,
+                                            module.bias, module.gradBias)
+         mytester:assertlt(err,precision, 'error on weight ')
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
-   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
-   mytester:assertlt(err,precision, 'error on bias [direct update] ')
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err,precision, 'error on bias [direct update] ')
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
-      mytester:assertlt(err, precision, string.format(
-         'error on weight [%s]', t))
-   end
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+               'error on weight [%s]', t))
+         end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
-      mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+         end
+      end
+
+      -- IO
+      local ferr,berr = jac.testIO(module,input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
    end
-
-   -- IO
-   local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
-
+
+   local module = nn[moduleName](planes)
+   module:training()
+   jacTests(module, input, true)
+   module:evaluate()
+   jacTests(module, input, true)
+
    -- batch norm without affine transform
    module = nn[moduleName](planes, 1e-5, 0.1, false)
-
-   local err = jac.testJacobian(module, input)
-   mytester:assertlt(err,precision, 'error on state ')
-
-   -- IO
-   local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   module:training()
+   jacTests(module, input, false)
+   module:evaluate()
+   jacTests(module, input, false)
 end
 
 function nntest.BatchNormalization()