vimalmanohar · GaofengCheng · Dec 16, 2016 · Dec 16, 2016 · Dec 16, 2016 · Dec 17, 2016
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
@@ -29,6 +29,7 @@ ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 dropout_schedule='0,0@0.20,0.5@0.50,0@0.50,0'
+dropout_per_frame=false
 chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
@@ -193,15 +194,15 @@ if [ $stage -le 15 ]; then
   relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
 
   # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
-  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
   relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
-  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
   relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
-  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
 
   ## adding the layers for chain branch
   output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
@@ -243,6 +244,7 @@ if [ $stage -le 16 ]; then
     --egs.chunk-left-context $chunk_left_context \
     --egs.chunk-right-context $chunk_right_context \
     --trainer.dropout-schedule $dropout_schedule \
+    --trainer.dropout-per-frame $dropout_per_frame \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
     --trainer.num-epochs 4 \

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -225,6 +225,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         frame_subsampling_factor, truncate_deriv_weights,
                         run_opts,
                         dropout_proportions=None,
+                        dropout_per_frame=None,
                         background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -307,7 +308,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     dropout_info_str = ''
     if dropout_proportions is not None:
         raw_model_string, dropout_info = common_train_lib.apply_dropout(
-            dropout_proportions, raw_model_string)
+            dropout_proportions, dropout_per_frame, raw_model_string)
         dropout_info_str = ', {0}'.format(", ".join(dropout_info))
 
     shrink_info_str = ''

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -511,7 +511,7 @@ def _get_component_dropout(dropout_schedule, num_archives_processed):
             + initial_dropout)
 
 
-def apply_dropout(dropout_proportions, raw_model_string):
+def apply_dropout(dropout_proportions, dropout_per_frame, raw_model_string):
     """Adds an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
 
@@ -523,10 +523,10 @@ def apply_dropout(dropout_proportions, raw_model_string):
 
     for component_name, dropout_proportion in dropout_proportions:
         edit_config_lines.append(
-            "set-dropout-proportion name={0} proportion={1}".format(
-                component_name, dropout_proportion))
-        dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
-            component_name, dropout_proportion))
+            "set-dropout-proportion name={0} proportion={1} dropout-per-frame={2}".format(
+                component_name, dropout_proportion, dropout_per_frame))
+        dropout_info.append("pattern/dropout-proportion={0}/{1} dropout-per-frame={2}".format(
+            component_name, dropout_proportion, dropout_per_frame))
 
     return ("""{raw_model_string} nnet3-copy --edits='{edits}' \
             - - |""".format(raw_model_string=raw_model_string,
@@ -771,6 +771,11 @@ def __init__(self):
                                  lstm*=0,0.2,0'.  More general should precede
                                  less general patterns, as they are applied
                                  sequentially.""")
+        self.parser.add_argument("--trainer.dropout-per-frame", type=str,
+                                 action=common_lib.NullstrToNoneAction,
+                                 dest='dropout_per_frame', default=None,
+                                 help="""this option is used to control whether
+                                 using dropout by frame level or by vector level""")
 
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -251,6 +251,7 @@ def set_default_configs(self):
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
                         'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
+                        'dropout-per-frame' : 'false'
                        }
 
     def set_derived_configs(self):
@@ -285,6 +286,10 @@ def check_configs(self):
              self.config['dropout-proportion'] < 0.0) and
              self.config['dropout-proportion'] != -1.0 ):
              raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
+
+        if (self.config['dropout-per-frame'] != 'false' or
+            self.config['dropout-per-frame'] != 'true'):
+            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
 
     def auxiliary_outputs(self):
         return ['c_t']
@@ -347,7 +352,8 @@ def generate_lstm_config(self):
         pes_str = self.config['ng-per-element-scale-options']
         lstm_dropout_value = self.config['dropout-proportion']
         lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
-
+        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
+        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
         if re.search('param-mean', pes_str) is None and \

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -202,7 +202,10 @@ def process_args(args):
             "value={0}. We recommend using the option "
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
-
+    if ( args.dropout_schedule is None ) 
+            and (args.dropout_per_frame is not None) :
+        raise Exception("The dropout schedule is null, but dropout_per_frame"
+                        "option is not null")
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -441,6 +444,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     None if args.dropout_schedule is None
                     else common_train_lib.get_dropout_proportions(
                         dropout_schedule, num_archives_processed)),
+                dropout_per_frame=(
+                    None if args.dropout_schedule is None
+                    else args.dropout_per_frame),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
@@ -64,6 +64,7 @@ void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
                          bool include_sign, MatrixDim d);
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                        MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
@@ -330,6 +331,7 @@ void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
                          bool include_sign, MatrixDim d);
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                        MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
@@ -1628,6 +1628,23 @@ static void _apply_heaviside(Real* mat, MatrixDim d) {
     mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
 }
 
+template<typename Real>
+__global__
+static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int j_tempt = blockIdx.y * blockDim.y + threadIdx.y;  // row index using to control setting heavyside() in the first rows
+  int index = i + j * d.stride;
+  if (i < d.cols && j < d.rows)
+    if (j = j_tempt) {
+      mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
+    }
+    else {
+      mat[index] = mat[index-d.stride-d.cols];
+    }
+}
+
+
 template<typename Real>
 __global__
 static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
@@ -3233,6 +3250,10 @@ void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
+void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
+  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
+}
+
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
@@ -3880,6 +3901,10 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
+void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
+  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
+}
+
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {

diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
@@ -201,6 +201,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
   cudaF_apply_heaviside(Gr, Bl, mat, dim);
 }
+inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
+  cudaF_apply_heaviside_by_row(Gr, Bl, mat, dim);
+}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                              MatrixDim dim) {
   cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
@@ -739,6 +742,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
   cudaD_apply_heaviside(Gr, Bl, mat, dim);
 }
+inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
+  cudaD_apply_heaviside_by_row(Gr, Bl, mat, dim);
+}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                              MatrixDim dim) {
   cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);

diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
@@ -2207,6 +2207,23 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyHeavisideByRow() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_apply_heaviside_by_row(dimGrid, dimBlock, data_, Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    KALDI_ERR << "no ApplyHeavisideByRow implemented without CUDA";
+  }
+}
 template<typename Real>
 void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));

diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
@@ -369,6 +369,7 @@ class CuMatrixBase {
   /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
   /// See also Heaviside().
   void ApplyHeaviside();
+  void ApplyHeavisideByRow();
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();

diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
@@ -38,7 +38,7 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+  SetDropoutProportion(0, false, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;

diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
@@ -34,7 +34,7 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
     nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+  SetDropoutProportion(0, false, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
@@ -87,27 +87,37 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
 }
 
 
-void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion) {
+void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion, bool dropout_per_frame) {
   dropout_proportion_ = dropout_proportion;
+  dropout_per_frame_ = dropout_per_frame;
   dim_ = dim;
 }
 
 void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   BaseFloat dropout_proportion = 0.0;
+  bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
+  bool ok2 = cfl->GetValue("dropout-per-frame", &dropout_per_frame);
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, dropout_proportion);
+  if( ! ok2 )
+  {
+      dropout_per_frame = false;
+      Init(dim, dropout_proportion, dropout_per_frame);
+  } else {
+      Init(dim, dropout_proportion, dropout_per_frame);
+  }
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
-         << ", dropout-proportion=" << dropout_proportion_;
+         << ", dropout-proportion=" << dropout_proportion_
+         << ", dropout-per-frame=" << dropout_per_frame_;
   return stream.str();
 }
 
@@ -119,16 +129,27 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
+  if(dropout_per_frame_)
+  {
+    // This const_cast is only safe assuming you don't attempt
+    // to use multi-threaded code with the GPU.
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-  // This const_cast is only safe assuming you don't attempt
-  // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+    out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                          // be zero and (1 - dropout) will be 1.0.
 
-  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                         // be zero and (1 - dropout) will be 1.0.
+    out->MulElements(in);
+  } else {
 
-  out->MulElements(in);
+    // This const_cast is only safe assuming you don't attempt
+    // to use multi-threaded code with the GPU.
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+    out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+    out->ApplyHeavisideByRow(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                           // be zero and (1 - dropout) will be 1.0 by row.
+    out->MulElements(in);
+  }
 }
 
 
@@ -154,6 +175,8 @@ void DropoutComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
+  ExpectToken(is, binary, "<DropoutPerFrame>");
+  ReadBasicType(is, binary, &dropout_per_frame_);
   ExpectToken(is, binary, "</DropoutComponent>");
 }
 
@@ -163,6 +186,8 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
+  WriteToken(os, binary, "<DropoutPerFrame>");
+  WriteBasicType(os, binary, dropout_per_frame_);
   WriteToken(os, binary, "</DropoutComponent>");
 }