Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding dropout-by row #8

Open
wants to merge 12 commits into
base: dropout_schedule
Choose a base branch
from
8 changes: 5 additions & 3 deletions egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true).
num_threads_ubm=32
nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
dropout_schedule='0,0@0.20,0.5@0.50,0@0.50,0'
dropout_per_frame=false
chunk_width=150
chunk_left_context=40
chunk_right_context=0
Expand Down Expand Up @@ -193,15 +194,15 @@ if [ $stage -le 15 ]; then
relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024

# check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false

## adding the layers for chain branch
output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
Expand Down Expand Up @@ -243,6 +244,7 @@ if [ $stage -le 16 ]; then
--egs.chunk-left-context $chunk_left_context \
--egs.chunk-right-context $chunk_right_context \
--trainer.dropout-schedule $dropout_schedule \
--trainer.dropout-per-frame $dropout_per_frame \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as Vimal says, please remove this from the training code... does not need to be there.

--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs 4 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
frame_subsampling_factor, truncate_deriv_weights,
run_opts,
dropout_proportions=None,
dropout_per_frame=None,
background_process_handler=None):
""" Called from steps/nnet3/chain/train.py for one iteration for
neural network training with LF-MMI objective
Expand Down Expand Up @@ -307,7 +308,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
dropout_info_str = ''
if dropout_proportions is not None:
raw_model_string, dropout_info = common_train_lib.apply_dropout(
dropout_proportions, raw_model_string)
dropout_proportions, dropout_per_frame, raw_model_string)
dropout_info_str = ', {0}'.format(", ".join(dropout_info))

shrink_info_str = ''
Expand Down
15 changes: 10 additions & 5 deletions egs/wsj/s5/steps/libs/nnet3/train/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def _get_component_dropout(dropout_schedule, num_archives_processed):
+ initial_dropout)


def apply_dropout(dropout_proportions, raw_model_string):
def apply_dropout(dropout_proportions, dropout_per_frame, raw_model_string):
"""Adds an nnet3-copy --edits line to modify raw_model_string to
set dropout proportions according to dropout_proportions.

Expand All @@ -523,10 +523,10 @@ def apply_dropout(dropout_proportions, raw_model_string):

for component_name, dropout_proportion in dropout_proportions:
edit_config_lines.append(
"set-dropout-proportion name={0} proportion={1}".format(
component_name, dropout_proportion))
dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
component_name, dropout_proportion))
"set-dropout-proportion name={0} proportion={1} dropout-per-frame={2}".format(
component_name, dropout_proportion, dropout_per_frame))
dropout_info.append("pattern/dropout-proportion={0}/{1} dropout-per-frame={2}".format(
component_name, dropout_proportion, dropout_per_frame))

return ("""{raw_model_string} nnet3-copy --edits='{edits}' \
- - |""".format(raw_model_string=raw_model_string,
Expand Down Expand Up @@ -771,6 +771,11 @@ def __init__(self):
lstm*=0,0.2,0'. More general should precede
less general patterns, as they are applied
sequentially.""")
self.parser.add_argument("--trainer.dropout-per-frame", type=str,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this option required? Do you expect to change whether dropout is per frame or not during the training iterations?
I think dropout-per-frame should only be at the config level.
Also I think you can remove dropout_per_frame from the function SetDropoutProportion, because that is something you would have already defined from the config. If you really need to change dropout-per-frame during training, I suggest add a separate function like SetDropoutPerFrame to the DropoutComponent.

action=common_lib.NullstrToNoneAction,
dest='dropout_per_frame', default=None,
help="""this option is used to control whether
using dropout by frame level or by vector level""")

# General options
self.parser.add_argument("--stage", type=int, default=-4,
Expand Down
8 changes: 7 additions & 1 deletion egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def set_default_configs(self):
'zeroing-interval' : 20,
'zeroing-threshold' : 15.0,
'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
'dropout-per-frame' : 'false'
}

def set_derived_configs(self):
Expand Down Expand Up @@ -285,6 +286,10 @@ def check_configs(self):
self.config['dropout-proportion'] < 0.0) and
self.config['dropout-proportion'] != -1.0 ):
raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))

if (self.config['dropout-per-frame'] != 'false' or
self.config['dropout-per-frame'] != 'true'):
raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))

def auxiliary_outputs(self):
return ['c_t']
Expand Down Expand Up @@ -347,7 +352,8 @@ def generate_lstm_config(self):
pes_str = self.config['ng-per-element-scale-options']
lstm_dropout_value = self.config['dropout-proportion']
lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])

lstm_dropout_per_frame_value = self.config['dropout-per-frame']
lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
# Natural gradient per element scale parameters
# TODO: decide if we want to keep exposing these options
if re.search('param-mean', pes_str) is None and \
Expand Down
8 changes: 7 additions & 1 deletion egs/wsj/s5/steps/nnet3/chain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,10 @@ def process_args(args):
"value={0}. We recommend using the option "
"--trainer.deriv-truncate-margin.".format(
args.deriv_truncate_margin))

if ( args.dropout_schedule is None )
and (args.dropout_per_frame is not None) :
raise Exception("The dropout schedule is null, but dropout_per_frame"
"option is not null")
if (not os.path.exists(args.dir)
or not os.path.exists(args.dir+"/configs")):
raise Exception("This scripts expects {0} to exist and have a configs "
Expand Down Expand Up @@ -441,6 +444,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
None if args.dropout_schedule is None
else common_train_lib.get_dropout_proportions(
dropout_schedule, num_archives_processed)),
dropout_per_frame=(
None if args.dropout_schedule is None
else args.dropout_per_frame),
shrinkage_value=shrinkage_value,
num_chunk_per_minibatch=args.num_chunk_per_minibatch,
num_hidden_layers=num_hidden_layers,
Expand Down
2 changes: 2 additions & 0 deletions src/cudamatrix/cu-kernels-ansi.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
bool include_sign, MatrixDim d);
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is no need for any of these changes in cudamatrix/... just use CopyColsFromVec.

void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
MatrixDim d);
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
Expand Down Expand Up @@ -330,6 +331,7 @@ void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
bool include_sign, MatrixDim d);
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
MatrixDim d);
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
Expand Down
25 changes: 25 additions & 0 deletions src/cudamatrix/cu-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1628,6 +1628,23 @@ static void _apply_heaviside(Real* mat, MatrixDim d) {
mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
}

template<typename Real>
__global__
static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // col index
int j = blockIdx.y * blockDim.y + threadIdx.y; // row index
int j_tempt = blockIdx.y * blockDim.y + threadIdx.y; // row index using to control setting heavyside() in the first rows
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you want to get the 0th row or something? You have given the same expression as j.

int index = i + j * d.stride;
if (i < d.cols && j < d.rows)
if (j = j_tempt) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

==

mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
}
else {
mat[index] = mat[index-d.stride-d.cols];
}
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@danpovey I think there may exist some problem:

LOG (nnet3-chain-train:UpdateParamsWithMaxChange():nnet-chain-training.cc:225) Per-component max-change active on 19 / 35 Updatable Components.(smallest factor=0.223659 on tdnn2.affine with max-change=0.75). Global max-change factor was 0.495221 with max-change=2.
ERROR (nnet3-chain-train:MulElements():cu-matrix.cc:665) cudaError_t 77 : "an illegal memory access was encountered" returned from 'cudaGetLastError()'

[ Stack-Trace: ]
nnet3-chain-train() [0xb78566]
kaldi::MessageLogger::HandleMessage(kaldi::LogMessageEnvelope const&, char const*)
kaldi::MessageLogger::~MessageLogger()
kaldi::CuMatrixBase<float>::MulElements(kaldi::CuMatrixBase<float> const&)
kaldi::nnet3::DropoutComponent::Propagate(kaldi::nnet3::ComponentPrecomputedIndexes const*, kaldi::CuMatrixBase<float> const&, kaldi::CuMatrixBase<float>*) const
kaldi::nnet3::NnetComputer::ExecuteCommand(int)
kaldi::nnet3::NnetComputer::Forward()
kaldi::nnet3::NnetChainTrainer::Train(kaldi::nnet3::NnetChainExample const&)
main
__libc_start_main
nnet3-chain-train() [0x7d23c9]

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably because of not using ==.
I think your code will not work in any case. It may require some sync_threads after first setting the values when j == j_tempt
and then copy them for j != j_tempt.


template<typename Real>
__global__
static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
Expand Down Expand Up @@ -3233,6 +3250,10 @@ void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
_apply_heaviside<<<Gr,Bl>>>(mat, d);
}

void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
_apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
}

void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
int src_stride) {
Expand Down Expand Up @@ -3880,6 +3901,10 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
_apply_heaviside<<<Gr,Bl>>>(mat, d);
}

void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
_apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
}

void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
int src_stride) {
Expand Down
6 changes: 6 additions & 0 deletions src/cudamatrix/cu-kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
cudaF_apply_heaviside(Gr, Bl, mat, dim);
}
inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
cudaF_apply_heaviside_by_row(Gr, Bl, mat, dim);
}
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
MatrixDim dim) {
cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
Expand Down Expand Up @@ -739,6 +742,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
cudaD_apply_heaviside(Gr, Bl, mat, dim);
}
inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
cudaD_apply_heaviside_by_row(Gr, Bl, mat, dim);
}
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
MatrixDim dim) {
cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
Expand Down
17 changes: 17 additions & 0 deletions src/cudamatrix/cu-matrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2207,6 +2207,23 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
}
}

template<typename Real>
void CuMatrixBase<Real>::ApplyHeavisideByRow() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimGrid, dimBlock;
GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
&dimGrid, &dimBlock);
cuda_apply_heaviside_by_row(dimGrid, dimBlock, data_, Dim());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
KALDI_ERR << "no ApplyHeavisideByRow implemented without CUDA";
}
}
template<typename Real>
void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
KALDI_ASSERT(SameDim(*this, src));
Expand Down
1 change: 1 addition & 0 deletions src/cudamatrix/cu-matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ class CuMatrixBase {
/// For each element, sets x = (x > 0 ? 1.0 : 0.0).
/// See also Heaviside().
void ApplyHeaviside();
void ApplyHeavisideByRow();
void ApplyFloor(Real floor_val);
void ApplyCeiling(Real ceiling_val);
void ApplyExp();
Expand Down
21 changes: 5 additions & 16 deletions src/nnet3/nnet-simple-component.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,7 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need a branch here because dropout_per_frame defaults to false if not set (that's how you
initialized the variable). Don't have the 'ok2' variable; you don't need to check the return status of
cfl->GetValue("dropout-per-frame", &dropout_per_frame);
because it is an optional parameter.

dropout_per_frame = false;
Init(dim, dropout_proportion, dropout_per_frame);
}
else
{
} else {
Init(dim, dropout_proportion, dropout_per_frame);
}
}
Expand All @@ -131,7 +129,7 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,

BaseFloat dropout = dropout_proportion_;
KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
if(dropout_per_frame_ == true)
if(dropout_per_frame_)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use the correct code style. Should be

   if (x) { 
 ...

and note the space after if. You can run misc/maintenance/cpplint.py on your code to check for style problems.

{
// This const_cast is only safe assuming you don't attempt
// to use multi-threaded code with the GPU.
Expand All @@ -142,23 +140,14 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
// be zero and (1 - dropout) will be 1.0.

out->MulElements(in);
}
else
{
} else {

// This const_cast is only safe assuming you don't attempt
// to use multi-threaded code with the GPU.
const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
out->Add(-dropout); // now, a proportion "dropout" will be <0.0
out->ApplyHeaviside(); // apply the function (x>0?1:0). Now, a proportion "dropout" will
// be zero and (1 - dropout) will be 1.0.
CuVector<BaseFloat> *random_drop_vector = new CuVector<BaseFloat>(in.NumRows(), kSetZero);
MatrixIndexT i = 0;
random_drop_vector->CopyColFromMat(*out, i);
for (MatrixIndexT i = 0; i < in.NumCols(); i++)
{
out->CopyColFromVec(*random_drop_vector, i);
}
out->ApplyHeavisideByRow(); // apply the function (x>0?1:0). Now, a proportion "dropout" will
// be zero and (1 - dropout) will be 1.0 by row.
out->MulElements(in);
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/nnet3/nnet-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ std::string NnetInfo(const Nnet &nnet) {
}

void SetDropoutProportion(BaseFloat dropout_proportion,
bool dropout_per_frame ,
bool dropout_per_frame,
Nnet *nnet) {
dropout_per_frame = false;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the input to the function ignored?

for (int32 c = 0; c < nnet->NumComponents(); c++) {
Expand Down Expand Up @@ -696,13 +696,13 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
// matches names of components, not nodes.
config_line.GetValue("name", &name_pattern);
BaseFloat proportion = -1;
bool perframe = false;
bool dropout_per_frame = false;
if (!config_line.GetValue("proportion", &proportion)) {
KALDI_ERR << "In edits-config, expected proportion to be set in line: "
<< config_line.WholeLine();
}
if (!config_line.GetValue("perframe", &perframe)) {
perframe = false;
if (!config_line.GetValue("dropout-per-frame", &dropout_per_frame)) {
dropout_per_frame = false;
}
DropoutComponent *component = NULL;
int32 num_dropout_proportions_set = 0;
Expand All @@ -711,7 +711,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
name_pattern.c_str()) &&
(component =
dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
component->SetDropoutProportion(proportion, perframe);
component->SetDropoutProportion(proportion, dropout_per_frame);
num_dropout_proportions_set++;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/nnet3/nnet-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
remove internal nodes directly; instead you should use the command
'remove-orphans'.

set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> perframe=<perframe>
set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> dropout-per-frame=<dropout-per-frame>
Sets the dropout rates for any components of type DropoutComponent whose
names match the given <name-pattern> (e.g. lstm*). <name-pattern> defaults to "*".
\endverbatim
Expand Down