Trainning can not converge when add lstm layer #62

dajiangxiaoyan · 2017-09-07T10:04:55Z

I implement the RPN without LSTM from papers based on faster rcnn code. The result is very well for horizon text. But when add bi-directional lstm layer after last con layer, the model is not converge. And scores are the same for all image. Can anyone met this problem?

`name: "VGG_ILSVRC_16_layers"
layer {
name: 'input-data'
type: 'Python'
top: 'data'
top: 'im_info'
top: 'gt_boxes'
python_param {
module: 'roi_data_layer.layer'
layer: 'RoIDataLayer'
param_str: "'num_classes': 2"
}
}
layer {
name: "conv1_1"
type: "Convolution"
bottom: "data"
top: "conv1_1"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "relu1_1"
type: "ReLU"
bottom: "conv1_1"
top: "conv1_1"
}
layer {
name: "conv1_2"
type: "Convolution"
bottom: "conv1_1"
top: "conv1_2"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "relu1_2"
type: "ReLU"
bottom: "conv1_2"
top: "conv1_2"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1_2"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2_1"
type: "Convolution"
bottom: "pool1"
top: "conv2_1"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "relu2_1"
type: "ReLU"
bottom: "conv2_1"
top: "conv2_1"
}
layer {
name: "conv2_2"
type: "Convolution"
bottom: "conv2_1"
top: "conv2_2"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "relu2_2"
type: "ReLU"
bottom: "conv2_2"
top: "conv2_2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2_2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv3_1"
type: "Convolution"
bottom: "pool2"
top: "conv3_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_1"
type: "ReLU"
bottom: "conv3_1"
top: "conv3_1"
}
layer {
name: "conv3_2"
type: "Convolution"
bottom: "conv3_1"
top: "conv3_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_2"
type: "ReLU"
bottom: "conv3_2"
top: "conv3_2"
}
layer {
name: "conv3_3"
type: "Convolution"
bottom: "conv3_2"
top: "conv3_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_3"
type: "ReLU"
bottom: "conv3_3"
top: "conv3_3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3_3"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv4_1"
type: "Convolution"
bottom: "pool3"
top: "conv4_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_1"
type: "ReLU"
bottom: "conv4_1"
top: "conv4_1"
}
layer {
name: "conv4_2"
type: "Convolution"
bottom: "conv4_1"
top: "conv4_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_2"
type: "ReLU"
bottom: "conv4_2"
top: "conv4_2"
}
layer {
name: "conv4_3"
type: "Convolution"
bottom: "conv4_2"
top: "conv4_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_3"
type: "ReLU"
bottom: "conv4_3"
top: "conv4_3"
}
layer {
name: "pool4"
type: "Pooling"
bottom: "conv4_3"
top: "pool4"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv5_1"
type: "Convolution"
bottom: "pool4"
top: "conv5_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_1"
type: "ReLU"
bottom: "conv5_1"
top: "conv5_1"
}
layer {
name: "conv5_2"
type: "Convolution"
bottom: "conv5_1"
top: "conv5_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_2"
type: "ReLU"
bottom: "conv5_2"
top: "conv5_2"
}
layer {
name: "conv5_3"
type: "Convolution"
bottom: "conv5_2"
top: "conv5_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_3"
type: "ReLU"
bottom: "conv5_3"
top: "conv5_3"
}

#========= RPN ============
#========= RPN ============

prepare lstm inputs

layer {
name: "im2col"
bottom: "conv5_3"
top: "im2col"
type: "Im2col"
convolution_param {
pad: 1
kernel_size: 3
}
}
layer {
name: "im2col_transpose"
top: "im2col_transpose"
bottom: "im2col"
type: "Transpose"
transpose_param {
dim: 3
dim: 2
dim: 0
dim: 1
}
}
layer {
name: "lstm_input"
type: "Reshape"
bottom: "im2col_transpose"
top: "lstm_input"
reshape_param {
shape { dim: -1 }
axis: 1
num_axes: 2
}
}

layer {
name: "lstm"
type: "Lstm"
bottom: "lstm_input"
top: "lstm"
lstm_param {
num_output: 128
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
clipping_threshold: 1
}
}

===================== rlstm ===================

layer {
name: "lstm-reverse1"
type: "Reverse"
bottom: "lstm_input"
top: "rlstm_input"
reverse_param {
axis: 0
}
}
layer {
name: "rlstm"
type: "Lstm"
bottom: "rlstm_input"
top: "rlstm-output"
lstm_param {
num_output: 128
}
}
layer {
name: "lstm-reverse2"
type: "Reverse"
bottom: "rlstm-output"
top: "rlstm"
reverse_param {
axis: 0
}
}

merge lstm and rlstm

layer {
name: "merge_lstm_rlstm"
type: "Concat"
bottom: "lstm"
bottom: "rlstm"
top: "merge_lstm_rlstm"
concat_param {
axis: 2
}
}
layer {
name: "lstm_output_reshape"
type: "Reshape"
bottom: "merge_lstm_rlstm"
top: "lstm_output_reshape"
reshape_param {
shape { dim: -1 dim: 1 }
axis: 1
num_axes: 1
}
}

transpose size of output as (N, C, H, W)

layer {
name: "lstm_output"
type: "Transpose"
bottom: "lstm_output_reshape"
top: "lstm_output"
transpose_param {
dim: 2
dim: 3
dim: 1
dim: 0
}
}
layer {
name: "fc"
bottom: "lstm_output"
top: "fc"
type: "Convolution"
convolution_param {
num_output: 512
kernel_size: 1
}
}
layer {
name: "relu_fc"
type: "ReLU"
bottom: "fc"
top: "fc"
}

layer {
name: "rpn_cls_score"
type: "Convolution"
bottom: "fc"
top: "rpn_cls_score"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
num_output: 20 # 2(bg/fg) * 9(anchors)
kernel_size: 1 pad: 0 stride: 1
}
}

layer {
name: "rpn_bbox_pred"
type: "Convolution"
bottom: "fc"
top: "rpn_bbox_pred"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
num_output: 20 # 4 * 9(anchors)
kernel_size: 1 pad: 0 stride: 1
}
}

layer {
bottom: "rpn_cls_score"
top: "rpn_cls_score_reshape"
name: "rpn_cls_score_reshape"
type: "Reshape"
reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
}

layer {
name: 'rpn-data'
type: 'Python'
bottom: 'rpn_cls_score'
bottom: 'gt_boxes'
bottom: 'im_info'
bottom: 'data'
top: 'rpn_labels'
top: 'rpn_bbox_targets'
top: 'rpn_bbox_inside_weights'
top: 'rpn_bbox_outside_weights'
python_param {
module: 'rpn.anchor_target_layer'
layer: 'AnchorTargetLayer'
param_str: "'feat_stride': 16"
}
}

layer {
name: "rpn_loss_cls"
type: "SoftmaxWithLoss"
bottom: "rpn_cls_score_reshape"
bottom: "rpn_labels"
propagate_down: 1
propagate_down: 0
top: "rpn_cls_loss"
loss_weight: 1
loss_param {
ignore_label: -1
normalize: true
}
}

layer {
name: "rpn_loss_bbox"
type: "SmoothL1Loss"
bottom: "rpn_bbox_pred"
bottom: "rpn_bbox_targets"
bottom: 'rpn_bbox_inside_weights'
bottom: 'rpn_bbox_outside_weights'
top: "rpn_loss_bbox"
loss_weight: 1
smooth_l1_loss_param { sigma: 3.0 }
}
`

dajiangxiaoyan · 2017-09-08T01:00:21Z

Never mind. I fixed it.

usmanxia · 2017-10-15T04:37:03Z

Can you share which data Set did you use for training?

TaoDream · 2017-10-26T14:14:11Z

@dajiangxiaoyan How did you fix it. I have the same problem. Thank you very much

catmonkeylee · 2017-11-20T11:48:55Z

@dajiangxiaoyan I have the same problem ,can you share the method? Thank you.

It's my fault, I used an error vgg16 model.

lxx1884896 · 2018-04-19T07:48:38Z

@dajiangxiaoyan HI,I am trying to train ctpn myself too, but i can't recur the train.prototxt ,i have tried many times and now i am desperate ,can you kindly share it to me for training? Many thanks.

Atlas-wuu · 2018-05-20T06:21:38Z

@catmonkeylee Hello, I want to fine-tune the model on my own data set, but don't know what the data format is. Could you please show me more detail about how you made the data input and what your training parameters is? Thanks a lot!

cristina510 · 2018-12-11T02:42:01Z

@dajiangxiaoyan Hi, I'm trying to train the ctpn and have got exectly the same problem. Could you please give me some suggestions? Any reply is deeply appreciated.

cristina510 · 2018-12-11T05:06:29Z

@TaoDream @catmonkeylee Is there any solution to this problem? I've check the model yet the training seems never converge when lstm is added. Could you please help me out here? Thanks a lot!

tianzhi0549 · 2018-12-11T05:18:13Z

@cristina510 Maybe you need to clip the gradients for LSTM layers.

cristina510 · 2018-12-11T06:35:28Z

@cristina510 Maybe you need to clip the gradients for LSTM layers.

Thanks a lot for your reply.
I tried to clip the gradients for LSTM layers in these ways:
As is written in the provided deploy.prototxt, I set lstm param "clipping_threshold: 1" for both lstm. Also, I tried to set clip_gradients in solver.prototxt.
The weird thing is : when lstm is added, I test the trained caffemodel, and found all the anchors of the same size get the same score for classify task, no matter which test image is processed.
So from you perspective, should I still check clipping threshold or something else? Thanks for you time and reply.

tianzhi0549 · 2018-12-11T06:49:51Z

@cristina510 This should not be the problem about gradient clipping. Are LSTM layers initialized properly?

cristina510 · 2018-12-11T07:42:19Z

@cristina510 This should not be the problem about gradient clipping. Are LSTM layers initialized properly?

Thanks for your reply, I really appreciate it.
In my training process，the input shape of lstm layers is [W,HN,9C]. After the concat layer we get the data in shape [W,HN,256]. Then I use the reshape layer and transpose layer to make it to the shape [N,256,H,W]. This output is sent to fc layer to get data in shape[N,512,H,W].
Is the understanding right?
Thanks a lot for your warm help.

tianzhi0549 · 2018-12-11T08:46:02Z

@cristina510 You are welcome. What I meant is that whether parameters of LSTM layers are initialized properly. Because you said the output is unrelated to the input image, I guess all parameters of LSTM layers are probably initialized to zeros.

dajiangxiaoyan closed this as completed Sep 8, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Trainning can not converge when add lstm layer #62

Trainning can not converge when add lstm layer #62

dajiangxiaoyan commented Sep 7, 2017

dajiangxiaoyan commented Sep 8, 2017

usmanxia commented Oct 15, 2017

TaoDream commented Oct 26, 2017

catmonkeylee commented Nov 20, 2017 •

edited

lxx1884896 commented Apr 19, 2018

Atlas-wuu commented May 20, 2018

cristina510 commented Dec 11, 2018

cristina510 commented Dec 11, 2018

tianzhi0549 commented Dec 11, 2018

cristina510 commented Dec 11, 2018 •

edited

tianzhi0549 commented Dec 11, 2018

cristina510 commented Dec 11, 2018 •

edited

tianzhi0549 commented Dec 11, 2018

Trainning can not converge when add lstm layer #62

Trainning can not converge when add lstm layer #62

Comments

dajiangxiaoyan commented Sep 7, 2017

prepare lstm inputs

===================== rlstm ===================

merge lstm and rlstm

transpose size of output as (N, C, H, W)

dajiangxiaoyan commented Sep 8, 2017

usmanxia commented Oct 15, 2017

TaoDream commented Oct 26, 2017

catmonkeylee commented Nov 20, 2017 • edited

@dajiangxiaoyan I have the same problem ,can you share the method? Thank you.

lxx1884896 commented Apr 19, 2018

Atlas-wuu commented May 20, 2018

cristina510 commented Dec 11, 2018

cristina510 commented Dec 11, 2018

tianzhi0549 commented Dec 11, 2018

cristina510 commented Dec 11, 2018 • edited

tianzhi0549 commented Dec 11, 2018

cristina510 commented Dec 11, 2018 • edited

tianzhi0549 commented Dec 11, 2018

catmonkeylee commented Nov 20, 2017 •

edited

cristina510 commented Dec 11, 2018 •

edited

cristina510 commented Dec 11, 2018 •

edited