Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Segmentation fault (core dumped) training on 4 gpus #52

Closed
robotzheng opened this issue Jun 28, 2019 · 5 comments
Closed

Segmentation fault (core dumped) training on 4 gpus #52

robotzheng opened this issue Jun 28, 2019 · 5 comments

Comments

@robotzheng
Copy link

[root@A01-R04-I221-51 codes]# python3 train.py -opt options/train/train_EDVR_woTSA_M_vimeo90k.yml
export CUDA_VISIBLE_DEVICES=0,1,2,3
Disabled distributed training.
Path already exists. Rename it to [/home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S_archived_190628-144447]
19-06-28 14:44:47.113 - INFO: name: 001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
use_tb_logger: True
model: VideoSR_base
distortion: sr
scale: 4
gpu_ids: [0, 1, 2, 3]
datasets:[
train:[
name: Vimeo90K
mode: Vimeo90K
interval_list: [1]
random_reverse: False
border_mode: False
dataroot_GT: /home/zzt/EDVR/datasets/vimeo90k_train_GT.lmdb
dataroot_LQ: /home/zzt/EDVR/datasets/vimeo90k_train_LR7frames.lmdb
cache_keys: Vimeo90K_train_keys.pkl
N_frames: 5
use_shuffle: True
n_workers: 3
batch_size: 32
GT_size: 256
LQ_size: 64
use_flip: True
use_rot: True
color: RGB
phase: train
scale: 4
data_type: lmdb
]
]
network_G:[
which_model_G: EDVR
nf: 64
nframes: 5
groups: 8
front_RBs: 5
back_RBs: 10
predeblur: False
HR_in: False
w_TSA: False
scale: 4
]
path:[
pretrain_model_G: None
strict_load: True
resume_state: None
root: /home/zzt/EDVR
experiments_root: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
models: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/models
training_state: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/training_state
log: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
val_images: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/val_images
]
train:[
lr_G: 0.0004
lr_scheme: CosineAnnealingLR_Restart
beta1: 0.9
beta2: 0.99
niter: 600000
warmup_iter: -1
T_period: [150000, 150000, 150000, 150000]
restarts: [150000, 300000, 450000]
restart_weights: [1, 1, 1]
eta_min: 1e-07
pixel_criterion: cb
pixel_weight: 1.0
val_freq: 2000.0
manual_seed: 0
]
logger:[
print_freq: 1
save_checkpoint_freq: 2000.0
]
is_train: True
dist: False

19-06-28 14:44:50.625 - INFO: Random seed: 0
19-06-28 14:44:50.628 - INFO: Temporal augmentation interval list: [1], with random reverse is False.
19-06-28 14:44:50.629 - INFO: Using cache keys: Vimeo90K_train_keys.pkl
19-06-28 14:44:50.629 - INFO: Using cache keys - Vimeo90K_train_keys.pkl.
19-06-28 14:44:50.638 - INFO: Dataset [Vimeo90KDataset - Vimeo90K] is created.
19-06-28 14:44:50.638 - INFO: Number of train images: 64,612, iters: 2,020
19-06-28 14:44:50.638 - INFO: Total epochs needed: 298 for iters 600,000
19-06-28 14:44:53.798 - INFO: Network G structure: DataParallel - EDVR, with parameters: 2,996,259
19-06-28 14:44:53.799 - INFO: EDVR(
(conv_first): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(feature_extraction): Sequential(
(0): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(4): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(fea_L2_conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(fea_L2_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(fea_L3_conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(fea_L3_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(pcd_align): PCD_Align(
(L3_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L3_offset_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L3_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L2_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_offset_conv2): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_offset_conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L2_fea_conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv2): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L1_fea_conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_offset_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(lrelu): LeakyReLU(negative_slope=0.1, inplace)
)
(tsa_fusion): Conv2d(320, 64, kernel_size=(1, 1), stride=(1, 1))
(recon_trunk): Sequential(
(0): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(4): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(5): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(6): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(7): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(8): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(9): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(upconv1): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(upconv2): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(pixel_shuffle): PixelShuffle(upscale_factor=2)
(HRconv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv_last): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(lrelu): LeakyReLU(negative_slope=0.1, inplace)
)
19-06-28 14:44:53.801 - INFO: Model [VideoSRBaseModel] is created.
19-06-28 14:44:53.802 - INFO: Start training from epoch: 0, iter: 0
Segmentation fault (core dumped)

@XiZhang93
Copy link

I have met the same problem, how to solve it?

@xinntao
Copy link
Owner

xinntao commented Aug 5, 2019

I have not encountered this problem.
You may try to google it OR reinstall pytorch and reinstall the dcn package OR install the pytorch from source codes.

@llp1996
Copy link

llp1996 commented Sep 4, 2019

i meet the same problem ,,have you fixed it?

@xinntao
Copy link
Owner

xinntao commented Sep 6, 2019

@llp1996
Try to use pytorch1.1 version?

@llp1996
Copy link

llp1996 commented Sep 11, 2019

@llp1996
Try to use pytorch1.1 version?

i have already use pytorch 1.1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants