New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Segmentation fault (core dumped) training on 4 gpus #52
Comments
I have met the same problem, how to solve it? |
I have not encountered this problem. |
i meet the same problem ,,have you fixed it? |
@llp1996 |
i have already use pytorch 1.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
[root@A01-R04-I221-51 codes]# python3 train.py -opt options/train/train_EDVR_woTSA_M_vimeo90k.yml
export CUDA_VISIBLE_DEVICES=0,1,2,3
Disabled distributed training.
Path already exists. Rename it to [/home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S_archived_190628-144447]
19-06-28 14:44:47.113 - INFO: name: 001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
use_tb_logger: True
model: VideoSR_base
distortion: sr
scale: 4
gpu_ids: [0, 1, 2, 3]
datasets:[
train:[
name: Vimeo90K
mode: Vimeo90K
interval_list: [1]
random_reverse: False
border_mode: False
dataroot_GT: /home/zzt/EDVR/datasets/vimeo90k_train_GT.lmdb
dataroot_LQ: /home/zzt/EDVR/datasets/vimeo90k_train_LR7frames.lmdb
cache_keys: Vimeo90K_train_keys.pkl
N_frames: 5
use_shuffle: True
n_workers: 3
batch_size: 32
GT_size: 256
LQ_size: 64
use_flip: True
use_rot: True
color: RGB
phase: train
scale: 4
data_type: lmdb
]
]
network_G:[
which_model_G: EDVR
nf: 64
nframes: 5
groups: 8
front_RBs: 5
back_RBs: 10
predeblur: False
HR_in: False
w_TSA: False
scale: 4
]
path:[
pretrain_model_G: None
strict_load: True
resume_state: None
root: /home/zzt/EDVR
experiments_root: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
models: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/models
training_state: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/training_state
log: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S
val_images: /home/zzt/EDVR/experiments/001_EDVRwoTSA_scratch_lr4e-4_600k_vimeo90k_LrCAR4S/val_images
]
train:[
lr_G: 0.0004
lr_scheme: CosineAnnealingLR_Restart
beta1: 0.9
beta2: 0.99
niter: 600000
warmup_iter: -1
T_period: [150000, 150000, 150000, 150000]
restarts: [150000, 300000, 450000]
restart_weights: [1, 1, 1]
eta_min: 1e-07
pixel_criterion: cb
pixel_weight: 1.0
val_freq: 2000.0
manual_seed: 0
]
logger:[
print_freq: 1
save_checkpoint_freq: 2000.0
]
is_train: True
dist: False
19-06-28 14:44:50.625 - INFO: Random seed: 0
19-06-28 14:44:50.628 - INFO: Temporal augmentation interval list: [1], with random reverse is False.
19-06-28 14:44:50.629 - INFO: Using cache keys: Vimeo90K_train_keys.pkl
19-06-28 14:44:50.629 - INFO: Using cache keys - Vimeo90K_train_keys.pkl.
19-06-28 14:44:50.638 - INFO: Dataset [Vimeo90KDataset - Vimeo90K] is created.
19-06-28 14:44:50.638 - INFO: Number of train images: 64,612, iters: 2,020
19-06-28 14:44:50.638 - INFO: Total epochs needed: 298 for iters 600,000
19-06-28 14:44:53.798 - INFO: Network G structure: DataParallel - EDVR, with parameters: 2,996,259
19-06-28 14:44:53.799 - INFO: EDVR(
(conv_first): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(feature_extraction): Sequential(
(0): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(4): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(fea_L2_conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(fea_L2_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(fea_L3_conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(fea_L3_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(pcd_align): PCD_Align(
(L3_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L3_offset_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L3_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L2_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_offset_conv2): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_offset_conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L2_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L2_fea_conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv2): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_offset_conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(L1_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(L1_fea_conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_offset_conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_offset_conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cas_dcnpack): DCN_sep(
(conv_offset_mask): Conv2d(64, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(lrelu): LeakyReLU(negative_slope=0.1, inplace)
)
(tsa_fusion): Conv2d(320, 64, kernel_size=(1, 1), stride=(1, 1))
(recon_trunk): Sequential(
(0): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(4): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(5): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(6): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(7): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(8): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(9): ResidualBlock_noBN(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(upconv1): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(upconv2): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(pixel_shuffle): PixelShuffle(upscale_factor=2)
(HRconv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv_last): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(lrelu): LeakyReLU(negative_slope=0.1, inplace)
)
19-06-28 14:44:53.801 - INFO: Model [VideoSRBaseModel] is created.
19-06-28 14:44:53.802 - INFO: Start training from epoch: 0, iter: 0
Segmentation fault (core dumped)
The text was updated successfully, but these errors were encountered: