You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
RuntimeError: Timed out initializing process group in store based barrier on rank: 2, for key: store_based_barrier_key:1 (world_size=2, worker_count=4, timeout=0:30:00)
#18
Open
poetryben88 opened this issue
Jan 6, 2022
· 2 comments
root@pai-worker1:/home/Data/exports/pytorch-distributed# srun -N1 -n2 --gres gpu:2 python distributed_slurm_main.py --dist-file dist_file
Traceback (most recent call last):
File "distributed_slurm_main.py", line 420, in <module>
main()
File "distributed_slurm_main.py", line 131, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/Data/exports/pytorch-distributed/distributed_slurm_main.py", line 137, in main_worker
dist.init_process_group(backend='nccl',
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 608, in init_process_group
_store_based_barrier(rank, store, timeout)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 244, in _store_based_barrier
raise RuntimeError(
RuntimeError: Timed out initializing process group in store based barrier on rank: 1, for key: store_based_barrier_key:1 (world_size=2, worker_count=4, timeout=0:30:00)
Traceback (most recent call last):
File "distributed_slurm_main.py", line 420, in <module>
main()
File "distributed_slurm_main.py", line 131, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/Data/exports/pytorch-distributed/distributed_slurm_main.py", line 137, in main_worker
dist.init_process_group(backend='nccl',
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 608, in init_process_group
_store_based_barrier(rank, store, timeout)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 244, in _store_based_barrier
raise RuntimeError(
RuntimeError: Timed out initializing process group in store based barrier on rank: 2, for key: store_based_barrier_key:1 (world_size=2, worker_count=4, timeout=0:30:00)
srun: error: pai-worker1: tasks 0-1: Exited with exit code 1
root@pai-worker1:/home/Data/exports/pytorch-distributed#
The text was updated successfully, but these errors were encountered:
按照你的脚本跑,一直报错,找不到原因。
The text was updated successfully, but these errors were encountered: