You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I pulled the docker image and downloaded the checkpoint. When running generate_interactive.sh, I encountered the following error:
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
_ = load_checkpoint(model, None, None)
Traceback (most recent call last):
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
main()
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
_ = load_checkpoint(model, None, None)
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
torch.distributed.barrier()
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
main()
main()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
load_checkpoint_new(model, optimizer, lr_scheduler)
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
_ = load_checkpoint(model, None, None)
main()
torch.distributed.barrier()
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
_ = load_checkpoint(model, None, None)
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
torch.distributed.barrier()
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc). torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc). work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 554) of binary: /opt/conda/bin/python3
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.8.0a0+17f8c32', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/run.py", line 719, in main
run(args)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
)(*cmd_args)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
megatron_lm/tools/generate_samples_gpt2.py FAILED
The text was updated successfully, but these errors were encountered:
I pulled the docker image and downloaded the checkpoint. When running generate_interactive.sh, I encountered the following error:
The text was updated successfully, but these errors were encountered: