From b310ba4822e57362cf78b7dfbb5fda0a3086d116 Mon Sep 17 00:00:00 2001 From: matthewdeng Date: Sun, 4 Feb 2024 16:26:09 -0800 Subject: [PATCH] [train] remove DEFAULT_NCCL_SOCKET_IFNAME (#42808) Signed-off-by: matthewdeng Signed-off-by: tterrysun --- doc/source/train/user-guides/using-gpus.rst | 79 +++++++++++-------- python/ray/air/util/torch_dist.py | 3 - python/ray/train/constants.py | 3 - python/ray/train/tests/test_gpu.py | 25 +----- python/ray/train/torch/config.py | 18 ----- .../workloads/benchmark_util.py | 6 +- .../workloads/torch_benchmark.py | 18 ----- .../workloads/tune_torch_benchmark.py | 1 - release/release_tests.yaml | 2 - release/xgboost_tests/app_config_gpu.yaml | 6 -- 10 files changed, 50 insertions(+), 111 deletions(-) diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst index cea84be6a4996f..24526d552f6f8c 100644 --- a/doc/source/train/user-guides/using-gpus.rst +++ b/doc/source/train/user-guides/using-gpus.rst @@ -104,6 +104,54 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices trainer.fit() +(PyTorch) Setting the communication backend +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +PyTorch Distributed supports multiple `backends `__ +for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise. + +If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig` +and pass it into the :class:`~ray.train.torch.TorchTrainer`. + +.. testcode:: + :hide: + + num_training_workers = 1 + +.. testcode:: + + from ray.train.torch import TorchConfig, TorchTrainer + + trainer = TorchTrainer( + train_func, + scaling_config=ScalingConfig( + num_workers=num_training_workers, + use_gpu=True, # Defaults to NCCL + ), + torch_config=TorchConfig(backend="gloo"), + ) + +(NCCL) Setting the communication network interface +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using NCCL for distributed training, you can configure the network interface cards +that are used for communicating between GPUs by setting the +`NCCL_SOCKET_IFNAME `__ +environment variable. + +To ensure that the environment variable is set for all training workers, you can pass it +in a :ref:`Ray runtime environment `: + +.. testcode:: + :skipif: True + + import ray + + runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}} + ray.init(runtime_env=runtime_env) + + trainer = TorchTrainer(...) + Setting the resources per worker -------------------------------- If you want to allocate more than one CPU or GPU per training worker, or if you @@ -145,37 +193,6 @@ will be assigned the same CUDA device. ) -Setting the communication backend (PyTorch) -------------------------------------------- - -.. note:: - - This is an advanced setting. In most cases, you don't have to change this setting. - -You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a -:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`. - -See the `PyTorch API reference `__ -for valid options. - -.. testcode:: - :hide: - - num_training_workers = 1 - -.. testcode:: - - from ray.train.torch import TorchConfig, TorchTrainer - - trainer = TorchTrainer( - train_func, - scaling_config=ScalingConfig( - num_workers=num_training_workers, - use_gpu=True, - ), - torch_config=TorchConfig(backend="gloo"), - ) - .. _train_trainer_resources: diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py index 9bb817008b1048..ba3889b72dabf4 100644 --- a/python/ray/air/util/torch_dist.py +++ b/python/ray/air/util/torch_dist.py @@ -16,7 +16,6 @@ import ray from ray.actor import ActorHandle from ray.train._internal.utils import get_address_and_port -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.air._internal.torch_utils import get_devices @@ -69,8 +68,6 @@ def _init_torch_distributed( # All workers on a same node should share the same set of # visible GPUs. Otherwise they can't talk among themselves. os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids) - if "NCCL_SOCKET_IFNAME" not in os.environ: - os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME init_process_group_kwargs.update( dict( diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index e89354f4904745..efb36bd57eab1e 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str: RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE, } -# Blacklist virtualized networking. -DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth" - # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata" diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 314b1afa78de57..2e088f3bff7a86 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -16,10 +16,8 @@ from ray import train from ray.exceptions import RayTaskError from ray.train import ScalingConfig -from ray.train._internal.worker_group import WorkerGroup -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.train.examples.pytorch.torch_linear_example import LinearDataset -from ray.train.torch.config import TorchConfig, _TorchBackend +from ray.train.torch.config import TorchConfig from ray.train.torch.torch_trainer import TorchTrainer from ray.train.trainer import TrainingFailedError @@ -308,27 +306,6 @@ def train_func(): assert result1.metrics["loss"] == result2.metrics["loss"] -@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"]) -def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname): - worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1) - - if nccl_socket_ifname: - - def set_env_var(): - os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname - - worker_group.execute(set_env_var) - - def assert_env_var_set(): - value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME - assert os.environ["NCCL_SOCKET_IFNAME"] == value - - torch_backend = _TorchBackend() - torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl")) - - worker_group.execute(assert_env_var_set) - - def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus): """Tests that TorchTrainer raises exception on NCCL timeouts.""" diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py index 3e534c2204dece..da05ce4736ab7c 100644 --- a/python/ray/train/torch/config.py +++ b/python/ray/train/torch/config.py @@ -11,7 +11,6 @@ from ray.train._internal.utils import get_address_and_port from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.util import PublicAPI logger = logging.getLogger(__name__) @@ -45,20 +44,6 @@ def backend_cls(self): return _TorchBackend -def _set_nccl_network_interface(): - """Set the appropriate NCCL network interface to use.""" - - if "NCCL_SOCKET_IFNAME" not in os.environ: - logger.debug( - f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} " - f"to prioritize ethernet connection. To override this behavior, set the " - f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime " - "environment: " - "`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`" - ) - os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME - - def _setup_torch_process_group( backend: str, world_rank: int, @@ -155,9 +140,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig): else: backend = backend_config.backend - if backend == "nccl": - worker_group.execute(_set_nccl_network_interface) - master_addr, master_port = worker_group.execute_single( 0, get_address_and_port ) diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py index 7bee5d682aff18..5fbaaf8c285a32 100644 --- a/release/air_tests/air_benchmarks/workloads/benchmark_util.py +++ b/release/air_tests/air_benchmarks/workloads/benchmark_util.py @@ -7,7 +7,7 @@ from ray.air.util.node import _force_on_node import ray -from typing import Any, List, Dict, Union, Callable +from typing import List, Dict, Union, Callable def schedule_remote_fn_on_all_nodes( @@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs): def create_actors_with_options( num_actors: int, resources: Dict[str, Union[float, int]], - runtime_env: Dict[str, Any] = None, ) -> List[ray.actor.ActorHandle]: num_cpus = resources.pop("CPU", 1) num_gpus = resources.pop("GPU", 0) options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources} - if runtime_env: - options["runtime_env"] = runtime_env - return [CommandRunner.options(**options).remote() for _ in range(num_actors)] diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py index 0b7d8c8a6ded32..0ce327cb6e5dd7 100644 --- a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py @@ -18,17 +18,6 @@ VANILLA_RESULT_JSON = "/tmp/vanilla_out.json" -def find_network_interface(): - for iface in os.listdir("/sys/class/net"): - if iface.startswith("ens"): - network_interface = iface - break - else: - network_interface = "^lo,docker" - - return network_interface - - # Define model class NeuralNetwork(nn.Module): def __init__(self): @@ -311,19 +300,12 @@ def train_torch_vanilla( num_epochs = config["epochs"] - try: - nccl_network_interface = find_network_interface() - runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}} - except Exception: - runtime_env = {} - actors = create_actors_with_options( num_actors=num_workers, resources={ "CPU": cpus_per_worker, "GPU": int(use_gpu), }, - runtime_env=runtime_env, ) run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None)) diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py index 4b60b38b137cd3..0d7b594d14976b 100644 --- a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py @@ -117,7 +117,6 @@ def main( ray.init( runtime_env={ "working_dir": os.path.dirname(__file__), - "env_vars": {"NCCL_SOCKET_IFNAME": "ens"}, } ) prepare_mnist() diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 134483567db6e2..93fc16abe16405 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1128,8 +1128,6 @@ cluster: byod: type: gpu - runtime_env: - - NCCL_SOCKET_IFNAME=ens post_build_script: byod_xgboost_test.sh cluster_compute: tpl_gpu_small_aws.yaml diff --git a/release/xgboost_tests/app_config_gpu.yaml b/release/xgboost_tests/app_config_gpu.yaml index 55db2bfa14f2d1..193a05e5c1df27 100755 --- a/release/xgboost_tests/app_config_gpu.yaml +++ b/release/xgboost_tests/app_config_gpu.yaml @@ -1,10 +1,4 @@ base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }} -env_vars: - # Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on - # anyscale_default_cloud. - # See https://github.com/pytorch/pytorch/issues/68893 for more details. - NCCL_SOCKET_IFNAME: ens - debian_packages: - curl