[train] remove DEFAULT_NCCL_SOCKET_IFNAME (ray-project#42808)

Signed-off-by: matthewdeng <matt@anyscale.com> Signed-off-by: tterrysun <terry@anyscale.com>
tterrysun · Feb 14, 2024 · b310ba4 · b310ba4
1 parent 382e386
commit b310ba4
Show file tree

Hide file tree

Showing 10 changed files with 50 additions and 111 deletions.
diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst
@@ -104,6 +104,54 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices
     trainer.fit()
 
 
+(PyTorch) Setting the communication backend 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PyTorch Distributed supports multiple `backends <https://pytorch.org/docs/stable/distributed.html#backends>`__
+for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise.
+
+If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig` 
+and pass it into the :class:`~ray.train.torch.TorchTrainer`.
+
+.. testcode::
+    :hide:
+
+    num_training_workers = 1
+
+.. testcode::
+
+    from ray.train.torch import TorchConfig, TorchTrainer
+
+    trainer = TorchTrainer(
+        train_func,
+        scaling_config=ScalingConfig(
+            num_workers=num_training_workers,
+            use_gpu=True, # Defaults to NCCL
+        ),
+        torch_config=TorchConfig(backend="gloo"),
+    )
+
+(NCCL) Setting the communication network interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using NCCL for distributed training, you can configure the network interface cards
+that are used for communicating between GPUs by setting the 
+`NCCL_SOCKET_IFNAME <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname>`__ 
+environment variable.
+
+To ensure that the environment variable is set for all training workers, you can pass it
+in a :ref:`Ray runtime environment <runtime-environments>`:
+
+.. testcode::
+    :skipif: True
+
+    import ray
+
+    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}}
+    ray.init(runtime_env=runtime_env)
+
+    trainer = TorchTrainer(...)
+
 Setting the resources per worker
 --------------------------------
 If you want to allocate more than one CPU or GPU per training worker, or if you
@@ -145,37 +193,6 @@ will be assigned the same CUDA device.
     )
 
 
-Setting the communication backend (PyTorch)
--------------------------------------------
-
-.. note::
-
-    This is an advanced setting. In most cases, you don't have to change this setting.
-
-You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
-:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.
-
-See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
-for valid options.
-
-.. testcode::
-    :hide:
-
-    num_training_workers = 1
-
-.. testcode::
-
-    from ray.train.torch import TorchConfig, TorchTrainer
-
-    trainer = TorchTrainer(
-        train_func,
-        scaling_config=ScalingConfig(
-            num_workers=num_training_workers,
-            use_gpu=True,
-        ),
-        torch_config=TorchConfig(backend="gloo"),
-    )
-
 
 .. _train_trainer_resources:
 

diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py
@@ -16,7 +16,6 @@
 import ray
 from ray.actor import ActorHandle
 from ray.train._internal.utils import get_address_and_port
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.air._internal.torch_utils import get_devices
 
 
@@ -69,8 +68,6 @@ def _init_torch_distributed(
         # All workers on a same node should share the same set of
         # visible GPUs. Otherwise they can't talk among themselves.
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids)
-        if "NCCL_SOCKET_IFNAME" not in os.environ:
-            os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
 
     init_process_group_kwargs.update(
         dict(

diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
@@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str:
     RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
 }
 
-# Blacklist virtualized networking.
-DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth"
-
 # Key for AIR Checkpoint metadata in TrainingResult metadata
 CHECKPOINT_METADATA_KEY = "checkpoint_metadata"
 

diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
@@ -16,10 +16,8 @@
 from ray import train
 from ray.exceptions import RayTaskError
 from ray.train import ScalingConfig
-from ray.train._internal.worker_group import WorkerGroup
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.train.examples.pytorch.torch_linear_example import LinearDataset
-from ray.train.torch.config import TorchConfig, _TorchBackend
+from ray.train.torch.config import TorchConfig
 from ray.train.torch.torch_trainer import TorchTrainer
 from ray.train.trainer import TrainingFailedError
 
@@ -308,27 +306,6 @@ def train_func():
     assert result1.metrics["loss"] == result2.metrics["loss"]
 
 
-@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"])
-def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname):
-    worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1)
-
-    if nccl_socket_ifname:
-
-        def set_env_var():
-            os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname
-
-        worker_group.execute(set_env_var)
-
-    def assert_env_var_set():
-        value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME
-        assert os.environ["NCCL_SOCKET_IFNAME"] == value
-
-    torch_backend = _TorchBackend()
-    torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl"))
-
-    worker_group.execute(assert_env_var_set)
-
-
 def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus):
     """Tests that TorchTrainer raises exception on NCCL timeouts."""
 

diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py
@@ -11,7 +11,6 @@
 from ray.train._internal.utils import get_address_and_port
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.util import PublicAPI
 
 logger = logging.getLogger(__name__)
@@ -45,20 +44,6 @@ def backend_cls(self):
         return _TorchBackend
 
 
-def _set_nccl_network_interface():
-    """Set the appropriate NCCL network interface to use."""
-
-    if "NCCL_SOCKET_IFNAME" not in os.environ:
-        logger.debug(
-            f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} "
-            f"to prioritize ethernet connection. To override this behavior, set the "
-            f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime "
-            "environment: "
-            "`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`"
-        )
-        os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
-
-
 def _setup_torch_process_group(
     backend: str,
     world_rank: int,
@@ -155,9 +140,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
             else:
                 backend = backend_config.backend
 
-            if backend == "nccl":
-                worker_group.execute(_set_nccl_network_interface)
-
             master_addr, master_port = worker_group.execute_single(
                 0, get_address_and_port
             )

diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -7,7 +7,7 @@
 from ray.air.util.node import _force_on_node
 
 import ray
-from typing import Any, List, Dict, Union, Callable
+from typing import List, Dict, Union, Callable
 
 
 def schedule_remote_fn_on_all_nodes(
@@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs):
 def create_actors_with_options(
     num_actors: int,
     resources: Dict[str, Union[float, int]],
-    runtime_env: Dict[str, Any] = None,
 ) -> List[ray.actor.ActorHandle]:
     num_cpus = resources.pop("CPU", 1)
     num_gpus = resources.pop("GPU", 0)
 
     options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources}
 
-    if runtime_env:
-        options["runtime_env"] = runtime_env
-
     return [CommandRunner.options(**options).remote() for _ in range(num_actors)]
 
 

diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -18,17 +18,6 @@
 VANILLA_RESULT_JSON = "/tmp/vanilla_out.json"
 
 
-def find_network_interface():
-    for iface in os.listdir("/sys/class/net"):
-        if iface.startswith("ens"):
-            network_interface = iface
-            break
-    else:
-        network_interface = "^lo,docker"
-
-    return network_interface
-
-
 # Define model
 class NeuralNetwork(nn.Module):
     def __init__(self):
@@ -311,19 +300,12 @@ def train_torch_vanilla(
 
     num_epochs = config["epochs"]
 
-    try:
-        nccl_network_interface = find_network_interface()
-        runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}}
-    except Exception:
-        runtime_env = {}
-
     actors = create_actors_with_options(
         num_actors=num_workers,
         resources={
             "CPU": cpus_per_worker,
             "GPU": int(use_gpu),
         },
-        runtime_env=runtime_env,
     )
 
     run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None))

diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -117,7 +117,6 @@ def main(
     ray.init(
         runtime_env={
             "working_dir": os.path.dirname(__file__),
-            "env_vars": {"NCCL_SOCKET_IFNAME": "ens"},
         }
     )
     prepare_mnist()

diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -1128,8 +1128,6 @@
   cluster:
     byod:
       type: gpu
-      runtime_env:
-        - NCCL_SOCKET_IFNAME=ens
       post_build_script: byod_xgboost_test.sh
     cluster_compute: tpl_gpu_small_aws.yaml
 

diff --git a/release/xgboost_tests/app_config_gpu.yaml b/release/xgboost_tests/app_config_gpu.yaml
@@ -1,10 +1,4 @@
 base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }}
-env_vars:
-  # Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on
-  # anyscale_default_cloud.
-  # See https://github.com/pytorch/pytorch/issues/68893 for more details.
-  NCCL_SOCKET_IFNAME: ens
-
 debian_packages:
   - curl