From b310ba4822e57362cf78b7dfbb5fda0a3086d116 Mon Sep 17 00:00:00 2001
From: matthewdeng <matt@anyscale.com>
Date: Sun, 4 Feb 2024 16:26:09 -0800
Subject: [PATCH] [train] remove DEFAULT_NCCL_SOCKET_IFNAME (#42808)

Signed-off-by: matthewdeng <matt@anyscale.com>
Signed-off-by: tterrysun <terry@anyscale.com>
---
 doc/source/train/user-guides/using-gpus.rst   | 79 +++++++++++--------
 python/ray/air/util/torch_dist.py             |  3 -
 python/ray/train/constants.py                 |  3 -
 python/ray/train/tests/test_gpu.py            | 25 +-----
 python/ray/train/torch/config.py              | 18 -----
 .../workloads/benchmark_util.py               |  6 +-
 .../workloads/torch_benchmark.py              | 18 -----
 .../workloads/tune_torch_benchmark.py         |  1 -
 release/release_tests.yaml                    |  2 -
 release/xgboost_tests/app_config_gpu.yaml     |  6 --
 10 files changed, 50 insertions(+), 111 deletions(-)

diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst
index cea84be6a4996f..24526d552f6f8c 100644
--- a/doc/source/train/user-guides/using-gpus.rst
+++ b/doc/source/train/user-guides/using-gpus.rst
@@ -104,6 +104,54 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices
     trainer.fit()
 
 
+(PyTorch) Setting the communication backend 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PyTorch Distributed supports multiple `backends <https://pytorch.org/docs/stable/distributed.html#backends>`__
+for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise.
+
+If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig` 
+and pass it into the :class:`~ray.train.torch.TorchTrainer`.
+
+.. testcode::
+    :hide:
+
+    num_training_workers = 1
+
+.. testcode::
+
+    from ray.train.torch import TorchConfig, TorchTrainer
+
+    trainer = TorchTrainer(
+        train_func,
+        scaling_config=ScalingConfig(
+            num_workers=num_training_workers,
+            use_gpu=True, # Defaults to NCCL
+        ),
+        torch_config=TorchConfig(backend="gloo"),
+    )
+
+(NCCL) Setting the communication network interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using NCCL for distributed training, you can configure the network interface cards
+that are used for communicating between GPUs by setting the 
+`NCCL_SOCKET_IFNAME <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname>`__ 
+environment variable.
+
+To ensure that the environment variable is set for all training workers, you can pass it
+in a :ref:`Ray runtime environment <runtime-environments>`:
+
+.. testcode::
+    :skipif: True
+
+    import ray
+
+    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}}
+    ray.init(runtime_env=runtime_env)
+
+    trainer = TorchTrainer(...)
+
 Setting the resources per worker
 --------------------------------
 If you want to allocate more than one CPU or GPU per training worker, or if you
@@ -145,37 +193,6 @@ will be assigned the same CUDA device.
     )
 
 
-Setting the communication backend (PyTorch)
--------------------------------------------
-
-.. note::
-
-    This is an advanced setting. In most cases, you don't have to change this setting.
-
-You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
-:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.
-
-See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
-for valid options.
-
-.. testcode::
-    :hide:
-
-    num_training_workers = 1
-
-.. testcode::
-
-    from ray.train.torch import TorchConfig, TorchTrainer
-
-    trainer = TorchTrainer(
-        train_func,
-        scaling_config=ScalingConfig(
-            num_workers=num_training_workers,
-            use_gpu=True,
-        ),
-        torch_config=TorchConfig(backend="gloo"),
-    )
-
 
 .. _train_trainer_resources:
 
diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py
index 9bb817008b1048..ba3889b72dabf4 100644
--- a/python/ray/air/util/torch_dist.py
+++ b/python/ray/air/util/torch_dist.py
@@ -16,7 +16,6 @@
 import ray
 from ray.actor import ActorHandle
 from ray.train._internal.utils import get_address_and_port
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.air._internal.torch_utils import get_devices
 
 
@@ -69,8 +68,6 @@ def _init_torch_distributed(
         # All workers on a same node should share the same set of
         # visible GPUs. Otherwise they can't talk among themselves.
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids)
-        if "NCCL_SOCKET_IFNAME" not in os.environ:
-            os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
 
     init_process_group_kwargs.update(
         dict(
diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
index e89354f4904745..efb36bd57eab1e 100644
--- a/python/ray/train/constants.py
+++ b/python/ray/train/constants.py
@@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str:
     RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
 }
 
-# Blacklist virtualized networking.
-DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth"
-
 # Key for AIR Checkpoint metadata in TrainingResult metadata
 CHECKPOINT_METADATA_KEY = "checkpoint_metadata"
 
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index 314b1afa78de57..2e088f3bff7a86 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -16,10 +16,8 @@
 from ray import train
 from ray.exceptions import RayTaskError
 from ray.train import ScalingConfig
-from ray.train._internal.worker_group import WorkerGroup
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.train.examples.pytorch.torch_linear_example import LinearDataset
-from ray.train.torch.config import TorchConfig, _TorchBackend
+from ray.train.torch.config import TorchConfig
 from ray.train.torch.torch_trainer import TorchTrainer
 from ray.train.trainer import TrainingFailedError
 
@@ -308,27 +306,6 @@ def train_func():
     assert result1.metrics["loss"] == result2.metrics["loss"]
 
 
-@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"])
-def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname):
-    worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1)
-
-    if nccl_socket_ifname:
-
-        def set_env_var():
-            os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname
-
-        worker_group.execute(set_env_var)
-
-    def assert_env_var_set():
-        value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME
-        assert os.environ["NCCL_SOCKET_IFNAME"] == value
-
-    torch_backend = _TorchBackend()
-    torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl"))
-
-    worker_group.execute(assert_env_var_set)
-
-
 def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus):
     """Tests that TorchTrainer raises exception on NCCL timeouts."""
 
diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py
index 3e534c2204dece..da05ce4736ab7c 100644
--- a/python/ray/train/torch/config.py
+++ b/python/ray/train/torch/config.py
@@ -11,7 +11,6 @@
 from ray.train._internal.utils import get_address_and_port
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.util import PublicAPI
 
 logger = logging.getLogger(__name__)
@@ -45,20 +44,6 @@ def backend_cls(self):
         return _TorchBackend
 
 
-def _set_nccl_network_interface():
-    """Set the appropriate NCCL network interface to use."""
-
-    if "NCCL_SOCKET_IFNAME" not in os.environ:
-        logger.debug(
-            f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} "
-            f"to prioritize ethernet connection. To override this behavior, set the "
-            f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime "
-            "environment: "
-            "`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`"
-        )
-        os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
-
-
 def _setup_torch_process_group(
     backend: str,
     world_rank: int,
@@ -155,9 +140,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
             else:
                 backend = backend_config.backend
 
-            if backend == "nccl":
-                worker_group.execute(_set_nccl_network_interface)
-
             master_addr, master_port = worker_group.execute_single(
                 0, get_address_and_port
             )
diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
index 7bee5d682aff18..5fbaaf8c285a32 100644
--- a/release/air_tests/air_benchmarks/workloads/benchmark_util.py
+++ b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -7,7 +7,7 @@
 from ray.air.util.node import _force_on_node
 
 import ray
-from typing import Any, List, Dict, Union, Callable
+from typing import List, Dict, Union, Callable
 
 
 def schedule_remote_fn_on_all_nodes(
@@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs):
 def create_actors_with_options(
     num_actors: int,
     resources: Dict[str, Union[float, int]],
-    runtime_env: Dict[str, Any] = None,
 ) -> List[ray.actor.ActorHandle]:
     num_cpus = resources.pop("CPU", 1)
     num_gpus = resources.pop("GPU", 0)
 
     options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources}
 
-    if runtime_env:
-        options["runtime_env"] = runtime_env
-
     return [CommandRunner.options(**options).remote() for _ in range(num_actors)]
 
 
diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
index 0b7d8c8a6ded32..0ce327cb6e5dd7 100644
--- a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -18,17 +18,6 @@
 VANILLA_RESULT_JSON = "/tmp/vanilla_out.json"
 
 
-def find_network_interface():
-    for iface in os.listdir("/sys/class/net"):
-        if iface.startswith("ens"):
-            network_interface = iface
-            break
-    else:
-        network_interface = "^lo,docker"
-
-    return network_interface
-
-
 # Define model
 class NeuralNetwork(nn.Module):
     def __init__(self):
@@ -311,19 +300,12 @@ def train_torch_vanilla(
 
     num_epochs = config["epochs"]
 
-    try:
-        nccl_network_interface = find_network_interface()
-        runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}}
-    except Exception:
-        runtime_env = {}
-
     actors = create_actors_with_options(
         num_actors=num_workers,
         resources={
             "CPU": cpus_per_worker,
             "GPU": int(use_gpu),
         },
-        runtime_env=runtime_env,
     )
 
     run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None))
diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
index 4b60b38b137cd3..0d7b594d14976b 100644
--- a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -117,7 +117,6 @@ def main(
     ray.init(
         runtime_env={
             "working_dir": os.path.dirname(__file__),
-            "env_vars": {"NCCL_SOCKET_IFNAME": "ens"},
         }
     )
     prepare_mnist()
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 134483567db6e2..93fc16abe16405 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -1128,8 +1128,6 @@
   cluster:
     byod:
       type: gpu
-      runtime_env:
-        - NCCL_SOCKET_IFNAME=ens
       post_build_script: byod_xgboost_test.sh
     cluster_compute: tpl_gpu_small_aws.yaml
 
diff --git a/release/xgboost_tests/app_config_gpu.yaml b/release/xgboost_tests/app_config_gpu.yaml
index 55db2bfa14f2d1..193a05e5c1df27 100755
--- a/release/xgboost_tests/app_config_gpu.yaml
+++ b/release/xgboost_tests/app_config_gpu.yaml
@@ -1,10 +1,4 @@
 base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }}
-env_vars:
-  # Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on
-  # anyscale_default_cloud.
-  # See https://github.com/pytorch/pytorch/issues/68893 for more details.
-  NCCL_SOCKET_IFNAME: ens
-
 debian_packages:
   - curl