Skip to content

Commit

Permalink
[train] remove DEFAULT_NCCL_SOCKET_IFNAME (ray-project#42808)
Browse files Browse the repository at this point in the history
Signed-off-by: matthewdeng <matt@anyscale.com>
Signed-off-by: tterrysun <terry@anyscale.com>
  • Loading branch information
matthewdeng authored and tterrysun committed Feb 14, 2024
1 parent 382e386 commit b310ba4
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 111 deletions.
79 changes: 48 additions & 31 deletions doc/source/train/user-guides/using-gpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,54 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices
trainer.fit()


(PyTorch) Setting the communication backend
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

PyTorch Distributed supports multiple `backends <https://pytorch.org/docs/stable/distributed.html#backends>`__
for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise.

If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig`
and pass it into the :class:`~ray.train.torch.TorchTrainer`.

.. testcode::
:hide:

num_training_workers = 1

.. testcode::

from ray.train.torch import TorchConfig, TorchTrainer

trainer = TorchTrainer(
train_func,
scaling_config=ScalingConfig(
num_workers=num_training_workers,
use_gpu=True, # Defaults to NCCL
),
torch_config=TorchConfig(backend="gloo"),
)

(NCCL) Setting the communication network interface
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

When using NCCL for distributed training, you can configure the network interface cards
that are used for communicating between GPUs by setting the
`NCCL_SOCKET_IFNAME <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname>`__
environment variable.

To ensure that the environment variable is set for all training workers, you can pass it
in a :ref:`Ray runtime environment <runtime-environments>`:

.. testcode::
:skipif: True

import ray

runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}}
ray.init(runtime_env=runtime_env)

trainer = TorchTrainer(...)

Setting the resources per worker
--------------------------------
If you want to allocate more than one CPU or GPU per training worker, or if you
Expand Down Expand Up @@ -145,37 +193,6 @@ will be assigned the same CUDA device.
)


Setting the communication backend (PyTorch)
-------------------------------------------

.. note::

This is an advanced setting. In most cases, you don't have to change this setting.

You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.

See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
for valid options.

.. testcode::
:hide:

num_training_workers = 1

.. testcode::

from ray.train.torch import TorchConfig, TorchTrainer

trainer = TorchTrainer(
train_func,
scaling_config=ScalingConfig(
num_workers=num_training_workers,
use_gpu=True,
),
torch_config=TorchConfig(backend="gloo"),
)


.. _train_trainer_resources:

Expand Down
3 changes: 0 additions & 3 deletions python/ray/air/util/torch_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import ray
from ray.actor import ActorHandle
from ray.train._internal.utils import get_address_and_port
from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
from ray.air._internal.torch_utils import get_devices


Expand Down Expand Up @@ -69,8 +68,6 @@ def _init_torch_distributed(
# All workers on a same node should share the same set of
# visible GPUs. Otherwise they can't talk among themselves.
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids)
if "NCCL_SOCKET_IFNAME" not in os.environ:
os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME

init_process_group_kwargs.update(
dict(
Expand Down
3 changes: 0 additions & 3 deletions python/ray/train/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str:
RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
}

# Blacklist virtualized networking.
DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth"

# Key for AIR Checkpoint metadata in TrainingResult metadata
CHECKPOINT_METADATA_KEY = "checkpoint_metadata"

Expand Down
25 changes: 1 addition & 24 deletions python/ray/train/tests/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
from ray import train
from ray.exceptions import RayTaskError
from ray.train import ScalingConfig
from ray.train._internal.worker_group import WorkerGroup
from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
from ray.train.examples.pytorch.torch_linear_example import LinearDataset
from ray.train.torch.config import TorchConfig, _TorchBackend
from ray.train.torch.config import TorchConfig
from ray.train.torch.torch_trainer import TorchTrainer
from ray.train.trainer import TrainingFailedError

Expand Down Expand Up @@ -308,27 +306,6 @@ def train_func():
assert result1.metrics["loss"] == result2.metrics["loss"]


@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"])
def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname):
worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1)

if nccl_socket_ifname:

def set_env_var():
os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname

worker_group.execute(set_env_var)

def assert_env_var_set():
value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME
assert os.environ["NCCL_SOCKET_IFNAME"] == value

torch_backend = _TorchBackend()
torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl"))

worker_group.execute(assert_env_var_set)


def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus):
"""Tests that TorchTrainer raises exception on NCCL timeouts."""

Expand Down
18 changes: 0 additions & 18 deletions python/ray/train/torch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from ray.train._internal.utils import get_address_and_port
from ray.train._internal.worker_group import WorkerGroup
from ray.train.backend import Backend, BackendConfig
from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
from ray.util import PublicAPI

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -45,20 +44,6 @@ def backend_cls(self):
return _TorchBackend


def _set_nccl_network_interface():
"""Set the appropriate NCCL network interface to use."""

if "NCCL_SOCKET_IFNAME" not in os.environ:
logger.debug(
f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} "
f"to prioritize ethernet connection. To override this behavior, set the "
f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime "
"environment: "
"`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`"
)
os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME


def _setup_torch_process_group(
backend: str,
world_rank: int,
Expand Down Expand Up @@ -155,9 +140,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
else:
backend = backend_config.backend

if backend == "nccl":
worker_group.execute(_set_nccl_network_interface)

master_addr, master_port = worker_group.execute_single(
0, get_address_and_port
)
Expand Down
6 changes: 1 addition & 5 deletions release/air_tests/air_benchmarks/workloads/benchmark_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ray.air.util.node import _force_on_node

import ray
from typing import Any, List, Dict, Union, Callable
from typing import List, Dict, Union, Callable


def schedule_remote_fn_on_all_nodes(
Expand Down Expand Up @@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs):
def create_actors_with_options(
num_actors: int,
resources: Dict[str, Union[float, int]],
runtime_env: Dict[str, Any] = None,
) -> List[ray.actor.ActorHandle]:
num_cpus = resources.pop("CPU", 1)
num_gpus = resources.pop("GPU", 0)

options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources}

if runtime_env:
options["runtime_env"] = runtime_env

return [CommandRunner.options(**options).remote() for _ in range(num_actors)]


Expand Down
18 changes: 0 additions & 18 deletions release/air_tests/air_benchmarks/workloads/torch_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,6 @@
VANILLA_RESULT_JSON = "/tmp/vanilla_out.json"


def find_network_interface():
for iface in os.listdir("/sys/class/net"):
if iface.startswith("ens"):
network_interface = iface
break
else:
network_interface = "^lo,docker"

return network_interface


# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
Expand Down Expand Up @@ -311,19 +300,12 @@ def train_torch_vanilla(

num_epochs = config["epochs"]

try:
nccl_network_interface = find_network_interface()
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}}
except Exception:
runtime_env = {}

actors = create_actors_with_options(
num_actors=num_workers,
resources={
"CPU": cpus_per_worker,
"GPU": int(use_gpu),
},
runtime_env=runtime_env,
)

run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def main(
ray.init(
runtime_env={
"working_dir": os.path.dirname(__file__),
"env_vars": {"NCCL_SOCKET_IFNAME": "ens"},
}
)
prepare_mnist()
Expand Down
2 changes: 0 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1128,8 +1128,6 @@
cluster:
byod:
type: gpu
runtime_env:
- NCCL_SOCKET_IFNAME=ens
post_build_script: byod_xgboost_test.sh
cluster_compute: tpl_gpu_small_aws.yaml

Expand Down
6 changes: 0 additions & 6 deletions release/xgboost_tests/app_config_gpu.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }}
env_vars:
# Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on
# anyscale_default_cloud.
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
NCCL_SOCKET_IFNAME: ens

debian_packages:
- curl

Expand Down

0 comments on commit b310ba4

Please sign in to comment.