From 203d651155c8d6a0d66b4e0a850d9d595cc5afa8 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 24 Sep 2025 15:42:11 -0500 Subject: [PATCH 1/4] Skip trying to access non-existent nccl symbols on ROCm Signed-off-by: Gregory Shtrasberg --- .../device_communicators/pynccl_wrapper.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index c3e99e177e2d..98cb33ff656a 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -31,6 +31,7 @@ from torch.distributed import ReduceOp from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import find_nccl_library logger = init_logger(__name__) @@ -275,10 +276,18 @@ def __init__(self, so_file: Optional[str] = None): if so_file not in NCCLLibrary.path_to_dict_mapping: _funcs: dict[str, Any] = {} for func in NCCLLibrary.exported_functions: - f = getattr(self.lib, func.name) - f.restype = func.restype - f.argtypes = func.argtypes - _funcs[func.name] = f + try: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + except: + if current_platform.is_rocm() and func.name in [ + "ncclCommWindowRegister", + "ncclCommWindowDeregister" + ]: + continue + raise NCCLLibrary.path_to_dict_mapping[so_file] = _funcs self._funcs = NCCLLibrary.path_to_dict_mapping[so_file] From 978f4ee99a83500271b19542100f60947cdef455 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 24 Sep 2025 15:49:11 -0500 Subject: [PATCH 2/4] Replace try-except with a check for None Signed-off-by: Gregory Shtrasberg --- .../device_communicators/pynccl_wrapper.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 98cb33ff656a..3085d03616e3 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -276,18 +276,17 @@ def __init__(self, so_file: Optional[str] = None): if so_file not in NCCLLibrary.path_to_dict_mapping: _funcs: dict[str, Any] = {} for func in NCCLLibrary.exported_functions: - try: - f = getattr(self.lib, func.name) - f.restype = func.restype - f.argtypes = func.argtypes - _funcs[func.name] = f - except: - if current_platform.is_rocm() and func.name in [ - "ncclCommWindowRegister", - "ncclCommWindowDeregister" - ]: - continue - raise + f = getattr(self.lib, func.name) + if f is None and func.name in [ + "ncclCommWindowRegister", "ncclCommWindowDeregister" + ] and current_platform.is_rocm(): + # These symbols require NCCL >= 2.27.03, and having + # an exception here on ROCm platform is not allowed + # during graph capturing + continue + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f NCCLLibrary.path_to_dict_mapping[so_file] = _funcs self._funcs = NCCLLibrary.path_to_dict_mapping[so_file] From 44304ad549a1ddc7974e215441f8ff994261a635 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 24 Sep 2025 16:23:29 -0500 Subject: [PATCH 3/4] Return the try-except Signed-off-by: Gregory Shtrasberg --- .../device_communicators/pynccl_wrapper.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 3085d03616e3..89ed10e049f3 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -276,17 +276,21 @@ def __init__(self, so_file: Optional[str] = None): if so_file not in NCCLLibrary.path_to_dict_mapping: _funcs: dict[str, Any] = {} for func in NCCLLibrary.exported_functions: - f = getattr(self.lib, func.name) - if f is None and func.name in [ - "ncclCommWindowRegister", "ncclCommWindowDeregister" - ] and current_platform.is_rocm(): - # These symbols require NCCL >= 2.27.03, and having - # an exception here on ROCm platform is not allowed - # during graph capturing - continue - f.restype = func.restype - f.argtypes = func.argtypes - _funcs[func.name] = f + try: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + except AttributeError: + if current_platform.is_rocm() and func.name in [ + "ncclCommWindowRegister", + "ncclCommWindowDeregister" + ]: + # These symbols require NCCL >= 2.27.03, and having + # an exception here on ROCm platform is not allowed + # during graph capturing + continue + raise NCCLLibrary.path_to_dict_mapping[so_file] = _funcs self._funcs = NCCLLibrary.path_to_dict_mapping[so_file] From 6fab3f1007d8444ebe4ecfed6412cccec6b95bf3 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 25 Sep 2025 16:50:37 +0000 Subject: [PATCH 4/4] Add warning about missing symbols in case the feature using them is enabled Signed-off-by: Gregory Shtrasberg --- .../device_communicators/pynccl_wrapper.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 89ed10e049f3..2e9a4e024de4 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -30,6 +30,7 @@ import torch from torch.distributed import ReduceOp +from vllm import envs from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import find_nccl_library @@ -282,14 +283,20 @@ def __init__(self, so_file: Optional[str] = None): f.argtypes = func.argtypes _funcs[func.name] = f except AttributeError: - if current_platform.is_rocm() and func.name in [ + if func.name in [ "ncclCommWindowRegister", "ncclCommWindowDeregister" ]: - # These symbols require NCCL >= 2.27.03, and having - # an exception here on ROCm platform is not allowed - # during graph capturing - continue + if envs.VLLM_USE_NCCL_SYMM_MEM: + logger.warning_once( + "The symbol %s is not found in the NCCL " + "library %s. To enable VLLM_USE_NCCL_SYMM_MEM " + " please update your NCCL version to >= " + "2.27.03.", func.name, so_file) + if current_platform.is_rocm(): + # Having an exception here on ROCm platform is + # not allowed during graph capturing + continue raise NCCLLibrary.path_to_dict_mapping[so_file] = _funcs self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]