From 8a673cf61f5877fe5d9e6794bb07943a07f25165 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 22:54:29 -0700
Subject: [PATCH 1/4] gate cudagraph_unsafe tag for torch-2.9

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/silly_attention.py | 9 +++++++--
 vllm/attention/layer.py          | 8 +++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index baedafbae99f..2b422ae305ab 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -8,7 +8,12 @@
 import torch
 from torch.library import Library
 
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
+
+if is_torch_equal_or_newer("2.9.0.dev"):
+    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
+else:
+    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 
 # Shared library for all compilation test operations
 # Using "silly" namespace to match existing test expectations
@@ -60,5 +65,5 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
     target_lib=silly_lib,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
+    tags=tag_cudagraph_unsafe,
 )
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 544a72052442..eaaa5a4dbe04 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -25,13 +25,15 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import GiB_bytes, direct_register_custom_op
+from vllm.utils import (GiB_bytes, direct_register_custom_op,
+                        is_torch_equal_or_newer)
 
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
-try:
+
+if is_torch_equal_or_newer("2.9.0.dev"):
     tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
-except AttributeError:
+else:
     tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 
 

From 68ed7cced9ccefc9ae0c29423e1b0e315a38645a Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 22:58:46 -0700
Subject: [PATCH 2/4] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/silly_attention.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index 2b422ae305ab..db4dd4061a63 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -8,12 +8,8 @@
 import torch
 from torch.library import Library
 
-from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-
-if is_torch_equal_or_newer("2.9.0.dev"):
-    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
-else:
-    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
+from vllm.attention.layer import tag_cudagraph_unsafe
+from vllm.utils import direct_register_custom_op
 
 # Shared library for all compilation test operations
 # Using "silly" namespace to match existing test expectations

From 56d5f48d586e373162ac27dc3084fe68070060e5 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 23:04:44 -0700
Subject: [PATCH 3/4] move to vllm.utils.__init__

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/silly_attention.py | 3 +--
 vllm/attention/layer.py          | 7 +------
 vllm/utils/__init__.py           | 6 ++++++
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index db4dd4061a63..e69888b737e9 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -8,8 +8,7 @@
 import torch
 from torch.library import Library
 
-from vllm.attention.layer import tag_cudagraph_unsafe
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, tag_cudagraph_unsafe
 
 # Shared library for all compilation test operations
 # Using "silly" namespace to match existing test expectations
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index eaaa5a4dbe04..7ff064c0b968 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -26,16 +26,11 @@
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import (GiB_bytes, direct_register_custom_op,
-                        is_torch_equal_or_newer)
+                        tag_cudagraph_unsafe)
 
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
 
-if is_torch_equal_or_newer("2.9.0.dev"):
-    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
-else:
-    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
-
 
 def check_xformers_availability():
     global USE_XFORMERS_OPS
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index fd1c0af31269..187667a59329 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3472,3 +3472,9 @@ def length_from_prompt_token_ids_or_embeds(
                 f" prompt_token_ids={prompt_token_len}"
                 f" prompt_embeds={prompt_embeds_len}")
         return prompt_token_len
+
+
+if is_torch_equal_or_newer("2.9.0.dev"):
+    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
+else:
+    tag_cudagraph_unsafe = ()  # type: ignore[assignment]

From 99aee9710507e6b243ee5715306b1291b328be75 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 23:31:16 -0700
Subject: [PATCH 4/4] gate by is_cuda_alike

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/utils/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 187667a59329..f225b396b45a 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3475,6 +3475,8 @@ def length_from_prompt_token_ids_or_embeds(
 
 
 if is_torch_equal_or_newer("2.9.0.dev"):
-    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
+    from vllm.platforms import current_platform
+    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,
+                            ) if current_platform.is_cuda_alike() else ()
 else:
     tag_cudagraph_unsafe = ()  # type: ignore[assignment]