huggingface · ydshieh · Jun 18, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -391,7 +391,7 @@ class TrainingArguments:
             installation](https://github.com/intel/intel-extension-for-pytorch).
         bf16 (`bool`, *optional*, defaults to `False`):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
-            NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
+            NVIDIA architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
         fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
         fp16_opt_level (`str`, *optional*, defaults to 'O1'):

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -4892,7 +4892,7 @@ def test_cache_device_map_with_vision_layer_device_map(self):
         # If the generate doesn't infer the DECODER device map correctly, this will fail
         _ = model.generate(**inputs, max_new_tokens=2, do_sample=False)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_cpu_offload_doesnt_compile(self):
         """Test that CPU offload doesn't trigger compilation"""
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")

diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
@@ -623,7 +623,7 @@ def test_simple_generate(self):
                 (
                     "xpu",
                     3,
-                ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
+                ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
             }
         )
 
@@ -671,7 +671,7 @@ def test_simple_batched_generate_with_padding(self):
                     "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
                 ],
                 ("xpu", 3): [
-                    "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
+                    "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
                     "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
                 ],
             }

diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py
@@ -27,7 +27,13 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 
@@ -680,7 +686,7 @@ def test_initialization(self):
         self.assertTrue(not failed_cases, message)
 
     @parameterized.expand(["float32", "float16", "bfloat16"])
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_inference_with_different_dtypes(self, torch_dtype_str):
         torch_dtype = {
@@ -702,7 +708,7 @@ def test_inference_with_different_dtypes(self, torch_dtype_str):
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
     @parameterized.expand(["float32", "float16", "bfloat16"])
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_inference_equivalence_for_static_and_dynamic_anchors(self, torch_dtype_str):
         torch_dtype = {

diff --git a/tests/models/glm4/test_modeling_glm4.py b/tests/models/glm4/test_modeling_glm4.py
@@ -119,7 +119,7 @@ def test_model_9b_bf16(self):
             {
                 ("xpu", 3): [
                     "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
+                    "Hi today I am going to tell you about the most common mistakes that people make when they are learning English.",
                 ],
                 ("cuda", 7): [],
                 ("cuda", 8): [
@@ -177,7 +177,7 @@ def test_model_9b_sdpa(self):
             {
                 ("xpu", 3): [
                     "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
+                    "Hi today I am going to tell you about the most common mistakes that people make when they are learning English.",
                 ],
                 ("cuda", 7): [],
                 ("cuda", 8): [

diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
@@ -1035,7 +1035,7 @@ def test_flash_attn_2_conversion(self):
         self.skipTest(reason="Musicgen doesn't use the MusicgenFlashAttention2 class method.")
 
     @require_torch_sdpa
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_sdpa_can_dispatch_on_flash(self):
         if not self.has_attentions:
@@ -1046,8 +1046,8 @@ def test_sdpa_can_dispatch_on_flash(self):
             self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
         elif device_type == "rocm" and major < 9:
             self.skipTest(reason="This test requires an AMD GPU with compute capability >= 9.0")
-        else:
-            self.skipTest(reason="This test requires a Nvidia or AMD GPU")
+        elif device_type not in ["cuda", "rocm", "xpu"]:
+            self.skipTest(reason="This test requires a Nvidia or AMD GPU or an Intel XPU")
 
         torch.compiler.reset()
 

diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -1035,7 +1035,7 @@ def test_flash_attn_2_conversion(self):
         self.skipTest(reason="MusicgenMelody doesn't use the MusicgenMelodyFlashAttention2 class method.")
 
     @require_torch_sdpa
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_sdpa_can_dispatch_on_flash(self):
         if not self.has_attentions:
@@ -1046,8 +1046,8 @@ def test_sdpa_can_dispatch_on_flash(self):
             self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
         elif device_type == "rocm" and major < 9:
             self.skipTest(reason="This test requires an AMD GPU with compute capability >= 9.0")
-        else:
-            self.skipTest(reason="This test requires a Nvidia or AMD GPU")
+        elif device_type not in ["cuda", "rocm", "xpu"]:
+            self.skipTest(reason="This test requires a Nvidia or AMD GPU or an Intel XPU")
 
         torch.compiler.reset()
 

diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -21,7 +21,14 @@
 import requests
 from packaging import version
 
-from transformers.testing_utils import is_flaky, require_torch, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    is_flaky,
+    require_torch,
+    require_torch_accelerator,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -337,7 +344,7 @@ def test_slow_fast_equivalence_batched(self):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_vision
     def test_can_compile_fast_image_processor(self):
         # override as trimaps are needed for the image processor

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -1623,9 +1623,10 @@ def is_any_loss_nan_or_inf(log_history):
         self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
 
     def test_train_and_eval_dataloaders(self):
-        if torch_device in ["cuda", "xpu"]:
+        if torch_device in ["cuda"]:
             n_gpu = max(1, backend_device_count(torch_device))
         else:
+            # DP is decprecated by PyTorch, accelerators like XPU doesn't support DP
             n_gpu = 1
 
         tmp_dir = self.get_auto_remove_tmp_dir()
@@ -3940,7 +3941,7 @@ def test_torchdynamo_memory(self):
         from torch import _dynamo as torchdynamo
 
         class CustomTrainer(Trainer):
-            def compute_loss(self, model, inputs, return_outputs=False):
+            def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
                 x = inputs["x"]
                 output = model(x)
                 if self.args.n_gpu == 1:

diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
@@ -556,7 +556,7 @@ def test_static_cache_multi_accelerator(self):
         _ = model(**inputs)
         _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @parameterized.expand(TEST_CACHE_IMPLEMENTATIONS)
     def test_cache_gptj_model(self, cache_implementation):
         """Tests caches with GPT-J model. Regression test for https://github.com/huggingface/transformers/pull/34799"""