Add the workaround to support rowwise scaled_gemm for fp32 outputs (pytorch#2431)

y-sq · facebook-github-bot · commit 92c366880648 · 2025-07-03T16:39:51.000-07:00
Summary:

Running rowwise scaling on fp32 tensors got the error, P1794222725
```
RuntimeError: Only bf16 high precision output types are supported for row-wise scaling.
```

This pr adds an option to explicitly use bfloat16 as the output of rowwise_scaled, and cast it back to the original precision.

It can be enabled by setting
```
config = dataclasses.replace(config, convert_dtypes_for_rowwise_scaled_mm=True)
```

Differential Revision: D73552660
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -34,9 +34,7 @@
     e5m2_dtype,
 )
 from torchao.float8.float8_linear import Float8Linear
-from torchao.float8.float8_linear_utils import (
-    convert_to_float8_training,
-)
+from torchao.float8.float8_linear_utils import convert_to_float8_training
 from torchao.float8.float8_ops import addmm_float8_unwrapped
 from torchao.float8.float8_scaling_utils import (
     get_maybe_axiswise_dim,
@@ -379,12 +377,16 @@ def test_linear_from_config_params(
     )
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_bias", [True, False])
+    @pytest.mark.parametrize(
+        "linear_dtype", [torch.bfloat16, torch.float16, torch.float32]
+    )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     @skip_if_rocm("ROCm enablement in progress")
     def test_linear_from_recipe(
         self,
         recipe_name,
         x_shape,
+        linear_dtype: torch.dtype,
         linear_bias: bool,
     ):
         if torch.cuda.get_device_capability() < (9, 0):
@@ -393,7 +395,6 @@ def test_linear_from_recipe(
             )
             pytest.skip()
 
-        linear_dtype = torch.bfloat16
         x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
         m_ref = nn.Linear(16, 32, bias=linear_bias, device="cuda", dtype=linear_dtype)
         config = Float8LinearConfig.from_recipe_name(recipe_name)
@@ -436,9 +437,9 @@ def test_autocast_outputs(
 
         with torch.autocast("cuda", dtype=torch.bfloat16):
             y = m(x)
-        assert y.dtype == torch.bfloat16, (
-            f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
-        )
+        assert (
+            y.dtype == torch.bfloat16
+        ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
 
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
@@ -467,9 +468,9 @@ def test_type_cast(self, linear_dtype: torch.dtype, emulate: bool):
 
         with torch.autocast("cuda", dtype=torch.bfloat16):
             y = m(x)
-        assert y.dtype == torch.bfloat16, (
-            f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
-        )
+        assert (
+            y.dtype == torch.bfloat16
+        ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
 
     def test_repr(self):
         m = nn.Linear(32, 16)
@@ -500,9 +501,9 @@ def test_quantize(self):
         from torchao.quantization.quant_api import float8_weight_only, quantize_
 
         quantize_(m, float8_weight_only())
-        assert m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn, (
-            "Post quantization dtype should be torch.float8_e4m3fn"
-        )
+        assert (
+            m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn
+        ), "Post quantization dtype should be torch.float8_e4m3fn"
         with torch.no_grad():
             m(x)
 
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
@@ -54,6 +54,12 @@ def addmm_float8_unwrapped(
         a_inverse_scale = a_inverse_scale.new_ones(())
         b_inverse_scale = a_inverse_scale.new_ones(())
 
+    # work around torch._scaled_mm not having float32 output type
+    # TODO(pytorch/pytorch#156771): remove this once torch._scaled_mm supports float32 output
+    orig_dtype = output_dtype
+    if orig_dtype in (torch.float16, torch.float32) and is_rowwise_scaling:
+        output_dtype = torch.bfloat16
+
     post_bias = None
     if output_dtype == torch.float32:
         # Bias is not supported by _scaled_mm when output is fp32
@@ -76,6 +82,9 @@ def addmm_float8_unwrapped(
     if post_bias is not None:
         output += post_bias
 
+    if orig_dtype in (torch.float16, torch.float32) and is_rowwise_scaling:
+        output = output.to(orig_dtype)
+
     return output
 
 
@@ -260,24 +269,24 @@ def float8_cat(aten_op, args, kwargs=None):
     gemm_input_role = chunked_tensors[0]._gemm_input_role
     chunk_data = []
     for chunk in chunked_tensors:
-        assert isinstance(chunk, Float8Tensor), (
-            "Expecting all chunks to be of type Float8Tensor"
-        )
-        assert chunk._orig_dtype == orig_dtype, (
-            "Expecting all chunks to be of the same dtype"
-        )
-        assert chunk._scale is scale, (
-            "Expecting all chunks to have thee same scale as a result of a split"
-        )
-        assert chunk._linear_mm_config is mm_config, (
-            "Expecting all chunks to have thee same mm config as a result of a split"
-        )
-        assert chunk._data.dtype == fp8_dtype, (
-            "Expecting all chunks to be of the same dtype as a result of a split"
-        )
-        assert chunk._gemm_input_role is gemm_input_role, (
-            "Expecting all chunks to have the same gemm_input_role as a result of a split"
-        )
+        assert isinstance(
+            chunk, Float8Tensor
+        ), "Expecting all chunks to be of type Float8Tensor"
+        assert (
+            chunk._orig_dtype == orig_dtype
+        ), "Expecting all chunks to be of the same dtype"
+        assert (
+            chunk._scale is scale
+        ), "Expecting all chunks to have thee same scale as a result of a split"
+        assert (
+            chunk._linear_mm_config is mm_config
+        ), "Expecting all chunks to have thee same mm config as a result of a split"
+        assert (
+            chunk._data.dtype == fp8_dtype
+        ), "Expecting all chunks to be of the same dtype as a result of a split"
+        assert (
+            chunk._gemm_input_role is gemm_input_role
+        ), "Expecting all chunks to have the same gemm_input_role as a result of a split"
         _assert_tensorwise_scale(aten_op, chunk._scale)
         chunk_data.append(chunk._data.view(torch.uint8))
 
@@ -320,9 +329,9 @@ def preprocess_addmm(a: Float8Tensor, b: Float8Tensor):
     )
 
     if scaled_mm_config.pad_inner_dim:
-        assert a._data.size(1) == b._data.size(0), (
-            f"Inner dims must match for mm, got {a._data.size(1)} and {b._data.size(0)}"
-        )
+        assert a._data.size(1) == b._data.size(
+            0
+        ), f"Inner dims must match for mm, got {a._data.size(1)} and {b._data.size(0)}"
         a_data = pad_tensor_for_matmul(a_data, dims=1)
         b_data = pad_tensor_for_matmul(b_data, dims=0)
 
@@ -353,10 +362,10 @@ def float8_mm(aten_op, args, kwargs=None):
     a = args[0]
     b = args[1]
 
-    assert isinstance(a, Float8Tensor) and isinstance(b, Float8Tensor), (
-        "Expecting  both Float8Tensor for mm inputs but found {} and {}".format(
-            type(a), type(b)
-        )
+    assert isinstance(a, Float8Tensor) and isinstance(
+        b, Float8Tensor
+    ), "Expecting  both Float8Tensor for mm inputs but found {} and {}".format(
+        type(a), type(b)
     )
     a_data, a_scale, b_data, b_scale = preprocess_addmm(a, b)
     output_dtype = a._orig_dtype
@@ -434,9 +443,9 @@ def autocast_to_copy(aten_op, args, kwargs=None):
     """
     _assert_tensorwise_scale(aten_op, args[0]._scale)
     assert isinstance(args[0], Float8Tensor)
-    assert len(kwargs) == 1 and "dtype" in kwargs, (
-        "Only support dtype kwarg for autocast"
-    )
+    assert (
+        len(kwargs) == 1 and "dtype" in kwargs
+    ), "Only support dtype kwarg for autocast"
     assert kwargs["dtype"] in {
         torch.float16,
         torch.bfloat16,
@@ -462,9 +471,9 @@ def allgather_fp8(aten_op, args, kwargs=None):
     """
     _assert_tensorwise_scale(aten_op, args[0]._scale)
     fp8_input = args[0]
-    assert isinstance(fp8_input, Float8Tensor), (
-        f"expecting a Float8Tensor for allgather but found {type(fp8_input)}"
-    )
+    assert isinstance(
+        fp8_input, Float8Tensor
+    ), f"expecting a Float8Tensor for allgather but found {type(fp8_input)}"
 
     fp8_data = fp8_input._data
     fp8_data = fp8_data.contiguous()
@@ -536,21 +545,21 @@ def copy_fp8(aten_op, args, kwargs=None):
         return aten_op(self, src_hp, *args[2:], **kwargs)
     elif isinstance(self, Float8Tensor) and isinstance(src, Float8Tensor):
         _assert_tensorwise_scale(aten_op, src._scale)
-        assert self._orig_dtype == src._orig_dtype, (
-            "Expecting both Float8Tensors to be of the same dtype"
-        )
-        assert self._scale == src._scale, (
-            "Expecting both Float8Tensors to have thee same scale"
-        )
-        assert self._linear_mm_config == src._linear_mm_config, (
-            "Expecting both Float8Tensors to have thee same mm config"
-        )
-        assert self._data.dtype == src._data.dtype, (
-            "Expecting both Float8Tensors to be of the same dtypet"
-        )
-        assert self._gemm_input_role == src._gemm_input_role, (
-            "Expecting both Float8Tensors to have the same gemm_input_role"
-        )
+        assert (
+            self._orig_dtype == src._orig_dtype
+        ), "Expecting both Float8Tensors to be of the same dtype"
+        assert (
+            self._scale == src._scale
+        ), "Expecting both Float8Tensors to have thee same scale"
+        assert (
+            self._linear_mm_config == src._linear_mm_config
+        ), "Expecting both Float8Tensors to have thee same mm config"
+        assert (
+            self._data.dtype == src._data.dtype
+        ), "Expecting both Float8Tensors to be of the same dtypet"
+        assert (
+            self._gemm_input_role == src._gemm_input_role
+        ), "Expecting both Float8Tensors to have the same gemm_input_role"
         fp8_out = aten_op(self._data, src._data, *args[2:], **kwargs)
         return Float8Tensor(
             fp8_out,