tp on routed experts working

danielvegamyhre · danielvegamyhre · commit 44778d07d9ba · 2025-06-23T14:40:51.000-07:00
diff --git a/torchao/prototype/moe_training/conversion_utils.py b/torchao/prototype/moe_training/conversion_utils.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
 from typing import Callable, Optional
 
 from torch import nn
@@ -8,6 +14,8 @@
     register_quantize_module_handler,
 )
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 
 class MoETrainingConfig(AOBaseConfig):
     """
@@ -105,7 +113,9 @@ def post_order_traversal(
                         ScaledGroupedMMTensor(param), requires_grad=param.requires_grad
                     )
                     setattr(module, param_name, new_param)
-                    print(f"Swapped {cur_fqn}.{param_name} to ScaledGroupedMMTensor")
+                    logger.info(
+                        f"Swapped {cur_fqn}.{param_name} to ScaledGroupedMMTensor"
+                    )
 
     post_order_traversal(root_module)
     return root_module
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -39,7 +39,6 @@ def _scaled_grouped_mm(
             and in column-major memory layout.
         offs (int32 torch.Tensor): The offsets to use to mark the starting index of each group along dim0 of the A tensor.
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
-        use_triton_for_per_group_scales (bool): Whether to use custom triton kernels to compute per-group scales. Default is True.
     """
     logger.info("Using differentiable _scaled_grouped_mm")
     return _Float8GroupedMM.apply(
@@ -61,8 +60,8 @@ def forward(
         offs: Optional[torch.Tensor] = None,
         out_dtype: Optional[torch.dtype] = torch.bfloat16,
     ) -> torch.Tensor:
-        # torchao _scaled_grouped_mm only supports A=2D, B=3D.
-        assert A.ndim == 2, "A must be 2D"
+        # torchao _scaled_grouped_mm only supports A=2D|3D + B=3D.
+        assert A.ndim == 2 or A.ndim == 3, "A must be 2D or 3D"
         assert B_t.ndim == 3, "B must be 3D"
 
         assert A.size(-1) % 16 == 0, (
@@ -151,12 +150,25 @@ def forward(
         assert _is_column_major(B_t_fp8_col_major), (
             "B must be column-major for output = A @ B"
         )
+
+        # TODO: remove excessive logging once prototype is more mature.
+        logger.debug(
+            (
+                f"forward scaled_grouped_mm: A_fp8_row_major.shape={A_fp8_row_major.shape}, "
+                f"A_scale.shape={A_scales.squeeze(-1).shape}, "
+                f"B_t_fp8_col_major.shape={B_t_fp8_col_major.shape}, "
+                f"B_t_scale.shape={B_t_scales.squeeze(1).shape}, "
+                f"offs={offs if offs is not None else None}"
+            )
+        )
         return torch._scaled_grouped_mm(
             A_fp8_row_major,
             B_t_fp8_col_major,
-            A_scales.squeeze().reciprocal(),
-            B_t_scales.squeeze().reciprocal(),
-            offs,
+            # Squeeze A scales to: (B, S, 1) => (B, M), or (B*S, 1) => (B*S)
+            A_scales.squeeze(-1).reciprocal(),
+            # Squeeze B scales to: (B, 1, N) => (B, N)
+            B_t_scales.squeeze(1).reciprocal(),
+            offs=offs,
             out_dtype=out_dtype,
             use_fast_accum=True,
         )
@@ -193,12 +205,20 @@ def backward(ctx, grad_output: torch.Tensor):
         assert _is_column_major(B_fp8_col_major), (
             "B must be column-major for grad_A = grad_output @ B"
         )
+        logger.debug(
+            (
+                f"backward grad_A: grad_output_fp8_row_major.shape={grad_output_fp8_row_major.shape}, "
+                f"grad_output_scale.shape={grad_output_scales.shape}, "
+                f"B_fp8_col_major.shape={B_fp8_col_major.shape}, "
+                f"B_scale.shape={B_scales.shape}, "
+            )
+        )
         grad_A = torch._scaled_grouped_mm(
             grad_output_fp8_row_major,
             B_fp8_col_major,
-            grad_output_scales.squeeze().reciprocal(),
-            B_scales.squeeze().reciprocal(),
-            offs,
+            grad_output_scales.squeeze(-1).reciprocal(),
+            B_scales.squeeze(1).reciprocal(),
+            offs=offs,
             out_dtype=out_dtype,
             use_fast_accum=True,
         )
@@ -238,12 +258,21 @@ def backward(ctx, grad_output: torch.Tensor):
         assert _is_column_major(A_fp8_col_major), (
             "A must be column-major for grad_B = grad_output_t @ A"
         )
+
+        logger.debug(
+            (
+                f"backward grad_B: grad_output_t_fp8_row_major.shape={grad_output_t_fp8_row_major.shape}, "
+                f"grad_output_t_scale.shape={grad_output_t_scales.shape}, "
+                f"A_fp8_col_major.shape={A_fp8_col_major.shape}, "
+                f"A_scale.shape={A_scales.shape}, "
+            )
+        )
         grad_B = torch._scaled_grouped_mm(
             grad_output_t_fp8_row_major,
             A_fp8_col_major,
             grad_output_t_scales.reciprocal(),
             A_scales.reciprocal(),
-            offs,
+            offs=offs,
             out_dtype=out_dtype,
             use_fast_accum=True,
         )
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -75,12 +75,12 @@ def __torch_function__(cls, func, types, args, kwargs={}):
             # used for shared experts. This is basically the grouped_mm
             # kernel handling a bmm.
             A, B = args[0], args[1]
-            A_is_2d = A.dim() == 2
+            A_is_2d_or_3d = A.dim() in (2, 3)
             B_is_3d = B.dim() == 3
             has_offs = kwargs.get(cls.offs_arg_name) is not None
-            logger.info(f"A.shape={A.shape}, B.shape={B.shape}, has_offs={has_offs}")
-            
-            if A_is_2d and B_is_3d:
+            logger.debug(f"A.shape={A.shape}, B.shape={B.shape}, has_offs={has_offs}")
+
+            if A_is_2d_or_3d and B_is_3d:
                 return _scaled_grouped_mm(
                     *args,
                     **kwargs,