[float8] fix all-gather in 2D with DTensor(WeightWithDynamicFloat8CastTensor) (#590)

weifengpy · web-flow · commit 00529fa510e8 · 2024-08-02T18:12:12.000-07:00
* [float8][2D] fix bug in precomputing scales

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* [float8] fix all-gather in 2D with DTensor(WeightWithDynamicFloat8CastTensor)

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* remove record_function after debugging

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* add asci diagraph

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/float8/test_dtensor.py b/test/float8/test_dtensor.py
@@ -27,6 +27,7 @@
 from torchao.float8 import Float8LinearConfig
 from torchao.float8.float8_linear_utils import convert_to_float8_training
 
+from torchao.float8.config import CastConfig, ScalingType
 from torchao.float8.float8_scaling_utils import NoopFwToFloat8E5M2BwDynamic
 from torchao.float8.float8_tensor import (
     Float8Tensor,
@@ -43,6 +44,11 @@
 from torch.distributed._tensor import distribute_tensor, DTensor, Replicate, Shard
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.tensor.parallel import parallelize_module
+from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+)
 from tqdm import tqdm
 
 
@@ -303,6 +309,38 @@ def _test_fp8_mlp_tensor_parallelism_compile(mesh: DeviceMesh, size=16):
     _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True)
 
 
+def _test_distribute_fsdp_tensor_subclass(tp_mesh: DeviceMesh):
+    torch.manual_seed(42)
+    model = Transformer(ModelArgs(dropout_p=0.0, weight_tying=False)).cuda()
+    convert_to_float8_training(
+        model,
+        config=Float8LinearConfig(
+            enable_fsdp_float8_all_gather=True,
+            cast_config_weight=CastConfig(scaling_type=ScalingType.DYNAMIC),
+        ),
+    )
+    # test Float8ColwiseParallel
+    colwise_param = distribute_tensor(
+        model.layers[0].attention.wq.weight, tp_mesh, [Shard(0)]
+    )
+    assert (
+        isinstance(colwise_param, DTensor)
+        and isinstance(
+            colwise_param._local_tensor, WeightWithDynamicFloat8CastTensor
+        )
+    ), f"expect DTensor(local_tensor={WeightWithDynamicFloat8CastTensor}) but got {colwise_param}"
+    # test Float8RowwiseParallel
+    rowwise_param = distribute_tensor(
+        model.layers[0].attention.wo.weight, tp_mesh, [Shard(1)]
+    )
+    assert (
+        isinstance(rowwise_param, DTensor)
+        and isinstance(
+            rowwise_param._local_tensor, WeightWithDynamicFloat8CastTensor
+        )
+    ), f"expect DTensor(local_tensor={WeightWithDynamicFloat8CastTensor}) but got {colwise_param}"
+
+
 if __name__ == "__main__":
     # float8 only works on CUDA H100 so we only test cuda and we follow
     # other test files to not use TestCase but instead just add the test
@@ -315,6 +353,7 @@ def _test_fp8_mlp_tensor_parallelism_compile(mesh: DeviceMesh, size=16):
         _test_dtensor_fp8_autograd,
         _test_fp8_mlp_tensor_parallelism_eager,
         _test_fp8_mlp_tensor_parallelism_compile,
+        _test_distribute_fsdp_tensor_subclass,
     ]
 
     for test in tqdm(tests, desc="Running tests"):
diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -84,8 +84,43 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
     torch.ops.aten.as_strided.default,
     torch.ops.aten._to_copy.default,
     torch.ops.aten._pin_memory.default,
+    torch.ops.aten.split.Tensor,
+    torch.ops.aten.clone.default,
 }
 
+# How Tensor Parallel (TP) and FSDP2 work
+
+# Initialization: apply TP first then FSDP2
+# nn.Linear(weight=torch.Tensor)
+#      |
+#      | apply float8 linear, `convert_to_float8_training`
+#      |
+# Float8Linear(weight=WeightWithDynamicFloat8CastTensor)
+#      |
+#      | apply tensor parallel, `parallelize_module` shards rowwise/colwise
+#      |
+# Float8Linear(weight=DTensor(local_tensor=WeightWithDynamicFloat8CastTensor,
+#                             device_mesh=DeviceMesh([0, 1], mesh_dim_names=('tp',)),
+#                             placements=(Shard(dim=0),)))
+#      |
+#      | apply FSDP2, `fully_shard` shards rowwise (dim=0)
+#      |
+# Float8Linear(weight=DTensor(local_tensor=WeightWithDynamicFloat8CastTensor,
+#                             device_mesh=DeviceMesh([[0, 1], [2, 3]], mesh_dim_names=('dp', 'tp')),
+#                             placements=(Shard(dim=0), Shard(dim=0))))
+
+# Forward and backward: FSDP runs first then TP
+# Float8Linear(weight=DTensor(local_tensor=WeightWithDynamicFloat8CastTensor,
+#                             device_mesh=DeviceMesh([[0, 1], [2, 3]], mesh_dim_names=('dp', 'tp')),
+#                             placements=(Shard(dim=0), Shard(dim=0))))
+#      |
+#      |   FSDP unshards parameters within dp mesh
+#      |
+# Float8Linear(weight=DTensor(local_tensor=WeightWithDynamicFloat8CastTensor,
+#                             device_mesh=DeviceMesh([0, 1], mesh_dim_names=('tp',)),
+#                             placements=(Shard(dim=0),)))
+#      |
+#      |   TP compute with torch.mm(input, weight)
 
 class WeightWithDynamicFloat8CastTensor(torch.Tensor):
     @staticmethod
@@ -195,8 +230,17 @@ def fsdp_post_all_gather(
         (data,) = all_gather_outputs
         (scale,) = metadata
         if out is not None:
-            assert isinstance(out, Float8Tensor), f"{type(out)}"
-            out._scale = scale
+            from torch.distributed._tensor import DTensor
+            if isinstance(out, Float8Tensor):
+                out._scale = scale
+            elif isinstance(out, DTensor) and isinstance(
+                out._local_tensor, Float8Tensor
+            ):
+                out._local_tensor._scale = scale
+            else:
+                raise RuntimeError(
+                    f"out must be a Float8Tensor or DTensor(_local_tensor=Float8Tensor), but got {out}"
+                )
             return
         return Float8Tensor(
             data,