#6491: add fp32_dest_acc for unit test

tenstorrent · May 4, 2024 · 70a4ade · 70a4ade
1 parent df1c655
commit 70a4ade
Show file tree

Hide file tree

Showing 3 changed files with 239 additions and 49 deletions.
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_logsoftmax.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_logsoftmax.py
@@ -9,7 +9,35 @@
 from models.utility_functions import comp_allclose_and_pcc
 from loguru import logger
 import torch.nn.functional as F
-from models.utility_functions import skip_for_wormhole_b0
+import ttnn
+from models.utility_functions import is_wormhole_b0
+
+compute_kernel_options = [
+    False,  # for grayskull
+]
+compute_kernel_ids = ["fp32_dest_acc_en=False"]
+if is_wormhole_b0:
+    compute_kernel_options.append(True)
+    compute_kernel_ids.append("fp32_dest_acc_en=True")
+
+
+def get_compute_kernel_options(compute_kernel_options):
+    if is_wormhole_b0():
+        fp32_dest_acc_en = compute_kernel_options
+        packer_l1_acc = False
+        compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=False,
+            fp32_dest_acc_en=fp32_dest_acc_en,
+            packer_l1_acc=packer_l1_acc,
+        )
+    else:
+        # Grayskull doesn't support fp32 but test passing a GS config is ok
+        compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=True,
+        )
+    return compute_kernel_config
 
 
 @pytest.mark.parametrize(
@@ -25,18 +53,21 @@
         ((10, 20, 32 * 3, 32 * 5), 2),  # multiple tiles per core
     ),
 )
-def test_logsoftmax_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
 
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
 
     dev_x = ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)
 
     tt_cpu = F.log_softmax(x, dim)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
 
     assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
     tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
@@ -54,12 +85,15 @@ def test_logsoftmax_for_dim_hw(shape_dim, device):
         ((2, 3, 32 * 4, 32 * 5), 2),
     ),
 )
-def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
 
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
 
     dev_x = ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)
@@ -70,7 +104,9 @@ def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
         if dim == 3
         else ttl.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H
     )
-    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, None, strategy)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax(
+        dev_x, dim, None, strategy, compute_kernel_config=compute_kernel_config
+    )
 
     assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
     tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
@@ -90,11 +126,14 @@ def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
         ((1, 1, 32 * 2 + 10, 32), 2),  # mutiple tile with dim
     ),
 )
-def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
 
     dev_x = (
@@ -105,7 +144,7 @@ def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
     )
 
     tt_cpu = F.log_softmax(x, dim)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
     tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)
 
     assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
@@ -128,19 +167,22 @@ def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
         ((15, 109, 32 * 2, 32 * 2), 0),  # mutiple tiles per cores
     ),
 )
-def test_logsoftmax_for_dim_nc(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_for_dim_nc(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
 
     dev_x = (
         ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).pad_to_tile(float("7")).to(ttl.tensor.Layout.TILE).to(device)
     )
 
     tt_cpu = F.log_softmax(x, dim)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
     tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)
 
     assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
@@ -165,11 +207,14 @@ def test_logsoftmax_for_dim_nc(shape_dim, device):
         ((10, 20, 32 * 5, 32), 2),  # multiple tiles per core
     ),
 )
-def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
 
     y = F.log_softmax(x, dim)
@@ -179,7 +224,9 @@ def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
     dev_dy = ttl.tensor.Tensor(dy, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)
 
     y.backward(dy)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
+        dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
+    )
 
     assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
     tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
@@ -197,11 +244,14 @@ def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
         ((2, 3, 32 * 4, 32 * 5), 2),
     ),
 )
-def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
 
     y = F.log_softmax(x, dim)
@@ -216,7 +266,9 @@ def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
         if dim == 3
         else ttl.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H
     )
-    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim, None, strategy)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
+        dev_y, dev_dy, dim, None, strategy, compute_kernel_config=compute_kernel_config
+    )
 
     assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
     tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
@@ -236,11 +288,14 @@ def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
         ((1, 1, 32 * 2 + 10, 32), 2),  # mutiple tile with dim
     ),
 )
-def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
 
     y = F.log_softmax(x, dim)
@@ -260,7 +315,9 @@ def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
     )
 
     y.backward(dy)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
+        dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
+    )
     tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)
 
     assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
@@ -283,11 +340,14 @@ def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
         ((15, 109, 32 * 2, 32 * 2), 0),  # mutiple tiles per cores
     ),
 )
-def test_logsoftmax_backward_for_dim_nc(shape_dim, device):
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_logsoftmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
     device.enable_program_cache()
     shape, dim = shape_dim
     torch.manual_seed(0)
 
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+
     x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
 
     y = F.log_softmax(x, dim)
@@ -307,7 +367,9 @@ def test_logsoftmax_backward_for_dim_nc(shape_dim, device):
     )
 
     y.backward(dy)
-    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
+    tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
+        dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
+    )
     tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)
     assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
     tt_dev = tt_npu.cpu().to_torch().to(torch.bfloat16)