Skip to content

Commit

Permalink
#6491: add fp32_dest_acc for unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
hschoi4448 committed May 4, 2024
1 parent df1c655 commit 70a4ade
Show file tree
Hide file tree
Showing 3 changed files with 239 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,35 @@
from models.utility_functions import comp_allclose_and_pcc
from loguru import logger
import torch.nn.functional as F
from models.utility_functions import skip_for_wormhole_b0
import ttnn
from models.utility_functions import is_wormhole_b0

compute_kernel_options = [
False, # for grayskull
]
compute_kernel_ids = ["fp32_dest_acc_en=False"]
if is_wormhole_b0:
compute_kernel_options.append(True)
compute_kernel_ids.append("fp32_dest_acc_en=True")


def get_compute_kernel_options(compute_kernel_options):
if is_wormhole_b0():
fp32_dest_acc_en = compute_kernel_options
packer_l1_acc = False
compute_kernel_config = ttnn.WormholeComputeKernelConfig(
math_fidelity=ttnn.MathFidelity.HiFi4,
math_approx_mode=False,
fp32_dest_acc_en=fp32_dest_acc_en,
packer_l1_acc=packer_l1_acc,
)
else:
# Grayskull doesn't support fp32 but test passing a GS config is ok
compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
math_fidelity=ttnn.MathFidelity.HiFi4,
math_approx_mode=True,
)
return compute_kernel_config


@pytest.mark.parametrize(
Expand All @@ -25,18 +53,21 @@
((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
),
)
def test_logsoftmax_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()

shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)

dev_x = ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)

tt_cpu = F.log_softmax(x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)

assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
Expand All @@ -54,12 +85,15 @@ def test_logsoftmax_for_dim_hw(shape_dim, device):
((2, 3, 32 * 4, 32 * 5), 2),
),
)
def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()

shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)

dev_x = ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)
Expand All @@ -70,7 +104,9 @@ def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
if dim == 3
else ttl.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H
)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, None, strategy)
tt_npu = ttl.operations.primary.moreh_logsoftmax(
dev_x, dim, None, strategy, compute_kernel_config=compute_kernel_config
)

assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
Expand All @@ -90,11 +126,14 @@ def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, device):
((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
),
)
def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)

dev_x = (
Expand All @@ -105,7 +144,7 @@ def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
)

tt_cpu = F.log_softmax(x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)

assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
Expand All @@ -128,19 +167,22 @@ def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, device):
((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
),
)
def test_logsoftmax_for_dim_nc(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_for_dim_nc(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)

dev_x = (
ttl.tensor.Tensor(x, ttl.tensor.DataType.BFLOAT16).pad_to_tile(float("7")).to(ttl.tensor.Layout.TILE).to(device)
)

tt_cpu = F.log_softmax(x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)

assert list(tt_npu.get_legacy_shape()) == list(tt_cpu.shape)
Expand All @@ -165,11 +207,14 @@ def test_logsoftmax_for_dim_nc(shape_dim, device):
((10, 20, 32 * 5, 32), 2), # multiple tiles per core
),
)
def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)

y = F.log_softmax(x, dim)
Expand All @@ -179,7 +224,9 @@ def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
dev_dy = ttl.tensor.Tensor(dy, ttl.tensor.DataType.BFLOAT16).to(ttl.tensor.Layout.TILE).to(device)

y.backward(dy)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
)

assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
Expand All @@ -197,11 +244,14 @@ def test_logsoftmax_backward_for_dim_hw(shape_dim, device):
((2, 3, 32 * 4, 32 * 5), 2),
),
)
def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)

y = F.log_softmax(x, dim)
Expand All @@ -216,7 +266,9 @@ def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
if dim == 3
else ttl.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H
)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim, None, strategy)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
dev_y, dev_dy, dim, None, strategy, compute_kernel_config=compute_kernel_config
)

assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
tt_dev = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).to_torch().to(torch.bfloat16)
Expand All @@ -236,11 +288,14 @@ def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, device):
((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
),
)
def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)

y = F.log_softmax(x, dim)
Expand All @@ -260,7 +315,9 @@ def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
)

y.backward(dy)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
)
tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)

assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
Expand All @@ -283,11 +340,14 @@ def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, device):
((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
),
)
def test_logsoftmax_backward_for_dim_nc(shape_dim, device):
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_logsoftmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)

compute_kernel_config = get_compute_kernel_options(compute_kernel_options)

x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)

y = F.log_softmax(x, dim)
Expand All @@ -307,7 +367,9 @@ def test_logsoftmax_backward_for_dim_nc(shape_dim, device):
)

y.backward(dy)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
tt_npu = ttl.operations.primary.moreh_logsoftmax_backward(
dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
)
tt_npu = tt_npu.cpu().to(ttl.tensor.Layout.ROW_MAJOR).unpad_from_tile(shape)
assert list(tt_npu.get_legacy_shape()) == list(x.grad.shape)
tt_dev = tt_npu.cpu().to_torch().to(torch.bfloat16)
Expand Down
Loading

0 comments on commit 70a4ade

Please sign in to comment.