#7511: adjust fidelity, subblock index order, and test for matmul

tenstorrent · May 12, 2024 · 06a9d66 · 06a9d66
1 parent 780b0f3
commit 06a9d66
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 5 deletions.
diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py
@@ -449,15 +449,15 @@ def test_sharded_matmul(t3k_device_mesh):
 
 def test_4b_tensor(device_mesh):
     tensor = ttnn.from_torch(
-        torch.randn(1, 1, 64, 64),
+        torch.randn(1, 1, 32, 32),
         dtype=ttnn.bfloat4_b,
         layout=ttnn.TILE_LAYOUT,
         device=device_mesh,
         mesh_mapper=ReplicateTensorToMesh(device_mesh),
     )
     tensor = ttnn.to_device(tensor, device_mesh)
     x = ttnn.from_torch(
-        torch.randn(1, 1, 64, 64),
+        torch.randn(1, 1, 32, 32),
         dtype=ttnn.bfloat16,
         layout=ttnn.TILE_LAYOUT,
         device=device_mesh,

diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.cpp
@@ -513,8 +513,8 @@ tt::operations::primary::MatmulProgramConfig get_matmul_program_config(const Ten
 tuple<uint32_t, uint32_t> get_subblock_sizes(uint32_t m_tiles_per_core, uint32_t n_tiles_per_core, bool fp32_dest_acc_en) {
     uint32_t out_subblock_h, out_subblock_w;
     for (auto &subblock_hw : SUBBLOCK_HW_CHOICES) {
-        out_subblock_h = std::get<0>(subblock_hw);
-        out_subblock_w = std::get<1>(subblock_hw);
+        out_subblock_w = std::get<0>(subblock_hw);
+        out_subblock_h = std::get<1>(subblock_hw);
 	if ((out_subblock_h * out_subblock_w) <= 4 || !fp32_dest_acc_en) {
 	    if (m_tiles_per_core % out_subblock_h == 0 && n_tiles_per_core % out_subblock_w == 0) {
 		return {out_subblock_h, out_subblock_w};

diff --git a/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp b/tt_eager/tt_dnn/op_library/bmm/bmm_op.hpp
@@ -327,6 +327,8 @@ MatmulProgramConfig create_matmul_program_config(const Tensor& input_tensor_a, c
 namespace bmm_op_utils {
 using namespace tt::tt_metal;
 
+// Ensure there are always symmetrical values. Different paths use different
+// index ordering (0,1 vs 1,0) to meet test PCC requirements.
 constexpr std::array<tuple<uint32_t, uint32_t>, 20> SUBBLOCK_HW_CHOICES = {{
     {4, 2}, {2, 4}, {8, 1}, {1, 8},
     {7, 1}, {1, 7},
@@ -401,7 +403,7 @@ inline Tensor matmul(
             const auto& input_tensor_a = input_tensors.at(0);
             const auto& input_tensor_b = input_tensors.at(1);
             auto arch = input_tensor_a.device()->arch();
-            const auto increase_fidelity = is_program_config_default(program_config) || user_core_coord.has_value();
+            const auto increase_fidelity = is_program_config_default(program_config) && !user_core_coord.has_value();
             auto math_fidelity = increase_fidelity ? MathFidelity::HiFi2 : MathFidelity::LoFi;
             auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, math_fidelity);
             bool broadcast_batch = get_broadcast_batch(input_tensor_a, input_tensor_b, program_config);