From 5ce7a1e3a22435c195e0bf23ed7e70008cf7554a Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Wed, 22 May 2024 09:08:38 -0700
Subject: [PATCH] PR #12942: [GPU] Fix cuDNN GEMM test tolerances.

Imported from GitHub PR https://github.com/openxla/xla/pull/12942

Use the maximum absolute difference observed on 20 runs of these tests with different seed values.
Copybara import of the project:

--
c438b08ea7240c23ae98bc8dcf4ef45fa6d2e89c by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Fix cuDNN GEMM test tolerances.

Use the maximum absolute difference observed on 20 runs of these tests with different seed values.

Merging this change closes #12942

FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/openxla/xla/pull/12942 from openxla:fix_test_cudnn c438b08ea7240c23ae98bc8dcf4ef45fa6d2e89c
PiperOrigin-RevId: 636188097
---
 tensorflow/lite/schema/BUILD                   |  1 -
 .../temporary/linear_layout_compose_asan.patch | 18 ------------------
 third_party/triton/temporary/series.bzl        |  1 -
 third_party/triton/workspace.bzl               |  4 ++--
 .../temporary/linear_layout_compose_asan.patch | 18 ------------------
 .../third_party/triton/temporary/series.bzl    |  1 -
 .../xla/third_party/triton/workspace.bzl       |  4 ++--
 .../xla/xla/service/gpu/fusions/cudnn_test.cc  |  4 ++--
 .../xla/service/gpu/ir_emitter_triton_cuda.cc  | 10 +++++++---
 9 files changed, 13 insertions(+), 48 deletions(-)
 delete mode 100644 third_party/triton/temporary/linear_layout_compose_asan.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch

diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index b3e5fea78978c6..6155575b4048b5 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -15,7 +15,6 @@ filegroup(
     name = "tflite_internal_cc_3p_api_deps_src",
     srcs = [
         ":schema_fbs_srcs",
-        ":schema_utils.cc",
         ":schema_utils.h",
     ],
     visibility = [
diff --git a/third_party/triton/temporary/linear_layout_compose_asan.patch b/third_party/triton/temporary/linear_layout_compose_asan.patch
deleted file mode 100644
index eff83a166ac4a3..00000000000000
--- a/third_party/triton/temporary/linear_layout_compose_asan.patch
+++ /dev/null
@@ -1,18 +0,0 @@
-==== triton/lib/Tools/LinearLayout.cpp#2 - /google/src/cloud/shyshkov/triton_asan/triton/lib/Tools/LinearLayout.cpp ====
-# action=edit type=text
---- triton/lib/Tools/LinearLayout.cpp	2024-05-17 09:15:25.000000000 -0700
-+++ triton/lib/Tools/LinearLayout.cpp	2024-05-21 06:27:58.000000000 -0700
-@@ -397,9 +397,11 @@
-       for (auto [outDim, b] : llvm::zip(getOutDimNames(), basis)) {
-         bases.push_back({outDim, b});
-       }
--      auto newBases = llvm::make_second_range(outer.apply(bases));
-+
-+      auto outerBases =
-+          llvm::to_vector(llvm::make_second_range(outer.apply(bases)));
-       newInDimBases.push_back(
--          std::vector<int32_t>(newBases.begin(), newBases.end()));
-+          std::vector<int32_t>(outerBases.begin(), outerBases.end()));
-     }
-   }
-   return LinearLayout(std::move(newBases), outer.getOutDimNames());
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index b3d935c048fadd..a929f2c4a017f0 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -6,5 +6,4 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:linear_layout_compose_asan.patch",
 ]
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index a257f1f3e44645..00a674f63d005b 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl634675237"
-    TRITON_SHA256 = "7151d057ee8443c2f45cbe18a7435a42f37e18f562e5d238b844b6e09fc560e6"
+    TRITON_COMMIT = "cl635840438"
+    TRITON_SHA256 = "707101b2e8366e63e80150c26f8ab660052099c91ca0c4fa4c713607fa75f318"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch b/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch
deleted file mode 100644
index eff83a166ac4a3..00000000000000
--- a/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch
+++ /dev/null
@@ -1,18 +0,0 @@
-==== triton/lib/Tools/LinearLayout.cpp#2 - /google/src/cloud/shyshkov/triton_asan/triton/lib/Tools/LinearLayout.cpp ====
-# action=edit type=text
---- triton/lib/Tools/LinearLayout.cpp	2024-05-17 09:15:25.000000000 -0700
-+++ triton/lib/Tools/LinearLayout.cpp	2024-05-21 06:27:58.000000000 -0700
-@@ -397,9 +397,11 @@
-       for (auto [outDim, b] : llvm::zip(getOutDimNames(), basis)) {
-         bases.push_back({outDim, b});
-       }
--      auto newBases = llvm::make_second_range(outer.apply(bases));
-+
-+      auto outerBases =
-+          llvm::to_vector(llvm::make_second_range(outer.apply(bases)));
-       newInDimBases.push_back(
--          std::vector<int32_t>(newBases.begin(), newBases.end()));
-+          std::vector<int32_t>(outerBases.begin(), outerBases.end()));
-     }
-   }
-   return LinearLayout(std::move(newBases), outer.getOutDimNames());
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index b3d935c048fadd..a929f2c4a017f0 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -6,5 +6,4 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:linear_layout_compose_asan.patch",
 ]
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index a257f1f3e44645..00a674f63d005b 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl634675237"
-    TRITON_SHA256 = "7151d057ee8443c2f45cbe18a7435a42f37e18f562e5d238b844b6e09fc560e6"
+    TRITON_COMMIT = "cl635840438"
+    TRITON_SHA256 = "707101b2e8366e63e80150c26f8ab660052099c91ca0c4fa4c713607fa75f318"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index ba664e1029854e..cc380db0fffbc9 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -605,7 +605,7 @@ ENTRY r {
   ROOT r = bf16[192,128]{1,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
     backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
 })",
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+                            ErrorSpec{/*aabs=*/1, /*arel=*/1e-3}));
 }
 
 TEST_F(CuDnnFusionLevel3Test,
@@ -629,7 +629,7 @@ ENTRY r {
   ROOT r = bf16[4,3,16,128]{2,1,3,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
     backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
 })",
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+                            ErrorSpec{/*aabs=*/1, /*arel=*/1e-3}));
 }
 
 class ElementwiseTest : public CuDnnFusionExecutionTest,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
index ec2c6e8136f6b3..07014ca5852ab7 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
@@ -75,9 +75,13 @@ absl::Status CreateTritonPipeline(
   pm.addPass(mt::gpu::createOptimizeDotOperandsPass(ccCuda.IsAtLeastAmpere()));
   pm.addPass(mlir::createCSEPass());
 
-  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                         config.num_ctas, ccAsInt));
-
+  // Even though we don't run on pre-Ampere architectures anymore, we keep this
+  // check for consistency with the upstream pipeline
+  if (ccCuda.IsAtLeastAmpere()) {
+    pm.addPass(mt::gpu::createCombineTensorSelectAndIfPass());
+    pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
+                                           config.num_ctas, ccAsInt));
+  }
   if (!ccCuda.IsAtLeastHopper()) {
     pm.addPass(mt::gpu::createPrefetchPass());
   }