From f9811551ca3c7fd3c0b1a59d7093215e5aba890a Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Tue, 21 Feb 2023 13:28:30 -0800
Subject: [PATCH 1/2] [xla:gpu] Serialize autotuning on GPU devices

This change uses UpgradeableReaderMutexLock to protect runtime autotuning. Before this change, a reader mutex is used, which may cause multiple instances of runtime autotuning to run concurrently. Removing such concurrency helps to minimize autotuning noise.

PiperOrigin-RevId: 511290070
---
 tensorflow/compiler/xla/service/gpu/BUILD     |  1 +
 .../xla/service/gpu/gpu_executable.cc         | 17 ++++---
 .../compiler/xla/service/gpu/gpu_executable.h |  4 +-
 .../compiler/xla/service/gpu/runtime/BUILD    |  2 +
 .../xla/service/gpu/runtime/executable.cc     |  4 +-
 .../xla/service/gpu/runtime/executable.h      |  2 +
 .../compiler/xla/service/gpu/runtime/gemm.cc  | 51 ++++++++++++-------
 7 files changed, 54 insertions(+), 27 deletions(-)
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 33a057c05fa09c..4562b89791a519 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -602,6 +602,7 @@ cc_library(
         "while_thunk.h",
     ],
     deps = [
+        ":non_atomically_upgradeable_rw_lock",
         ":backend_configs_cc",
         ":buffer_allocations",
         ":cusolver_context",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 5cc36892857e05..a5b9099bda1a20 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
+#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
@@ -457,7 +458,8 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
                                 const std::vector<uint8_t>& binary,
                                 const BufferAllocations& buffer_allocations,
                                 const BufferAllocation* temp_buffer,
-                                bool block_host_until_done) {
+                                bool block_host_until_done,
+                                NonAtomicallyUpgradeableRWLock& gpu_lock) {
   uint64_t start_nanos = tsl::Env::Default()->NowNanos();
 
   tsl::profiler::TraceMe hlo_module_activity(
@@ -474,7 +476,7 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
   });
 
   auto executed = gpu_runtime_executable.Execute(
-      run_options, asm_text, binary, buffer_allocations, temp_buffer);
+      run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer);
   if (!executed.ok()) return executed;
 
   return MaybeSyncAndProfile(
@@ -497,7 +499,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   // Lock the GPU with a shared lock so that we don't interfere with autotuning
   // that may be running during JIT compilation while allowing multiple XLA
   // computations to use the same GPU simultaneously.
-  absl::ReaderMutexLock gpu_lock(&GetGpuMutex(executor));
+  NonAtomicallyUpgradeableRWLock gpu_lock(&GetGpuMutex(executor));
 
   const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
   {
@@ -628,8 +630,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     buffers_in_result.insert(result_buffer);
   }
 
-  TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(run_options, buffer_allocations,
-                                               block_host_until_done));
+  TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(
+      run_options, buffer_allocations, block_host_until_done, gpu_lock));
 
   // Free all temporary allocations.
   TF_RETURN_IF_ERROR(
@@ -644,7 +646,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
 
 Status GpuExecutable::ExecuteThunksOrXlaRuntime(
     const ServiceExecutableRunOptions* run_options,
-    const BufferAllocations& buffer_allocations, bool block_host_until_done) {
+    const BufferAllocations& buffer_allocations, bool block_host_until_done,
+    NonAtomicallyUpgradeableRWLock& gpu_lock) {
   TF_RETURN_IF_ERROR(
       CheckCompatibilityWithServiceExecutableRunOptions(run_options));
 
@@ -676,7 +679,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
     }
     return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_,
                              run_options, text_, binary_, buffer_allocations,
-                             temp_buffer, block_host_until_done);
+                             temp_buffer, block_host_until_done, gpu_lock);
   }
 
   return FailedPrecondition("Expected XLA gpu executable is not supplied.");
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 989722dfcfca15..d4e326f3fee734 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
+#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
@@ -200,7 +201,8 @@ class GpuExecutable : public Executable {
   // GPU execution completes.
   Status ExecuteThunksOrXlaRuntime(
       const ServiceExecutableRunOptions* run_options,
-      const BufferAllocations& buffer_allocations, bool block_host_until_done);
+      const BufferAllocations& buffer_allocations, bool block_host_until_done,
+      NonAtomicallyUpgradeableRWLock& gpu_lock);
 
   using BufferAllocToDeviceMemoryMap =
       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/BUILD b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
index 0b1e35d7ed9ee3..723d8d725f9ba9 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
@@ -119,6 +119,7 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:module_registry",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:buffer_allocations",
+        "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/strings",
@@ -160,6 +161,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:matmul_utils",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
index 028c32b752b38b..8ac2425fbde851 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
@@ -301,6 +302,7 @@ Status GpuRuntimeExecutable::Execute(
     const ServiceExecutableRunOptions* run_options, const std::string& asm_text,
     const std::vector<uint8_t>& binary,
     const BufferAllocations& buffer_allocations,
+    NonAtomicallyUpgradeableRWLock& gpu_lock,
     const BufferAllocation* temp_alloc) {
   // We pass a pointer to the executable through UserData, so that we can
   // get access to other exported functions from custom call handlers.
@@ -380,7 +382,7 @@ Status GpuRuntimeExecutable::Execute(
   runtime::CustomCall::UserData user_data(
       run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
       &ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners,
-      &collectives_, &fft_plans, &send_recv_events,
+      &collectives_, &fft_plans, &send_recv_events, &gpu_lock,
 #if GOOGLE_CUDA
       // Auxiliary data that is available only if compiled with CUDA support.
       &matmul_plans, &graph_instances,
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.h b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
index 93fd9478390fad..8f090f3f5cda3d 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/runtime/module_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
@@ -104,6 +105,7 @@ class GpuRuntimeExecutable {
                  const std::string& asm_text,
                  const std::vector<uint8_t>& binary,
                  const BufferAllocations& buffer_allocations,
+                 NonAtomicallyUpgradeableRWLock& gpu_lock,
                  const BufferAllocation* temp_alloc = nullptr);
 
   // Returns object file behind the runtime executable. This object file can
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
index 1d20e1e6532898..73d712d2e18f54 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
@@ -43,14 +44,12 @@ using xla::runtime::State;
 using xla::runtime::StridedMemrefView;
 
 #if GOOGLE_CUDA
-// TODO(anlunx): Runtime autotuning should be protected by an exclusive lock to
-// achieve precision. Right now it is protected by a reader lock acquired by
-// GpuExecutable::ExecuteAsyncOnStreamImpl, so it may run cuncurrently with
-// another runtime autotuning.
+
 Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
                            se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs,
                            se::DeviceMemoryBase out, const Shape& output_shape,
-                           double beta, const DebugOptions* debug_options) {
+                           double beta, const DebugOptions* debug_options,
+                           NonAtomicallyUpgradeableRWLock* gpu_lock) {
   VLOG(3) << "Running GEMM runtime autotuning";
   std::vector<se::blas::AlgorithmType> algorithms;
   stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms);
@@ -67,6 +66,11 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
       CreateRedzoneAllocator(stream, stream->parent()->GetAllocator(),
                              *debug_options, autotune_config);
 
+  // Upgrade the reader lock for execution to a writer lock to protect runtime
+  // autotuning.
+  NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
+      gpu_lock->UpgradeToWriterMutexLock();
+
   TF_ASSIGN_OR_RETURN(
       auto best_algorithm_idx,
       GetBestBlasAlgorithm(
@@ -96,6 +100,7 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
 
 static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
                              const DebugOptions* debug_options,
+                             NonAtomicallyUpgradeableRWLock* gpu_lock,
                              State<GemmConfig> state, StridedMemrefView lhs,
                              StridedMemrefView rhs, StridedMemrefView out,
                              int64_t algorithm, double alpha_real,
@@ -110,27 +115,36 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
   Shape output_shape = ToShape(out);
 
   // Get the gemm config from the state.
-  absl::StatusOr<GemmConfig*> config = state.GetOrCreate([&] {
+  absl::StatusOr<GemmConfig*> config_from_state = state.GetOrCreate([&] {
     StatusOr<GemmConfig> gemm_config =
         GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
                       dot_dims.lhs_batch, dot_dims.lhs_contract,
                       dot_dims.rhs_batch, dot_dims.rhs_contract);
+    return ToAbsl(gemm_config);
+  });
+
+  if (!config_from_state.ok()) return config_from_state.status();
+  GemmConfig* gemm_config = *config_from_state;
+
+  // Set the gemm algorithm by runtime autotuning. We do runtime autotuning
+  // outside of state.GetOrCreate() because otherwise it would be a potential
+  // deadlock.
+  if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
 #if GOOGLE_CUDA
-    if (!gemm_config.ok()) return ToAbsl(gemm_config);
-    if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
-      auto status =
-          DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
-                              output_data, output_shape, beta, debug_options);
-      if (!status.ok())
-        return absl::StatusOr<GemmConfig>(
-            absl::InternalError(status.ToString()));
+    auto status = DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
+                                      output_data, output_shape, beta,
+                                      debug_options, gpu_lock);
+    if (!status.ok()) {
+      return absl::InternalError(status.ToString());
     }
+#else
+    return absl::InternalError(
+        "Failed to run runtime autotuner because CUDA is not enabled");
 #endif
-    return ToAbsl(gemm_config);
-  });
-  if (!config.ok()) return config.status();
+  }
 
-  Status executed = RunGemm(**config, lhs_data, rhs_data, output_data, stream);
+  Status executed =
+      RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream);
 
   if (!executed.ok()) return ToAbslStatus(executed);
 
@@ -142,6 +156,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     CustomCall::Bind("xla.gpu.gemm")
         .UserData<const ServiceExecutableRunOptions*>()
         .UserData<const DebugOptions*>()
+        .UserData<NonAtomicallyUpgradeableRWLock*>()
         .State<GemmConfig>("uid")
         .Arg<StridedMemrefView>()  // lhs
         .Arg<StridedMemrefView>()  // rhs

From 737d6dfac356e5cb3a254f182ae89e52ea94bc83 Mon Sep 17 00:00:00 2001
From: "guozhong.zhuang" <guozhong.zhuang@intel.com>
Date: Tue, 21 Feb 2023 09:21:35 -0800
Subject: [PATCH 2/2] INT8 tests do not run on Windows with --config=mkl