From f9811551ca3c7fd3c0b1a59d7093215e5aba890a Mon Sep 17 00:00:00 2001 From: Anlun Xu Date: Tue, 21 Feb 2023 13:28:30 -0800 Subject: [PATCH 1/2] [xla:gpu] Serialize autotuning on GPU devices This change uses UpgradeableReaderMutexLock to protect runtime autotuning. Before this change, a reader mutex is used, which may cause multiple instances of runtime autotuning to run concurrently. Removing such concurrency helps to minimize autotuning noise. PiperOrigin-RevId: 511290070 --- tensorflow/compiler/xla/service/gpu/BUILD | 1 + .../xla/service/gpu/gpu_executable.cc | 17 ++++--- .../compiler/xla/service/gpu/gpu_executable.h | 4 +- .../compiler/xla/service/gpu/runtime/BUILD | 2 + .../xla/service/gpu/runtime/executable.cc | 4 +- .../xla/service/gpu/runtime/executable.h | 2 + .../compiler/xla/service/gpu/runtime/gemm.cc | 51 ++++++++++++------- 7 files changed, 54 insertions(+), 27 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 33a057c05fa09c..4562b89791a519 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -602,6 +602,7 @@ cc_library( "while_thunk.h", ], deps = [ + ":non_atomically_upgradeable_rw_lock", ":backend_configs_cc", ":buffer_allocations", ":cusolver_context", diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 5cc36892857e05..a5b9099bda1a20 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -46,6 +46,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h" #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" +#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h" #include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h" #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h" #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h" @@ -457,7 +458,8 @@ static Status ExecuteXlaRuntime(const std::string& module_name, const std::vector& binary, const BufferAllocations& buffer_allocations, const BufferAllocation* temp_buffer, - bool block_host_until_done) { + bool block_host_until_done, + NonAtomicallyUpgradeableRWLock& gpu_lock) { uint64_t start_nanos = tsl::Env::Default()->NowNanos(); tsl::profiler::TraceMe hlo_module_activity( @@ -474,7 +476,7 @@ static Status ExecuteXlaRuntime(const std::string& module_name, }); auto executed = gpu_runtime_executable.Execute( - run_options, asm_text, binary, buffer_allocations, temp_buffer); + run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer); if (!executed.ok()) return executed; return MaybeSyncAndProfile( @@ -497,7 +499,7 @@ StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( // Lock the GPU with a shared lock so that we don't interfere with autotuning // that may be running during JIT compilation while allowing multiple XLA // computations to use the same GPU simultaneously. - absl::ReaderMutexLock gpu_lock(&GetGpuMutex(executor)); + NonAtomicallyUpgradeableRWLock gpu_lock(&GetGpuMutex(executor)); const GpuExecutable::BufferAllocToDeviceMemoryMap* globals; { @@ -628,8 +630,8 @@ StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( buffers_in_result.insert(result_buffer); } - TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(run_options, buffer_allocations, - block_host_until_done)); + TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime( + run_options, buffer_allocations, block_host_until_done, gpu_lock)); // Free all temporary allocations. TF_RETURN_IF_ERROR( @@ -644,7 +646,8 @@ StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( Status GpuExecutable::ExecuteThunksOrXlaRuntime( const ServiceExecutableRunOptions* run_options, - const BufferAllocations& buffer_allocations, bool block_host_until_done) { + const BufferAllocations& buffer_allocations, bool block_host_until_done, + NonAtomicallyUpgradeableRWLock& gpu_lock) { TF_RETURN_IF_ERROR( CheckCompatibilityWithServiceExecutableRunOptions(run_options)); @@ -676,7 +679,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime( } return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_, run_options, text_, binary_, buffer_allocations, - temp_buffer, block_host_until_done); + temp_buffer, block_host_until_done, gpu_lock); } return FailedPrecondition("Expected XLA gpu executable is not supplied."); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 989722dfcfca15..d4e326f3fee734 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" +#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h" #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" @@ -200,7 +201,8 @@ class GpuExecutable : public Executable { // GPU execution completes. Status ExecuteThunksOrXlaRuntime( const ServiceExecutableRunOptions* run_options, - const BufferAllocations& buffer_allocations, bool block_host_until_done); + const BufferAllocations& buffer_allocations, bool block_host_until_done, + NonAtomicallyUpgradeableRWLock& gpu_lock); using BufferAllocToDeviceMemoryMap = absl::flat_hash_map; diff --git a/tensorflow/compiler/xla/service/gpu/runtime/BUILD b/tensorflow/compiler/xla/service/gpu/runtime/BUILD index 0b1e35d7ed9ee3..723d8d725f9ba9 100644 --- a/tensorflow/compiler/xla/service/gpu/runtime/BUILD +++ b/tensorflow/compiler/xla/service/gpu/runtime/BUILD @@ -119,6 +119,7 @@ cc_library( "//tensorflow/compiler/xla/runtime:module_registry", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service/gpu:buffer_allocations", + "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock", "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream", "//tensorflow/tsl/protobuf:dnn_proto_cc", "@com_google_absl//absl/strings", @@ -160,6 +161,7 @@ cc_library( "//tensorflow/compiler/xla/service/gpu:matmul_utils", "//tensorflow/compiler/xla/stream_executor:blas", "//tensorflow/compiler/xla/stream_executor:device_memory", + "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock", ] + if_cuda_is_configured([ "//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker", "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator", diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc index 028c32b752b38b..8ac2425fbde851 100644 --- a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc +++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/runtime/executable.h" #include "tensorflow/compiler/xla/runtime/ffi.h" #include "tensorflow/compiler/xla/runtime/jit_executable.h" +#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h" #include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h" #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h" #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h" @@ -301,6 +302,7 @@ Status GpuRuntimeExecutable::Execute( const ServiceExecutableRunOptions* run_options, const std::string& asm_text, const std::vector& binary, const BufferAllocations& buffer_allocations, + NonAtomicallyUpgradeableRWLock& gpu_lock, const BufferAllocation* temp_alloc) { // We pass a pointer to the executable through UserData, so that we can // get access to other exported functions from custom call handlers. @@ -380,7 +382,7 @@ Status GpuRuntimeExecutable::Execute( runtime::CustomCall::UserData user_data( run_options, &executable, &debug_options_, &temp_buffer, &asm_text, &ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners, - &collectives_, &fft_plans, &send_recv_events, + &collectives_, &fft_plans, &send_recv_events, &gpu_lock, #if GOOGLE_CUDA // Auxiliary data that is available only if compiled with CUDA support. &matmul_plans, &graph_instances, diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.h b/tensorflow/compiler/xla/service/gpu/runtime/executable.h index 93fd9478390fad..8f090f3f5cda3d 100644 --- a/tensorflow/compiler/xla/service/gpu/runtime/executable.h +++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.h @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/compiler/xla/runtime/jit_executable.h" #include "tensorflow/compiler/xla/runtime/module_registry.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h" #include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h" #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h" #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h" @@ -104,6 +105,7 @@ class GpuRuntimeExecutable { const std::string& asm_text, const std::vector& binary, const BufferAllocations& buffer_allocations, + NonAtomicallyUpgradeableRWLock& gpu_lock, const BufferAllocation* temp_alloc = nullptr); // Returns object file behind the runtime executable. This object file can diff --git a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc index 1d20e1e6532898..73d712d2e18f54 100644 --- a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc +++ b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/runtime/custom_call.h" #include "tensorflow/compiler/xla/runtime/executable.h" #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h" +#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h" #include "tensorflow/compiler/xla/service/gpu/runtime/support.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/service_executable_run_options.h" @@ -43,14 +44,12 @@ using xla::runtime::State; using xla::runtime::StridedMemrefView; #if GOOGLE_CUDA -// TODO(anlunx): Runtime autotuning should be protected by an exclusive lock to -// achieve precision. Right now it is protected by a reader lock acquired by -// GpuExecutable::ExecuteAsyncOnStreamImpl, so it may run cuncurrently with -// another runtime autotuning. + Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config, se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs, se::DeviceMemoryBase out, const Shape& output_shape, - double beta, const DebugOptions* debug_options) { + double beta, const DebugOptions* debug_options, + NonAtomicallyUpgradeableRWLock* gpu_lock) { VLOG(3) << "Running GEMM runtime autotuning"; std::vector algorithms; stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms); @@ -67,6 +66,11 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config, CreateRedzoneAllocator(stream, stream->parent()->GetAllocator(), *debug_options, autotune_config); + // Upgrade the reader lock for execution to a writer lock to protect runtime + // autotuning. + NonAtomicallyUpgradeableRWLock::WriterLock writer_lock = + gpu_lock->UpgradeToWriterMutexLock(); + TF_ASSIGN_OR_RETURN( auto best_algorithm_idx, GetBestBlasAlgorithm( @@ -96,6 +100,7 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config, static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options, const DebugOptions* debug_options, + NonAtomicallyUpgradeableRWLock* gpu_lock, State state, StridedMemrefView lhs, StridedMemrefView rhs, StridedMemrefView out, int64_t algorithm, double alpha_real, @@ -110,27 +115,36 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options, Shape output_shape = ToShape(out); // Get the gemm config from the state. - absl::StatusOr config = state.GetOrCreate([&] { + absl::StatusOr config_from_state = state.GetOrCreate([&] { StatusOr gemm_config = GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta, dot_dims.lhs_batch, dot_dims.lhs_contract, dot_dims.rhs_batch, dot_dims.rhs_contract); + return ToAbsl(gemm_config); + }); + + if (!config_from_state.ok()) return config_from_state.status(); + GemmConfig* gemm_config = *config_from_state; + + // Set the gemm algorithm by runtime autotuning. We do runtime autotuning + // outside of state.GetOrCreate() because otherwise it would be a potential + // deadlock. + if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) { #if GOOGLE_CUDA - if (!gemm_config.ok()) return ToAbsl(gemm_config); - if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) { - auto status = - DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data, - output_data, output_shape, beta, debug_options); - if (!status.ok()) - return absl::StatusOr( - absl::InternalError(status.ToString())); + auto status = DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data, + output_data, output_shape, beta, + debug_options, gpu_lock); + if (!status.ok()) { + return absl::InternalError(status.ToString()); } +#else + return absl::InternalError( + "Failed to run runtime autotuner because CUDA is not enabled"); #endif - return ToAbsl(gemm_config); - }); - if (!config.ok()) return config.status(); + } - Status executed = RunGemm(**config, lhs_data, rhs_data, output_data, stream); + Status executed = + RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream); if (!executed.ok()) return ToAbslStatus(executed); @@ -142,6 +156,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL( CustomCall::Bind("xla.gpu.gemm") .UserData() .UserData() + .UserData() .State("uid") .Arg() // lhs .Arg() // rhs From 737d6dfac356e5cb3a254f182ae89e52ea94bc83 Mon Sep 17 00:00:00 2001 From: "guozhong.zhuang" Date: Tue, 21 Feb 2023 09:21:35 -0800 Subject: [PATCH 2/2] INT8 tests do not run on Windows with --config=mkl