Skip to content

Commit

Permalink
Merge pull request #59842 from tensorflow/venkat-patch-1
Browse files Browse the repository at this point in the history
Merge pull request #59581 from Intel-tensorflow:security_fix_quantiz
  • Loading branch information
learning-to-play committed Mar 1, 2023
2 parents c8eb0b7 + 737d6df commit 37f8b09
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 27 deletions.
1 change: 1 addition & 0 deletions tensorflow/compiler/xla/service/gpu/BUILD
Expand Up @@ -602,6 +602,7 @@ cc_library(
"while_thunk.h",
],
deps = [
":non_atomically_upgradeable_rw_lock",
":backend_configs_cc",
":buffer_allocations",
":cusolver_context",
Expand Down
17 changes: 10 additions & 7 deletions tensorflow/compiler/xla/service/gpu/gpu_executable.cc
Expand Up @@ -46,6 +46,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
Expand Down Expand Up @@ -457,7 +458,8 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
const BufferAllocation* temp_buffer,
bool block_host_until_done) {
bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock) {
uint64_t start_nanos = tsl::Env::Default()->NowNanos();

tsl::profiler::TraceMe hlo_module_activity(
Expand All @@ -474,7 +476,7 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
});

auto executed = gpu_runtime_executable.Execute(
run_options, asm_text, binary, buffer_allocations, temp_buffer);
run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer);
if (!executed.ok()) return executed;

return MaybeSyncAndProfile(
Expand All @@ -497,7 +499,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
// Lock the GPU with a shared lock so that we don't interfere with autotuning
// that may be running during JIT compilation while allowing multiple XLA
// computations to use the same GPU simultaneously.
absl::ReaderMutexLock gpu_lock(&GetGpuMutex(executor));
NonAtomicallyUpgradeableRWLock gpu_lock(&GetGpuMutex(executor));

const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
{
Expand Down Expand Up @@ -628,8 +630,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
buffers_in_result.insert(result_buffer);
}

TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(run_options, buffer_allocations,
block_host_until_done));
TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(
run_options, buffer_allocations, block_host_until_done, gpu_lock));

// Free all temporary allocations.
TF_RETURN_IF_ERROR(
Expand All @@ -644,7 +646,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(

Status GpuExecutable::ExecuteThunksOrXlaRuntime(
const ServiceExecutableRunOptions* run_options,
const BufferAllocations& buffer_allocations, bool block_host_until_done) {
const BufferAllocations& buffer_allocations, bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock) {
TF_RETURN_IF_ERROR(
CheckCompatibilityWithServiceExecutableRunOptions(run_options));

Expand Down Expand Up @@ -676,7 +679,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
}
return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_,
run_options, text_, binary_, buffer_allocations,
temp_buffer, block_host_until_done);
temp_buffer, block_host_until_done, gpu_lock);
}

return FailedPrecondition("Expected XLA gpu executable is not supplied.");
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/compiler/xla/service/gpu/gpu_executable.h
Expand Up @@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/executable.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
#include "tensorflow/compiler/xla/service/gpu/thunk.h"
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
Expand Down Expand Up @@ -200,7 +201,8 @@ class GpuExecutable : public Executable {
// GPU execution completes.
Status ExecuteThunksOrXlaRuntime(
const ServiceExecutableRunOptions* run_options,
const BufferAllocations& buffer_allocations, bool block_host_until_done);
const BufferAllocations& buffer_allocations, bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock);

using BufferAllocToDeviceMemoryMap =
absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
Expand Down
2 changes: 2 additions & 0 deletions tensorflow/compiler/xla/service/gpu/runtime/BUILD
Expand Up @@ -119,6 +119,7 @@ cc_library(
"//tensorflow/compiler/xla/runtime:module_registry",
"//tensorflow/compiler/xla/service:executable",
"//tensorflow/compiler/xla/service/gpu:buffer_allocations",
"//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
"//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
"//tensorflow/tsl/protobuf:dnn_proto_cc",
"@com_google_absl//absl/strings",
Expand Down Expand Up @@ -160,6 +161,7 @@ cc_library(
"//tensorflow/compiler/xla/service/gpu:matmul_utils",
"//tensorflow/compiler/xla/stream_executor:blas",
"//tensorflow/compiler/xla/stream_executor:device_memory",
"//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
] + if_cuda_is_configured([
"//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
"//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/compiler/xla/service/gpu/runtime/executable.cc
Expand Up @@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/executable.h"
#include "tensorflow/compiler/xla/runtime/ffi.h"
#include "tensorflow/compiler/xla/runtime/jit_executable.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
Expand Down Expand Up @@ -301,6 +302,7 @@ Status GpuRuntimeExecutable::Execute(
const ServiceExecutableRunOptions* run_options, const std::string& asm_text,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
NonAtomicallyUpgradeableRWLock& gpu_lock,
const BufferAllocation* temp_alloc) {
// We pass a pointer to the executable through UserData, so that we can
// get access to other exported functions from custom call handlers.
Expand Down Expand Up @@ -380,7 +382,7 @@ Status GpuRuntimeExecutable::Execute(
runtime::CustomCall::UserData user_data(
run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
&ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners,
&collectives_, &fft_plans, &send_recv_events,
&collectives_, &fft_plans, &send_recv_events, &gpu_lock,
#if GOOGLE_CUDA
// Auxiliary data that is available only if compiled with CUDA support.
&matmul_plans, &graph_instances,
Expand Down
2 changes: 2 additions & 0 deletions tensorflow/compiler/xla/service/gpu/runtime/executable.h
Expand Up @@ -28,6 +28,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/jit_executable.h"
#include "tensorflow/compiler/xla/runtime/module_registry.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
Expand Down Expand Up @@ -104,6 +105,7 @@ class GpuRuntimeExecutable {
const std::string& asm_text,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
NonAtomicallyUpgradeableRWLock& gpu_lock,
const BufferAllocation* temp_alloc = nullptr);

// Returns object file behind the runtime executable. This object file can
Expand Down
51 changes: 33 additions & 18 deletions tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/custom_call.h"
#include "tensorflow/compiler/xla/runtime/executable.h"
#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
Expand All @@ -43,14 +44,12 @@ using xla::runtime::State;
using xla::runtime::StridedMemrefView;

#if GOOGLE_CUDA
// TODO(anlunx): Runtime autotuning should be protected by an exclusive lock to
// achieve precision. Right now it is protected by a reader lock acquired by
// GpuExecutable::ExecuteAsyncOnStreamImpl, so it may run cuncurrently with
// another runtime autotuning.

Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs,
se::DeviceMemoryBase out, const Shape& output_shape,
double beta, const DebugOptions* debug_options) {
double beta, const DebugOptions* debug_options,
NonAtomicallyUpgradeableRWLock* gpu_lock) {
VLOG(3) << "Running GEMM runtime autotuning";
std::vector<se::blas::AlgorithmType> algorithms;
stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms);
Expand All @@ -67,6 +66,11 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
CreateRedzoneAllocator(stream, stream->parent()->GetAllocator(),
*debug_options, autotune_config);

// Upgrade the reader lock for execution to a writer lock to protect runtime
// autotuning.
NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
gpu_lock->UpgradeToWriterMutexLock();

TF_ASSIGN_OR_RETURN(
auto best_algorithm_idx,
GetBestBlasAlgorithm(
Expand Down Expand Up @@ -96,6 +100,7 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,

static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
const DebugOptions* debug_options,
NonAtomicallyUpgradeableRWLock* gpu_lock,
State<GemmConfig> state, StridedMemrefView lhs,
StridedMemrefView rhs, StridedMemrefView out,
int64_t algorithm, double alpha_real,
Expand All @@ -110,27 +115,36 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
Shape output_shape = ToShape(out);

// Get the gemm config from the state.
absl::StatusOr<GemmConfig*> config = state.GetOrCreate([&] {
absl::StatusOr<GemmConfig*> config_from_state = state.GetOrCreate([&] {
StatusOr<GemmConfig> gemm_config =
GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
dot_dims.lhs_batch, dot_dims.lhs_contract,
dot_dims.rhs_batch, dot_dims.rhs_contract);
return ToAbsl(gemm_config);
});

if (!config_from_state.ok()) return config_from_state.status();
GemmConfig* gemm_config = *config_from_state;

// Set the gemm algorithm by runtime autotuning. We do runtime autotuning
// outside of state.GetOrCreate() because otherwise it would be a potential
// deadlock.
if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
#if GOOGLE_CUDA
if (!gemm_config.ok()) return ToAbsl(gemm_config);
if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
auto status =
DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
output_data, output_shape, beta, debug_options);
if (!status.ok())
return absl::StatusOr<GemmConfig>(
absl::InternalError(status.ToString()));
auto status = DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
output_data, output_shape, beta,
debug_options, gpu_lock);
if (!status.ok()) {
return absl::InternalError(status.ToString());
}
#else
return absl::InternalError(
"Failed to run runtime autotuner because CUDA is not enabled");
#endif
return ToAbsl(gemm_config);
});
if (!config.ok()) return config.status();
}

Status executed = RunGemm(**config, lhs_data, rhs_data, output_data, stream);
Status executed =
RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream);

if (!executed.ok()) return ToAbslStatus(executed);

Expand All @@ -142,6 +156,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
CustomCall::Bind("xla.gpu.gemm")
.UserData<const ServiceExecutableRunOptions*>()
.UserData<const DebugOptions*>()
.UserData<NonAtomicallyUpgradeableRWLock*>()
.State<GemmConfig>("uid")
.Arg<StridedMemrefView>() // lhs
.Arg<StridedMemrefView>() // rhs
Expand Down

0 comments on commit 37f8b09

Please sign in to comment.