Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

r2.12 cherry-pick: Merge pull request https://github.com/tensorflow/tensorflow/pull/59581 from Intel-tensorflow:security_fix_quantiz #59842

Merged
merged 2 commits into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions tensorflow/compiler/xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ cc_library(
"while_thunk.h",
],
deps = [
":non_atomically_upgradeable_rw_lock",
":backend_configs_cc",
":buffer_allocations",
":cusolver_context",
Expand Down
17 changes: 10 additions & 7 deletions tensorflow/compiler/xla/service/gpu/gpu_executable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
Expand Down Expand Up @@ -457,7 +458,8 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
const BufferAllocation* temp_buffer,
bool block_host_until_done) {
bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock) {
uint64_t start_nanos = tsl::Env::Default()->NowNanos();

tsl::profiler::TraceMe hlo_module_activity(
Expand All @@ -474,7 +476,7 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
});

auto executed = gpu_runtime_executable.Execute(
run_options, asm_text, binary, buffer_allocations, temp_buffer);
run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer);
if (!executed.ok()) return executed;

return MaybeSyncAndProfile(
Expand All @@ -497,7 +499,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
// Lock the GPU with a shared lock so that we don't interfere with autotuning
// that may be running during JIT compilation while allowing multiple XLA
// computations to use the same GPU simultaneously.
absl::ReaderMutexLock gpu_lock(&GetGpuMutex(executor));
NonAtomicallyUpgradeableRWLock gpu_lock(&GetGpuMutex(executor));

const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
{
Expand Down Expand Up @@ -628,8 +630,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
buffers_in_result.insert(result_buffer);
}

TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(run_options, buffer_allocations,
block_host_until_done));
TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(
run_options, buffer_allocations, block_host_until_done, gpu_lock));

// Free all temporary allocations.
TF_RETURN_IF_ERROR(
Expand All @@ -644,7 +646,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(

Status GpuExecutable::ExecuteThunksOrXlaRuntime(
const ServiceExecutableRunOptions* run_options,
const BufferAllocations& buffer_allocations, bool block_host_until_done) {
const BufferAllocations& buffer_allocations, bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock) {
TF_RETURN_IF_ERROR(
CheckCompatibilityWithServiceExecutableRunOptions(run_options));

Expand Down Expand Up @@ -676,7 +679,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
}
return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_,
run_options, text_, binary_, buffer_allocations,
temp_buffer, block_host_until_done);
temp_buffer, block_host_until_done, gpu_lock);
}

return FailedPrecondition("Expected XLA gpu executable is not supplied.");
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/compiler/xla/service/gpu/gpu_executable.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/executable.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
#include "tensorflow/compiler/xla/service/gpu/thunk.h"
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
Expand Down Expand Up @@ -200,7 +201,8 @@ class GpuExecutable : public Executable {
// GPU execution completes.
Status ExecuteThunksOrXlaRuntime(
const ServiceExecutableRunOptions* run_options,
const BufferAllocations& buffer_allocations, bool block_host_until_done);
const BufferAllocations& buffer_allocations, bool block_host_until_done,
NonAtomicallyUpgradeableRWLock& gpu_lock);

using BufferAllocToDeviceMemoryMap =
absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
Expand Down
2 changes: 2 additions & 0 deletions tensorflow/compiler/xla/service/gpu/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ cc_library(
"//tensorflow/compiler/xla/runtime:module_registry",
"//tensorflow/compiler/xla/service:executable",
"//tensorflow/compiler/xla/service/gpu:buffer_allocations",
"//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
"//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
"//tensorflow/tsl/protobuf:dnn_proto_cc",
"@com_google_absl//absl/strings",
Expand Down Expand Up @@ -160,6 +161,7 @@ cc_library(
"//tensorflow/compiler/xla/service/gpu:matmul_utils",
"//tensorflow/compiler/xla/stream_executor:blas",
"//tensorflow/compiler/xla/stream_executor:device_memory",
"//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
] + if_cuda_is_configured([
"//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
"//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/compiler/xla/service/gpu/runtime/executable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/executable.h"
#include "tensorflow/compiler/xla/runtime/ffi.h"
#include "tensorflow/compiler/xla/runtime/jit_executable.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
Expand Down Expand Up @@ -301,6 +302,7 @@ Status GpuRuntimeExecutable::Execute(
const ServiceExecutableRunOptions* run_options, const std::string& asm_text,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
NonAtomicallyUpgradeableRWLock& gpu_lock,
const BufferAllocation* temp_alloc) {
// We pass a pointer to the executable through UserData, so that we can
// get access to other exported functions from custom call handlers.
Expand Down Expand Up @@ -380,7 +382,7 @@ Status GpuRuntimeExecutable::Execute(
runtime::CustomCall::UserData user_data(
run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
&ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners,
&collectives_, &fft_plans, &send_recv_events,
&collectives_, &fft_plans, &send_recv_events, &gpu_lock,
#if GOOGLE_CUDA
// Auxiliary data that is available only if compiled with CUDA support.
&matmul_plans, &graph_instances,
Expand Down
2 changes: 2 additions & 0 deletions tensorflow/compiler/xla/service/gpu/runtime/executable.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/jit_executable.h"
#include "tensorflow/compiler/xla/runtime/module_registry.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
Expand Down Expand Up @@ -104,6 +105,7 @@ class GpuRuntimeExecutable {
const std::string& asm_text,
const std::vector<uint8_t>& binary,
const BufferAllocations& buffer_allocations,
NonAtomicallyUpgradeableRWLock& gpu_lock,
const BufferAllocation* temp_alloc = nullptr);

// Returns object file behind the runtime executable. This object file can
Expand Down
51 changes: 33 additions & 18 deletions tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/runtime/custom_call.h"
#include "tensorflow/compiler/xla/runtime/executable.h"
#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
#include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
Expand All @@ -43,14 +44,12 @@ using xla::runtime::State;
using xla::runtime::StridedMemrefView;

#if GOOGLE_CUDA
// TODO(anlunx): Runtime autotuning should be protected by an exclusive lock to
// achieve precision. Right now it is protected by a reader lock acquired by
// GpuExecutable::ExecuteAsyncOnStreamImpl, so it may run cuncurrently with
// another runtime autotuning.

Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs,
se::DeviceMemoryBase out, const Shape& output_shape,
double beta, const DebugOptions* debug_options) {
double beta, const DebugOptions* debug_options,
NonAtomicallyUpgradeableRWLock* gpu_lock) {
VLOG(3) << "Running GEMM runtime autotuning";
std::vector<se::blas::AlgorithmType> algorithms;
stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms);
Expand All @@ -67,6 +66,11 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
CreateRedzoneAllocator(stream, stream->parent()->GetAllocator(),
*debug_options, autotune_config);

// Upgrade the reader lock for execution to a writer lock to protect runtime
// autotuning.
NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
gpu_lock->UpgradeToWriterMutexLock();

TF_ASSIGN_OR_RETURN(
auto best_algorithm_idx,
GetBestBlasAlgorithm(
Expand Down Expand Up @@ -96,6 +100,7 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,

static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
const DebugOptions* debug_options,
NonAtomicallyUpgradeableRWLock* gpu_lock,
State<GemmConfig> state, StridedMemrefView lhs,
StridedMemrefView rhs, StridedMemrefView out,
int64_t algorithm, double alpha_real,
Expand All @@ -110,27 +115,36 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
Shape output_shape = ToShape(out);

// Get the gemm config from the state.
absl::StatusOr<GemmConfig*> config = state.GetOrCreate([&] {
absl::StatusOr<GemmConfig*> config_from_state = state.GetOrCreate([&] {
StatusOr<GemmConfig> gemm_config =
GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
dot_dims.lhs_batch, dot_dims.lhs_contract,
dot_dims.rhs_batch, dot_dims.rhs_contract);
return ToAbsl(gemm_config);
});

if (!config_from_state.ok()) return config_from_state.status();
GemmConfig* gemm_config = *config_from_state;

// Set the gemm algorithm by runtime autotuning. We do runtime autotuning
// outside of state.GetOrCreate() because otherwise it would be a potential
// deadlock.
if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
#if GOOGLE_CUDA
if (!gemm_config.ok()) return ToAbsl(gemm_config);
if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
auto status =
DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
output_data, output_shape, beta, debug_options);
if (!status.ok())
return absl::StatusOr<GemmConfig>(
absl::InternalError(status.ToString()));
auto status = DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
output_data, output_shape, beta,
debug_options, gpu_lock);
if (!status.ok()) {
return absl::InternalError(status.ToString());
}
#else
return absl::InternalError(
"Failed to run runtime autotuner because CUDA is not enabled");
#endif
return ToAbsl(gemm_config);
});
if (!config.ok()) return config.status();
}

Status executed = RunGemm(**config, lhs_data, rhs_data, output_data, stream);
Status executed =
RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream);

if (!executed.ok()) return ToAbslStatus(executed);

Expand All @@ -142,6 +156,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
CustomCall::Bind("xla.gpu.gemm")
.UserData<const ServiceExecutableRunOptions*>()
.UserData<const DebugOptions*>()
.UserData<NonAtomicallyUpgradeableRWLock*>()
.State<GemmConfig>("uid")
.Arg<StridedMemrefView>() // lhs
.Arg<StridedMemrefView>() // rhs
Expand Down