Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XLA:GPU][ROCm] Introduce amdgpu_compiler to XLA. #31489

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 4 additions & 10 deletions tensorflow/compiler/xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1133,8 +1133,7 @@ cc_library(
cc_library(
name = "amdgpu_compiler",
srcs = [
# TODO(whchung@gmail.com): Enable in the subsequent PR.
# "amdgpu_compiler_registration.cc",
"amdgpu_compiler_registration.cc",
],
deps = [
":amdgpu_compiler_impl",
Expand All @@ -1145,18 +1144,13 @@ cc_library(
cc_library(
name = "amdgpu_compiler_impl",
srcs = [
# TODO(whchung@gmail.com) : enable in the subsequent PR.
#"amdgpu_compiler.cc",
"amdgpu_compiler.cc",
],
hdrs = [
# TODO(whchung@gmail.com): enable in the subsequent PR.
#"amdgpu_compiler.h"
"amdgpu_compiler.h",
],
deps = [
# TODO(whchung@gmail.com): Enable these after pending PRs get merged.
#":gpu_compiler_impl",
#":miopen_conv_algorithm_picker",
#"//tensorflow/core:rocm_rocdl_path",
"//tensorflow/core/platform:rocm_rocdl_path",
],
)

Expand Down
155 changes: 155 additions & 0 deletions tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"

#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
whchung marked this conversation as resolved.
Show resolved Hide resolved
// TODO(whchung@gmail.com): Add gpu_conv_algorithm_picker after its PR merged.
#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
#include "tensorflow/compiler/xla/service/hlo_cse.h"
#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
#include "tensorflow/compiler/xla/service/hlo_verifier.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
#include "tensorflow/core/platform/rocm_rocdl_path.h"

namespace xla {
namespace gpu {

namespace {

// Returns the directory containing ROCm-Device-Libs files. This function is
// called in AMDGPUCompiler's constructor, so can't return an error. But
// AMDGPUCompiler::Compile will return an error when the wanted rocdl file
// doesn't exist in the folder this function returns.
string GetROCDLDir(const HloModuleConfig& config) {
std::vector<string> potential_rocdl_dirs;
const string datadir = config.debug_options().xla_gpu_cuda_data_dir();
if (!datadir.empty()) {
potential_rocdl_dirs.push_back(datadir);
}
potential_rocdl_dirs.push_back(tensorflow::RocdlRoot());

// Tries all potential ROCDL directories in the order they are inserted.
// Returns the first directory that exists in the file system.
for (const string& potential_rocdl_dir : potential_rocdl_dirs) {
if (tensorflow::Env::Default()->IsDirectory(potential_rocdl_dir).ok()) {
VLOG(2) << "Found ROCm-Device-Libs dir " << potential_rocdl_dir;
return potential_rocdl_dir;
}
VLOG(2) << "Unable to find potential ROCm-Device-Libs dir "
<< potential_rocdl_dir;
}

// Last resort: maybe in the current folder.
return ".";
}

} // namespace

Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
HloModule* hlo_module, se::StreamExecutor* stream_exec,
se::DeviceMemoryAllocator* device_allocator) {
// Convert convolutions into CustomCalls to MIOpen, then canonicalize them
// (PadInsertion).
HloPassPipeline pipeline("conv_canonicalization");
pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
/*allow_mixed_precision=*/false);
pipeline.AddPass<CudnnConvRewriter>();
pipeline.AddPass<CudnnConvPaddingLegalization>();

pipeline.AddPass<HloConstantFolding>();
TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());

return Status::OK();
}

Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
HloModule* hlo_module, se::StreamExecutor* stream_exec,
se::DeviceMemoryAllocator* device_allocator) {
HloPassPipeline pipeline("post-layout_assignment");
pipeline.AddInvariantChecker<HloVerifier>(
/*layout_sensitive=*/true,
/*allow_mixed_precision=*/false,
LayoutAssignment::InstructionCanChangeLayout);

// The LayoutAssignment pass may leave behind kCopy instructions which are
// duplicate or NOPs, so remove them with algebraic simplification and CSE.
AlgebraicSimplifierOptions options;
options.set_is_layout_sensitive(true);
pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);

// TODO(whchung@gmail.com): Add gpu_conv_algorithm_picker after its PR merged.

// Clean up new_tuple described above.
pipeline.AddPass<TupleSimplifier>();

pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());

return Status::OK();
}

AMDGPUCompiler::AMDGPUCompiler()
: GpuCompiler(stream_executor::rocm::kROCmPlatformId, amdgpu::kTargetTriple, amdgpu::kDataLayout) {}

GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
int isa_version = 0;
if (!stream_exec->GetDeviceDescription().
rocm_amdgpu_isa_version(&isa_version)) {
LOG(WARNING)
<< "Couldn't get AMDGPU ISA version for device; assuming gfx803.";
isa_version = 803;
}

return isa_version;
}

StatusOr<std::pair<std::string, std::vector<uint8>>>
AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
llvm::Module* llvm_module,
GpuVersion gpu_version,
se::StreamExecutor* stream_exec) {
if (rocdl_dir_.empty()) {
// Compute rocdl_dir_ just once and cache it in this member.
rocdl_dir_ = GetROCDLDir(module->config());
}

std::vector<uint8> hsaco;
{
XLA_SCOPED_LOGGING_TIMER(
"AMDGPUCompiler::CompileTargetBinary - CompileToHsaco");
TF_ASSIGN_OR_RETURN(hsaco,
amdgpu::CompileToHsaco(llvm_module, gpu_version,
module->config(), rocdl_dir_));
}

llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);

if (user_post_optimization_hook_) {
user_post_optimization_hook_(*llvm_module);
}

return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
}

} // namespace gpu
} // namespace xla
70 changes: 70 additions & 0 deletions tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_

#include <memory>
#include <string>
#include <vector>

#include "tensorflow/compiler/xla/service/executable.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/compiler/xla/service/llvm_compiler.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/gtl/optional.h"
#include "tensorflow/core/lib/hash/hash.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/thread_annotations.h"

namespace xla {
namespace gpu {

// AMDGPUCompiler generates efficient GPU executables for AMDGPU target.
class AMDGPUCompiler : public GpuCompiler {
public:
AMDGPUCompiler();
~AMDGPUCompiler() override {}

Status OptimizeHloConvolutionCanonicalization(
HloModule* hlo_module, se::StreamExecutor* stream_exec,
se::DeviceMemoryAllocator* device_allocator) override;

Status OptimizeHloPostLayoutAssignment(
HloModule* hlo_module, se::StreamExecutor* stream_exec,
se::DeviceMemoryAllocator* device_allocator) override;

GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;

StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
const HloModule* hlo_module, llvm::Module* llvm_module,
GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;

private:
// The parent directory of ROCm-Device-Libs IR libraries.
string rocdl_dir_;
whchung marked this conversation as resolved.
Show resolved Hide resolved

TF_DISALLOW_COPY_AND_ASSIGN(AMDGPUCompiler);
};

} // namespace gpu
} // namespace xla

#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"

static bool InitModule() {
xla::Compiler::RegisterCompilerFactory(
stream_executor::rocm::kROCmPlatformId, []() {
return absl::make_unique<xla::gpu::AMDGPUCompiler>();
});
return true;
}
static bool module_initialized = InitModule();