Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] add ROCm support for XLA RCCL thunk #36106

Merged
merged 2 commits into from Mar 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 25 additions & 3 deletions tensorflow/compiler/xla/service/gpu/BUILD
Expand Up @@ -11,11 +11,13 @@ load(
)
load(
"//tensorflow:tensorflow.bzl",
"if_cuda_or_rocm",
"tf_cc_test",
"tf_copts",
"tf_cuda_library",
)
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
load(
"//tensorflow/core/platform/default:cuda_build_defs.bzl",
"if_cuda_is_configured",
Expand Down Expand Up @@ -399,9 +401,24 @@ filegroup(
),
)

# use alias since nested select statements not possible
cc_library(
name = "empty",
)

alias(
name = "virtual_nccl",
actual = if_cuda("@local_config_nccl//:nccl", ":empty"),
)

alias(
name = "virtual_rccl",
actual = if_rocm("@local_config_rocm//rocm:rccl", ":empty"),
)

tf_cuda_library(
name = "nccl_all_reduce_thunk",
srcs = if_cuda(
srcs = if_cuda_or_rocm(
[":nccl_all_reduce_thunk_src"],
["dummy_all_reduce_thunk.cc"],
),
Expand All @@ -420,10 +437,15 @@ tf_cuda_library(
"//tensorflow/compiler/xla/service:hlo",
"//tensorflow/core:lib",
"//tensorflow/core:stream_executor_no_cuda",
] + if_cuda([
"//tensorflow/stream_executor/cuda:cuda_activation",
"//tensorflow/stream_executor/cuda:cuda_gpu_executor",
] + if_nccl([
"@local_config_nccl//:nccl",
]) + if_rocm([
"//tensorflow/stream_executor/rocm:rocm_activation",
"//tensorflow/stream_executor/rocm:rocm_gpu_executor",
]) + if_nccl([
":virtual_nccl",
":virtual_rccl",
]),
)

Expand Down
18 changes: 16 additions & 2 deletions tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
Expand Up @@ -28,7 +28,11 @@ limitations under the License.
#include "absl/strings/str_join.h"
#include "absl/types/optional.h"
#include "absl/types/span.h"
#if GOOGLE_CUDA
#include "third_party/nccl/nccl.h"
#elif TENSORFLOW_USE_ROCM
#include "rocm/include/rccl/rccl.h"
#endif
#include "tensorflow/compiler/xla/layout_util.h"
#include "tensorflow/compiler/xla/refcounting_hash_map.h"
#include "tensorflow/compiler/xla/service/collective_ops_utils.h"
Expand All @@ -37,7 +41,17 @@ limitations under the License.
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/core/lib/core/blocking_counter.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
#include "tensorflow/stream_executor/gpu/gpu_activation.h"

#if TENSORFLOW_USE_ROCM
// Local hipify of cuda symbols
#define cudaError_t hipError_t
#define cudaStream_t hipStream_t
#define cudaGetErrorString hipGetErrorString
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaSuccess hipSuccess
#endif

namespace xla {
namespace gpu {
Expand Down Expand Up @@ -406,7 +420,7 @@ RendezvousNcclAllReduce::SubmitParticipantImpl(
CHECK(allreduce_datatype.has_value());

se::StreamExecutor* executor = participant.stream->parent();
se::cuda::ScopedActivateExecutorContext scoped_context(executor);
se::gpu::ScopedActivateExecutorContext scoped_context(executor);
cudaStream_t* cu_stream = reinterpret_cast<cudaStream_t*>(
participant.stream->implementation()->GpuStreamMemberHack());
VLOG(3) << "Using stream pointer: " << cu_stream
Expand Down