Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Enable ROCm support for "softplus_op" #26457

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 34 additions & 0 deletions tensorflow/core/grappler/clusters/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ limitations under the License.
#include "cuda/include/cudnn.h"
#endif

#if TENSORFLOW_USE_ROCM
#include "rocm/include/hip/hip_runtime.h"
#endif

#ifdef EIGEN_USE_LIBXSMM
#include "include/libxsmm.h"
#endif
Expand Down Expand Up @@ -109,6 +113,36 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
strings::StrCat(properties.major, ".", properties.minor);
(*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
(*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);

#elif TENSORFLOW_USE_ROCM
hipDeviceProp_t properties;
hipError_t error =
hipGetDeviceProperties(&properties, platform_gpu_id.value());
if (error != hipSuccess) {
device.set_type("UNKNOWN");
LOG(ERROR) << "Failed to get device properties, error code: " << error;
return device;
}

// ROCM TODO review if numbers here are valid
device.set_vendor("Advanced Micro Devices, Inc");
device.set_model(properties.name);
device.set_frequency(properties.clockRate * 1e-3);
device.set_num_cores(properties.multiProcessorCount);
device.set_num_registers(properties.regsPerBlock);
device.set_l1_cache_size(16 * 1024);
device.set_l2_cache_size(properties.l2CacheSize);
device.set_l3_cache_size(0);
device.set_shared_memory_size_per_multiprocessor(
properties.maxSharedMemoryPerMultiProcessor);
device.set_memory_size(properties.totalGlobalMem);
// 8 is the number of bits per byte. 2 is accounted for
// double data rate (DDR).
device.set_bandwidth(properties.memoryBusWidth / 8 *
properties.memoryClockRate * 2);

(*device.mutable_environment())["architecture"] =
strings::StrCat("gfx", properties.gcnArch);
#endif

return device;
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/softplus_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void SoftplusGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
#undef REGISTER_KERNELS

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Forward declarations of the functor specializations for GPU.
namespace functor {
#define DECLARE_GPU_SPEC(T) \
Expand Down Expand Up @@ -119,6 +119,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
#undef REGISTER_GPU_KERNELS

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

} // namespace tensorflow
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/softplus_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand All @@ -37,4 +37,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);

} // end namespace tensorflow

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
2 changes: 2 additions & 0 deletions tensorflow/stream_executor/gpu/gpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ limitations under the License.

#if TENSORFLOW_USE_ROCM

#define __HIP_DISABLE_CPP_FUNCTIONS__

#include "rocm/include/hip/hip_complex.h"
#include "rocm/include/hip/hip_runtime.h"
#include "rocm/include/hiprand/hiprand.h"
Expand Down
10 changes: 5 additions & 5 deletions tensorflow/stream_executor/platform/default/dso_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,27 +158,27 @@ port::StatusOr<void*> GetCudnnDsoHandle() {

port::StatusOr<void*> GetRocblasDsoHandle() {
static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
return result;
return *result;
}

port::StatusOr<void*> GetMiopenDsoHandle() {
static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
return result;
return *result;
}

port::StatusOr<void*> GetRocfftDsoHandle() {
static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
return result;
return *result;
}

port::StatusOr<void*> GetRocrandDsoHandle() {
static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
return result;
return *result;
}

port::StatusOr<void*> GetHipDsoHandle() {
static auto result = new auto(DsoLoader::GetHipDsoHandle());
return result;
return *result;
}

} // namespace CachedDsoLoader
Expand Down
1 change: 0 additions & 1 deletion tensorflow/stream_executor/rocm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ cc_library(
"//tensorflow/core:lib_internal",
"//tensorflow/stream_executor:dnn",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:logging_proto_cc",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very much so.

Leaving it in is what breaks the --config=rocm build.

The CUDA equivalent was removed in the following commit :
5aefc4e

"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
Expand Down
29 changes: 6 additions & 23 deletions tensorflow/stream_executor/rocm/rocm_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
hipDeviceptr_t location,
uint8 value, size_t size) {
ScopedActivateContext activation{context};
hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
hipError_t res = tensorflow::wrap::hipMemsetD8(location, value, size);
if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false;
Expand All @@ -514,15 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
size_t uint32_count) {
ScopedActivateContext activation{context};
void* pointer = absl::bit_cast<void*>(location);
unsigned char valueC = static_cast<unsigned char>(value);
uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
if (value32 != value) {
// mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
LOG(ERROR) << "failed to memset memory";
return false;
}
hipError_t res = tensorflow::wrap::hipMemset(pointer, static_cast<int>(value),
uint32_count * 4);
hipError_t res = tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count);
if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false;
Expand Down Expand Up @@ -553,17 +545,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream) {
ScopedActivateContext activation{context};
void* pointer = absl::bit_cast<void*>(location);

// FIXME - need to set a 32-bit value here
unsigned char valueC = static_cast<unsigned char>(value);
uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
if (value32 != value) {
// mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
LOG(ERROR) << "failed to memset memory";
return false;
}
hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value,
uint32_count * 4, stream);
hipError_t res =
tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream);
if (res != hipSuccess) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false;
Expand Down Expand Up @@ -671,7 +654,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 bytes) {
ScopedActivateContext activated{context};
hipDeviceptr_t result = 0;
hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
if (res != hipSuccess) {
LOG(ERROR) << "failed to allocate "
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
Expand Down Expand Up @@ -717,7 +700,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activation{context};
void* host_mem = nullptr;
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = tensorflow::wrap::hipHostMallocVanilla(
hipError_t res = tensorflow::wrap::hipHostMalloc(
&host_mem, bytes, hipHostMallocPortable);
if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << bytes
Expand Down
22 changes: 7 additions & 15 deletions tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_

#define __HIP_DISABLE_CPP_FUNCTIONS__

#include "rocm/include/hip/hip_runtime.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
Expand Down Expand Up @@ -48,21 +50,6 @@ namespace wrap {
#define TO_STR_(x) #x
#define TO_STR(x) TO_STR_(x)

// hipMalloc and hipHostMalloc are defined as funtion templates in the
// HIP header files, and hence their names get mangled and the attempt
// to resolve their name when trying to dynamically load them will fail
// Updating the HIP header files to make them C functions is underway.
// Until that change flows through, we will workaround the issue by
// creating dummy wrappers for them here

hipError_t hipMallocVanilla(void** ptr, size_t size) {
return hipErrorNotInitialized;
}

hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
return hipErrorNotInitialized;
}

#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
template <typename... Args> \
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
Expand Down Expand Up @@ -107,9 +94,11 @@ hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
__macro(hipGetDeviceCount) \
__macro(hipGetDeviceProperties) \
__macro(hipHostFree) \
__macro(hipHostMalloc) \
__macro(hipHostRegister) \
__macro(hipHostUnregister) \
__macro(hipInit) \
__macro(hipMalloc) \
__macro(hipMemGetAddressRange) \
__macro(hipMemGetInfo) \
__macro(hipMemcpyDtoD) \
Expand All @@ -119,7 +108,10 @@ hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
__macro(hipMemcpyHtoD) \
__macro(hipMemcpyHtoDAsync) \
__macro(hipMemset) \
__macro(hipMemsetD32) \
__macro(hipMemsetD8) \
__macro(hipMemsetAsync) \
__macro(hipMemsetD32Async) \
__macro(hipModuleGetFunction) \
__macro(hipModuleGetGlobal) \
__macro(hipModuleLaunchKernel) \
Expand Down
1 change: 1 addition & 0 deletions tensorflow/tensorflow.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,7 @@ def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs)
"@local_config_cuda//cuda:cuda_headers",
]) + if_rocm_is_configured(cuda_deps + [
# rocm_header placeholder
"@local_config_rocm//rocm:rocm_headers",
]),
copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
**kwargs
Expand Down