Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Forward host function names from gpu prefix to cuda prefix #28568

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions tensorflow/core/util/gpu_cuda_alias.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ limitations under the License.
#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_

// Several forwarding macros are defined in this file to serve for backward
// compatibility usage as we migrating from Cuda prefixed function to Gpu
// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
// function. Over time, all Cuda* functions will be deprecated.
// compatibility usage as we migrating from CUDA prefixed function to GPU
// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
// function. Over time, all CUDA* functions will be deprecated.

namespace tensorflow {

// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
#ifndef TENSORFLOW_USE_ROCM
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
template <typename... Args> \
Expand All @@ -36,7 +36,7 @@ namespace tensorflow {
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
#endif

// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
// Alias.
#ifndef TENSORFLOW_USE_ROCM
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
Expand All @@ -49,7 +49,7 @@ namespace tensorflow {
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
#endif

// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
#ifndef TENSORFLOW_USE_ROCM
#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
#else
Expand Down
26 changes: 23 additions & 3 deletions tensorflow/core/util/gpu_kernel_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,36 @@ limitations under the License.
#define gpuSuccess cudaSuccess
using gpuStream_t = cudaStream_t;
using gpuError_t = cudaError_t;

#elif TENSORFLOW_USE_ROCM
#define gpuSuccess hipSuccess
using gpuStream_t = hipStream_t;
using gpuError_t = hipError_t;
#endif

#define GetGPUStream(context) context->eigen_gpu_device().stream()

namespace tensorflow {
#if GOOGLE_CUDA
// cudaGetErrorString is available to both host and device
__host__ __device__ inline const char* GpuGetErrorString(cudaError_t error){
return cudaGetErrorString(error);
#elif TENSORFLOW_USE_ROCM
// hipGetErrorString is available on host side only
inline const char* GpuGetErrorString(hipError_t error){
return hipGetErrorString(error);
#endif
}

inline const gpuStream_t& GetGpuStream(OpKernelContext* context) {
// Returns a raw reference to the current cuda stream. Required by a
// number of kernel calls (for which StreamInterface* does not work),
// i.e. CUB and certain cublas primitives.
const gpuStream_t* ptr = CHECK_NOTNULL(
reinterpret_cast<const gpuStream_t*>(context->op_device_context()
->stream()
->implementation()
->GpuStreamMemberHack()));
return *ptr;
}

__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
const tensorflow::bfloat16* address) {
tensorflow::bfloat16 return_value;
Expand Down
30 changes: 5 additions & 25 deletions tensorflow/core/util/gpu_launch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/gpu_cuda_alias.h"

// Usage of GetGpuLaunchConfig, GetGpu2DLaunchConfig, and
// GetGpu3DLaunchConfig:
Expand Down Expand Up @@ -192,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
config.block_count = block_count;
return config;
}
template <typename DeviceFunc>
CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
const Eigen::GpuDevice& d, DeviceFunc func,
size_t dynamic_shared_memory_size,
int block_size_limit) {
return GetGpuLaunchConfig(work_element_count, d, func,
dynamic_shared_memory_size, block_size_limit);
}
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
jerryyin marked this conversation as resolved.
Show resolved Hide resolved

// Calculate the Cuda launch config we should use for a kernel launch. This
// variant takes the resource limits of func into account to maximize occupancy.
Expand Down Expand Up @@ -244,14 +238,7 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
config.block_count = block_count;
return config;
}
template <typename DeviceFunc>
CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
size_t dynamic_shared_memory_size, int fixed_block_size) {
return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
dynamic_shared_memory_size,
fixed_block_size);
}
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize, GetCudaLaunchConfigFixedBlockSize);

struct Gpu2DLaunchConfig {
dim3 virtual_thread_count = dim3(0, 0, 0);
Expand Down Expand Up @@ -368,15 +355,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
config.block_count = dim3(blocksx, blocksy, blocksz);
return config;
}
template <typename DeviceFunc>
Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
const Eigen::GpuDevice& d,
DeviceFunc func,
size_t dynamic_shared_memory_size,
int block_size_limit) {
return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
dynamic_shared_memory_size, block_size_limit);
}
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);

template <typename DeviceFunc>
Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
Expand All @@ -387,6 +366,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
dynamic_shared_memory_size, block_size_limit);
}
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);

#if GOOGLE_CUDA
// Returns a raw reference to the current cuda stream. Required by a
Expand Down