Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Adding ROCm support for the dilation ops #29186

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/dilation_ops.cc
Expand Up @@ -467,7 +467,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER);

#undef REGISTER

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define REGISTER(T) \
REGISTER_KERNEL_BUILDER( \
Expand All @@ -488,6 +488,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);

#undef REGISTER

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

} // namespace tensorflow
26 changes: 13 additions & 13 deletions tensorflow/core/kernels/dilation_ops_gpu.cu.cc
Expand Up @@ -15,7 +15,7 @@ limitations under the License.

// See docs in ../ops/nn_ops.cc.

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand All @@ -42,7 +42,7 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
int output_cols, int stride_rows,
int stride_cols, int rate_rows, int rate_cols,
int pad_top, int pad_left, T* output_ptr) {
CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
// out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
const int d = out_idx % depth;
const int out_idx2 = out_idx / depth;
Expand Down Expand Up @@ -81,7 +81,7 @@ __global__ void DilationBackpropInputKernel(
int depth, int filter_rows, int filter_cols, int output_rows,
int output_cols, int stride_rows, int stride_cols, int rate_rows,
int rate_cols, int pad_top, int pad_left, T* in_backprop_ptr) {
CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
// out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
const int d = out_idx % depth;
const int out_idx2 = out_idx / depth;
Expand Down Expand Up @@ -116,7 +116,7 @@ __global__ void DilationBackpropInputKernel(
}
}
}
CudaAtomicAdd(
GpuAtomicAdd(
in_backprop_ptr + d +
depth * (w_in_max + input_cols * (h_in_max + input_rows * b)),
out_backprop_ptr[out_idx]);
Expand All @@ -130,7 +130,7 @@ __global__ void DilationBackpropFilterKernel(
int depth, int filter_rows, int filter_cols, int output_rows,
int output_cols, int stride_rows, int stride_cols, int rate_rows,
int rate_cols, int pad_top, int pad_left, T* filter_backprop_ptr) {
CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
// out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
const int d = out_idx % depth;
const int out_idx2 = out_idx / depth;
Expand Down Expand Up @@ -165,7 +165,7 @@ __global__ void DilationBackpropFilterKernel(
}
}
}
CudaAtomicAdd(
GpuAtomicAdd(
filter_backprop_ptr + d + depth * (w_max + filter_cols * h_max),
out_backprop_ptr[out_idx]);
}
Expand Down Expand Up @@ -193,9 +193,9 @@ struct Dilation<GPUDevice, T> {
const int output_cols = output.dimension(2);

const int total_count = batch * output_rows * output_cols * depth;
GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);

TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
DilationKernel<T>, config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, input.data(), filter.data(),
batch, input_rows, input_cols, depth, filter_rows, filter_cols,
Expand Down Expand Up @@ -229,14 +229,14 @@ struct DilationBackpropInput<GPUDevice, T> {
// Initialize in_backprop with all zeros.
total_count = batch * input_rows * input_cols * depth;
config = GetGpuLaunchConfig(total_count, d);
TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
TF_CHECK_OK(GpuLaunchKernel(SetZero<T>, config.block_count,
config.thread_per_block, 0, d.stream(),
total_count, in_backprop.data()));

// Accumulate.
total_count = batch * output_rows * output_cols * depth;
config = GetGpuLaunchConfig(total_count, d);
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
DilationBackpropInputKernel<T>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
input.data(), filter.data(), out_backprop.data(), batch, input_rows,
Expand Down Expand Up @@ -271,14 +271,14 @@ struct DilationBackpropFilter<GPUDevice, T> {
// Initialize filter_backprop with all zeros.
total_count = filter_rows * filter_cols * depth;
config = GetGpuLaunchConfig(total_count, d);
TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
TF_CHECK_OK(GpuLaunchKernel(SetZero<T>, config.block_count,
config.thread_per_block, 0, d.stream(),
total_count, filter_backprop.data()));

// Accumulate.
total_count = batch * output_rows * output_cols * depth;
config = GetGpuLaunchConfig(total_count, d);
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
DilationBackpropFilterKernel<T>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
input.data(), filter.data(), out_backprop.data(), batch, input_rows,
Expand All @@ -301,4 +301,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);

} // namespace tensorflow

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM