tensorflow · tensorflow-copybara · Jun 3, 2019 · May 30, 2019
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
@@ -467,7 +467,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER);
 
 #undef REGISTER
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                          \
@@ -488,6 +488,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 
 #undef REGISTER
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -42,7 +42,7 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
                                int output_cols, int stride_rows,
                                int stride_cols, int rate_rows, int rate_cols,
                                int pad_top, int pad_left, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
     const int out_idx2 = out_idx / depth;
@@ -81,7 +81,7 @@ __global__ void DilationBackpropInputKernel(
     int depth, int filter_rows, int filter_cols, int output_rows,
     int output_cols, int stride_rows, int stride_cols, int rate_rows,
     int rate_cols, int pad_top, int pad_left, T* in_backprop_ptr) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
     const int out_idx2 = out_idx / depth;
@@ -116,7 +116,7 @@ __global__ void DilationBackpropInputKernel(
         }
       }
     }
-    CudaAtomicAdd(
+    GpuAtomicAdd(
         in_backprop_ptr + d +
             depth * (w_in_max + input_cols * (h_in_max + input_rows * b)),
         out_backprop_ptr[out_idx]);
@@ -130,7 +130,7 @@ __global__ void DilationBackpropFilterKernel(
     int depth, int filter_rows, int filter_cols, int output_rows,
     int output_cols, int stride_rows, int stride_cols, int rate_rows,
     int rate_cols, int pad_top, int pad_left, T* filter_backprop_ptr) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     const int d = out_idx % depth;
     const int out_idx2 = out_idx / depth;
@@ -165,7 +165,7 @@ __global__ void DilationBackpropFilterKernel(
         }
       }
     }
-    CudaAtomicAdd(
+    GpuAtomicAdd(
         filter_backprop_ptr + d + depth * (w_max + filter_cols * h_max),
         out_backprop_ptr[out_idx]);
   }
@@ -193,9 +193,9 @@ struct Dilation<GPUDevice, T> {
     const int output_cols = output.dimension(2);
 
     const int total_count = batch * output_rows * output_cols * depth;
-    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
 
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         DilationKernel<T>, config.block_count, config.thread_per_block, 0,
         d.stream(), config.virtual_thread_count, input.data(), filter.data(),
         batch, input_rows, input_cols, depth, filter_rows, filter_cols,
@@ -229,14 +229,14 @@ struct DilationBackpropInput<GPUDevice, T> {
     // Initialize in_backprop with all zeros.
     total_count = batch * input_rows * input_cols * depth;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
+    TF_CHECK_OK(GpuLaunchKernel(SetZero<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(),
                                  total_count, in_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         DilationBackpropInputKernel<T>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
         input.data(), filter.data(), out_backprop.data(), batch, input_rows,
@@ -271,14 +271,14 @@ struct DilationBackpropFilter<GPUDevice, T> {
     // Initialize filter_backprop with all zeros.
     total_count = filter_rows * filter_cols * depth;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
+    TF_CHECK_OK(GpuLaunchKernel(SetZero<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(),
                                  total_count, filter_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         DilationBackpropFilterKernel<T>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
         input.data(), filter.data(), out_backprop.data(), batch, input_rows,
@@ -301,4 +301,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM