tensorflow · tensorflow-copybara · Jul 1, 2019 · Jun 28, 2019
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -15,13 +15,16 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
+#if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
+#endif // GOOGLE_CUDA
+
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -73,7 +76,7 @@ struct FusedBatchNorm;
 template <typename Device, typename T, typename U>
 struct FusedBatchNormGrad;
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 using se::DeviceMemory;
 using se::ScratchAllocator;
 using se::Stream;
@@ -216,7 +219,7 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
   OpKernelContext* context_;  // not owned
   int output_index_;
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <bool IsSame, typename Y, typename X, typename T>
 struct CastIfNecessary {
@@ -423,14 +426,14 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
   }
 };
 
-#ifndef GOOGLE_CUDA
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
 namespace {
 // See implementation under GOOGLE_CUDA #ifdef below.
 bool BatchnormSpatialPersistentEnabled() { return false; }
 }  // namespace
 #endif
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace {
 
@@ -848,7 +851,7 @@ DECLARE_GPU_SPEC(Eigen::half, float);
 
 #undef DECLARE_GPU_SPEC
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace functor
 
 template <typename Device, typename T, typename U>
@@ -1250,7 +1253,7 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOpV3<CPUDevice, Eigen::half, float>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 REGISTER_KERNEL_BUILDER(
     Name("FusedBatchNorm").Device(DEVICE_GPU).TypeConstraint<float>("T"),

diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
+#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#endif
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
@@ -30,7 +32,7 @@ template struct FusedBatchNormFreezeGrad<GPUDevice, Eigen::half, float>;
 template <class T>
 __global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
                                             double epsilon, T* output) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
     output[index] = rsqrt(input[index] + T(epsilon));
   }
 }
@@ -39,8 +41,8 @@ template <class T>
 void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
                                           const T* variance, double epsilon,
                                           int channels, T* inv_variance) {
-  GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
-  TF_CHECK_OK(CudaLaunchKernel(VarianceToInvVarianceKernel<T>,
+  GpuLaunchConfig config = GetGpuLaunchConfig(channels, d);
+  TF_CHECK_OK(GpuLaunchKernel(VarianceToInvVarianceKernel<T>,
                                config.block_count, config.thread_per_block, 0,
                                d.stream(), config.virtual_thread_count,
                                variance, epsilon, inv_variance));
@@ -49,7 +51,7 @@ void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 __global__ void InvVarianceToVarianceKernel(int nthreads, double epsilon,
                                             int sample_size, T* variance) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
     T inv_var = variance[index];
     T var = __fdividef(1, inv_var * inv_var) - T(epsilon);
     // This is for Bessel's correction
@@ -62,8 +64,8 @@ template <class T>
 void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
                                           double epsilon, int sample_size,
                                           int channels, T* variance) {
-  GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
-  TF_CHECK_OK(CudaLaunchKernel(InvVarianceToVarianceKernel<T>,
+  GpuLaunchConfig config = GetGpuLaunchConfig(channels, d);
+  TF_CHECK_OK(GpuLaunchKernel(InvVarianceToVarianceKernel<T>,
                                config.block_count, config.thread_per_block, 0,
                                d.stream(), config.virtual_thread_count, epsilon,
                                sample_size, variance));
@@ -277,13 +279,13 @@ struct FusedBatchNormInferenceFunctor<GPUDevice, T, U> {
                INNER_DIM_SIZE)                                                 \
   launched = true;                                                             \
                                                                                \
-  GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(                  \
+  GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(                   \
       std::is_same<T, Eigen::half>::value ? Eigen::divup(count, 2) : count, d, \
       FusedBatchNormInferenceMetaKernel<T, U, DATA_FORMAT, ADD_SIDE_INPUT,     \
                                         ACTIVATION>,                           \
       0, kThreadInBlock);                                                      \
                                                                                \
-  TF_CHECK_OK(CudaLaunchKernel(                                                \
+  TF_CHECK_OK(GpuLaunchKernel(                                                 \
       FusedBatchNormInferenceMetaKernel<T, U, DATA_FORMAT, ADD_SIDE_INPUT,     \
                                         ACTIVATION>,                           \
       config.block_count, config.thread_per_block, 0, d.stream(), count,       \
@@ -342,4 +344,4 @@ template struct FusedBatchNormInferenceFunctor<GPUDevice, Eigen::half, float>;
 
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -35,7 +35,7 @@ string ToString(FusedBatchNormActivationMode activation_mode);
 Status ParseActivationMode(OpKernelConstruction* context,
                            FusedBatchNormActivationMode* activation_mode);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // There is a behavior difference between cuDNN v4 and v5 with regard to the
 // scaling factor for function cudnnBatchNormalizationForwardInference.
@@ -82,7 +82,7 @@ struct FusedBatchNormInferenceFunctor {
                   typename TTypes<T, 4>::Tensor out);
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Functor used by FusedBatchNormGradOp to do the computations when
 // is_training=False. Both CPU and GPU will use this functor.