tensorflow · tensorflow-copybara · Jun 26, 2019 · Jun 25, 2019
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
@@ -53,7 +53,7 @@ struct MultinomialFunctor {
                   typename TTypes<OutputType>::Matrix output);
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
 extern template struct MultinomialFunctor<GPUDevice, float, int32>;
 extern template struct MultinomialFunctor<GPUDevice, double, int32>;
@@ -65,7 +65,7 @@ extern template struct MultinomialFunctor<GPUDevice, float, int64>;
 extern template struct MultinomialFunctor<GPUDevice, double, int64>;
 extern template struct MultinomialFunctor<GPUDevice, int32, int64>;
 extern template struct MultinomialFunctor<GPUDevice, int64, int64>;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T, typename OutputType>
 struct MultinomialFunctor<CPUDevice, T, OutputType> {
@@ -253,7 +253,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(TYPE)                                                   \
   REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
                               .Device(DEVICE_GPU)                        \
@@ -273,7 +273,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T, typename OutputType>
 class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
@@ -321,7 +321,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(TYPE)                                                    \
   REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
                               .Device(DEVICE_GPU)                         \
@@ -343,7 +343,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace
 

diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -29,6 +29,12 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
+#if GOOGLE_CUDA
+namespace gpuprim = ::cub;
+#elif TENSORFLOW_USE_ROCM
+namespace gpuprim = ::hipcub;
+#endif
+
 namespace tensorflow {
 
 namespace functor {
@@ -41,12 +47,12 @@ template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
                                   const int32 num_samples, const float* scores,
                                   const float* maxima, OutputType* output) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
       using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
-      CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
-                    static_cast<UnsignedOutputType>(index % num_classes));
+      GpuAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
+                   static_cast<UnsignedOutputType>(index % num_classes));
     }
   }
 }
@@ -98,8 +104,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     // Max-reduce along classes for each (batch, sample).
     typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
     Constants<GPUDevice> constants;
-    cub::Max op;
-    functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
+    gpuprim::Max op;
+    functor::ReduceImpl<float, gpuprim::Max, float*, const float*,
+                        ReductionAxes>(
         /*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
         /*in_dim0=*/batch_size * num_samples,
         /*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
@@ -109,8 +116,8 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     output.device(d) = output.constant(0LL);
 
     const int32 work_items = batch_size * num_samples * num_classes;
-    GpuLaunchConfig config = GetCudaLaunchConfig(work_items, d);
-    TF_CHECK_OK(CudaLaunchKernel(
+    GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
+    TF_CHECK_OK(GpuLaunchKernel(
         MultinomialKernel<OutputType>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
         num_classes, num_samples, scores.data(), maxima.data(), output.data()));
@@ -133,4 +140,4 @@ template struct MultinomialFunctor<GPUDevice, int64, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM