Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Adding ROCm support for the multinomial op #30133

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 6 additions & 6 deletions tensorflow/core/kernels/multinomial_op.cc
Expand Up @@ -53,7 +53,7 @@ struct MultinomialFunctor {
typename TTypes<OutputType>::Matrix output);
};

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
extern template struct MultinomialFunctor<GPUDevice, float, int32>;
extern template struct MultinomialFunctor<GPUDevice, double, int32>;
Expand All @@ -65,7 +65,7 @@ extern template struct MultinomialFunctor<GPUDevice, float, int64>;
extern template struct MultinomialFunctor<GPUDevice, double, int64>;
extern template struct MultinomialFunctor<GPUDevice, int32, int64>;
extern template struct MultinomialFunctor<GPUDevice, int64, int64>;
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename T, typename OutputType>
struct MultinomialFunctor<CPUDevice, T, OutputType> {
Expand Down Expand Up @@ -253,7 +253,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER(TYPE) \
REGISTER_KERNEL_BUILDER(Name("Multinomial") \
.Device(DEVICE_GPU) \
Expand All @@ -273,7 +273,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename Device, typename T, typename OutputType>
class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
Expand Down Expand Up @@ -321,7 +321,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER(TYPE) \
REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial") \
.Device(DEVICE_GPU) \
Expand All @@ -343,7 +343,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

} // end namespace

Expand Down
25 changes: 16 additions & 9 deletions tensorflow/core/kernels/multinomial_op_gpu.cu.cc
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand All @@ -29,6 +29,12 @@ limitations under the License.
#include "tensorflow/core/lib/random/random_distributions.h"
#include "tensorflow/core/util/gpu_kernel_helper.h"

#if GOOGLE_CUDA
namespace gpuprim = ::cub;
#elif TENSORFLOW_USE_ROCM
namespace gpuprim = ::hipcub;
#endif

namespace tensorflow {

namespace functor {
Expand All @@ -41,12 +47,12 @@ template <typename OutputType>
__global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
const int32 num_samples, const float* scores,
const float* maxima, OutputType* output) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
const int maxima_idx = index / num_classes;
if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
static_cast<UnsignedOutputType>(index % num_classes));
GpuAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
static_cast<UnsignedOutputType>(index % num_classes));
}
}
}
Expand Down Expand Up @@ -98,8 +104,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
// Max-reduce along classes for each (batch, sample).
typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
Constants<GPUDevice> constants;
cub::Max op;
functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
gpuprim::Max op;
functor::ReduceImpl<float, gpuprim::Max, float*, const float*,
ReductionAxes>(
/*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
/*in_dim0=*/batch_size * num_samples,
/*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
Expand All @@ -109,8 +116,8 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
output.device(d) = output.constant(0LL);

const int32 work_items = batch_size * num_samples * num_classes;
GpuLaunchConfig config = GetCudaLaunchConfig(work_items, d);
TF_CHECK_OK(CudaLaunchKernel(
GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
TF_CHECK_OK(GpuLaunchKernel(
MultinomialKernel<OutputType>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
num_classes, num_samples, scores.data(), maxima.data(), output.data()));
Expand All @@ -133,4 +140,4 @@ template struct MultinomialFunctor<GPUDevice, int64, int64>;
} // namespace functor
} // namespace tensorflow

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM