Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Adding ROCm support for the fused_batch_norm op #30237

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 11 additions & 8 deletions tensorflow/core/kernels/fused_batch_norm_op.cc
Expand Up @@ -15,13 +15,16 @@ limitations under the License.

#define EIGEN_USE_THREADS

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#if GOOGLE_CUDA
#include "third_party/gpus/cudnn/cudnn.h"
#endif // GOOGLE_CUDA

#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/util/stream_executor_util.h"
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
Expand Down Expand Up @@ -73,7 +76,7 @@ struct FusedBatchNorm;
template <typename Device, typename T, typename U>
struct FusedBatchNormGrad;

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
using se::DeviceMemory;
using se::ScratchAllocator;
using se::Stream;
Expand Down Expand Up @@ -216,7 +219,7 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
OpKernelContext* context_; // not owned
int output_index_;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <bool IsSame, typename Y, typename X, typename T>
struct CastIfNecessary {
Expand Down Expand Up @@ -423,14 +426,14 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
}
};

#ifndef GOOGLE_CUDA
#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
namespace {
// See implementation under GOOGLE_CUDA #ifdef below.
bool BatchnormSpatialPersistentEnabled() { return false; }
} // namespace
#endif

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

namespace {

Expand Down Expand Up @@ -848,7 +851,7 @@ DECLARE_GPU_SPEC(Eigen::half, float);

#undef DECLARE_GPU_SPEC

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace functor

template <typename Device, typename T, typename U>
Expand Down Expand Up @@ -1250,7 +1253,7 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
.TypeConstraint<float>("U"),
FusedBatchNormGradOpV3<CPUDevice, Eigen::half, float>);

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

REGISTER_KERNEL_BUILDER(
Name("FusedBatchNorm").Device(DEVICE_GPU).TypeConstraint<float>("T"),
Expand Down
22 changes: 12 additions & 10 deletions tensorflow/core/kernels/fused_batch_norm_op.cu.cc
Expand Up @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#if GOOGLE_CUDA
#include "third_party/gpus/cuda/include/cuda.h"
#endif
#include "tensorflow/core/kernels/fused_batch_norm_op.h"
#include "tensorflow/core/util/gpu_kernel_helper.h"

Expand All @@ -30,7 +32,7 @@ template struct FusedBatchNormFreezeGrad<GPUDevice, Eigen::half, float>;
template <class T>
__global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
double epsilon, T* output) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
output[index] = rsqrt(input[index] + T(epsilon));
}
}
Expand All @@ -39,8 +41,8 @@ template <class T>
void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
const T* variance, double epsilon,
int channels, T* inv_variance) {
GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
TF_CHECK_OK(CudaLaunchKernel(VarianceToInvVarianceKernel<T>,
GpuLaunchConfig config = GetGpuLaunchConfig(channels, d);
TF_CHECK_OK(GpuLaunchKernel(VarianceToInvVarianceKernel<T>,
config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count,
variance, epsilon, inv_variance));
Expand All @@ -49,7 +51,7 @@ void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
template <class T>
__global__ void InvVarianceToVarianceKernel(int nthreads, double epsilon,
int sample_size, T* variance) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
T inv_var = variance[index];
T var = __fdividef(1, inv_var * inv_var) - T(epsilon);
// This is for Bessel's correction
Expand All @@ -62,8 +64,8 @@ template <class T>
void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
double epsilon, int sample_size,
int channels, T* variance) {
GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
TF_CHECK_OK(CudaLaunchKernel(InvVarianceToVarianceKernel<T>,
GpuLaunchConfig config = GetGpuLaunchConfig(channels, d);
TF_CHECK_OK(GpuLaunchKernel(InvVarianceToVarianceKernel<T>,
config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, epsilon,
sample_size, variance));
Expand Down Expand Up @@ -277,13 +279,13 @@ struct FusedBatchNormInferenceFunctor<GPUDevice, T, U> {
INNER_DIM_SIZE) \
launched = true; \
\
GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( \
GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( \
std::is_same<T, Eigen::half>::value ? Eigen::divup(count, 2) : count, d, \
FusedBatchNormInferenceMetaKernel<T, U, DATA_FORMAT, ADD_SIDE_INPUT, \
ACTIVATION>, \
0, kThreadInBlock); \
\
TF_CHECK_OK(CudaLaunchKernel( \
TF_CHECK_OK(GpuLaunchKernel( \
FusedBatchNormInferenceMetaKernel<T, U, DATA_FORMAT, ADD_SIDE_INPUT, \
ACTIVATION>, \
config.block_count, config.thread_per_block, 0, d.stream(), count, \
Expand Down Expand Up @@ -342,4 +344,4 @@ template struct FusedBatchNormInferenceFunctor<GPUDevice, Eigen::half, float>;

#include "tensorflow/core/kernels/fused_batch_norm_op.h"

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/fused_batch_norm_op.h
Expand Up @@ -35,7 +35,7 @@ string ToString(FusedBatchNormActivationMode activation_mode);
Status ParseActivationMode(OpKernelConstruction* context,
FusedBatchNormActivationMode* activation_mode);

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

// There is a behavior difference between cuDNN v4 and v5 with regard to the
// scaling factor for function cudnnBatchNormalizationForwardInference.
Expand Down Expand Up @@ -82,7 +82,7 @@ struct FusedBatchNormInferenceFunctor {
typename TTypes<T, 4>::Tensor out);
};

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

// Functor used by FusedBatchNormGradOp to do the computations when
// is_training=False. Both CPU and GPU will use this functor.
Expand Down