Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Add ROCm support for pooling operators #30231

Merged
merged 2 commits into from Jul 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define EIGEN_USE_GPU

Expand Down Expand Up @@ -48,7 +48,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_t,
const int pad_l, dtype* const bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int c = index % channels;
Expand Down Expand Up @@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
const int pad_l, T* const bottom_diff,
const GPUDevice& d) {
int x_size = num * height * width * channels;
GpuLaunchConfig config = GetCudaLaunchConfig(x_size, d);
TF_CHECK_OK(CudaLaunchKernel(
GpuLaunchConfig config = GetGpuLaunchConfig(x_size, d);
TF_CHECK_OK(GpuLaunchKernel(
AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, top_diff, num, height, width,
channels, pooled_height, pooled_width, kernel_h, kernel_w, stride_h,
Expand Down Expand Up @@ -121,4 +121,4 @@ template bool RunAvePoolBackwardNHWC(

} // end namespace tensorflow

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
8 changes: 4 additions & 4 deletions tensorflow/core/kernels/cudnn_pooling_gpu.cc
Expand Up @@ -28,7 +28,7 @@ typedef Eigen::GpuDevice GPUDevice;

namespace tensorflow {

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename T>
void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
Expand Down Expand Up @@ -103,7 +103,7 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
output_desc, &output_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolForward launch failed"));
errors::Internal("dnn PoolForward launch failed"));

if (data_format == FORMAT_NHWC) {
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
Expand Down Expand Up @@ -232,7 +232,7 @@ void DnnPooling3dGradOp<T>::Compute(
output_backprop_data, &input_backprop_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolBackward launch failed"));
errors::Internal("dnn PoolBackward launch failed"));

if (data_format == FORMAT_NHWC) {
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
Expand All @@ -249,6 +249,6 @@ void DnnPooling3dGradOp<T>::Compute(
TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
#undef DEFINE_DNN_OPS

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

} // namespace tensorflow
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/cudnn_pooling_gpu.h
Expand Up @@ -22,15 +22,15 @@ limitations under the License.

#include "tensorflow/core/framework/op_kernel.h"

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/platform/stream_executor.h"
#endif

#include "tensorflow/core/util/padding.h"

namespace tensorflow {

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

// Runs (avg/max)pooling on GPU.
// Dimension order for all array arguments is: x, y, z.
Expand Down
20 changes: 11 additions & 9 deletions tensorflow/core/kernels/maxpooling_op.cc
Expand Up @@ -42,10 +42,12 @@ limitations under the License.

#if GOOGLE_CUDA
#include "third_party/gpus/cudnn/cudnn.h"
#endif // GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

namespace tensorflow {

Expand Down Expand Up @@ -317,7 +319,7 @@ class MaxPoolingGradOp : public OpKernel {
TensorFormat data_format_;
};

#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename T>
static void MaxPoolingBackwardCustomKernel(
Expand Down Expand Up @@ -438,7 +440,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
bool propagate_nans_;
};

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

// The operation to compute gradient of MaxPool gradients.
// It takes three inputs:
Expand Down Expand Up @@ -647,7 +649,7 @@ class MaxPoolingGradGradOp : public OpKernel {
TensorFormat data_format_;
};

#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <class T>
class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
Expand Down Expand Up @@ -744,7 +746,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
bool use_dnn_;
};

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

template <typename Device, typename T>
struct LaunchMaxPoolingNoMask;
Expand Down Expand Up @@ -1112,7 +1114,7 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
bool include_batch_in_index_;
};

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
public:
Expand Down Expand Up @@ -1383,7 +1385,7 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
}
};

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#define REGISTER_MAX_POOL_KERNELS(D, T) \
REGISTER_KERNEL_BUILDER( \
Expand Down Expand Up @@ -1430,7 +1432,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
#undef REGISTER_CPU_KERNELS

#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

// Forward declarations for the functor specializations for GPU.
namespace functor {
Expand Down Expand Up @@ -1509,7 +1511,7 @@ REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")

#undef REGISTER_GPU_ONLY_POOL_KERNELS

#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

#undef REGISTER_MAX_POOL_KERNELS

Expand Down