From bff8831efb6a82edd7a986e94b515151b7e12828 Mon Sep 17 00:00:00 2001
From: zhuoryin <zhuoryin@amd.com>
Date: Thu, 9 May 2019 15:54:45 +0000
Subject: [PATCH] Merging latest update from develop-upstream branch

---
 tensorflow/core/util/gpu_device_functions.h |  2 ++
 tensorflow/core/util/gpu_kernel_helper.h    |  8 ++++-
 tensorflow/core/util/gpu_launch_config.h    | 40 +++++++++++----------
 3 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index a540e872a50f79..cc80680de7c282 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -355,6 +355,7 @@ __device__ inline double GpuShuffleUpSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
 
 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
 // in convergence, see comment above for details.
@@ -394,6 +395,7 @@ __device__ inline double GpuShuffleDownSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
 
 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 85409f90fc851c..acf8226772ea89 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -21,6 +21,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "cuda/include/cuda_fp16.h"
 #endif
+#include "tensorflow/core/util/gpu_cuda_alias.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 
@@ -46,6 +47,7 @@ limitations under the License.
 #define gpuSuccess cudaSuccess
 using gpuStream_t = cudaStream_t;
 using gpuError_t = cudaError_t;
+
 #elif TENSORFLOW_USE_ROCM
 #define gpuSuccess hipSuccess
 using gpuStream_t = hipStream_t;
@@ -59,7 +61,7 @@ __host__ __device__ inline const char* gpuGetErrorString(cudaError_t error){
 #elif TENSORFLOW_USE_ROCM
 // hipGetErrorString is available on host side only
 inline const char* gpuGetErrorString(hipError_t error){
-   return hipGetErrorString(error);
+  return hipGetErrorString(error);
 #endif
 }
 
@@ -142,24 +144,28 @@ __device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value,
   return Eigen::half(
       GpuShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
 }
+// Aliased in gpu_device_functions.h
 
 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync(
     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
   return Eigen::half(
       GpuShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
 }
+// Aliased in gpu_device_functions.h
 
 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync(
     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
   return Eigen::half(
       GpuShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
 }
+// Aliased in gpu_device_functions.h
 
 __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync(
     unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
   return Eigen::half(
       GpuShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
+// Aliased in gpu_device_functions.h
 #endif
 
 namespace gpu_helper {
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 829cc7a23eb424..15001502a07890 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -166,22 +166,24 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
 #elif TENSORFLOW_USE_ROCM
   // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
   // implemented
-  //hipError_t err = hipOccupancyMaxPotentialBlockSize(
+  // hipError_t err = hipOccupancyMaxPotentialBlockSize(
   //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
   //    block_size_limit);
-  //CHECK_EQ(err, hipSuccess);
+  // CHECK_EQ(err, hipSuccess);
 
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
   const int physical_thread_count = std::min(
       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
   thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumGpuMultiProcessors());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
 #endif
 
-  block_count =
-      std::min(block_count, DivUp(work_element_count, thread_per_block));
+  block_count = std::min(block_count, DivUp(work_element_count, thread_per_block));
 
   config.virtual_thread_count = work_element_count;
   config.thread_per_block = thread_per_block;
@@ -209,20 +211,23 @@ inline GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
 #elif TENSORFLOW_USE_ROCM
-  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor is
-  // implemented
-  //hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
+  // is implemented
+  // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
   //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
   //    block_size_limit);
-  //CHECK_EQ(err, hipSuccess);
+  // CHECK_EQ(err, hipSuccess);
 
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
   const int physical_thread_count = std::min(
       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
   int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumGpuMultiProcessors());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
 #endif
 
   config.virtual_thread_count = work_element_count;
@@ -309,7 +314,7 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
-  // ROCM FIXME re-enable this after hipOccupancyMaxPotentialBlockSize is
+  // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
   // implemented
   // hipError_t err = hipOccupancyMaxPotentialBlockSize(
   //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
@@ -319,9 +324,8 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
   const int physical_thread_count =
       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
   thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumGpuMultiProcessors());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
 #endif
 
   int threadsx = std::min({xdim, thread_per_block, xthreadlimit});