From bff8831efb6a82edd7a986e94b515151b7e12828 Mon Sep 17 00:00:00 2001 From: zhuoryin Date: Thu, 9 May 2019 15:54:45 +0000 Subject: [PATCH] Merging latest update from develop-upstream branch --- tensorflow/core/util/gpu_device_functions.h | 2 ++ tensorflow/core/util/gpu_kernel_helper.h | 8 ++++- tensorflow/core/util/gpu_launch_config.h | 40 +++++++++++---------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h index a540e872a50f79..cc80680de7c282 100644 --- a/tensorflow/core/util/gpu_device_functions.h +++ b/tensorflow/core/util/gpu_device_functions.h @@ -355,6 +355,7 @@ __device__ inline double GpuShuffleUpSync(unsigned mask, double value, static_cast(lo)); #endif } +CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync); // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function // in convergence, see comment above for details. @@ -394,6 +395,7 @@ __device__ inline double GpuShuffleDownSync(unsigned mask, double value, static_cast(lo)); #endif } +CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync); // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in // convergence, see comment above for details. diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h index 85409f90fc851c..acf8226772ea89 100644 --- a/tensorflow/core/util/gpu_kernel_helper.h +++ b/tensorflow/core/util/gpu_kernel_helper.h @@ -21,6 +21,7 @@ limitations under the License. #if GOOGLE_CUDA #include "cuda/include/cuda_fp16.h" #endif +#include "tensorflow/core/util/gpu_cuda_alias.h" #include "tensorflow/core/util/gpu_device_functions.h" #include "tensorflow/core/util/gpu_launch_config.h" @@ -46,6 +47,7 @@ limitations under the License. #define gpuSuccess cudaSuccess using gpuStream_t = cudaStream_t; using gpuError_t = cudaError_t; + #elif TENSORFLOW_USE_ROCM #define gpuSuccess hipSuccess using gpuStream_t = hipStream_t; @@ -59,7 +61,7 @@ __host__ __device__ inline const char* gpuGetErrorString(cudaError_t error){ #elif TENSORFLOW_USE_ROCM // hipGetErrorString is available on host side only inline const char* gpuGetErrorString(hipError_t error){ - return hipGetErrorString(error); + return hipGetErrorString(error); #endif } @@ -142,24 +144,28 @@ __device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value, return Eigen::half( GpuShuffleSync(mask, static_cast(value), src_lane, width)); } +// Aliased in gpu_device_functions.h __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync( unsigned mask, Eigen::half value, int delta, int width = warpSize) { return Eigen::half( GpuShuffleUpSync(mask, static_cast(value), delta, width)); } +// Aliased in gpu_device_functions.h __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync( unsigned mask, Eigen::half value, int delta, int width = warpSize) { return Eigen::half( GpuShuffleDownSync(mask, static_cast(value), delta, width)); } +// Aliased in gpu_device_functions.h __device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync( unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) { return Eigen::half( GpuShuffleXorSync(mask, static_cast(value), lane_mask, width)); } +// Aliased in gpu_device_functions.h #endif namespace gpu_helper { diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h index 829cc7a23eb424..15001502a07890 100644 --- a/tensorflow/core/util/gpu_launch_config.h +++ b/tensorflow/core/util/gpu_launch_config.h @@ -166,22 +166,24 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, #elif TENSORFLOW_USE_ROCM // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is // implemented - //hipError_t err = hipOccupancyMaxPotentialBlockSize( + // hipError_t err = hipOccupancyMaxPotentialBlockSize( // &block_count, &thread_per_block, func, dynamic_shared_memory_size, // block_size_limit); - //CHECK_EQ(err, hipSuccess); + // CHECK_EQ(err, hipSuccess); + // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) + // that the kernel is quite simple and will largely be memory-limited. const int physical_thread_count = std::min( d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), work_element_count); + // Assume the kernel be simple enough that it is okay to use 1024 threads + // per workgroup. thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); - block_count = - std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumGpuMultiProcessors()); + block_count = std::min(DivUp(physical_thread_count, thread_per_block), + d.getNumGpuMultiProcessors()); #endif - block_count = - std::min(block_count, DivUp(work_element_count, thread_per_block)); + block_count = std::min(block_count, DivUp(work_element_count, thread_per_block)); config.virtual_thread_count = work_element_count; config.thread_per_block = thread_per_block; @@ -209,20 +211,23 @@ inline GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( block_count = std::min(block_count * d.getNumGpuMultiProcessors(), DivUp(work_element_count, fixed_block_size)); #elif TENSORFLOW_USE_ROCM - // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor is - // implemented - //hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor + // is implemented + // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( // &block_count, &thread_per_block, func, dynamic_shared_memory_size, // block_size_limit); - //CHECK_EQ(err, hipSuccess); + // CHECK_EQ(err, hipSuccess); + // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) + // that the kernel is quite simple and will largely be memory-limited. const int physical_thread_count = std::min( d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), work_element_count); + // Assume the kernel be simple enough that it is okay to use 1024 threads + // per workgroup. int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); - block_count = - std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumGpuMultiProcessors()); + block_count = std::min(DivUp(physical_thread_count, thread_per_block), + d.getNumGpuMultiProcessors()); #endif config.virtual_thread_count = work_element_count; @@ -309,7 +314,7 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig( block_size_limit); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM - // ROCM FIXME re-enable this after hipOccupancyMaxPotentialBlockSize is + // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is // implemented // hipError_t err = hipOccupancyMaxPotentialBlockSize( // &block_count, &thread_per_block, func, dynamic_shared_memory_size, @@ -319,9 +324,8 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig( const int physical_thread_count = d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(); thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); - block_count = - std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumGpuMultiProcessors()); + block_count = std::min(DivUp(physical_thread_count, thread_per_block), + d.getNumGpuMultiProcessors()); #endif int threadsx = std::min({xdim, thread_per_block, xthreadlimit});