Skip to content

Commit

Permalink
Merging latest update from develop-upstream branch
Browse files Browse the repository at this point in the history
  • Loading branch information
jerryyin committed May 9, 2019
1 parent e4ab2af commit bff8831
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 19 deletions.
2 changes: 2 additions & 0 deletions tensorflow/core/util/gpu_device_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ __device__ inline double GpuShuffleUpSync(unsigned mask, double value,
static_cast<uint64_t>(lo));
#endif
}
CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);

// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
// in convergence, see comment above for details.
Expand Down Expand Up @@ -394,6 +395,7 @@ __device__ inline double GpuShuffleDownSync(unsigned mask, double value,
static_cast<uint64_t>(lo));
#endif
}
CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);

// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
// convergence, see comment above for details.
Expand Down
8 changes: 7 additions & 1 deletion tensorflow/core/util/gpu_kernel_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ limitations under the License.
#if GOOGLE_CUDA
#include "cuda/include/cuda_fp16.h"
#endif
#include "tensorflow/core/util/gpu_cuda_alias.h"
#include "tensorflow/core/util/gpu_device_functions.h"
#include "tensorflow/core/util/gpu_launch_config.h"

Expand All @@ -46,6 +47,7 @@ limitations under the License.
#define gpuSuccess cudaSuccess
using gpuStream_t = cudaStream_t;
using gpuError_t = cudaError_t;

#elif TENSORFLOW_USE_ROCM
#define gpuSuccess hipSuccess
using gpuStream_t = hipStream_t;
Expand All @@ -59,7 +61,7 @@ __host__ __device__ inline const char* gpuGetErrorString(cudaError_t error){
#elif TENSORFLOW_USE_ROCM
// hipGetErrorString is available on host side only
inline const char* gpuGetErrorString(hipError_t error){
return hipGetErrorString(error);
return hipGetErrorString(error);
#endif
}

Expand Down Expand Up @@ -142,24 +144,28 @@ __device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value,
return Eigen::half(
GpuShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
}
// Aliased in gpu_device_functions.h

__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync(
unsigned mask, Eigen::half value, int delta, int width = warpSize) {
return Eigen::half(
GpuShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
}
// Aliased in gpu_device_functions.h

__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync(
unsigned mask, Eigen::half value, int delta, int width = warpSize) {
return Eigen::half(
GpuShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
}
// Aliased in gpu_device_functions.h

__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync(
unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
return Eigen::half(
GpuShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
}
// Aliased in gpu_device_functions.h
#endif

namespace gpu_helper {
Expand Down
40 changes: 22 additions & 18 deletions tensorflow/core/util/gpu_launch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,22 +166,24 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
#elif TENSORFLOW_USE_ROCM
// ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
// implemented
//hipError_t err = hipOccupancyMaxPotentialBlockSize(
// hipError_t err = hipOccupancyMaxPotentialBlockSize(
// &block_count, &thread_per_block, func, dynamic_shared_memory_size,
// block_size_limit);
//CHECK_EQ(err, hipSuccess);
// CHECK_EQ(err, hipSuccess);

// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif

block_count =
std::min(block_count, DivUp(work_element_count, thread_per_block));
block_count = std::min(block_count, DivUp(work_element_count, thread_per_block));

config.virtual_thread_count = work_element_count;
config.thread_per_block = thread_per_block;
Expand Down Expand Up @@ -209,20 +211,23 @@ inline GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
DivUp(work_element_count, fixed_block_size));
#elif TENSORFLOW_USE_ROCM
// ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor is
// implemented
//hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
// ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
// is implemented
// hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
// &block_count, &thread_per_block, func, dynamic_shared_memory_size,
// block_size_limit);
//CHECK_EQ(err, hipSuccess);
// CHECK_EQ(err, hipSuccess);

// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif

config.virtual_thread_count = work_element_count;
Expand Down Expand Up @@ -309,7 +314,7 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
// ROCM FIXME re-enable this after hipOccupancyMaxPotentialBlockSize is
// ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
// implemented
// hipError_t err = hipOccupancyMaxPotentialBlockSize(
// &block_count, &thread_per_block, func, dynamic_shared_memory_size,
Expand All @@ -319,9 +324,8 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
const int physical_thread_count =
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif

int threadsx = std::min({xdim, thread_per_block, xthreadlimit});
Expand Down

0 comments on commit bff8831

Please sign in to comment.