Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensorflow/core/kernels/conv_2d_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
// One extra line in the inner dimension to avoid share memory bank conflict.
// This is to mimic the following, but no constructor of T can be invoked.
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
__shared__ __align__(
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
Expand Down
8 changes: 4 additions & 4 deletions tensorflow/core/kernels/random_op_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,10 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
typename Distribution::ResultElementType* data, int64 size,
Distribution dist) {
const int32 block_size = d.maxGpuThreadsPerBlock();
const int32 num_blocks =
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
block_size;

const int32 num_blocks = max(1,
min(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
int(size + block_size - 1)) /
block_size);
TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
num_blocks, block_size, 0, d.stream(), gen, data,
size, dist));
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/reduction_gpu_kernels.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
// This is to mimic the following, but without any constructors:
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
// (TF_RED_WARPSIZE+1)];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
#if GOOGLE_CUDA
__shared__ __align__(alignof(value_type)) char
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
sizeof(value_type)];
Expand Down Expand Up @@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
// This is to mimic the following, but without constructors:
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
// (TF_RED_WARPSIZE + 1)];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
#if GOOGLE_CUDA
__shared__ __align__(alignof(value_type)) char
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
sizeof(value_type)];
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/training_ops_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
// due to subtraction of two close values. We have to get fancy
root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
? rsqrt_helper(im * im * r * r)
: 1 + re * r)) *
: max(T(0.0), 1 + re * r))) *
root2;
root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
? rsqrt_helper(im * im * r * r)
: 1 - re * r)) *
: max(T(0.0), 1 - re * r))) *
root2 * (im >= 0 ? -1. : 1.);
return *(reinterpret_cast<std::complex<T>*>(&root));
}
Expand Down