tensorflow · ekuznetsov139 · May 14, 2020 · Aug 26, 2020
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   // One extra line in the inner dimension to avoid share memory bank conflict.
   // This is to mimic the following, but no constructor of T can be invoked.
   //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA  // || TENSORFLOW_COMPILER_IS_HIP_CLANG
   __shared__ __align__(
       alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
   typedef T(*SharedMemoryTile)[TileSizeJ + 1];

diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
@@ -223,10 +223,10 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
   const int32 block_size = d.maxGpuThreadsPerBlock();
-  const int32 num_blocks =
-      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
-      block_size;
-
+  const int32 num_blocks = max(1,
+      min(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+          int(size + block_size - 1)) /
+      block_size);
   TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
                               num_blocks, block_size, 0, d.stream(), gen, data,
                               size, dist));

diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
     // This is to mimic the following, but without any constructors:
     //   __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
     //   (TF_RED_WARPSIZE+1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
   __shared__ __align__(alignof(value_type)) char
       partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                        sizeof(value_type)];
@@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
     // This is to mimic the following, but without constructors:
     //     __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
     //     (TF_RED_WARPSIZE + 1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
   __shared__ __align__(alignof(value_type)) char
       partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                        sizeof(value_type)];

diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
   // due to subtraction of two close values. We have to get fancy
   root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
                           ? rsqrt_helper(im * im * r * r)
-                          : 1 + re * r)) *
+                          : max(T(0.0), 1 + re * r))) *
             root2;
   root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
                           ? rsqrt_helper(im * im * r * r)
-                          : 1 - re * r)) *
+                          : max(T(0.0), 1 - re * r))) *
             root2 * (im >= 0 ? -1. : 1.);
   return *(reinterpret_cast<std::complex<T>*>(&root));
 }