Merge pull request #40954 from ROCmSoftwarePlatform:google_upstream_r…

…ocm_platform_fix_200630 PiperOrigin-RevId: 323063198 Change-Id: Iea7f29bb0d476428e423950e129fd61e057dac92
tensorflow · Jul 24, 2020 · cbc87da · cbc87da
2 parents 4e5311d + ddafc33
commit cbc87da
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 7 deletions.
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
@@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
   hipError_t err = hipOccupancyMaxPotentialBlockSize(
       &block_count, &thread_per_block, func, dynamic_shared_memory_size,
       block_size_limit);
   CHECK_EQ(err, hipSuccess);
+#else
+  // Earlier versions of this HIP routine incorrectly returned void.
+  // TODO re-enable hipError_t error checking when HIP is fixed.
+  // ROCm interface uses unsigned int, convert after checking
+  uint32_t block_count_uint = 0;
+  uint32_t thread_per_block_uint = 0;
+  CHECK_GE(block_size_limit, 0);
+  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
+  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
+                                    func, dynamic_shared_memory_size,
+                                    block_size_limit_uint);
+  block_count = static_cast<int>(block_count_uint);
+  thread_per_block = static_cast<int>(thread_per_block_uint);
+#endif
 #endif
 
   block_count =
@@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
   hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, hipSuccess);
+#else
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
+  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 #endif
   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));

diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -1,6 +1,6 @@
 # This Dockerfile provides a starting point for a ROCm installation of
 # MIOpen and tensorflow.
-FROM ubuntu:xenial
+FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
@@ -19,9 +19,9 @@ RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.
 # Install misc pkgs
 RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
   build-essential \
-  clang-3.8 \
-  clang-format-3.8 \
-  clang-tidy-3.8 \
+  clang-6.0 \
+  clang-format-6.0 \
+  clang-tidy-6.0 \
   cmake \
   cmake-qt-gui \
   ssh \
@@ -91,8 +91,6 @@ RUN touch ${ROCM_PATH}/.info/version
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
 RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh

diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
@@ -35,7 +35,7 @@ load(
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
+_ROCM_TOOLKIT_PATH = "ROCM_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
 _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
 _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
@@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
 
+    # Support hcc based off clang 10.0.0 (for ROCm 3.3)
+    inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
+    inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
+
+    # Add hcc headers
+    inc_dirs.append(rocm_toolkit_path + "/hcc/include")
+
     return inc_dirs
 
 def _enable_rocm(repository_ctx):