Skip to content

Commit

Permalink
Merge pull request #40954 from ROCmSoftwarePlatform:google_upstream_r…
Browse files Browse the repository at this point in the history
…ocm_platform_fix_200630

PiperOrigin-RevId: 323063198
Change-Id: Iea7f29bb0d476428e423950e129fd61e057dac92
  • Loading branch information
tensorflower-gardener committed Jul 24, 2020
2 parents 4e5311d + ddafc33 commit cbc87da
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 7 deletions.
28 changes: 28 additions & 0 deletions tensorflow/core/util/gpu_launch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxPotentialBlockSize(
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
block_size_limit);
CHECK_EQ(err, hipSuccess);
#else
// Earlier versions of this HIP routine incorrectly returned void.
// TODO re-enable hipError_t error checking when HIP is fixed.
// ROCm interface uses unsigned int, convert after checking
uint32_t block_count_uint = 0;
uint32_t thread_per_block_uint = 0;
CHECK_GE(block_size_limit, 0);
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
func, dynamic_shared_memory_size,
block_size_limit_uint);
block_count = static_cast<int>(block_count_uint);
thread_per_block = static_cast<int>(thread_per_block_uint);
#endif
#endif

block_count =
Expand Down Expand Up @@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, hipSuccess);
#else
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif
#endif
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
DivUp(work_element_count, fixed_block_size));
Expand Down
10 changes: 4 additions & 6 deletions tensorflow/tools/ci_build/Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This Dockerfile provides a starting point for a ROCm installation of
# MIOpen and tensorflow.
FROM ubuntu:xenial
FROM ubuntu:bionic
MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>

ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
Expand All @@ -19,9 +19,9 @@ RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.
# Install misc pkgs
RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
clang-3.8 \
clang-format-3.8 \
clang-tidy-3.8 \
clang-6.0 \
clang-format-6.0 \
clang-tidy-6.0 \
cmake \
cmake-qt-gui \
ssh \
Expand Down Expand Up @@ -91,8 +91,6 @@ RUN touch ${ROCM_PATH}/.info/version
COPY install/*.sh /install/
ARG DEBIAN_FRONTEND=noninteractive
RUN /install/install_bootstrap_deb_packages.sh
RUN add-apt-repository -y ppa:openjdk-r/ppa && \
add-apt-repository -y ppa:george-edison55/cmake-3.x
RUN /install/install_deb_packages.sh
RUN /install/install_pip_packages.sh
RUN /install/install_bazel.sh
Expand Down
9 changes: 8 additions & 1 deletion third_party/gpus/rocm_configure.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ load(

_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
_ROCM_TOOLKIT_PATH = "ROCM_PATH"
_TF_ROCM_VERSION = "TF_ROCM_VERSION"
_TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
Expand Down Expand Up @@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")

# Support hcc based off clang 10.0.0 (for ROCm 3.3)
inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")

# Add hcc headers
inc_dirs.append(rocm_toolkit_path + "/hcc/include")

return inc_dirs

def _enable_rocm(repository_ctx):
Expand Down

0 comments on commit cbc87da

Please sign in to comment.