Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] ROCm stream executor implementation updates #30299

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
158 changes: 79 additions & 79 deletions tensorflow/stream_executor/rocm/rocm_blas.cc

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions tensorflow/stream_executor/rocm/rocm_blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class GpuExecutor;
// Thread-safe post-initialization.
class ROCMBlas : public blas::BlasSupport {
public:
explicit ROCMBlas(GpuExecutor *parent);
explicit ROCMBlas(GpuExecutor* parent);

// Allocates a rocBLAS handle.
bool Init();
Expand Down Expand Up @@ -98,7 +98,7 @@ class ROCMBlas : public blas::BlasSupport {
// Convenience functions that call DoBlasInternalImpl with different values
// for err_on_failure.
template <typename FuncT, typename... Args>
bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
bool DoBlasInternal(FuncT rocblas_func, Stream* stream,
bool pointer_mode_host, Args... args) {
return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
/*err_on_failure=*/true, args...);
Expand All @@ -114,14 +114,14 @@ class ROCMBlas : public blas::BlasSupport {
// strided flavor
template <typename T>
port::Status AllocateStridedBuffer(
const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type *>
&raw_ptrs,
const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type*>&
raw_ptrs,
int batch_count, uint64_t batch_stride,
ScratchAllocator *scratch_allocator, Stream *stream,
ScratchAllocator* scratch_allocator, Stream* stream,
std::unique_ptr<TemporaryDeviceMemory<
typename RocBlasTypeConversionHelper<T>::mapped_type>> *temp_memory,
DeviceMemory<typename RocBlasTypeConversionHelper<T>::mapped_type>
*device_memory);
typename RocBlasTypeConversionHelper<T>::mapped_type>>* temp_memory,
DeviceMemory<typename RocBlasTypeConversionHelper<T>::mapped_type>*
device_memory);

// A helper function to implement DoBlasGemmBatched interfaces for generic
// types.
Expand Down Expand Up @@ -185,7 +185,7 @@ class ROCMBlas : public blas::BlasSupport {

// GpuExecutor which instantiated this ROCMBlas.
// Immutable post-initialization.
GpuExecutor *parent_;
GpuExecutor* parent_;

// rocBLAS library handle on the device.
rocblas_handle blas_ GUARDED_BY(mu_);
Expand Down
87 changes: 44 additions & 43 deletions tensorflow/stream_executor/rocm/rocm_dnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ class MIOpenHandle {
namespace wrap {

#ifdef PLATFORM_GOOGLE

#define STREAM_EXECUTOR_MIOPEN_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
Expand Down Expand Up @@ -162,6 +161,7 @@ namespace wrap {
__macro(miopenBatchNormalizationForwardInference) \
__macro(miopenBatchNormalizationForwardTraining) \
__macro(miopenGetConvolutionForwardOutputDim) \
__macro(miopenGetConvolutionNdForwardOutputDim) \
__macro(miopenFindConvolutionForwardAlgorithm) \
__macro(miopenCreateTensorDescriptor) \
__macro(miopenDestroyTensorDescriptor) \
Expand All @@ -183,7 +183,9 @@ namespace wrap {
__macro(miopenConvolutionBackwardBias) \
__macro(miopenConvolutionForwardGetWorkSpaceSize) \
__macro(miopenInitConvolutionDescriptor) \
__macro(miopenInitConvolutionNdDescriptor) \
__macro(miopenGetConvolutionDescriptor) \
__macro(miopenGetConvolutionNdDescriptor) \
__macro(miopenSetConvolutionGroupCount) \
__macro(miopenSet4dTensorDescriptor) \
__macro(miopenGetTensorDescriptor) \
Expand Down Expand Up @@ -282,28 +284,29 @@ uint64 GetHashValue(miopenTensorDescriptor_t tensor_desc) {

uint64 GetHashValue(miopenConvolutionDescriptor_t conv_desc) {
miopenConvolutionMode_t c_mode = miopenConvolution;
int pad_h = 0, pad_w = 0, u = 0, v = 0, dilation_h = 0, dilation_w = 0;
wrap::miopenGetConvolutionDescriptor(conv_desc, &c_mode, &pad_h, &pad_w, &u,
&v, &dilation_h, &dilation_w);
int nd = 0;
wrap::miopenGetConvolutionNdDescriptor(conv_desc, 0, &nd, nullptr, nullptr,
nullptr, &c_mode);

std::vector<int> stride(nd);
std::vector<int> pad(nd);
std::vector<int> dilation(nd);

wrap::miopenGetConvolutionNdDescriptor(
conv_desc, nd, &nd, pad.data(), stride.data(), dilation.data(), &c_mode);

uint64 hash_value = tensorflow::hash<int>()(c_mode);
hash_value =
tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(pad_h));
hash_value =
tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(pad_w));
hash_value =
tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(u));
hash_value =
tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(v));
hash_value = tensorflow::Hash64Combine(hash_value,
tensorflow::hash<int>()(dilation_h));
hash_value = tensorflow::Hash64Combine(hash_value,
tensorflow::hash<int>()(dilation_w));
auto hash64Combine = [&hash_value](int element) {
tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(element));
};
std::for_each(pad.begin(), pad.end(), hash64Combine);
std::for_each(stride.begin(), stride.end(), hash64Combine);
std::for_each(dilation.begin(), dilation.end(), hash64Combine);

return hash_value;
}

// Class to implement a cache of compiled fusion plans.
// Class to implement a cache of compiled fusion plans
class CachedFusionPlans {
public:
// Check if we already have a fusion_plan corresponding to the given hash
Expand Down Expand Up @@ -340,7 +343,7 @@ class CachedFusionPlans {
return found_cached_plan;
}

// Need to figure out the right place to call this routine.
// Need to figure out the right place to call this routine
static void Clear() {
absl::MutexLock lock{&cached_plans_mutex};

Expand All @@ -357,24 +360,24 @@ class CachedFusionPlans {
unsupported_plans.clear();
}

// Is the Fusion plan corresponding to this hash unsupported.
// Is the Fusion plan corresponding to this hash unsupported
static bool IsUnsupportedFusionPlan(uint64 hash) {
absl::MutexLock lock{&cached_plans_mutex};
return unsupported_plans.count(hash) > 0;
}

// Mark the given hash value as corresponding to an unsupported fusion plan.
// Mark the given hash value as corresponding to an unsupported fusion plan
static void MarkFusionPlanUnsupported(uint64 hash) {
absl::MutexLock lock{&cached_plans_mutex};
unsupported_plans.insert(hash);
}

private:
// Mutex to guard access to all data within this class.
// Mutex to guard access to all data within this class
static absl::Mutex cached_plans_mutex;

// Map of hash-value to MIOpen Fusion plan descriptors.
// Need to be able share this across more than one stream and hence static.
// Map of hash-value to MIOpen Fusion plan descriptors
// Need to be able share this across more than one stream and hence static
static std::map<uint64, miopenFusionPlanDescriptor_t> cached_plans;

// Set of hash-values that correspond to MIOpen Fusion plans that will fail
Expand All @@ -386,6 +389,10 @@ absl::Mutex CachedFusionPlans::cached_plans_mutex;
std::map<uint64, miopenFusionPlanDescriptor_t> CachedFusionPlans::cached_plans;
std::set<uint64> CachedFusionPlans::unsupported_plans;

} // namespace

namespace {

miopenHandle_t ToHandle(void* opaque_handle) {
return static_cast<miopenHandle_t>(opaque_handle);
}
Expand Down Expand Up @@ -538,10 +545,6 @@ class ScopedTensorDescriptor {
case dnn::DataLayout::kBatchYXDepth:
case dnn::DataLayout::kBatchDepthYX: {
const int nd = batch_descriptor.ndims() + 2;
if (nd != 4) {
LOG(FATAL) << "miopen only supports 4D tensors, dim=" << nd
<< " not allowed";
}

// MIOpen requires the strides and dims to be ordered as BDYX.
std::vector<int64> strides64 =
Expand All @@ -556,8 +559,8 @@ class ScopedTensorDescriptor {
&CheckedNarrowing<int64, int>);
std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
&CheckedNarrowing<int64, int>);
status = wrap::miopenSet4dTensorDescriptor(handle_, elem_type, dims[0],
dims[1], dims[2], dims[3]);
status = wrap::miopenSetTensorDescriptor(handle_, elem_type, nd,
dims.data(), strides.data());

if (status != miopenStatusSuccess) {
LOG(FATAL) << "could not convert BatchDescriptor "
Expand Down Expand Up @@ -604,19 +607,14 @@ class ScopedFilterDescriptor {

const int nd = batch_descriptor.ndims() + 2;

if (nd != 4) {
LOG(FATAL) << "miopen only supports 4D filters, dim=" << nd
<< "not allowed" << ToString(status);
}

std::vector<int> dims(2 + filter_descriptor.ndims());
dims[0] = filter_descriptor.output_feature_map_count();
dims[1] = filter_descriptor.input_feature_map_count();
const auto& spatial_dims = filter_descriptor.input_filter_dims();
std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);

status = wrap::miopenSet4dTensorDescriptor(handle_, elem_type, dims[0],
dims[1], dims[2], dims[3]);
status = wrap::miopenSetTensorDescriptor(handle_, elem_type, nd,
dims.data(), nullptr);
if (status != miopenStatusSuccess) {
LOG(FATAL) << "could not set miopen filter descriptor: "
<< ToString(status);
Expand Down Expand Up @@ -667,11 +665,15 @@ class ScopedConvolutionDescriptor {
&CheckedNarrowing<int64, int>);
std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
&CheckedNarrowing<int64, int>);
std::vector<int> upscale(convolution_descriptor.ndims(), 1);

status = wrap::miopenInitConvolutionDescriptor(
handle_, miopenConvolution, padding[0], padding[1], strides[0],
strides[1], upscale[0], upscale[1]);
std::vector<int> upscale(convolution_descriptor.ndims());
const auto& dilations64 = convolution_descriptor.dilations();
std::transform(dilations64.cbegin(), dilations64.cend(), upscale.begin(),
&CheckedNarrowing<int64, int>);

status = wrap::miopenInitConvolutionNdDescriptor(
handle_, convolution_descriptor.ndims(), padding.data(), strides.data(),
upscale.data(), miopenConvolution);
if (status != miopenStatusSuccess) {
LOG(FATAL) << "could not set miopen convolution descriptor: "
<< ToString(status);
Expand Down Expand Up @@ -4003,9 +4005,8 @@ bool MIOpenSupport::DeriveOutputBatchDescriptor(

int dn = batch_descriptor.ndims() + 2;
std::vector<int> dims(dn); // in BDYX
auto status = wrap::miopenGetConvolutionForwardOutputDim(
conv.handle(), input_nd.handle(), filter.handle(), &dims[0], &dims[1],
&dims[2], &dims[3]);
auto status = wrap::miopenGetConvolutionNdForwardOutputDim(
conv.handle(), input_nd.handle(), filter.handle(), &dn, dims.data());
if (status != miopenStatusSuccess) {
LOG(ERROR) << "could not get output tensor for convolution: "
<< ToString(status);
Expand Down
6 changes: 1 addition & 5 deletions tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ limitations under the License.
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/platform/port.h"

#if defined(TENSORFLOW_USE_ROCM)

#endif

namespace tensorflow {
namespace wrap {
#ifdef PLATFORM_GOOGLE
Expand Down Expand Up @@ -83,8 +79,8 @@ namespace wrap {
__macro(hipDeviceTotalMem) \
__macro(hipDriverGetVersion) \
__macro(hipEventCreateWithFlags) \
__macro(hipEventElapsedTime) \
__macro(hipEventDestroy) \
__macro(hipEventElapsedTime) \
__macro(hipEventQuery) \
__macro(hipEventRecord) \
__macro(hipEventSynchronize) \
Expand Down
72 changes: 42 additions & 30 deletions tensorflow/stream_executor/rocm/rocm_fft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace wrap {
#define STREAM_EXECUTOR_ROCFFT_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
hipfftResult operator()(GpuExecutor *parent, Args... args) { \
hipfftResult operator()(GpuExecutor* parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
Expand Down Expand Up @@ -86,21 +86,33 @@ namespace wrap {

#endif

#define ROCFFT_ROUTINE_EACH(__macro) \
__macro(hipfftDestroy) __macro(hipfftSetStream) __macro(hipfftPlan1d) \
__macro(hipfftPlan2d) __macro(hipfftPlan3d) __macro(hipfftPlanMany) \
__macro(hipfftCreate) __macro(hipfftSetAutoAllocation) \
__macro(hipfftSetWorkArea) __macro(hipfftGetSize1d) \
__macro(hipfftMakePlan1d) __macro(hipfftGetSize2d) \
__macro(hipfftMakePlan2d) __macro(hipfftGetSize3d) \
__macro(hipfftMakePlan3d) __macro(hipfftGetSizeMany) \
__macro(hipfftMakePlanMany) \
__macro(hipfftExecD2Z) \
__macro(hipfftExecZ2D) \
__macro(hipfftExecC2C) \
__macro(hipfftExecC2R) \
__macro(hipfftExecZ2Z) \
__macro(hipfftExecR2C)
// clang-format off
#define ROCFFT_ROUTINE_EACH(__macro) \
__macro(hipfftDestroy) \
__macro(hipfftSetStream) \
__macro(hipfftPlan1d) \
__macro(hipfftPlan2d) \
__macro(hipfftPlan3d) \
__macro(hipfftPlanMany) \
__macro(hipfftCreate) \
__macro(hipfftSetAutoAllocation) \
__macro(hipfftSetWorkArea) \
__macro(hipfftGetSize1d) \
__macro(hipfftMakePlan1d) \
__macro(hipfftGetSize2d) \
__macro(hipfftMakePlan2d) \
__macro(hipfftGetSize3d) \
__macro(hipfftMakePlan3d) \
__macro(hipfftGetSizeMany) \
__macro(hipfftMakePlanMany) \
__macro(hipfftExecD2Z) \
__macro(hipfftExecZ2D) \
__macro(hipfftExecC2C) \
__macro(hipfftExecC2R) \
__macro(hipfftExecZ2Z) \
__macro(hipfftExecR2C) \

// clang-format on

ROCFFT_ROUTINE_EACH(STREAM_EXECUTOR_ROCFFT_WRAP)

Expand Down Expand Up @@ -515,7 +527,7 @@ bool ROCMFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfftExec,
}

auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
GpuComplex(const_cast<InputT*>(GpuMemory(input))),
GpuComplex(GpuMemoryMutable(output)));

if (ret != HIPFFT_SUCCESS) {
Expand All @@ -542,7 +554,7 @@ bool ROCMFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
}

auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
GpuComplex(const_cast<InputT*>(GpuMemory(input))),
GpuComplex(GpuMemoryMutable(output)),
rocm_fft_plan->GetFftDirection());

Expand All @@ -556,21 +568,21 @@ bool ROCMFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,

#define STREAM_EXECUTOR_ROCM_DEFINE_FFT(__type, __fft_type1, __fft_type2, \
__fft_type3) \
bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan, \
const DeviceMemory<std::complex<__type>> &input, \
DeviceMemory<std::complex<__type>> *output) { \
bool ROCMFft::DoFft(Stream* stream, fft::Plan* plan, \
const DeviceMemory<std::complex<__type>>& input, \
DeviceMemory<std::complex<__type>>* output) { \
return DoFftWithDirectionInternal( \
stream, plan, wrap::hipfftExec##__fft_type1, input, output); \
} \
bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan, \
const DeviceMemory<__type> &input, \
DeviceMemory<std::complex<__type>> *output) { \
bool ROCMFft::DoFft(Stream* stream, fft::Plan* plan, \
const DeviceMemory<__type>& input, \
DeviceMemory<std::complex<__type>>* output) { \
return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type2, input, \
output); \
} \
bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan, \
const DeviceMemory<std::complex<__type>> &input, \
DeviceMemory<__type> *output) { \
bool ROCMFft::DoFft(Stream* stream, fft::Plan* plan, \
const DeviceMemory<std::complex<__type>>& input, \
DeviceMemory<__type>* output) { \
return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type3, input, \
output); \
}
Expand All @@ -590,9 +602,9 @@ void initialize_rocfft() {
port::Status status =
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
rocm::kROCmPlatformId, gpu::kRocFftPlugin, "rocFFT",
[](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
gpu::GpuExecutor *rocm_executor =
dynamic_cast<gpu::GpuExecutor *>(parent);
[](internal::StreamExecutorInterface* parent) -> fft::FftSupport* {
gpu::GpuExecutor* rocm_executor =
dynamic_cast<gpu::GpuExecutor*>(parent);
if (rocm_executor == nullptr) {
LOG(ERROR)
<< "Attempting to initialize an instance of the rocFFT "
Expand Down