Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GetConvolveAlgorithms fixup take 2 #13470

Merged
merged 1 commit into from
Oct 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 23 additions & 28 deletions tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ tensorflow::Status ConvolutionThunk::Convolve(
algorithm_config.algorithm_no_scratch().algo_id());
}

std::vector<AlgorithmDesc::Index> ConvolutionThunk::GetAlgorithms(
std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
se::StreamExecutor* stream_exec) const {
std::vector<AlgorithmDesc::Index> algorithms;
std::vector<AlgorithmDesc> algorithms;
// TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
// by default. Should send in conv parameters and enable it when
// ShouldIncludeWinogradNonfusedAlgo() returns true.
Expand Down Expand Up @@ -297,32 +297,27 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(

se::dnn::ProfileResult best_result;
se::dnn::ProfileResult best_result_without_scratch;
std::vector<AlgorithmDesc::Index> algorithms =
GetAlgorithms(stream->parent());
for (bool use_tensor_ops : {false, true}) {
for (auto algo_index : algorithms) {
AlgorithmDesc algorithm(algo_index, use_tensor_ops);
ConvolveScratchAllocator scratch_allocator(
buffer_allocations.device_ordinal(),
buffer_allocations.memory_allocator());
se::dnn::ProfileResult profile_result;
bool launch_ok =
Convolve(input_descriptor, input_data, filter_descriptor,
filter_data, output_descriptor, output_data,
convolution_descriptor,
se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
&scratch_allocator, &profile_result)
.ok();
if (launch_ok && profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalAllocatedBytes() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_without_scratch.elapsed_time_in_ms()) {
best_result_without_scratch = profile_result;
}
std::vector<AlgorithmDesc> algorithms = GetAlgorithms(stream->parent());
for (auto algorithm : algorithms) {
ConvolveScratchAllocator scratch_allocator(
buffer_allocations.device_ordinal(),
buffer_allocations.memory_allocator());
se::dnn::ProfileResult profile_result;
bool launch_ok =
Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
output_descriptor, output_data, convolution_descriptor,
se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
&scratch_allocator, &profile_result)
.ok();
if (launch_ok && profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalAllocatedBytes() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_without_scratch.elapsed_time_in_ms()) {
best_result_without_scratch = profile_result;
}
}
}
Expand Down
4 changes: 1 addition & 3 deletions tensorflow/compiler/xla/service/gpu/convolution_thunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,7 @@ class ConvolutionThunk : public Thunk {
perftools::gputools::dnn::ProfileResult* profile_result);

// Returns the convolve algorithms that can be used for this ConvolutionThunk.
// TODO(nluehr) GetAlgorithms should return AlgorithmDesc including both
// tensor-op and non-tensor-op variants.
std::vector<perftools::gputools::dnn::AlgorithmDesc::Index> GetAlgorithms(
std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
perftools::gputools::StreamExecutor* stream_exec) const;

// Fastest cuDNN convolution algorithm for this thunk learned from
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -493,42 +493,37 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
dnn::AlgorithmConfig algorithm_config;
if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
fused_conv_parameters, &algorithm_config)) {
std::vector<dnn::AlgorithmDesc::Index> algorithms;
std::vector<dnn::AlgorithmDesc> algorithms;
CHECK(stream->parent()->GetConvolveAlgorithms(
fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
&algorithms));
dnn::ProfileResult best_result;
dnn::ProfileResult best_result_no_scratch;
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
// if it's not enabled.
for (bool use_tensor_ops : {false, true}) {
for (auto algo_index : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
dnn::AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
dnn::ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenFusedConvolveWithAlgorithm(
conv_input_desc, conv_input_ptr, conv_input_scale,
filter_desc, filter_ptr, conv_desc, side_input_ptr,
side_input_scale, bias_desc, bias_ptr,
dnn::ActivationMode::kRelu, output_desc, &output_ptr,
&scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
&profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
for (auto profile_algorithm : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
dnn::ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenFusedConvolveWithAlgorithm(
conv_input_desc, conv_input_ptr, conv_input_scale,
filter_desc, filter_ptr, conv_desc, side_input_ptr,
side_input_scale, bias_desc, bias_ptr,
dnn::ActivationMode::kRelu, output_desc, &output_ptr,
&scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
&profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
}
}
Expand Down
55 changes: 25 additions & 30 deletions tensorflow/core/kernels/conv_grad_filter_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -816,40 +816,35 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
AlgorithmConfig algorithm_config;
if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
conv_parameters, &algorithm_config)) {
std::vector<AlgorithmDesc::Index> algorithms;
std::vector<AlgorithmDesc> algorithms;
CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
ProfileResult best_result;
ProfileResult best_result_no_scratch;
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
// if it's not enabled.
for (bool use_tensor_ops : {false, true}) {
for (auto algo_index : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
CudnnScratchAllocator scratch_allocator(
ConvolveBackwardFilterScratchSize, ctx);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveBackwardFilterWithAlgorithm(
input_desc, input_ptr, output_desc, out_backprop_ptr,
conv_desc, filter_desc, &filter_backprop_ptr,
&scratch_allocator, AlgorithmConfig(profile_algorithm),
&profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
for (auto profile_algorithm : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
ctx);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveBackwardFilterWithAlgorithm(
input_desc, input_ptr, output_desc, out_backprop_ptr,
conv_desc, filter_desc, &filter_backprop_ptr,
&scratch_allocator, AlgorithmConfig(profile_algorithm),
&profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
}
}
Expand Down
53 changes: 24 additions & 29 deletions tensorflow/core/kernels/conv_grad_input_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -870,39 +870,34 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
AlgorithmConfig algorithm_config;
if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
conv_parameters, &algorithm_config)) {
std::vector<AlgorithmDesc::Index> algorithms;
std::vector<AlgorithmDesc> algorithms;
CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
ProfileResult best_result;
ProfileResult best_result_no_scratch;
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
// if it's not enabled.
for (bool use_tensor_ops : {false, true}) {
for (auto algo_index : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
ctx);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveBackwardDataWithAlgorithm(
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
AlgorithmConfig(profile_algorithm), &profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
for (auto profile_algorithm : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
ctx);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveBackwardDataWithAlgorithm(
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
AlgorithmConfig(profile_algorithm), &profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
if (profile_result.elapsed_time_in_ms() <
best_result.elapsed_time_in_ms()) {
best_result = profile_result;
}
if (scratch_allocator.TotalByteSize() == 0 &&
profile_result.elapsed_time_in_ms() <
best_result_no_scratch.elapsed_time_in_ms()) {
best_result_no_scratch = profile_result;
}
}
}
Expand Down