Skip to content

Commit

Permalink
Add gemmlowp-threadpool multithreading to the depthwiseconv implement…
Browse files Browse the repository at this point in the history
…ation for the quantized path.

PiperOrigin-RevId: 239959051
  • Loading branch information
lu-wang-g authored and tensorflower-gardener committed Mar 23, 2019
1 parent 36f817a commit 152095e
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 41 deletions.
33 changes: 17 additions & 16 deletions tensorflow/lite/kernels/depthwise_conv.cc
Expand Up @@ -21,6 +21,7 @@ limitations under the License.

#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/c_api_internal.h"
#include "tensorflow/lite/kernels/gemm_support.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
Expand Down Expand Up @@ -66,13 +67,15 @@ struct OpData {
};

void* Init(TfLiteContext* context, const char* buffer, size_t length) {
gemm_support::IncrementUsageCounter(context);
// This is a builtin op, so we don't use the contents in 'buffer', if any.
// Instead, we allocate a new object to carry information from Prepare() to
// Eval().
return new OpData;
}

void Free(TfLiteContext* context, void* buffer) {
gemm_support::DecrementUsageCounter(context);
delete reinterpret_cast<OpData*>(buffer);
}

Expand Down Expand Up @@ -230,17 +233,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
auto filter_offset = -filter->params.zero_point;
auto output_offset = output->params.zero_point;

void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
const uint8*, const RuntimeShape&, const uint8*,
const RuntimeShape&, const int32*, const RuntimeShape&,
uint8*);

if (kernel_type == kReference) {
depthwise_conv = &reference_ops::DepthwiseConv;
} else {
depthwise_conv = &optimized_ops::DepthwiseConv;
}

DepthwiseParams op_params;
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
Expand All @@ -257,11 +249,20 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
depthwise_conv(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(filter),
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
GetTensorData<int32_t>(bias), GetTensorShape(output),
GetTensorData<uint8_t>(output));
if (kernel_type == kReference) {
reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output));
} else {
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
optimized_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output), gemm_context);
}
}

void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
Expand Down
167 changes: 167 additions & 0 deletions tensorflow/lite/kernels/depthwise_conv_test.cc
Expand Up @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cstdarg>
#include <initializer_list>
#include <gtest/gtest.h>
#include "absl/memory/memory.h"
#include "tensorflow/lite/interpreter.h"
Expand Down Expand Up @@ -501,6 +502,172 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
}

TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
const int depth = 1;
const int image_width = 4;
const int image_height = 28;
const int image_batch_count = 3;
const int filter_size = 3;
const int filter_count = 1;

QuantizedDepthwiseConvolutionOpModel m(
GetRegistration(),
{TensorType_UINT8,
{image_batch_count, image_height, image_width, depth},
0,
255},
{TensorType_UINT8,
{depth, filter_size, filter_size, filter_count},
0,
255},
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);

// clang-format off
m.SetInput({
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
});
// clang-format on

// The filter matrix is:
// | 1 | 2 | 3 |
// | 4 | 5 | 6 |
// | 7 | 8 | 9 |
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
// No bias for this test.
m.SetBias({0});
m.SetNumThreads(4);
m.Invoke();

// clang-format off
EXPECT_THAT(
m.GetOutput(),
ElementsAreArray({
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 24, 24, 39, 39,
45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 21, 21, 6, 6,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 48, 48, 78, 78,
90, 90, 90, 90, 90, 90, 90, 90,
90, 90, 90, 90, 90, 90, 90, 90,
90, 90, 90, 90, 42, 42, 12, 12,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 72, 72, 117, 117,
135, 135, 135, 135, 135, 135, 135, 135,
135, 135, 135, 135, 135, 135, 135, 135,
135, 135, 135, 135, 63, 63, 18, 18,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
}));
// clang-format on
}

TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
const int depth = 1;
const int image_width = 8;
const int image_height = 4;
const int image_batch_count = 6;
const int filter_size = 3;
const int filter_count = 1;

QuantizedDepthwiseConvolutionOpModel m(
GetRegistration(),
{TensorType_UINT8,
{image_batch_count, image_height, image_width, depth},
0,
255},
{TensorType_UINT8,
{depth, filter_size, filter_size, filter_count},
0,
255},
{TensorType_UINT8, {}, 0, 255}, Padding_VALID);

// clang-format off
m.SetInput({
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
});
// clang-format on

// The filter matrix is:
// | 1 | 2 | 3 |
// | 4 | 5 | 6 |
// | 7 | 8 | 9 |
m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
// No bias for this test.
m.SetBias({0});
m.SetNumThreads(4);
m.Invoke();

// clang-format off
EXPECT_THAT(
m.GetOutput(),
ElementsAreArray({
39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,

39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,

39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,

39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,

39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21,

39, 39, 39, 39, 39, 39,
21, 21, 21, 21, 21, 21
}));
// clang-format on
}

class PerChannelQuantizedDepthwiseConvolutionOpModel
: public BaseDepthwiseConvolutionOpModel {
public:
Expand Down
12 changes: 8 additions & 4 deletions tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
Expand Up @@ -139,7 +139,8 @@ inline void DispatchDepthwiseConv(
// Call kernel optimized for depthwise convolutions using 3x3 filters.
optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
#else
break;
Expand Down Expand Up @@ -242,7 +243,8 @@ inline void DispatchDepthwiseConv(
case DepthwiseConvImplementation::kUseGenericKernel: {
optimized_ops::depthwise_conv::DepthwiseConvGeneral(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
}
case DepthwiseConvImplementation::kNone:
Expand Down Expand Up @@ -271,13 +273,15 @@ inline void DispatchDepthwiseConv(
optimized_ops::DepthwiseConvWithRounding<
DepthwiseConvOutputRounding::kAwayFromZero>(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
case DepthwiseConvOutputRounding::kUpward:
optimized_ops::DepthwiseConvWithRounding<
DepthwiseConvOutputRounding::kUpward>(
params, input_shape, input_data, filter_shape, filter_data,
bias_shape, bias_data, output_shape, output_data);
bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
/*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
return;
default:
break;
Expand Down

0 comments on commit 152095e

Please sign in to comment.