Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TANH/Sigmoid 16-bit activation functions using LUT #34589

Merged
merged 8 commits into from Apr 7, 2020
156 changes: 145 additions & 11 deletions tensorflow/lite/kernels/activations.cc
Expand Up @@ -60,7 +60,8 @@ struct OpData {
int input_left_shift = 0;
int32_t input_range_radius = 0;
int diff_min = 0;
uint8_t table[256] = {0};
uint16_t table[256] = {0};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double check, is this safe for the int8/unit8 path when EvalUsingLookupTable is called with "table[0] = vld1q_u8_x4(data->table + 16 * 4 * 0)" ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right. I did this implementation, before Neon optimization has been added.
I corrected, so it will be working properly for int8/uint8.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for making the change.

uint16_t* table_zero = nullptr;
};

struct SoftmaxOpData {
Expand Down Expand Up @@ -154,6 +155,54 @@ inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
#endif

// TODO(b/143696793): move this to optimized_ops.
// We use combined sigmoid and tanh look-up table, since
// tanh(x) = 2*sigmoid(2*x) -1.
// Both functions are symmetric, so the LUT table is only needed
// for the absolute value of the input.
void PopulateLookupTableSigmoid(struct OpData* data) {

// Table of sigmoid(i/24) at 0.16 format - 256 elements.

auto table = std::initializer_list<uint16_t>({
32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513,
38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688,
43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369,
47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409,
51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755,
55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433,
57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519,
59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110,
61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302,
62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186,
63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308,
64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652,
64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079,
65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208,
65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301,
65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367,
65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415,
65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449,
65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474,
65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504,
65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513,
65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520,
65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524,
65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528,
65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532,
65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533,
65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534,
65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535
});

std::copy(table.begin(), table.end(), data->table);

data->table_zero = &data->table[0];
}

void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {
const int size =
Expand Down Expand Up @@ -211,6 +260,89 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input,
GetTensorShape(output), GetTensorData<T>(output));
}

void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {

const int size = MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));

int16_t* ptr_output_data = GetTensorData<int16_t>(output);
const int16_t* ptr_input_data = GetTensorData<int16_t>(input);

for (int i = 0; i < size; ++i, ptr_output_data++, ptr_input_data++) {
int32_t input_data = *ptr_input_data;

// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
// we do interpolation on unsigned values.
uint32_t abs_input_data = 3*abs(input_data);

// We divide by 2 power of 9, because
// we need to divide by 2 in power of 7 for
// the input conversion + 1/4 from the scale above.
uint8_t uh = abs_input_data >> 9;
uint32_t ua = data->table_zero[uh];
uint32_t ub = data->table_zero[uh+1];
uint32_t ut = abs_input_data & 0x1ff;

// Interpolation is done using the fractional bit.
uint32_t result = (ua << 9) + ut * (ub - ua);

result = (input_data >=0) ? (result + (1 << 9)) :
((1 << (16 + 9)) - result + (1 << 9) - 1);

// Back to 16-bit.
result >>= 10;

*ptr_output_data = result;
}
}

void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {

const int size =
MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));

const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
int16_t* ptr_output_data = GetTensorData<int16_t>(output);

// We use the LUT for sigmoid and take into account, that
// tanh(x) = 2*sigmoid(2*x) - 1
for (int i=0; i < size; ++i, ptr_input_data++, ptr_output_data++) {

int32_t input_data = *ptr_input_data;

if (data->input_left_shift == 1) {
input_data <<= 1;
}

// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
uint32_t abs_input_data = 3*abs(input_data);
uint32_t uh = abs_input_data >> 8;
int32_t result;

if (uh >= 255) {
// Saturate to maximum.
result = 0xFFFF<<8;
} else {

uint32_t ua = data->table_zero[uh];
uint32_t ub = data->table_zero[uh+1];

uint8_t ut = abs_input_data & 0xFF;

result = (ua<<8) + ut*(ub-ua);
}

result = (input_data>=0) ? (result - (1<<(14+9)) + (1<<(9-2))) :
(-result + (1<<(14+9)) + (1<<(9-2))-1);

// Convert back to 16-bit.
result >>= (9-1);

*ptr_output_data = result;
}
}

} // namespace

void* Init(TfLiteContext* context, const char* buffer, size_t length) {
Expand Down Expand Up @@ -418,6 +550,8 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
} else if (input->type == kTfLiteInt8) {
PopulateLookupTable<int8_t>(data, input, output,
[](float value) { return std::tanh(value); });
} else if (input->type == kTfLiteInt16) {
PopulateLookupTableSigmoid(data);
}
}

Expand Down Expand Up @@ -509,6 +643,10 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
PopulateLookupTable<int8_t>(data, input, output, [](float value) {
return 1.0f / (1.0f + std::exp(-value));
});
} else if (input->type == kTfLiteInt16) {
TF_LITE_ENSURE(context, output->params.scale == 1. / 32768);
TF_LITE_ENSURE(context, output->params.zero_point == 0.);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think integer "0" is sufficient here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

PopulateLookupTableSigmoid(data);
}
}

Expand Down Expand Up @@ -798,14 +936,12 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
case kTfLiteInt16: {
TanhParams params;
params.input_left_shift = data->input_left_shift;
if (kernel_type == kReference) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we intended to keep the reference ones

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we did not fully align on this. From the discussion preceding this change, we understood that we would keep the new LUT-based implementation as a default one, and assumed that "default" == "kReference".

It also appears that, if GEMMLOWP_NEON is not defined, optimized_ops::Tanh() called in the case of kFixedPointOptimize is the same as reference_ops::Tanh() that used to be called for kReference.

So, we just followed the same code pattern as is used for Int8 and UInt8 versions.

I appreciate now that later in this file kGenericOptimized version of kernel_type seems to be registered as the default one for Tanh and Logistic (in Register_TANH()).

Do you think we should change to calling different implementations for different kernel_type? I.e.
kReference -> reference_ops::Tanh()
kFixedPointOptimize -> optimized_ops::Tanh()
kGenericOptimized -> EvalUsingLookupTableTanh16Bit() (default)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@renjie-liu Could you please clarify how the current implementation and the suggested one using LUT should co-exist ? Should we call the suggested as a reference, because it is more accurate ? The current one looks like a good fit for the kernel kFixedPointOptimize. Which one should be when the kernel type is not specified ?
Thanks !

reference_ops::Tanh(
params, GetTensorShape(input), GetTensorData<int16_t>(input),
GetTensorShape(output), GetTensorData<int16_t>(output));
} else {
if (kernel_type == kFixedPointOptimized) {
optimized_ops::Tanh(
params, GetTensorShape(input), GetTensorData<int16_t>(input),
GetTensorShape(output), GetTensorData<int16_t>(output));
} else {
EvalUsingLookupTableTanh16Bit(data, input, output);
}
return kTfLiteOk;
} break;
Expand Down Expand Up @@ -870,14 +1006,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
}
case kTfLiteInt16: {
LogisticParams params;
if (kernel_type == kReference) {
reference_ops::Logistic(
params, GetTensorShape(input), GetTensorData<int16_t>(input),
GetTensorShape(output), GetTensorData<int16_t>(output));
} else {
if (kernel_type == kFixedPointOptimized) {
optimized_ops::Logistic(
params, GetTensorShape(input), GetTensorData<int16_t>(input),
GetTensorShape(output), GetTensorData<int16_t>(output));
} else {
EvalUsingLookupTableSigmoid16Bit(data, input, output);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make sure I'm understanding this correctly, this changes TFLite's builtin sigmoid ("kGenericOptimized") from optimized_ops::Logistic() to "EvalUsingLookupTableSigmoid16Bit", which is more accurate. And "EvalUsingLookupTableSigmoid16Bit" is not vectorized yet (like in "EvalUsingLookupTable").

Will "EvalUsingLookupTableSigmoid16Bit" be optimized in the future?

Copy link
Contributor

@akarmi akarmi Feb 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your understanding is correct.

As with other operators with 16-bit activations, our initial plans are to introduce the reference implementations. We have tentative plans for optimized implementations. Our other relevant efforts are currently focused on implementing corresponding kernels in TensorFlow Lite Micro, but that work depends on the progress with introducing the reference code to TFLite first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I was not in some of the discussions so I might miss context here. Isn't it safer to change just the reference code and leave the two optimized unchanged?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue with keeping the old kGenericOptimized implementation is the fact that it is the one currently used as the default BuiltinOp (see Register_TANH()). One of the incentives for this PR is to be able to run models quantized with higher accuracy provided by 16-bit activations using the default/standard TFLite build.

I would suggest that we enable the proposed code by default, and address any performance issues in due course. The alternative approach could be using kReference implementation by default, but, since it would affect execution paths for other, potentially more widely used, data types, I think it may not be the best option.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Anton. I see your point and that makes a lot of sense. The only concern I have is that there are already model that runs with 16bit builtin Tanh/Sigmoid Op. This change will make it more accurate but in the same time slower, which can be an issue.

May I suggest the following? We can submit the code but only enable it in the reference case. Once the optimization is finished, we can flip the switch so existing users can get better accuracy without performance impact.

Does that sound good? Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Jian Li. I appreciate your concerns, and generally agree with the proposed course of action. One thing that I would like to do before making the mods you are asking for is to move the new reference implementation to the reference_integer_ops namespace. It would make it possible to re-use the reference implementation in the code external to the TFLite run-time.

What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for understanding the restrictions.

We all appreciate the improved accuracy this PR brings. What you mentioned works and please go ahead with the plan (either here or in another PR, whichever you feel comfortable with).

Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for understanding the restrictions.

We all appreciate the improved accuracy this PR brings. What you mentioned works and please go ahead with the plan (either here or in another PR, whichever you feel comfortable with).

Thanks!

}
break;
}
Expand Down
18 changes: 12 additions & 6 deletions tensorflow/lite/kernels/activations_test.cc
Expand Up @@ -741,18 +741,22 @@ TEST_P(TanhOpTest, TanhInt16) {
const float kMax = 32767.f / 32768.f;
QuantizedActivationsOpModel m(
GetRegistration(), BuiltinOperator_TANH,
/*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
/*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
/*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax},
/*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax});
m.SetInput<int16_t>({
0, -6, 2, 4, //
-4, -2, 8, 1, //
7, -8, 3, -5, //
6, -1, -3, 5
});
m.Invoke();
EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
ElementsAreArray(ArrayFloatNear(
{
0.0, -0.999987, 0.964027, 0.999329, //
-0.999329, -0.96402, 0.99999, 0.76159, //
0.999998337, -0.99999, 0.995054754, -0.999909204, //
0.999999996, -0.76159, -0.995054754, 0.999909204
},
kQuantizedToleranceInt16)));
}
Expand Down Expand Up @@ -882,18 +886,20 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
const float kMax = 32767.f / 32768.f;
QuantizedActivationsOpModel m(
GetRegistration(), BuiltinOperator_LOGISTIC,
/*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
/*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
/*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax},
/*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax});
m.SetInput<int16_t>({
0, -6, 2, 4, //
3, -2, 10, 1, //
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why delete this line?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@renjie-liu this line has been modified by replacing 10 with 8, so the input number is in the given range of the input tensor and i added 4 more numbers.

3, -2, 8, 1, //
5, -8, 7, -3
});
m.Invoke();
EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
ElementsAreArray(ArrayFloatNear(
{
0.5, 0.002473, 0.880797, 0.982014, //
0.952574, 0.119203, 0.999955, 0.731059, //
0.952574, 0.119203, 0.9995, 0.731059, //
0.993307, 0.0003535, 0.999089, 0.047426 //
},
kQuantizedToleranceInt16)));
}
Expand Down