Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cadence NNLib:Fixed various unit test failures #38368

Merged
merged 1 commit into from
Apr 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 8;
constexpr int kMaxChannels = 256;

// Conv is quantized along dimension 0:
// https://www.tensorflow.org/lite/performance/quantization_spec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
// Per channel quantization is not needed for any model on xtensa.
constexpr int kMaxChannels = 8;
constexpr int kMaxChannels = 256;

// Depthwise conv is quantized along dimension 3:
// https://www.tensorflow.org/lite/performance/quantization_spec
Expand Down
193 changes: 83 additions & 110 deletions tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,15 @@ struct OpData {
*/

static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
TfLiteContext* context, int batch_size, int memory_size, int num_filters,
int num_units, int rank, const TfLiteTensor* weights_time,
const TfLiteTensor* bias, TfLiteFusedActivation activation,
TfLiteTensor* activation_state, TfLiteTensor* scratch,
TfLiteTensor* output) {
float* scratch_bias = GetTensorData<float>(scratch);
if (bias) {
const float* bias_data = GetTensorData<float>(bias);
TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank,
const float* const __restrict__ weights_time_ptr,
const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
float* const __restrict__ output_ptr) {
// Compute matmul(activation_state, weights_time).
float* scratch_bias = scratch_ptr;
if (bias_ptr) {
const float* bias_data = bias_ptr;
for (int j = 0; j < num_units; ++j) {
scratch_bias[j] = *bias_data++;
}
Expand All @@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
}
int err = 0;
for (int b = 0; b < batch_size; ++b) {
const float* weights_time_vec = GetTensorData<float>(weights_time);
const float* weights_time_vec = weights_time_ptr;
const float* mat_ptr =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
state_ptr + b * memory_size * num_filters;
float* output_ptr_batch = output_ptr + b * num_units;
for (int j = 0; j < num_units; j++) {
err = xa_nn_matXvec_f32xf32_f32(
output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias,
1, memory_size * rank, 0, memory_size * rank, 0);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed");

output_ptr_batch++;
mat_ptr += memory_size * rank;
weights_time_vec += memory_size * rank;
Expand All @@ -113,70 +115,68 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(

// Apply activation.
for (int b = 0; b < batch_size; ++b) {
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
float* output_ptr_batch = output_ptr + b * num_units;
for (int i = 0; i < num_units; ++i) {
*output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
++output_ptr_batch;
}
}

// Left shift the activation_state to make room for next cycle's activation.
// (alanchiao): explore collapsing this into a single loop.
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
for (int f = 0; f < num_filters; ++f) {
// Shift the vector left:
float* batch_ptr = state_ptr_batch;
float* batch_start = state_ptr_batch + 1;
float* batch_end = state_ptr_batch + memory_size;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch[memory_size - 1] = 0.0f;
state_ptr_batch += memory_size;
}
}
return kTfLiteOk;
}

inline TfLiteStatus EvalFloatSVDF(
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
const TfLiteTensor* bias, const TfLiteSVDFParams* params,
TfLiteTensor* scratch, TfLiteTensor* activation_state,
TfLiteTensor* output) {
TfLiteTensor* activation_state, TfLiteTensor* output) {
const int rank = params->rank;
const int batch_size = input->dims->data[0];
const int input_size = input->dims->data[1];
const int num_filters = weights_feature->dims->data[0];
const int num_units = num_filters / rank;
const int memory_size = weights_time->dims->data[1];

// Clear the activation (activation_state's leftmost column).
// (ghodrat): Add a test which initialize activation_state with invalid
// values in leftmost column and make sure it passes.
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
const float* weights_time_ptr = GetTensorData<float>(weights_time);
const float* bias_ptr = GetTensorData<float>(bias);
const float* input_ptr = GetTensorData<float>(input);

float* state_ptr = GetTensorData<float>(activation_state);

// TODO(b/132070898): Move this temp variable to the new scratch buffer API
// when ready.
float scratch_tensor[kScratchTensorMaxSize];
float* scratch_ptr = scratch_tensor;

float* output_ptr = GetTensorData<float>(output);

// Left shift the activation_state.
{
float* new_state_start = state_ptr;
const float* old_state_start = state_ptr + 1;
const float* old_state_end =
state_ptr + batch_size * num_filters * memory_size;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}

// Note: no need to clear the latest activation, matmul is not accumulative.

// Compute conv1d(inputs, weights_feature).
// The activation_state's rightmost column is used to save current cycle
// activation. This is achieved by starting at
// GetTensorData<float>(activation_state)[memory_size - 1] and having the
// stride equal to memory_size.

const float* matrix = GetTensorData<float>(weights_feature);
const float* vector = GetTensorData<float>(input);
float* out_scratch = GetTensorData<float>(scratch);
/* NNLib matXvec needs a bias buffer, so using output buffer to
avoid need for extra memory, output buffer size is batch * num_units,
batch is at least 1 so we use size num_units of it */
float* bias_scratch = GetTensorData<float>(output);
float* result = &GetTensorData<float>(activation_state)[memory_size - 1];
float* result_in_batch = result;
// activation. This is achieved by starting at state_ptr[memory_size - 1] and
// having the stride equal to memory_size.

// Perform batched matrix vector multiply operation:
{
const float* matrix = weights_feature_ptr;
const float* vector = input_ptr;
float* result = &state_ptr[memory_size - 1];
float* result_in_batch = result;

float* out_scratch = scratch_ptr;
float* bias_scratch = output_ptr;
for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;

int err = 0;
Expand All @@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF(
result_in_batch += memory_size;
}
}
}

return ApplyTimeWeightsBiasAndActivation(
context, batch_size, memory_size, num_filters, num_units, rank,
weights_time, bias, params->activation, activation_state, scratch,
output);
context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
}

void EvalIntegerSVDF(
Expand All @@ -217,24 +217,26 @@ void EvalIntegerSVDF(
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];

// (b/132070898): Move these temp variables to the new scratch buffer API
// TODO(b/132070898): Move these temp variables to the new scratch buffer API
// when ready.
int32_t scratch_tensor[kScratchTensorMaxSize];
int32_t scratch_output_tensor[kScratchTensorMaxSize];

// Rewrite last bit of state.
// Shift states.
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);

// Left shift the activation_state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int c = 0; c < n_filter; ++c) {
int16_t* state_ptr = state_ptr_batch + c * n_memory;
state_ptr[n_memory - 1] = 0;
}
int16_t* new_state_start = state_ptr;
const int16_t* old_state_start = state_ptr + 1;
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}

// Note: no need to clear the latest activation, matmul is not accumulative.

// Feature matmul.
{
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
Expand All @@ -255,6 +257,12 @@ void EvalIntegerSVDF(
dot_prod =
MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
dot_prod = std::min(std::max(output_min, dot_prod), output_max);
// This assumes state is symmetrically quantized. Otherwise last bit of
// state should be initialized to its zero point and accumulate the
// dot_prod.
// Equivalent as the following:
// result_in_batch = zero point, which happens to be zero.
// result_in_batch += dot_prod_56.
*result_in_batch = dot_prod;
result_in_batch += n_memory;
}
Expand Down Expand Up @@ -326,26 +334,6 @@ void EvalIntegerSVDF(
GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
}
}

// Shift state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int f = 0; f < n_filter; ++f) {
// Shift the vector left:
int16_t* batch_ptr = state_ptr_batch;
int16_t* batch_start = state_ptr_batch + 1;
int16_t* batch_end = state_ptr_batch + n_memory;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch[n_memory - 1] = 0;
state_ptr_batch += n_memory;
}
}
}
}

} // namespace
Expand Down Expand Up @@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const int rank = params->rank;
const int input_size = input->dims->data[1];
const int batch_size = input->dims->data[0];
// Ensure the input size is a multiple of two. This is necessary since
// optimized kernels access the memory in chunks of two, and all accesses
// must be aligned to 16 bits.
// TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
TF_LITE_ENSURE_EQ(context, input_size % 2, 0);

const int num_filters = weights_feature->dims->data[0];
TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
const int num_units = num_filters / rank;
Expand Down Expand Up @@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Validate Scratch Tensors:
// [0] = (shared - see float block below for usage)
// [1] = Output Temp, int8_t, {2, num_units, batch_size}
// (b/132070898): Scratch values are used as stack variables in
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().

// Validate output tensor:
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
} else {
TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);

// Validate Input Tensor dtypes:
TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
Expand All @@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// [0] = Holds dot-product of time-forward calculations in
// ApplyTimeWeightsBiasAndActivation():
// float/int32, {2, batch_size, num_filters}
// (b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented (b/132070898) TfLiteTensor*
// scratch_tensor = GetTemporary(context, node, 0);
TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);

TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters);
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().

// Full-float SVDF only uses the one shared scratch tensor (see above for
// usage).
// (b/132070898): Use input tensor as variable until scratch tensor
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
Expand All @@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

switch (weights_feature->type) {
case kTfLiteFloat32: {
// (b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented. TfLiteTensor* scratch =
// GetTemporary(context, node, /*index=*/0);
TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
bias, params, scratch, activation_state, output);
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
params, activation_state, output);
break;
}

case kTfLiteInt8: {
if (is_full_integer) {
// (b/132070898): Store these values in ::Prepare() instead of
// TODO(b/132070898): Store these values in ::Prepare() instead of
// ::Eval():
// Calculate effective scales.
OpData op_data;
Expand Down Expand Up @@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() {
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};

return &r;
}

Expand Down
4 changes: 2 additions & 2 deletions tensorflow/lite/micro/tools/make/third_party_downloads.inc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"

XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip"
XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"