From ce29f31b04b2e9af732acf94e3e86ec4d83317dd Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson@arm.com>
Date: Thu, 20 Jun 2019 09:18:23 +0200
Subject: [PATCH] Call CMSIS-NN optimized kernel for depthwise_conv

By usings TAGS=cmsis-nn, the optimized depthwise conv is called,
under the restriction that the kernel meets size requirements.

Change-Id: I0a070b37ce7dcd06dd8c747de362b4fd42ed4e5a
---
 .../micro/kernels/cmsis-nn/depthwise_conv.cc  | 64 +++++++++++++++----
 .../micro/tools/make/ext_libs/cmsis.inc       | 20 +++---
 2 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc
index 8e3f4b9fe3fbab..a77ebd9fb7118e 100644
--- a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "arm_nnfunctions.h"
 
 namespace tflite {
 namespace ops {
@@ -145,22 +146,58 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
   op_params.output_shift = -data->output_shift;
 
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
+
+#if defined(ARM_MATH_DSP)
+  // optimizations utilize loop unrolling which requires the following power
+  // of two kernel dimensions
+  RuntimeShape filter_shape = GetTensorShape(filter);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  if (0 == op_params.depth_multiplier % 2 && 0 == filter_width % 2) {
+    RuntimeShape input_shape = GetTensorShape(input);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    RuntimeShape output_shape = GetTensorShape(output);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    arm_depthwise_conv_u8_basic_ver1(GetTensorData<uint8_t>(input),
+                                      input_width,
+                                      input_height,
+                                      input_depth,
+                                      GetTensorData<uint8_t>(filter),
+                                      filter_width,
+                                      filter_height,
+                                      op_params.depth_multiplier,
+                                      op_params.padding_values.width,
+                                      op_params.padding_values.height,
+                                      op_params.stride_width,
+                                      op_params.stride_height,
+                                      op_params.dilation_width_factor,
+                                      op_params.dilation_height_factor,
+                                      GetTensorData<int32_t>(bias),
+                                      op_params.input_offset,
+                                      op_params.weights_offset,
+                                      op_params.output_offset,
+                                      GetTensorData<uint8_t>(output),
+                                      output_width,
+                                      output_height,
+                                      op_params.quantized_activation_min,
+                                      op_params.quantized_activation_max,
+                                      op_params.output_shift,
+                                      op_params.output_multiplier);
+  } else
+#endif
+ {
+    tflite::reference_ops::DepthwiseConv(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  }
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-#ifdef ARM_CMSIS_NN_M3
-  return kTfLiteError;
-#elif ARM_CMSIS_NN_M4
-  // Todo: call cmsis ops
-  return kTfLiteError;
-#elif ARM_CMSIS_NN_M7
-  return kTfLiteError;
-#else
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
 
@@ -200,7 +237,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
   return kTfLiteOk;
-#endif
 }
 
 }  // namespace depthwise_conv
diff --git a/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
index 779b77f8be6233..c15828db5f040e 100644
--- a/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
@@ -1,26 +1,26 @@
 ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     # Enable u-arch specfic behaviours
     ifneq (,$(filter $(TARGET_ARCH), cortex-m3))
-        CCFLAGS += -DARM_MATH_CM3
-        CXXFLAGS += -DARM_CMSIS_NN_M3
+        # CMSIS-NN optimizations not supported
     endif
     ifneq (,$(filter $(TARGET_ARCH), cortex-m4))
-        CCFLAGS += -DARM_MATH_CM4
-        CXXFLAGS += -DARM_CMSIS_NN_M4
+        CCFLAGS += -DARM_MATH_DSP
+        CXXFLAGS += -DARM_MATH_DSP
     endif
     ifneq (,$(filter $(TARGET_ARCH), cortex-m7))
-        CCFLAGS += -DARM_MATH_CM7
-        CXXFLAGS += -DARM_CMSIS_NN_M7
+        CCFLAGS += -DARM_MATH_DSP
+        CXXFLAGS += -DARM_MATH_DSP
     endif
     ifneq (,$(filter $(TARGET_ARCH), x86_64))
-        # For development purposes
-        CCFLAGS += -DARM_MATH_CM4
-        CXXFLAGS += -DARM_CMSIS_NN_X86_64
+        # CMSIS-NN optimizations not supported
     endif
 
     # Setup CMSIS-NN lib and add required header files to microlite lib INCLUDE
     CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
-    MICROLITE_CC_SRCS += $(shell find $(CMSIS_PATH)/CMSIS/NN/Source/ -name *.c)
+    THIRD_PARTY_CC_SRCS += $(shell find $(CMSIS_PATH)/CMSIS/NN/Source/ -name *.c)
+    THIRD_PARTY_CC_HDRS += $(shell find $(CMSIS_PATH)/CMSIS/Core/Include/ -name *.h) \
+                           $(shell find $(CMSIS_PATH)/CMSIS/NN/Include/ -name *.h) \
+                           $(shell find $(CMSIS_PATH)/CMSIS/DSP/Include/ -name *.h)
     INCLUDES += -I$(CMSIS_PATH)/CMSIS/Core/Include \
                 -I$(CMSIS_PATH)/CMSIS/NN/Include \
                 -I$(CMSIS_PATH)/CMSIS/DSP/Include