QLinearConv speed up (#3196)

For x86/x64 builds, change the QLinearConv op to use MLAS for the u8u8=s32 GEMM, then requantize the intermediate buffer to u8.
ucb-bar · Mar 13, 2020 · fe0b2b2 · fe0b2b2
1 parent 0a1257e
commit fe0b2b2
Show file tree

Hide file tree

Showing 14 changed files with 573 additions and 319 deletions.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -60,7 +60,6 @@ option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
 option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_MKLML "Build DNNL with MKL-ML binary dependency" OFF)
-option(onnxruntime_USE_GEMMLOWP "Build with gemmlowp for quantized gemm" OFF)
 option(onnxruntime_USE_FEATURIZERS "Build ML Featurizers support" OFF)
 option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
 option(onnxruntime_USE_OPENBLAS "Use openblas" OFF)
@@ -206,7 +205,7 @@ if (MSVC)
     set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib for gtest" FORCE)
   endif()
   #Always enable exception handling, even for Windows ARM
-  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")  
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
     SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
@@ -235,7 +234,7 @@ else()
     string(APPEND CMAKE_C_FLAGS_RELEASE " -march=native -mtune=native")
     string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
     string(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
-  endif()  
+  endif()
 endif()
 
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -295,7 +294,7 @@ if(onnxruntime_BUILD_BENCHMARKS)
     # We will not need to install benchmark since we link it statically.
     set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "Disable benchmark install to avoid overwriting vendor install.")
     add_subdirectory(${PROJECT_SOURCE_DIR}/external/onnx/third_party/benchmark EXCLUDE_FROM_ALL)
-  endif()  
+  endif()
 endif()
 
 if(NOT WIN32)
@@ -321,16 +320,16 @@ if(Protobuf_FOUND OR Protobuf_FOUND)
   message("Use protobuf from preinstalled system lib")
   if (onnxruntime_USE_FULL_PROTOBUF)
     set(PROTOBUF_LIB protobuf::libprotobuf)
-    #We have a check here but most of the cmake users don't know the Protobuf_USE_STATIC_LIBS 
+    #We have a check here but most of the cmake users don't know the Protobuf_USE_STATIC_LIBS
     # variable exists and may leave it in a wrong state.
     if(NOT Protobuf_USE_STATIC_LIBS)
-      #Indeed here should be a warning, not a fatal error. ONNX Runtime itself can work in such a 
+      #Indeed here should be a warning, not a fatal error. ONNX Runtime itself can work in such a
       #setting but it may cause compatibility issue when ONNX Runtime is integrated with the other ONNX ecosystem softwares.
       message(FATAL_ERROR "Please enable Protobuf_USE_STATIC_LIBS")
     endif()
   else()
     set(PROTOBUF_LIB protobuf::libprotobuf-lite)
-  endif()  
+  endif()
 else()
   message("Use protobuf from submodule")
   # use protobuf as a submodule
@@ -397,11 +396,11 @@ set(ONNXRUNTIME_INCLUDE_DIR ${REPO_ROOT}/include/onnxruntime)
 add_subdirectory(external/date EXCLUDE_FROM_ALL)
 
 if(onnxruntime_PREFER_SYSTEM_LIB)
-  find_package(re2)  
+  find_package(re2)
 endif()
 
 set(SAFEINT_INCLUDE_DIR ${REPO_ROOT}/cmake/external/SafeInt)
-add_library(safeint_interface INTERFACE) 
+add_library(safeint_interface INTERFACE)
 target_include_directories(safeint_interface INTERFACE ${SAFEINT_INCLUDE_DIR})
 
 if(NOT TARGET re2::re2)
@@ -501,8 +500,8 @@ if (onnxruntime_USE_TVM)
 endif()
 
 if (APPLE)
-  #onnx/onnx/proto_utils.h:34:16: error: 'SetTotalBytesLimit' is deprecated: Please use the single 
-  #parameter version of SetTotalBytesLimit(). The second parameter is ignored. 
+  #onnx/onnx/proto_utils.h:34:16: error: 'SetTotalBytesLimit' is deprecated: Please use the single
+  #parameter version of SetTotalBytesLimit(). The second parameter is ignored.
   #  coded_stream.SetTotalBytesLimit((2048LL << 20) - 1, 512LL << 20);
   #TODO: fix the warning in ONNX and re-enable this flag
   string(APPEND CMAKE_CXX_FLAGS " -Wno-deprecated")
@@ -540,7 +539,7 @@ if (WIN32)
             # disable warning because there are many occurrences from test macros
             " /wd6326 " # potential comparison of a constant with another constant
         )
-    endif()  
+    endif()
 
     # Treat warning as error if onnxruntime_DEV_MODE is ON
     # For cross-compiled ARM64 binaries, there are too many warnings to fix, hence ignore warnings for now
@@ -629,10 +628,6 @@ include_directories(
   ${REPO_ROOT}/include/onnxruntime/core/session
 )
 
-if(onnxruntime_USE_GEMMLOWP)
-  add_definitions(-DUSE_GEMMLOWP=1)
-endif()
-
 if (onnxruntime_USE_MKLML)
   add_definitions(-DUSE_MKLML=1 -DUSE_MKLML_FOR_BLAS=1)
   if (WIN32 OR APPLE)
@@ -694,7 +689,6 @@ if(onnxruntime_USE_OPENVINO)
 
 endif()
 
-
 if (onnxruntime_USE_OPENBLAS)
   add_definitions(-DUSE_OPENBLAS=1)
   if (WIN32)
@@ -812,7 +806,7 @@ foreach(provider_name ${ONNXRUNTIME_PROVIDER_NAMES})
       target_compile_options(onnxruntime_providers_${provider_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/sdl>")
     else()
       target_compile_definitions(onnxruntime_providers_${provider_name} PUBLIC -DNSYNC_ATOMIC_CPP11)
-      target_include_directories(onnxruntime_providers_${provider_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")      
+      target_include_directories(onnxruntime_providers_${provider_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
     endif()
   endif()
 endforeach()
@@ -827,23 +821,23 @@ if(WIN32)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES Shlwapi)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES debug Dbghelp)
 else()
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)  
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${CMAKE_DL_LIBS} Threads::Threads)
 endif()
 
-# Default version parts for Windows.AI.MachineLearning.dll and onnxruntime.dll in non-ADO pipeline local builds 
+# Default version parts for Windows.AI.MachineLearning.dll and onnxruntime.dll in non-ADO pipeline local builds
 set(VERSION_MAJOR_PART   0 CACHE STRING "First part of numeric file/product version.")
 set(VERSION_MINOR_PART   0 CACHE STRING "Second part of numeric file/product version.")
 set(VERSION_BUILD_PART       0 CACHE STRING "Third part of numeric file/product version.")
 set(VERSION_PRIVATE_PART     0 CACHE STRING "Fourth part of numeric file/product version.")
 set(VERSION_STRING       "Internal Build" CACHE STRING "String representation of file/product version.")
 
 if (onnxruntime_USE_WINML)
-  # WINML uses and depends on the shared lib.  Note:  You can build WINML without DML and you will get a 
+  # WINML uses and depends on the shared lib.  Note:  You can build WINML without DML and you will get a
   # CPU only WINML
   if (NOT onnxruntime_BUILD_SHARED_LIB)
     message(
-        FATAL_ERROR 
+        FATAL_ERROR
         "Option onnxruntime_USE_WINML can only be used when onnxruntime_BUILD_SHARED_LIB is also enabled")
   endif()
   include(wil.cmake)

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -439,3 +439,15 @@ MlasQuantizeLinear(
     float Scale,
     int8_t ZeroPoint
     );
+
+void
+MLASCALL
+MlasRequantizeOutput(
+    const int32_t* Input,
+    uint8_t* Output,
+    const int32_t* Bias,
+    size_t M,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    );
diff --git a/onnxruntime/core/mlas/lib/quantize.cpp b/onnxruntime/core/mlas/lib/quantize.cpp
@@ -284,3 +284,139 @@ Return Value:
 {
     return MlasQuantizeLinearKernel<int8_t, -127, 127>(Input, Output, N, Scale, ZeroPoint);
 }
+
+#if defined(MLAS_SSE2_INTRINSICS)
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasRequantizeOutputVector(
+    MLAS_INT32X4 IntegerVector,
+    MLAS_INT32X4 BiasVector,
+    MLAS_FLOAT32X4 ScaleVector,
+    MLAS_FLOAT32X4 MinimumValueVector,
+    MLAS_FLOAT32X4 MaximumValueVector,
+    MLAS_INT32X4 ZeroPointVector
+    )
+{
+    IntegerVector = _mm_add_epi32(IntegerVector, BiasVector);
+    MLAS_FLOAT32X4 FloatVector = _mm_cvtepi32_ps(IntegerVector);
+
+    //
+    // Scale the input vector and clamp the values to the minimum and maximum
+    // range (adjusted by the zero point value).
+    //
+
+    FloatVector = MlasMultiplyFloat32x4(FloatVector, ScaleVector);
+
+    // N.B. MINPS and MAXPS returns the value from the second vector if the
+    // value from the first vector is a NaN.
+    FloatVector = _mm_max_ps(FloatVector, MinimumValueVector);
+    FloatVector = _mm_min_ps(FloatVector, MaximumValueVector);
+
+    //
+    // Convert the float values to integer using "round to nearest even" and
+    // then shift the output range using the zero point value.
+    //
+
+    // N.B. Assumes MXCSR has been configured with the default rounding mode of
+    // "round to nearest even".
+    IntegerVector = _mm_cvtps_epi32(FloatVector);
+    IntegerVector = _mm_add_epi32(IntegerVector, ZeroPointVector);
+
+    return IntegerVector;
+}
+
+void
+MLASCALL
+MlasRequantizeOutput(
+    const int32_t* Input,
+    uint8_t* Output,
+    const int32_t* Bias,
+    size_t M,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    )
+/*++
+
+Routine Description:
+
+    This routine requantizes the intermediate buffer to the output buffer
+    optionally adding the supplied bias.
+
+Arguments:
+
+    Input - Supplies the input matrix.
+
+    Output - Supplies the output matrix.
+
+    Bias - Supplies the optional bias vector to be added to the input buffer
+        before requantization.
+
+    Buffer - Supplies the output matrix.
+
+    M - Supplies the number of elements of the bias vector and the number of
+        rows in the output matrix.
+
+    N - Supplies the number of columns of the output matrix.
+
+    Scale - Supplies the quantization scale.
+
+    ZeroPoint - Supplies the quantization zero point value.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    MLAS_FLOAT32X4 ScaleVector = MlasBroadcastFloat32x4(Scale);
+    MLAS_FLOAT32X4 MinimumValueVector = MlasBroadcastFloat32x4(float(0 - ZeroPoint));
+    MLAS_FLOAT32X4 MaximumValueVector = MlasBroadcastFloat32x4(float(255 - ZeroPoint));
+    MLAS_INT32X4 ZeroPointVector = MlasBroadcastInt32x4(ZeroPoint);
+    MLAS_INT32X4 BiasVector = _mm_setzero_si128();
+
+    //
+    // Step through each row of the output matrix.
+    //
+
+    while (M-- > 0) {
+
+        if (Bias != nullptr) {
+            BiasVector = MlasBroadcastInt32x4(*Bias++);
+        }
+
+        size_t n = N;
+
+        while (n >= 4) {
+
+            MLAS_INT32X4 IntegerVector = _mm_loadu_si128((const __m128i *)Input);
+            IntegerVector = MlasRequantizeOutputVector(IntegerVector, BiasVector,
+                ScaleVector, MinimumValueVector, MaximumValueVector, ZeroPointVector);
+
+            IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
+            IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
+
+            *((int32_t*)Output) = _mm_cvtsi128_si32(IntegerVector);
+
+            Input += 4;
+            Output += 4;
+            n -= 4;
+        }
+
+        while (n > 0) {
+
+            MLAS_INT32X4 IntegerVector = _mm_cvtsi32_si128(*Input);
+            IntegerVector = MlasRequantizeOutputVector(IntegerVector, BiasVector,
+                ScaleVector, MinimumValueVector, MaximumValueVector, ZeroPointVector);
+
+            *Output = (uint8_t)_mm_cvtsi128_si32(IntegerVector);
+
+            Input += 1;
+            Output += 1;
+            n -= 1;
+        }
+    }
+}
+
+#endif
diff --git a/onnxruntime/core/providers/cpu/nn/conv_integer.cc b/onnxruntime/core/providers/cpu/nn/conv_integer.cc
@@ -107,14 +107,15 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
   concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
 
   const auto* Xdata = X->template Data<uint8_t>();
+  const auto* Wdata = W->template Data<uint8_t>();
   auto* Ydata = Y->template MutableData<int32_t>();
 
   for (int image_id = 0; image_id < N; ++image_id) {
     for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
       if (col_buffer_data != nullptr) {
         if (kernel_rank == 2) {
           math::Im2col<uint8_t, StorageOrder::NCHW>()(
-              Xdata + group_id * X_offset,
+              Xdata,
               C / conv_attrs_.group,
               input_shape[0],
               input_shape[1],
@@ -132,7 +133,7 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
               input_offset);
         } else {
           math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
-              Xdata + group_id * X_offset,
+              Xdata,
               X->Shape().GetDims().data() + 1,
               col_buffer_shape.data(),
               C * input_image_size,
@@ -151,19 +152,19 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
       QGemmu8u8_s32(static_cast<int>(M / conv_attrs_.group),
                     static_cast<int>(output_image_size),
                     static_cast<int>(kernel_dim),
-                    W->template Data<uint8_t>() + group_id * W_offset,
+                    Wdata + group_id * W_offset,
                     static_cast<int>(kernel_dim),
                     filter_offset,
-                    col_buffer_data == nullptr ? Xdata + group_id * X_offset : col_buffer_data,
+                    col_buffer_data == nullptr ? Xdata : col_buffer_data,
                     static_cast<int>(output_image_size),
                     input_offset,
-                    Ydata + group_id * Y_offset,
+                    Ydata,
                     static_cast<int>(output_image_size),
                     thread_pool);
-    }
 
-    Xdata += X_offset * conv_attrs_.group;
-    Ydata += Y_offset * conv_attrs_.group;
+      Xdata += X_offset;
+      Ydata += Y_offset;
+    }
   }
 
   return Status::OK();