Skip to content

Commit

Permalink
QLinearConv speed up (#3196)
Browse files Browse the repository at this point in the history
For x86/x64 builds, change the QLinearConv op to use MLAS for the u8u8=s32 GEMM, then requantize the intermediate buffer to u8.
  • Loading branch information
tracysh committed Mar 13, 2020
1 parent 0a1257e commit fe0b2b2
Show file tree
Hide file tree
Showing 14 changed files with 573 additions and 319 deletions.
38 changes: 16 additions & 22 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_MKLML "Build DNNL with MKL-ML binary dependency" OFF)
option(onnxruntime_USE_GEMMLOWP "Build with gemmlowp for quantized gemm" OFF)
option(onnxruntime_USE_FEATURIZERS "Build ML Featurizers support" OFF)
option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
option(onnxruntime_USE_OPENBLAS "Use openblas" OFF)
Expand Down Expand Up @@ -206,7 +205,7 @@ if (MSVC)
set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib for gtest" FORCE)
endif()
#Always enable exception handling, even for Windows ARM
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
Expand Down Expand Up @@ -235,7 +234,7 @@ else()
string(APPEND CMAKE_C_FLAGS_RELEASE " -march=native -mtune=native")
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
string(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
endif()
endif()
endif()

if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
Expand Down Expand Up @@ -295,7 +294,7 @@ if(onnxruntime_BUILD_BENCHMARKS)
# We will not need to install benchmark since we link it statically.
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "Disable benchmark install to avoid overwriting vendor install.")
add_subdirectory(${PROJECT_SOURCE_DIR}/external/onnx/third_party/benchmark EXCLUDE_FROM_ALL)
endif()
endif()
endif()

if(NOT WIN32)
Expand All @@ -321,16 +320,16 @@ if(Protobuf_FOUND OR Protobuf_FOUND)
message("Use protobuf from preinstalled system lib")
if (onnxruntime_USE_FULL_PROTOBUF)
set(PROTOBUF_LIB protobuf::libprotobuf)
#We have a check here but most of the cmake users don't know the Protobuf_USE_STATIC_LIBS
#We have a check here but most of the cmake users don't know the Protobuf_USE_STATIC_LIBS
# variable exists and may leave it in a wrong state.
if(NOT Protobuf_USE_STATIC_LIBS)
#Indeed here should be a warning, not a fatal error. ONNX Runtime itself can work in such a
#Indeed here should be a warning, not a fatal error. ONNX Runtime itself can work in such a
#setting but it may cause compatibility issue when ONNX Runtime is integrated with the other ONNX ecosystem softwares.
message(FATAL_ERROR "Please enable Protobuf_USE_STATIC_LIBS")
endif()
else()
set(PROTOBUF_LIB protobuf::libprotobuf-lite)
endif()
endif()
else()
message("Use protobuf from submodule")
# use protobuf as a submodule
Expand Down Expand Up @@ -397,11 +396,11 @@ set(ONNXRUNTIME_INCLUDE_DIR ${REPO_ROOT}/include/onnxruntime)
add_subdirectory(external/date EXCLUDE_FROM_ALL)

if(onnxruntime_PREFER_SYSTEM_LIB)
find_package(re2)
find_package(re2)
endif()

set(SAFEINT_INCLUDE_DIR ${REPO_ROOT}/cmake/external/SafeInt)
add_library(safeint_interface INTERFACE)
add_library(safeint_interface INTERFACE)
target_include_directories(safeint_interface INTERFACE ${SAFEINT_INCLUDE_DIR})

if(NOT TARGET re2::re2)
Expand Down Expand Up @@ -501,8 +500,8 @@ if (onnxruntime_USE_TVM)
endif()

if (APPLE)
#onnx/onnx/proto_utils.h:34:16: error: 'SetTotalBytesLimit' is deprecated: Please use the single
#parameter version of SetTotalBytesLimit(). The second parameter is ignored.
#onnx/onnx/proto_utils.h:34:16: error: 'SetTotalBytesLimit' is deprecated: Please use the single
#parameter version of SetTotalBytesLimit(). The second parameter is ignored.
# coded_stream.SetTotalBytesLimit((2048LL << 20) - 1, 512LL << 20);
#TODO: fix the warning in ONNX and re-enable this flag
string(APPEND CMAKE_CXX_FLAGS " -Wno-deprecated")
Expand Down Expand Up @@ -540,7 +539,7 @@ if (WIN32)
# disable warning because there are many occurrences from test macros
" /wd6326 " # potential comparison of a constant with another constant
)
endif()
endif()

# Treat warning as error if onnxruntime_DEV_MODE is ON
# For cross-compiled ARM64 binaries, there are too many warnings to fix, hence ignore warnings for now
Expand Down Expand Up @@ -629,10 +628,6 @@ include_directories(
${REPO_ROOT}/include/onnxruntime/core/session
)

if(onnxruntime_USE_GEMMLOWP)
add_definitions(-DUSE_GEMMLOWP=1)
endif()

if (onnxruntime_USE_MKLML)
add_definitions(-DUSE_MKLML=1 -DUSE_MKLML_FOR_BLAS=1)
if (WIN32 OR APPLE)
Expand Down Expand Up @@ -694,7 +689,6 @@ if(onnxruntime_USE_OPENVINO)

endif()


if (onnxruntime_USE_OPENBLAS)
add_definitions(-DUSE_OPENBLAS=1)
if (WIN32)
Expand Down Expand Up @@ -812,7 +806,7 @@ foreach(provider_name ${ONNXRUNTIME_PROVIDER_NAMES})
target_compile_options(onnxruntime_providers_${provider_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/sdl>")
else()
target_compile_definitions(onnxruntime_providers_${provider_name} PUBLIC -DNSYNC_ATOMIC_CPP11)
target_include_directories(onnxruntime_providers_${provider_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
target_include_directories(onnxruntime_providers_${provider_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
endif()
endif()
endforeach()
Expand All @@ -827,23 +821,23 @@ if(WIN32)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES Shlwapi)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES debug Dbghelp)
else()
list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${CMAKE_DL_LIBS} Threads::Threads)
endif()

# Default version parts for Windows.AI.MachineLearning.dll and onnxruntime.dll in non-ADO pipeline local builds
# Default version parts for Windows.AI.MachineLearning.dll and onnxruntime.dll in non-ADO pipeline local builds
set(VERSION_MAJOR_PART 0 CACHE STRING "First part of numeric file/product version.")
set(VERSION_MINOR_PART 0 CACHE STRING "Second part of numeric file/product version.")
set(VERSION_BUILD_PART 0 CACHE STRING "Third part of numeric file/product version.")
set(VERSION_PRIVATE_PART 0 CACHE STRING "Fourth part of numeric file/product version.")
set(VERSION_STRING "Internal Build" CACHE STRING "String representation of file/product version.")

if (onnxruntime_USE_WINML)
# WINML uses and depends on the shared lib. Note: You can build WINML without DML and you will get a
# WINML uses and depends on the shared lib. Note: You can build WINML without DML and you will get a
# CPU only WINML
if (NOT onnxruntime_BUILD_SHARED_LIB)
message(
FATAL_ERROR
FATAL_ERROR
"Option onnxruntime_USE_WINML can only be used when onnxruntime_BUILD_SHARED_LIB is also enabled")
endif()
include(wil.cmake)
Expand Down
12 changes: 12 additions & 0 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -439,3 +439,15 @@ MlasQuantizeLinear(
float Scale,
int8_t ZeroPoint
);

void
MLASCALL
MlasRequantizeOutput(
const int32_t* Input,
uint8_t* Output,
const int32_t* Bias,
size_t M,
size_t N,
float Scale,
uint8_t ZeroPoint
);
136 changes: 136 additions & 0 deletions onnxruntime/core/mlas/lib/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,139 @@ Return Value:
{
return MlasQuantizeLinearKernel<int8_t, -127, 127>(Input, Output, N, Scale, ZeroPoint);
}

#if defined(MLAS_SSE2_INTRINSICS)

MLAS_FORCEINLINE
MLAS_INT32X4
MlasRequantizeOutputVector(
MLAS_INT32X4 IntegerVector,
MLAS_INT32X4 BiasVector,
MLAS_FLOAT32X4 ScaleVector,
MLAS_FLOAT32X4 MinimumValueVector,
MLAS_FLOAT32X4 MaximumValueVector,
MLAS_INT32X4 ZeroPointVector
)
{
IntegerVector = _mm_add_epi32(IntegerVector, BiasVector);
MLAS_FLOAT32X4 FloatVector = _mm_cvtepi32_ps(IntegerVector);

//
// Scale the input vector and clamp the values to the minimum and maximum
// range (adjusted by the zero point value).
//

FloatVector = MlasMultiplyFloat32x4(FloatVector, ScaleVector);

// N.B. MINPS and MAXPS returns the value from the second vector if the
// value from the first vector is a NaN.
FloatVector = _mm_max_ps(FloatVector, MinimumValueVector);
FloatVector = _mm_min_ps(FloatVector, MaximumValueVector);

//
// Convert the float values to integer using "round to nearest even" and
// then shift the output range using the zero point value.
//

// N.B. Assumes MXCSR has been configured with the default rounding mode of
// "round to nearest even".
IntegerVector = _mm_cvtps_epi32(FloatVector);
IntegerVector = _mm_add_epi32(IntegerVector, ZeroPointVector);

return IntegerVector;
}

void
MLASCALL
MlasRequantizeOutput(
const int32_t* Input,
uint8_t* Output,
const int32_t* Bias,
size_t M,
size_t N,
float Scale,
uint8_t ZeroPoint
)
/*++
Routine Description:
This routine requantizes the intermediate buffer to the output buffer
optionally adding the supplied bias.
Arguments:
Input - Supplies the input matrix.
Output - Supplies the output matrix.
Bias - Supplies the optional bias vector to be added to the input buffer
before requantization.
Buffer - Supplies the output matrix.
M - Supplies the number of elements of the bias vector and the number of
rows in the output matrix.
N - Supplies the number of columns of the output matrix.
Scale - Supplies the quantization scale.
ZeroPoint - Supplies the quantization zero point value.
Return Value:
None.
--*/
{
MLAS_FLOAT32X4 ScaleVector = MlasBroadcastFloat32x4(Scale);
MLAS_FLOAT32X4 MinimumValueVector = MlasBroadcastFloat32x4(float(0 - ZeroPoint));
MLAS_FLOAT32X4 MaximumValueVector = MlasBroadcastFloat32x4(float(255 - ZeroPoint));
MLAS_INT32X4 ZeroPointVector = MlasBroadcastInt32x4(ZeroPoint);
MLAS_INT32X4 BiasVector = _mm_setzero_si128();

//
// Step through each row of the output matrix.
//

while (M-- > 0) {

if (Bias != nullptr) {
BiasVector = MlasBroadcastInt32x4(*Bias++);
}

size_t n = N;

while (n >= 4) {

MLAS_INT32X4 IntegerVector = _mm_loadu_si128((const __m128i *)Input);
IntegerVector = MlasRequantizeOutputVector(IntegerVector, BiasVector,
ScaleVector, MinimumValueVector, MaximumValueVector, ZeroPointVector);

IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);

*((int32_t*)Output) = _mm_cvtsi128_si32(IntegerVector);

Input += 4;
Output += 4;
n -= 4;
}

while (n > 0) {

MLAS_INT32X4 IntegerVector = _mm_cvtsi32_si128(*Input);
IntegerVector = MlasRequantizeOutputVector(IntegerVector, BiasVector,
ScaleVector, MinimumValueVector, MaximumValueVector, ZeroPointVector);

*Output = (uint8_t)_mm_cvtsi128_si32(IntegerVector);

Input += 1;
Output += 1;
n -= 1;
}
}
}

#endif
17 changes: 9 additions & 8 deletions onnxruntime/core/providers/cpu/nn/conv_integer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,15 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();

const auto* Xdata = X->template Data<uint8_t>();
const auto* Wdata = W->template Data<uint8_t>();
auto* Ydata = Y->template MutableData<int32_t>();

for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
if (col_buffer_data != nullptr) {
if (kernel_rank == 2) {
math::Im2col<uint8_t, StorageOrder::NCHW>()(
Xdata + group_id * X_offset,
Xdata,
C / conv_attrs_.group,
input_shape[0],
input_shape[1],
Expand All @@ -132,7 +133,7 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
input_offset);
} else {
math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
Xdata + group_id * X_offset,
Xdata,
X->Shape().GetDims().data() + 1,
col_buffer_shape.data(),
C * input_image_size,
Expand All @@ -151,19 +152,19 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
QGemmu8u8_s32(static_cast<int>(M / conv_attrs_.group),
static_cast<int>(output_image_size),
static_cast<int>(kernel_dim),
W->template Data<uint8_t>() + group_id * W_offset,
Wdata + group_id * W_offset,
static_cast<int>(kernel_dim),
filter_offset,
col_buffer_data == nullptr ? Xdata + group_id * X_offset : col_buffer_data,
col_buffer_data == nullptr ? Xdata : col_buffer_data,
static_cast<int>(output_image_size),
input_offset,
Ydata + group_id * Y_offset,
Ydata,
static_cast<int>(output_image_size),
thread_pool);
}

Xdata += X_offset * conv_attrs_.group;
Ydata += Y_offset * conv_attrs_.group;
Xdata += X_offset;
Ydata += Y_offset;
}
}

return Status::OK();
Expand Down
Loading

0 comments on commit fe0b2b2

Please sign in to comment.