Skip to content

Commit

Permalink
Marginal improvement by pipelining loads on NEON
Browse files Browse the repository at this point in the history
The ld1{4 reg} variant saves us instructions
and only adds 3 cycles of latency to load 3
more neon/asimd registers worth of data.
  • Loading branch information
KungFuJesus authored and Dead2 committed Feb 1, 2022
1 parent 403a9ae commit 9146bd4
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 18 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Expand Up @@ -628,6 +628,10 @@ if(WITH_OPTIM)
endif()
add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
check_neon_ld4_intrinsics()
if(NEON_HAS_LD4)
add_definitions(-DARM_NEON_HASLD4)
endif()
else()
set(WITH_NEON OFF)
endif()
Expand Down
21 changes: 10 additions & 11 deletions arch/arm/slide_hash_neon.c
Expand Up @@ -16,10 +16,12 @@
#endif
#include "../../zbuild.h"
#include "../../deflate.h"
#include "../../fallback_builtins.h"

/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
Z_REGISTER uint16x8_t v, *p;
Z_REGISTER uint16x8_t v;
uint16x8x4_t p0, p1;
Z_REGISTER size_t n;

size_t size = entries*sizeof(table[0]);
Expand All @@ -28,18 +30,15 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(wsize);

p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
do {
p[0] = vqsubq_u16(p[0], v);
p[1] = vqsubq_u16(p[1], v);
p[2] = vqsubq_u16(p[2], v);
p[3] = vqsubq_u16(p[3], v);
p[4] = vqsubq_u16(p[4], v);
p[5] = vqsubq_u16(p[5], v);
p[6] = vqsubq_u16(p[6], v);
p[7] = vqsubq_u16(p[7], v);
p += 8;
p0 = vld1q_u16_x4(table);
p1 = vld1q_u16_x4(table+32);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
vst1q_u16_x4(table, p0);
vst1q_u16_x4(table+32, p1);
table += 64;
} while (--n);
}

Expand Down
28 changes: 28 additions & 0 deletions cmake/detect-intrinsics.cmake
Expand Up @@ -151,6 +151,34 @@ macro(check_neon_compiler_flag)
set(CMAKE_REQUIRED_FLAGS)
endmacro()

macro(check_neon_ld4_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
set(NEONFLAG "-mfpu=neon")
endif()
endif()
endif()
# Check whether compiler supports loading 4 neon vecs into a register range
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
check_c_source_compiles(
"#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int main(void) {
int stack_var[16];
int32x4x4_t v = vld1q_s32_x4(stack_var);
(void)v;
return 0;
}"
NEON_HAS_LD4)
set(CMAKE_REQUIRED_FLAGS)
endmacro()

macro(check_pclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
Expand Down
44 changes: 44 additions & 0 deletions configure
Expand Up @@ -1169,6 +1169,29 @@ EOF
fi
}

check_neon_ld4_intrinsics() {
cat > $test.c << EOF
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int main(void) {
int stack_var[16];
int32x4x4_t v = vld1q_s32_x4(stack_var);
(void)v;
return 0;
}
EOF
if try $CC -c $CFLAGS -march=native $test.c; then
NEON_HAS_LD4=1
echo "check whether compiler supports 4 wide register loads ... Yes." | tee -a configure.log
else
NEON_HAS_LD4=0
echo "check whether compiler supports 4 wide register loads ... No." | tee -a configure.log
fi
}

check_pclmulqdq_intrinsics() {
# Check whether compiler supports PCLMULQDQ intrinsics
cat > $test.c << EOF
Expand Down Expand Up @@ -1658,6 +1681,7 @@ EOF
if test $without_optimizations -eq 0; then
check_acle_compiler_flag
check_neon_compiler_flag
check_neon_ld4_intrinsics
fi

case "${ARCH}" in
Expand Down Expand Up @@ -1700,6 +1724,11 @@ EOF
neonflag="-mfpu=neon"
fi

if test $NEON_HAS_LD4 -eq 1; then
CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
fi

CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"

Expand All @@ -1722,6 +1751,11 @@ EOF
neonflag="-mfpu=neon"
fi

if test $NEON_HAS_LD4 -eq 1; then
CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
fi

CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"

Expand Down Expand Up @@ -1750,6 +1784,11 @@ EOF
neonflag="-mfpu=neon"
fi

if test $NEON_HAS_LD4 -eq 1; then
CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
fi

CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"

Expand Down Expand Up @@ -1793,6 +1832,11 @@ EOF
fi
fi

if test $NEON_HAS_LD4 -eq 1; then
CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
fi

if test $buildacle -eq 1; then
if test $native -eq 0; then
ARCH="${ARCH}+crc"
Expand Down
45 changes: 42 additions & 3 deletions fallback_builtins.h
Expand Up @@ -37,10 +37,10 @@ static __forceinline unsigned long long __builtin_ctzll(uint64_t value) {
return trailing_zero;
}
#define HAVE_BUILTIN_CTZLL
#endif
#endif // Microsoft AMD64

#endif
#endif
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
#endif // _MSC_VER & !clang

/* Unfortunately GCC didn't support these things until version 10 */
#ifdef __AVX2__
Expand All @@ -63,4 +63,43 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
#endif // gcc version 10 test

#endif // __AVX2__

#ifdef ARM_NEON_SLIDEHASH

#define vqsubq_u16_x4_x1(out, a, b) do { \
out.val[0] = vqsubq_u16(a.val[0], b); \
out.val[1] = vqsubq_u16(a.val[1], b); \
out.val[2] = vqsubq_u16(a.val[2], b); \
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)

/* Have to check for hard float ABI on GCC/clang, but not
* on MSVC (we don't compile for the soft float ABI on windows)
*/
#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))

#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif

static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
uint16x8x4_t ret = (uint16x8x4_t) {{
vld1q_u16(a),
vld1q_u16(a+8),
vld1q_u16(a+16),
vld1q_u16(a+24)}};
return ret;
}

static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p, a.val[0]);
vst1q_u16(p + 8, a.val[1]);
vst1q_u16(p + 16, a.val[2]);
vst1q_u16(p + 24, a.val[3]);
}
#endif // HASLD4 check and hard float
#endif // ARM_NEON_SLIDEHASH

#endif // include guard FALLBACK_BUILTINS_H
5 changes: 3 additions & 2 deletions win32/Makefile.a64
Expand Up @@ -25,12 +25,13 @@ RC = rc
CP = copy /y
CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
WFLAGS = \
-D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-D_CRT_SECURE_NO_DEPRECATE \
-D_CRT_NONSTDC_NO_DEPRECATE \
-DARM_NEON_HASLD4 \
-DARM_FEATURES \
-DUNALIGNED_OK \
-DUNALIGNED64_OK \
-D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-DARM_FEATURES \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
Expand Down
5 changes: 3 additions & 2 deletions win32/Makefile.arm
Expand Up @@ -25,11 +25,12 @@ RC = rc
CP = copy /y
CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
WFLAGS = \
-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-D_CRT_SECURE_NO_DEPRECATE \
-D_CRT_NONSTDC_NO_DEPRECATE \
-DUNALIGNED_OK \
-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-DARM_FEATURES \
-DARM_NEON_HASLD4 \
-DUNALIGNED_OK \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
Expand Down

0 comments on commit 9146bd4

Please sign in to comment.