diff --git a/benchmark/atoi-corpus.h b/benchmark/atoi-corpus.h index e682d3e2..78ed2aba 100644 --- a/benchmark/atoi-corpus.h +++ b/benchmark/atoi-corpus.h @@ -127,12 +127,21 @@ struct CorpusStringLength { #define AVX2_STRLEN_CORPUS_X_LIST /* nothing */ #endif +#if ZOO_CONFIGURED_TO_USE_NEON() +#define NEON_STRLEN_CORPUS_X_LIST \ + X(ZOO_NEON, zoo::neon_strlen) +#else +#define NEON_STRLEN_CORPUS_X_LIST /* nothing */ +#endif + + #define STRLEN_CORPUS_X_LIST \ X(LIBC_STRLEN, strlen) \ X(ZOO_STRLEN, zoo::c_strLength) \ X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \ X(GENERIC_GLIBC_STRLEN, STRLEN_old) \ - AVX2_STRLEN_CORPUS_X_LIST + AVX2_STRLEN_CORPUS_X_LIST \ + NEON_STRLEN_CORPUS_X_LIST #define X(Typename, FunctionToCall) \ struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } }; diff --git a/benchmark/atoi.cpp b/benchmark/atoi.cpp index 1cc394d2..8d002b08 100644 --- a/benchmark/atoi.cpp +++ b/benchmark/atoi.cpp @@ -325,3 +325,52 @@ STRLEN_old (const char *str) } } } + + +#if ZOO_CONFIGURED_TO_USE_NEON() + +#include + +namespace zoo { + +/// \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in +/// aarch64/strlen.S at +/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD +std::size_t neon_strlen(const char *str) { + const uint8x16_t zero = vdupq_n_u8(0); + size_t offset = 0; + uint8x16_t data; + auto [alignedBase, misalignment] = blockAlignedLoad(str, &data); + + auto compareAndConvertResultsToNibbles = [&]() { + auto cmp = vceqq_u8(data, zero); + // The result looks like, in hexadecimal digits, like this: + // [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each + // variable A, B, ... either 0xF or 0x0. + // instead of 16x8 bit results, we can see that as + // 8 16 bit results like this + // [ AABB, CCDD, EEFF, GGHH, ... ] + // If we shift out a nibble from each element (shift right by 4): + // [ ABB0, CDD0, EFF0, GHH0, ... ] + // Narrowing from 16 to eight, we would get + // [ AB, CD, EF, GH, ... ] + auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16(cmp, 4); + return vget_lane_u64(vreinterpret_u64_u8(straddle8bitLanePairAndNarrowToBytes), 0); + }; + auto nibbles = compareAndConvertResultsToNibbles(); + auto misalignmentNibbleMask = (~uint64_t(0)) << (misalignment * 4); + nibbles &= misalignmentNibbleMask; + for(;;) { + if(nibbles) { + auto trailingZeroBits = __builtin_ctz(nibbles); + auto nonNullByteCount = trailingZeroBits / 4; + return alignedBase + offset + nonNullByteCount - str; + } + alignedBase += sizeof(uint8x16_t); + memcpy(&data, alignedBase, sizeof(uint8x16_t)); + nibbles = compareAndConvertResultsToNibbles(); + } +} + +} +#endif diff --git a/benchmark/atoi.h b/benchmark/atoi.h index d506bf29..8c1d14b5 100644 --- a/benchmark/atoi.h +++ b/benchmark/atoi.h @@ -18,6 +18,10 @@ std::size_t c_strLength_natural(const char *s); std::size_t avx2_strlen(const char* str); #endif +#if ZOO_CONFIGURED_TO_USE_NEON() +std::size_t neon_strlen(const char* str); +#endif + } std::size_t diff --git a/inc/zoo/pp/platform.h b/inc/zoo/pp/platform.h index c0682595..b35b7b38 100644 --- a/inc/zoo/pp/platform.h +++ b/inc/zoo/pp/platform.h @@ -7,6 +7,12 @@ #define ZOO_CONFIGURED_TO_USE_AVX() 0 #endif +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && (defined(__aarch64__) || defined(_M_ARM64)) +#define ZOO_CONFIGURED_TO_USE_NEON() 1 +#else +#define ZOO_CONFIGURED_TO_USE_NEON() 0 +#endif + #ifdef _MSC_VER #define MSVC_EMPTY_BASES __declspec(empty_bases) #else