Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion benchmark/atoi-corpus.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,21 @@ struct CorpusStringLength {
#define AVX2_STRLEN_CORPUS_X_LIST /* nothing */
#endif

#if ZOO_CONFIGURED_TO_USE_NEON()
#define NEON_STRLEN_CORPUS_X_LIST \
X(ZOO_NEON, zoo::neon_strlen)
#else
#define NEON_STRLEN_CORPUS_X_LIST /* nothing */
#endif


#define STRLEN_CORPUS_X_LIST \
X(LIBC_STRLEN, strlen) \
X(ZOO_STRLEN, zoo::c_strLength) \
X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \
X(GENERIC_GLIBC_STRLEN, STRLEN_old) \
AVX2_STRLEN_CORPUS_X_LIST
AVX2_STRLEN_CORPUS_X_LIST \
NEON_STRLEN_CORPUS_X_LIST

#define X(Typename, FunctionToCall) \
struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } };
Expand Down
49 changes: 49 additions & 0 deletions benchmark/atoi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,52 @@ STRLEN_old (const char *str)
}
}
}


#if ZOO_CONFIGURED_TO_USE_NEON()

#include <arm_neon.h>

namespace zoo {

/// \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in
/// aarch64/strlen.S at
/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD
std::size_t neon_strlen(const char *str) {
const uint8x16_t zero = vdupq_n_u8(0);
size_t offset = 0;
uint8x16_t data;
auto [alignedBase, misalignment] = blockAlignedLoad(str, &data);

auto compareAndConvertResultsToNibbles = [&]() {
auto cmp = vceqq_u8(data, zero);
// The result looks like, in hexadecimal digits, like this:
// [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each
// variable A, B, ... either 0xF or 0x0.
// instead of 16x8 bit results, we can see that as
// 8 16 bit results like this
// [ AABB, CCDD, EEFF, GGHH, ... ]
// If we shift out a nibble from each element (shift right by 4):
// [ ABB0, CDD0, EFF0, GHH0, ... ]
// Narrowing from 16 to eight, we would get
// [ AB, CD, EF, GH, ... ]
auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16(cmp, 4);
return vget_lane_u64(vreinterpret_u64_u8(straddle8bitLanePairAndNarrowToBytes), 0);
};
auto nibbles = compareAndConvertResultsToNibbles();
auto misalignmentNibbleMask = (~uint64_t(0)) << (misalignment * 4);
nibbles &= misalignmentNibbleMask;
for(;;) {
if(nibbles) {
auto trailingZeroBits = __builtin_ctz(nibbles);
auto nonNullByteCount = trailingZeroBits / 4;
return alignedBase + offset + nonNullByteCount - str;
}
alignedBase += sizeof(uint8x16_t);
memcpy(&data, alignedBase, sizeof(uint8x16_t));
nibbles = compareAndConvertResultsToNibbles();
}
}

}
#endif
4 changes: 4 additions & 0 deletions benchmark/atoi.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ std::size_t c_strLength_natural(const char *s);
std::size_t avx2_strlen(const char* str);
#endif

#if ZOO_CONFIGURED_TO_USE_NEON()
std::size_t neon_strlen(const char* str);
#endif

}

std::size_t
Expand Down
6 changes: 6 additions & 0 deletions inc/zoo/pp/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
#define ZOO_CONFIGURED_TO_USE_AVX() 0
#endif

#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && (defined(__aarch64__) || defined(_M_ARM64))
#define ZOO_CONFIGURED_TO_USE_NEON() 1
#else
#define ZOO_CONFIGURED_TO_USE_NEON() 0
#endif

#ifdef _MSC_VER
#define MSVC_EMPTY_BASES __declspec(empty_bases)
#else
Expand Down