diff --git a/benchmark/atoi-corpus.h b/benchmark/atoi-corpus.h index 16f0b6e0..78ed2aba 100644 --- a/benchmark/atoi-corpus.h +++ b/benchmark/atoi-corpus.h @@ -1,4 +1,5 @@ #include "atoi.h" +#include "zoo/pp/platform.h" #include #include @@ -119,17 +120,33 @@ struct CorpusStringLength { } }; +#if ZOO_CONFIGURED_TO_USE_AVX() +#define AVX2_STRLEN_CORPUS_X_LIST \ + X(ZOO_AVX, zoo::avx2_strlen) +#else +#define AVX2_STRLEN_CORPUS_X_LIST /* nothing */ +#endif + +#if ZOO_CONFIGURED_TO_USE_NEON() +#define NEON_STRLEN_CORPUS_X_LIST \ + X(ZOO_NEON, zoo::neon_strlen) +#else +#define NEON_STRLEN_CORPUS_X_LIST /* nothing */ +#endif + + #define STRLEN_CORPUS_X_LIST \ X(LIBC_STRLEN, strlen) \ X(ZOO_STRLEN, zoo::c_strLength) \ X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \ - X(ZOO_MANUAL_STRLEN, zoo::c_strLength_manualComparison) \ - X(ZOO_AVX, zoo::avx2_strlen) \ - X(GENERIC_GLIBC_STRLEN, STRLEN_old) + X(GENERIC_GLIBC_STRLEN, STRLEN_old) \ + AVX2_STRLEN_CORPUS_X_LIST \ + NEON_STRLEN_CORPUS_X_LIST #define X(Typename, FunctionToCall) \ struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } }; PARSE8BYTES_CORPUS_X_LIST STRLEN_CORPUS_X_LIST + #undef X diff --git a/benchmark/atoi.cpp b/benchmark/atoi.cpp index 1b7ebb90..8d002b08 100644 --- a/benchmark/atoi.cpp +++ b/benchmark/atoi.cpp @@ -1,11 +1,17 @@ -#include "zoo/swar/SWAR.h" +#include "atoi.h" + #include "zoo/swar/associative_iteration.h" +#if ZOO_CONFIGURED_TO_USE_AVX() #include +#endif #include #include #include +#include + +#include // Copied from Daniel Lemire's GitHub at // https://lemire.me/blog/2018/10/03/quickly-parsing-eight-digits/ @@ -23,7 +29,7 @@ uint32_t parse_eight_digits_swar(const char *chars) { // Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2 // of 10^9 is slightly less than 30, thus, only 30 bits are needed. -auto lemire_as_zoo_swar(const char *chars) { +uint32_t lemire_as_zoo_swar(const char *chars) { uint64_t bytes; memcpy(&bytes, chars, 8); auto allCharacterZero = zoo::meta::BitmaskMaker::value; @@ -52,16 +58,96 @@ auto lemire_as_zoo_swar(const char *chars) { return uint32_t(by10001base2to32.value() >> 32); } +std::size_t spaces_glibc(const char *ptr) { + auto rv = 0; + while(isspace(ptr[rv])) { ++rv; } + return rv; +} + namespace zoo { +//constexpr +std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept { + /* + space (0x20, ' ') + form feed (0x0c, '\f') + line feed (0x0a, '\n') + carriage return (0x0d, '\r') + horizontal tab (0x09, '\t') + vertical tab (0x0b, '\v')* + constexpr std::array SpaceCharacters = { + 0b10'0000, //0x20 space + 0b00'1101, // 0xD \r + 0b00'1100, // 0xC \f + 0b00'1011, // 0xB \v + 0b00'1010, // 0xA \n + 0b00'1001 // 9 \t + }, + ExpressedAsEscapeCodes = { ' ', '\r', '\f', '\v', '\n', '\t' }; + static_assert(SpaceCharacters == ExpressedAsEscapeCodes); */ + using S = swar::SWAR<8, uint64_t>; + constexpr S Space{meta::BitmaskMaker::value}; + auto space = swar::equals(bytes, Space); + auto otherWhiteSpace = + swar::constantIsGreaterEqual<'\r'>(bytes) & + ~swar::constantIsGreaterEqual<'\t' - 1>(bytes); + auto whiteSpace = space | otherWhiteSpace; + auto notWhiteSpace = S{S::MostSignificantBit} ^ whiteSpace; + return notWhiteSpace.lsbIndex(); +} + +/// @brief Loads the "block" containing the pointer, by proper alignment +/// @tparam PtrT Pointer type for loading +/// @tparam Block as the name indicates +/// @param pointerInsideBlock the potentially misaligned pointer +/// @param b where the loaded bytes will be put +/// @return a pair to indicate the aligned pointer to the base of the block +/// and the misalignment, in bytes, of the source pointer +template +std::tuple +blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) { + uintptr_t asUint = reinterpret_cast(pointerInsideBlock); + constexpr auto Alignment = alignof(Block), Size = sizeof(Block); + static_assert(Alignment == Size); + auto misalignment = asUint % Alignment; + auto *base = reinterpret_cast(asUint - misalignment); + memcpy(b, base, Size); + return { base, misalignment }; +} + +/// \brief Helper function to fix the non-string part of block +template +S adjustMisalignmentFor_strlen(S data, int misalignment) { + // The speculative load has the valid data in the higher lanes. + // To use the same algorithm as the rest of the implementation, simply + // populate with ones the lower part, in that way there won't be nulls. + constexpr typename S::type Zero{0}; + auto + zeroesInMisalignedOnesInValid = + (~Zero) // all ones + << (misalignment * 8), // assumes 8 bits per char + onesInMisalignedZeroesInValid = ~zeroesInMisalignedOnesInValid; + return data | S{onesInMisalignedZeroesInValid}; +} + std::size_t c_strLength(const char *s) { - using S = swar::SWAR<8, std::size_t>; + using S = swar::SWAR<8, uint64_t>; constexpr auto MSBs = S{S::MostSignificantBit}, Ones = S{S::LeastSignificantBit}; - S bytes; - for(auto base = s;; base += 8) { - memcpy(&bytes.m_v, base, 8); + constexpr auto BytesPerIteration = sizeof(S::type); + S initialBytes; + + auto indexOfFirstTrue = [](auto bs) { return bs.lsbIndex(); }; + + // Misalignment must be taken into account because a SWAR read is + // speculative, it might read bytes outside of the actual string. + // It is safe to read within the page where the string occurs, and to + // guarantee that, simply make aligned reads because the size of the SWAR + // base size will always divide the memory page size + auto [alignedBase, misalignment] = blockAlignedLoad(s, &initialBytes); + auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment); + for(;;) { auto firstNullTurnsOnMSB = bytes - Ones; // The first lane with a null will borrow and set its MSB on when // subtracted one. @@ -74,24 +160,28 @@ std::size_t c_strLength(const char *s) { auto cheapestInversionOfMSBs = ~bytes; auto firstMSBsOnIsFirstNull = firstNullTurnsOnMSB & cheapestInversionOfMSBs; - auto onlyMSBs = zoo::swar::convertToBooleanSWAR(firstMSBsOnIsFirstNull); - if(onlyMSBs) { // there is a null! - auto firstNullIndex = onlyMSBs.lsbIndex(); - return firstNullIndex + (base - s); + auto onlyMSBs = swar::convertToBooleanSWAR(firstMSBsOnIsFirstNull); + if(onlyMSBs) { + return alignedBase + indexOfFirstTrue(onlyMSBs) - s; } + alignedBase += BytesPerIteration; + memcpy(&bytes, alignedBase, BytesPerIteration); } } std::size_t c_strLength_natural(const char *s) { - using S = swar::SWAR<8, std::size_t>; - S bytes; - for(auto base = s;; base += 8) { - memcpy(&bytes.m_v, base, 8); + using S = swar::SWAR<8, std::uint64_t>; + S initialBytes; + auto [base, misalignment] = blockAlignedLoad(s, &initialBytes); + auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment); + for(;;) { auto nulls = zoo::swar::equals(bytes, S{0}); if(nulls) { // there is a null! auto firstNullIndex = nulls.lsbIndex(); - return firstNullIndex + (base - s); + return firstNullIndex + base - s; } + base += sizeof(S); + memcpy(&bytes.m_v, base, 8); } } @@ -117,29 +207,47 @@ std::size_t c_strLength_manualComparison(const char *s) { } } +#if ZOO_CONFIGURED_TO_USE_AVX() + +/// \note Partially generated by Chat GPT 4 size_t avx2_strlen(const char* str) { const __m256i zero = _mm256_setzero_si256(); // Vector of 32 zero bytes size_t offset = 0; + __m256i data; + auto [alignedBase, misalignment] = blockAlignedLoad(str, &data); - // Loop over the string in blocks of 32 bytes - for (;; offset += 32) { - // Load 32 bytes of the string into a __m256i vector - __m256i data;// = _mm256_load_si256((const __m256i*)(str + offset)); - memcpy(&data, str + offset, 32); - // Compare each byte with '\0' - __m256i cmp = _mm256_cmpeq_epi8(data, zero); - // Create a mask indicating which bytes are '\0' - int mask = _mm256_movemask_epi8(cmp); + // AVX does not offer a practical way to generate a mask of all ones in + // the least significant positions, thus we cant invoke adjustFor_strlen. + // We will do the first iteration as a special case to explicitly take into + // account misalignment + auto maskOfMask = (~uint64_t(0)) << misalignment; + auto compareAndMask = + [&]() { + // Compare each byte with '\0' + __m256i cmp = _mm256_cmpeq_epi8(data, zero); + // Create a mask indicating which bytes are '\0' + return _mm256_movemask_epi8(cmp); + }; + auto mask = compareAndMask(); + mask &= maskOfMask; + + // Loop over the string in blocks of 32 bytes + for (;;) { // If mask is not zero, we found a '\0' byte if (mask) { - // Calculate the index of the first '\0' byte using ctz (Count Trailing Zeros) - return offset + __builtin_ctz(mask); + // Calculate the index of the first '\0' byte counting trailing 0s + auto nunNullByteCount = __builtin_ctz(mask); + return alignedBase + offset + nunNullByteCount - str; } + offset += 32; + memcpy(&data, alignedBase + offset, 32); + mask = compareAndMask(); } // Unreachable, but included to avoid compiler warnings return offset; } +#endif } @@ -217,3 +325,52 @@ STRLEN_old (const char *str) } } } + + +#if ZOO_CONFIGURED_TO_USE_NEON() + +#include + +namespace zoo { + +/// \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in +/// aarch64/strlen.S at +/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD +std::size_t neon_strlen(const char *str) { + const uint8x16_t zero = vdupq_n_u8(0); + size_t offset = 0; + uint8x16_t data; + auto [alignedBase, misalignment] = blockAlignedLoad(str, &data); + + auto compareAndConvertResultsToNibbles = [&]() { + auto cmp = vceqq_u8(data, zero); + // The result looks like, in hexadecimal digits, like this: + // [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each + // variable A, B, ... either 0xF or 0x0. + // instead of 16x8 bit results, we can see that as + // 8 16 bit results like this + // [ AABB, CCDD, EEFF, GGHH, ... ] + // If we shift out a nibble from each element (shift right by 4): + // [ ABB0, CDD0, EFF0, GHH0, ... ] + // Narrowing from 16 to eight, we would get + // [ AB, CD, EF, GH, ... ] + auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16(cmp, 4); + return vget_lane_u64(vreinterpret_u64_u8(straddle8bitLanePairAndNarrowToBytes), 0); + }; + auto nibbles = compareAndConvertResultsToNibbles(); + auto misalignmentNibbleMask = (~uint64_t(0)) << (misalignment * 4); + nibbles &= misalignmentNibbleMask; + for(;;) { + if(nibbles) { + auto trailingZeroBits = __builtin_ctz(nibbles); + auto nonNullByteCount = trailingZeroBits / 4; + return alignedBase + offset + nonNullByteCount - str; + } + alignedBase += sizeof(uint8x16_t); + memcpy(&data, alignedBase, sizeof(uint8x16_t)); + nibbles = compareAndConvertResultsToNibbles(); + } +} + +} +#endif diff --git a/benchmark/atoi.h b/benchmark/atoi.h index 3180c510..8c1d14b5 100644 --- a/benchmark/atoi.h +++ b/benchmark/atoi.h @@ -1,15 +1,26 @@ -#include "stdint.h" +#include "zoo/swar/SWAR.h" +#include "zoo/pp/platform.h" + #include uint32_t parse_eight_digits_swar(const char *chars); uint32_t lemire_as_zoo_swar(const char *chars); +std::size_t spaces_glibc(const char *ptr); + namespace zoo { +std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept; std::size_t c_strLength(const char *s); std::size_t c_strLength_natural(const char *s); -std::size_t c_strLength_manualComparison(const char *s); + +#if ZOO_CONFIGURED_TO_USE_AVX() std::size_t avx2_strlen(const char* str); +#endif + +#if ZOO_CONFIGURED_TO_USE_NEON() +std::size_t neon_strlen(const char* str); +#endif } diff --git a/benchmark/catch2swar-demo.cpp b/benchmark/catch2swar-demo.cpp index 8f7e33e8..3bcaf204 100644 --- a/benchmark/catch2swar-demo.cpp +++ b/benchmark/catch2swar-demo.cpp @@ -17,6 +17,22 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { auto seed = rd(); CAPTURE(seed); std::mt19937 g(seed); + SECTION("Simple comparison of two strings") { + auto TwoStrings = "Str1\0Much longer string here, even for AVX2"; + auto zoolength1 = zoo::c_strLength(TwoStrings); + auto strlen1 = strlen(TwoStrings); + REQUIRE(zoolength1 == strlen1); + auto skipFst = TwoStrings + strlen1 + 1; + auto zl2 = zoo::c_strLength(skipFst); + auto strlen2 = strlen(skipFst); + REQUIRE(zl2 == strlen2); + #if ZOO_CONFIGURED_TO_USE_AVX() + auto avx1 = zoo::avx2_strlen(TwoStrings); + REQUIRE(avx1 == strlen1); + auto avx2 = zoo::avx2_strlen(skipFst); + REQUIRE(avx2 == strlen2); + #endif + } auto corpus8D = Corpus8DecimalDigits::makeCorpus(g); auto corpusStrlen = CorpusStringLength::makeCorpus(g); #define X(Type, Fun) \ @@ -34,9 +50,10 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { REQUIRE(fromLIBC == fromZoo); REQUIRE(fromZOO_STRLEN == fromLIBC_STRLEN); REQUIRE(fromLIBC_STRLEN == fromZOO_NATURAL_STRLEN); - REQUIRE(fromZOO_NATURAL_STRLEN == fromZOO_MANUAL_STRLEN); REQUIRE(fromGENERIC_GLIBC_STRLEN == fromZOO_NATURAL_STRLEN); - REQUIRE(fromZOO_AVX == fromZOO_STRLEN); + #if ZOO_CONFIGURED_TO_USE_AVX() + REQUIRE(fromZOO_AVX == fromZOO_STRLEN); + #endif auto haveTheRoleOfMemoryBarrier = -1; #define X(Type, Fun) \ diff --git a/inc/zoo/pp/platform.h b/inc/zoo/pp/platform.h index 5bb79483..b35b7b38 100644 --- a/inc/zoo/pp/platform.h +++ b/inc/zoo/pp/platform.h @@ -1,14 +1,22 @@ #ifndef ZOO_PLATFORM_MACROS_H #define ZOO_PLATFORM_MACROS_H -#ifdef _MSC_VER - -#define MSVC_EMPTY_BASES __declspec(empty_bases) +#ifdef __AVX2__ +#define ZOO_CONFIGURED_TO_USE_AVX() 1 +#else +#define ZOO_CONFIGURED_TO_USE_AVX() 0 +#endif +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && (defined(__aarch64__) || defined(_M_ARM64)) +#define ZOO_CONFIGURED_TO_USE_NEON() 1 #else +#define ZOO_CONFIGURED_TO_USE_NEON() 0 +#endif +#ifdef _MSC_VER +#define MSVC_EMPTY_BASES __declspec(empty_bases) +#else #define MSVC_EMPTY_BASES - #endif #endif diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 7e1f4476..5b3db31b 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -37,8 +37,12 @@ constexpr std::make_unsigned_t msbIndex(T v) noexcept { /// Index into the bits of the type T that contains the LSB. template constexpr std::make_unsigned_t lsbIndex(T v) noexcept { - // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest - return meta::logFloor(1 + (~v & (v - 1))); + #ifdef _MSC_VER + // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest + return meta::logFloor(1 + (~v & (v - 1))); + #else + return ~v ? __builtin_ctzll(v) : sizeof(T) * 8; + #endif } /// Core abstraction around SIMD Within A Register (SWAR). Specifies 'lanes' diff --git a/scripts/mock-includes.sh b/scripts/mock-includes.sh new file mode 100644 index 00000000..381abfeb --- /dev/null +++ b/scripts/mock-includes.sh @@ -0,0 +1,15 @@ +COMPILER=$1 +OUTPUT=$2 + +shift +shift + +${COMPILER} -nostdinc -nostdinc++ -E -I${OUTPUT} $* 2>&1 > /dev/null | \ + sed -n 's/^\(.*\)fatal error: '"'"'\(.*\)'"'"' file not found\(.*\)$/\2/p' | + while read FILE + do + ADDENDUM="${OUTPUT}/$FILE" + mkdir -p $(dirname $ADDENDUM) + echo $ADDENDUM + echo "__include_directive__ <$FILE>" > ${ADDENDUM} + done diff --git a/scripts/redirective.sh b/scripts/redirective.sh new file mode 100644 index 00000000..4b6ac454 --- /dev/null +++ b/scripts/redirective.sh @@ -0,0 +1,7 @@ +#! /usr/bin/bash + +COMPILER=$1 + +shift + +${COMPILER} -D__include_directive__='#include' -E -CC $*