Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions benchmark/atoi-corpus.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "atoi.h"
#include "zoo/pp/platform.h"

#include <vector>
#include <string>
Expand Down Expand Up @@ -119,17 +120,33 @@ struct CorpusStringLength {
}
};

#if ZOO_CONFIGURED_TO_USE_AVX()
#define AVX2_STRLEN_CORPUS_X_LIST \
X(ZOO_AVX, zoo::avx2_strlen)
#else
#define AVX2_STRLEN_CORPUS_X_LIST /* nothing */
#endif

#if ZOO_CONFIGURED_TO_USE_NEON()
#define NEON_STRLEN_CORPUS_X_LIST \
X(ZOO_NEON, zoo::neon_strlen)
#else
#define NEON_STRLEN_CORPUS_X_LIST /* nothing */
#endif


#define STRLEN_CORPUS_X_LIST \
X(LIBC_STRLEN, strlen) \
X(ZOO_STRLEN, zoo::c_strLength) \
X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \
X(ZOO_MANUAL_STRLEN, zoo::c_strLength_manualComparison) \
X(ZOO_AVX, zoo::avx2_strlen) \
X(GENERIC_GLIBC_STRLEN, STRLEN_old)
X(GENERIC_GLIBC_STRLEN, STRLEN_old) \
AVX2_STRLEN_CORPUS_X_LIST \
NEON_STRLEN_CORPUS_X_LIST

#define X(Typename, FunctionToCall) \
struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } };

PARSE8BYTES_CORPUS_X_LIST
STRLEN_CORPUS_X_LIST

#undef X
209 changes: 183 additions & 26 deletions benchmark/atoi.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
#include "zoo/swar/SWAR.h"
#include "atoi.h"

#include "zoo/swar/associative_iteration.h"

#if ZOO_CONFIGURED_TO_USE_AVX()
#include <immintrin.h>
#endif

#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

#include <tuple>

// Copied from Daniel Lemire's GitHub at
// https://lemire.me/blog/2018/10/03/quickly-parsing-eight-digits/
Expand All @@ -23,7 +29,7 @@ uint32_t parse_eight_digits_swar(const char *chars) {

// Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2
// of 10^9 is slightly less than 30, thus, only 30 bits are needed.
auto lemire_as_zoo_swar(const char *chars) {
uint32_t lemire_as_zoo_swar(const char *chars) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Truly astonishing to see you return a concrete type. :D

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is an exported symbol, I have to manually set the type in the header, I may as well set it manually in the implementation. Good to see that you noticed

uint64_t bytes;
memcpy(&bytes, chars, 8);
auto allCharacterZero = zoo::meta::BitmaskMaker<uint64_t, '0', 8>::value;
Expand Down Expand Up @@ -52,16 +58,96 @@ auto lemire_as_zoo_swar(const char *chars) {
return uint32_t(by10001base2to32.value() >> 32);
}

std::size_t spaces_glibc(const char *ptr) {
auto rv = 0;
while(isspace(ptr[rv])) { ++rv; }
return rv;
}

namespace zoo {

//constexpr
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raises the question of why it isn't constexpr

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would not be exportable.
Also, all of SIMD support is un-usable in constexpr. I should delete the comment, but that's the reason why we have to dial down constexpr quite a lot in many parts of the library.

std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept {
/*
space (0x20, ' ')
form feed (0x0c, '\f')
line feed (0x0a, '\n')
carriage return (0x0d, '\r')
horizontal tab (0x09, '\t')
vertical tab (0x0b, '\v')*
constexpr std::array<char, 6> SpaceCharacters = {
0b10'0000, //0x20 space
0b00'1101, // 0xD \r
0b00'1100, // 0xC \f
0b00'1011, // 0xB \v
0b00'1010, // 0xA \n
0b00'1001 // 9 \t
},
ExpressedAsEscapeCodes = { ' ', '\r', '\f', '\v', '\n', '\t' };
static_assert(SpaceCharacters == ExpressedAsEscapeCodes); */
using S = swar::SWAR<8, uint64_t>;
constexpr S Space{meta::BitmaskMaker<uint64_t, ' ', 8>::value};
auto space = swar::equals(bytes, Space);
auto otherWhiteSpace =
swar::constantIsGreaterEqual<'\r'>(bytes) &
~swar::constantIsGreaterEqual<'\t' - 1>(bytes);
auto whiteSpace = space | otherWhiteSpace;
auto notWhiteSpace = S{S::MostSignificantBit} ^ whiteSpace;
return notWhiteSpace.lsbIndex();
}

/// @brief Loads the "block" containing the pointer, by proper alignment
/// @tparam PtrT Pointer type for loading
/// @tparam Block as the name indicates
/// @param pointerInsideBlock the potentially misaligned pointer
/// @param b where the loaded bytes will be put
/// @return a pair to indicate the aligned pointer to the base of the block
/// and the misalignment, in bytes, of the source pointer
template<typename PtrT, typename Block>
std::tuple<PtrT *, int>
blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) {
uintptr_t asUint = reinterpret_cast<uintptr_t>(pointerInsideBlock);
constexpr auto Alignment = alignof(Block), Size = sizeof(Block);
static_assert(Alignment == Size);
auto misalignment = asUint % Alignment;
auto *base = reinterpret_cast<PtrT *>(asUint - misalignment);
memcpy(b, base, Size);
return { base, misalignment };
}

/// \brief Helper function to fix the non-string part of block
template<typename S>
S adjustMisalignmentFor_strlen(S data, int misalignment) {
// The speculative load has the valid data in the higher lanes.
// To use the same algorithm as the rest of the implementation, simply
// populate with ones the lower part, in that way there won't be nulls.
constexpr typename S::type Zero{0};
auto
zeroesInMisalignedOnesInValid =
(~Zero) // all ones
<< (misalignment * 8), // assumes 8 bits per char
onesInMisalignedZeroesInValid = ~zeroesInMisalignedOnesInValid;
return data | S{onesInMisalignedZeroesInValid};
}

std::size_t c_strLength(const char *s) {
using S = swar::SWAR<8, std::size_t>;
using S = swar::SWAR<8, uint64_t>;
constexpr auto
MSBs = S{S::MostSignificantBit},
Ones = S{S::LeastSignificantBit};
S bytes;
for(auto base = s;; base += 8) {
memcpy(&bytes.m_v, base, 8);
constexpr auto BytesPerIteration = sizeof(S::type);
S initialBytes;

auto indexOfFirstTrue = [](auto bs) { return bs.lsbIndex(); };

// Misalignment must be taken into account because a SWAR read is
// speculative, it might read bytes outside of the actual string.
// It is safe to read within the page where the string occurs, and to
// guarantee that, simply make aligned reads because the size of the SWAR
// base size will always divide the memory page size
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this comment go in a SWAR-library level document?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I have not yet created such a document. Because this is introductory code we can keep it like this, right? once we have a body of improvements we can populate other parts of the library such as design documentation.
The most critical piece that only lives as a comment is the one describing associative iteration.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be valuable to set up a Doxygen generation from this and host it somewhere?
Starting to have some documentation infra could motivate more and better documentation.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once in a blue moon I try Doxygen. Sadly, it just does not understand my code. I still use doxygen comments in the code.

auto [alignedBase, misalignment] = blockAlignedLoad(s, &initialBytes);
auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment);
for(;;) {
auto firstNullTurnsOnMSB = bytes - Ones;
// The first lane with a null will borrow and set its MSB on when
// subtracted one.
Expand All @@ -74,24 +160,28 @@ std::size_t c_strLength(const char *s) {
auto cheapestInversionOfMSBs = ~bytes;
auto firstMSBsOnIsFirstNull =
firstNullTurnsOnMSB & cheapestInversionOfMSBs;
auto onlyMSBs = zoo::swar::convertToBooleanSWAR(firstMSBsOnIsFirstNull);
if(onlyMSBs) { // there is a null!
auto firstNullIndex = onlyMSBs.lsbIndex();
return firstNullIndex + (base - s);
auto onlyMSBs = swar::convertToBooleanSWAR(firstMSBsOnIsFirstNull);
if(onlyMSBs) {
return alignedBase + indexOfFirstTrue(onlyMSBs) - s;
}
alignedBase += BytesPerIteration;
memcpy(&bytes, alignedBase, BytesPerIteration);
}
}

std::size_t c_strLength_natural(const char *s) {
using S = swar::SWAR<8, std::size_t>;
S bytes;
for(auto base = s;; base += 8) {
memcpy(&bytes.m_v, base, 8);
using S = swar::SWAR<8, std::uint64_t>;
S initialBytes;
auto [base, misalignment] = blockAlignedLoad(s, &initialBytes);
auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment);
for(;;) {
auto nulls = zoo::swar::equals(bytes, S{0});
if(nulls) { // there is a null!
auto firstNullIndex = nulls.lsbIndex();
return firstNullIndex + (base - s);
return firstNullIndex + base - s;
}
base += sizeof(S);
memcpy(&bytes.m_v, base, 8);
}
}

Expand All @@ -117,29 +207,47 @@ std::size_t c_strLength_manualComparison(const char *s) {
}
}

#if ZOO_CONFIGURED_TO_USE_AVX()

/// \note Partially generated by Chat GPT 4
size_t avx2_strlen(const char* str) {
const __m256i zero = _mm256_setzero_si256(); // Vector of 32 zero bytes
size_t offset = 0;
__m256i data;
auto [alignedBase, misalignment] = blockAlignedLoad(str, &data);

// Loop over the string in blocks of 32 bytes
for (;; offset += 32) {
// Load 32 bytes of the string into a __m256i vector
__m256i data;// = _mm256_load_si256((const __m256i*)(str + offset));
memcpy(&data, str + offset, 32);
// Compare each byte with '\0'
__m256i cmp = _mm256_cmpeq_epi8(data, zero);
// Create a mask indicating which bytes are '\0'
int mask = _mm256_movemask_epi8(cmp);
// AVX does not offer a practical way to generate a mask of all ones in
// the least significant positions, thus we cant invoke adjustFor_strlen.
// We will do the first iteration as a special case to explicitly take into
// account misalignment
Comment on lines +219 to +222
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were any of the comments generated by ChatGPT? I think that could be good to know too.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I opened a can of worms by crediting ChatGPT 4.
Yes, some comments are from the tool, the others mine, and I forgot which is who's.
What's your opinion about crediting the AI? I think it is sufficient to indicate that a tool was involved and the result is hybrid.
What I wanted was to let the reader know I did not have to do the heavy lifting of selecting the intrinsics.
The ARM/NEON flavor was a different experience, though, I did not manage to get usable code from the tool, I had to decode the GLIBC assembler to learn the counter-intuitive technique of, to narrow bytes to nibbles, to start by making byte-pairs to shift them by 4 bits (a nibble) so the upper and lower nibbles straddle the byte boundary, then reduce to bytes, which in reality is a pair of nibbles.
If I indicate what comments were mine, sure, but when does it stop? do I also indicate my prompts? what about my user configuration, so that people can replicate exactly? then I'd also have to indicate the day when I had the session, since the tool can change radically one day to the next.

auto maskOfMask = (~uint64_t(0)) << misalignment;

auto compareAndMask =
[&]() {
// Compare each byte with '\0'
__m256i cmp = _mm256_cmpeq_epi8(data, zero);
// Create a mask indicating which bytes are '\0'
return _mm256_movemask_epi8(cmp);
};
auto mask = compareAndMask();
mask &= maskOfMask;

// Loop over the string in blocks of 32 bytes
for (;;) {
// If mask is not zero, we found a '\0' byte
if (mask) {
// Calculate the index of the first '\0' byte using ctz (Count Trailing Zeros)
return offset + __builtin_ctz(mask);
// Calculate the index of the first '\0' byte counting trailing 0s
auto nunNullByteCount = __builtin_ctz(mask);
return alignedBase + offset + nunNullByteCount - str;
}
offset += 32;
memcpy(&data, alignedBase + offset, 32);
mask = compareAndMask();
}
// Unreachable, but included to avoid compiler warnings
return offset;
}
#endif

}

Expand Down Expand Up @@ -217,3 +325,52 @@ STRLEN_old (const char *str)
}
}
}


#if ZOO_CONFIGURED_TO_USE_NEON()

#include <arm_neon.h>

namespace zoo {

/// \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in
/// aarch64/strlen.S at
/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD
std::size_t neon_strlen(const char *str) {
const uint8x16_t zero = vdupq_n_u8(0);
size_t offset = 0;
uint8x16_t data;
auto [alignedBase, misalignment] = blockAlignedLoad(str, &data);

auto compareAndConvertResultsToNibbles = [&]() {
auto cmp = vceqq_u8(data, zero);
// The result looks like, in hexadecimal digits, like this:
// [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each
// variable A, B, ... either 0xF or 0x0.
// instead of 16x8 bit results, we can see that as
// 8 16 bit results like this
// [ AABB, CCDD, EEFF, GGHH, ... ]
// If we shift out a nibble from each element (shift right by 4):
// [ ABB0, CDD0, EFF0, GHH0, ... ]
// Narrowing from 16 to eight, we would get
// [ AB, CD, EF, GH, ... ]
auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16(cmp, 4);
return vget_lane_u64(vreinterpret_u64_u8(straddle8bitLanePairAndNarrowToBytes), 0);
};
auto nibbles = compareAndConvertResultsToNibbles();
auto misalignmentNibbleMask = (~uint64_t(0)) << (misalignment * 4);
nibbles &= misalignmentNibbleMask;
for(;;) {
if(nibbles) {
auto trailingZeroBits = __builtin_ctz(nibbles);
auto nonNullByteCount = trailingZeroBits / 4;
return alignedBase + offset + nonNullByteCount - str;
}
alignedBase += sizeof(uint8x16_t);
memcpy(&data, alignedBase, sizeof(uint8x16_t));
nibbles = compareAndConvertResultsToNibbles();
}
}

}
#endif
15 changes: 13 additions & 2 deletions benchmark/atoi.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
#include "stdint.h"
#include "zoo/swar/SWAR.h"
#include "zoo/pp/platform.h"

#include <cstdlib>

uint32_t parse_eight_digits_swar(const char *chars);
uint32_t lemire_as_zoo_swar(const char *chars);

std::size_t spaces_glibc(const char *ptr);

namespace zoo {

std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept;
std::size_t c_strLength(const char *s);
std::size_t c_strLength_natural(const char *s);
std::size_t c_strLength_manualComparison(const char *s);

#if ZOO_CONFIGURED_TO_USE_AVX()
std::size_t avx2_strlen(const char* str);
#endif

#if ZOO_CONFIGURED_TO_USE_NEON()
std::size_t neon_strlen(const char* str);
#endif

}

Expand Down
21 changes: 19 additions & 2 deletions benchmark/catch2swar-demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,22 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") {
auto seed = rd();
CAPTURE(seed);
std::mt19937 g(seed);
SECTION("Simple comparison of two strings") {
auto TwoStrings = "Str1\0Much longer string here, even for AVX2";
auto zoolength1 = zoo::c_strLength(TwoStrings);
auto strlen1 = strlen(TwoStrings);
REQUIRE(zoolength1 == strlen1);
auto skipFst = TwoStrings + strlen1 + 1;
auto zl2 = zoo::c_strLength(skipFst);
auto strlen2 = strlen(skipFst);
REQUIRE(zl2 == strlen2);
#if ZOO_CONFIGURED_TO_USE_AVX()
auto avx1 = zoo::avx2_strlen(TwoStrings);
REQUIRE(avx1 == strlen1);
auto avx2 = zoo::avx2_strlen(skipFst);
REQUIRE(avx2 == strlen2);
#endif
}
auto corpus8D = Corpus8DecimalDigits::makeCorpus(g);
auto corpusStrlen = CorpusStringLength::makeCorpus(g);
#define X(Type, Fun) \
Expand All @@ -34,9 +50,10 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") {
REQUIRE(fromLIBC == fromZoo);
REQUIRE(fromZOO_STRLEN == fromLIBC_STRLEN);
REQUIRE(fromLIBC_STRLEN == fromZOO_NATURAL_STRLEN);
REQUIRE(fromZOO_NATURAL_STRLEN == fromZOO_MANUAL_STRLEN);
REQUIRE(fromGENERIC_GLIBC_STRLEN == fromZOO_NATURAL_STRLEN);
REQUIRE(fromZOO_AVX == fromZOO_STRLEN);
#if ZOO_CONFIGURED_TO_USE_AVX()
REQUIRE(fromZOO_AVX == fromZOO_STRLEN);
#endif

auto haveTheRoleOfMemoryBarrier = -1;
#define X(Type, Fun) \
Expand Down
Loading