diff --git a/.gitignore b/.gitignore index c8f9a2b8..e9113fc9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ -# Vscode does not like to build outside of the source tree -# (multiple glitches) - -.vscode -test/.vscode -build -.cache +# Vscode does not like to build outside of the source tree +# (multiple glitches) + +.vscode +test/.vscode +build +.cache diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 7e1f4476..800deecf 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -15,7 +15,7 @@ namespace zoo { namespace swar { using u64 = uint64_t; using u32 = uint32_t; using u16 = uint16_t; -using u8 = uint8_t; +using u8 = std::uint8_t; template constexpr uint64_t popcount(uint64_t a) noexcept { @@ -58,7 +58,10 @@ struct SWAR { SignificantBitsCount = BitWidth - PaddingBitsCount, AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount, LeastSignificantBit = meta::BitmaskMaker{1}, NBits>::value, - MostSignificantBit = LeastSignificantBit << (NBits - 1); + MostSignificantBit = LeastSignificantBit << (NBits - 1), + // Use LowerBits in favor of ~MostSignificantBit to not pollute + // "don't care" bits when non-power-of-two bit lane sizes are supported + LowerBits = MostSignificantBit - LeastSignificantBit; SWAR() = default; constexpr explicit SWAR(T v): m_v(v) {} @@ -129,20 +132,24 @@ struct SWAR { /// \brief as the name suggests /// \param protectiveMask should clear the bits that would cross the lane. - /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain - /// the protective mask by the caller, otherwise, the mask will be computed on all invocations. - /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance. - constexpr SWAR - shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() << bitCount}; - } - - /// \param protectiveMask should clear the bits that would cross the lane - /// \sa shiftIntraLaneLeft - constexpr SWAR - shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() >> bitCount}; - } + /// The bits that will be cleared are directly related to the count of + /// shifts, it is natural to maintain the protective mask by the caller, + /// otherwise, the mask would have to be computed in all invocations. + /// We are not sure the optimizer would maintain this mask somewhere, if it + /// were to recalculate it, it would be disastrous for performance + /// \note the \c static_cast are necessary because of narrowing conversions + #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>) + #define X(name, op) \ + constexpr SWAR \ + shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \ + T shiftC = static_cast(bitCount); \ + auto V = (*this & protectiveMask).value(); \ + auto rv = static_cast(V op shiftC); \ + return SWAR{rv}; \ + } + SHIFT_INTRALANE_OP_X_LIST + #undef X + #undef SHIFT_INTRALANE_OP_X_LIST constexpr SWAR multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; } diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index e63c2869..7c6cab3b 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -3,6 +3,42 @@ #include "zoo/swar/SWAR.h" +//#define ZOO_DEVELOPMENT_DEBUGGING +#ifdef ZOO_DEVELOPMENT_DEBUGGING +#include + +inline std::ostream &binary(std::ostream &out, uint64_t input, int count) { + while(count--) { + out << (1 & input); + input >>= 1; + } + return out; +} + +template +std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { + using S = zoo::swar::SWAR; + auto shiftCounter = sizeof(B) * 8 / NB; + out << "<|"; + auto v = s.value(); + do { + binary(out, v, NB) << '|'; + + } while(--shiftCounter); + return out << ">"; +} + +#define ZOO_TO_STRING(a) #a +// std::endl is needed within the context of debugging: flush the line +#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl; +#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__) + +#else + +#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__ + +#endif + namespace zoo::swar { /// \note This code should be substituted by an application of "progressive" algebraic iteration @@ -11,32 +47,196 @@ template constexpr SWAR parallelSuffix(SWAR input) { using S = SWAR; auto - shiftClearingMask = S{~S::MostSignificantBit}, + shiftClearingMask = S{static_cast(~S::MostSignificantBit)}, doubling = input, result = S{0}; auto bitsToXOR = NB, power = 1; + + #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) for(;;) { + ZTE(doubling); if(1 & bitsToXOR) { - result = result ^ doubling; - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + ZTE(result = result ^ doubling); + ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask)); } - bitsToXOR >>= 1; + ZTE(bitsToXOR >>= 1); if(!bitsToXOR) { break; } auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - doubling = doubling ^ shifted; + ZTE(shifted); + ZTE(doubling = doubling ^ shifted); // 01...1 // 001...1 // 00001...1 // 000000001...1 shiftClearingMask = - shiftClearingMask & S{shiftClearingMask.value() >> power}; - power <<= 1; + shiftClearingMask & + S{static_cast(shiftClearingMask.value() >> power)}; + ZTE(power <<= 1); } + ZTE(input); + #undef ZTE return S{result}; } +/* +Binary compress: A fascinating algorithm. + +Warren (Hacker's Delight) believes Guy L. Steele is the author of the following +binary compression operation, equivalent to Intel's BMI2 instruction PEXT of +"Parallel Extraction" + +From a "mask", a selector of bits from an input, we want to put them together in +the output. + +For example's sake, this is the selector: +Note: this follows the usual 'big endian' convention of denoting the most +significant bit first: +0001 0011 0111 0111 0110 1110 1100 1010 +Imagine the input is the 32-bit or 32-boolean variable expression +abcd efgh ijkl mnop qrst uvxy zABC DEFG +We want the selection + d gh jkl nop rs uvx zA D F +To be compressed into the output +0000 0000 0000 00dg hjkl nopr suvx zADF + +This algorithm will virtually calculate the count of positions that the selected +bits travel to the right, by constructing the binary encoding of that count: +It will identify the positions that will travel an odd number of positions to +the right, these are those whose position-travel-count have the units set. +It will then move those positions by one position to the right, and eliminate +them from the yet-to-move positions. Because it eliminates the positions that +would move an odd count, there remains only positions that move an even number +of positions. Now it finds the positions that move an odd count of /pairs/ of +positions, it moves them 2 positions. This is equivalent to finding the +positions that would have the bit for 2 set in the count of positions to move +right. +Then an odd count of /quartets/ of positions, and moves them 4; +8, 16, 32, ... + + +Complete example (32 bits) +Selection mask: +0001 0011 0111 0111 0110 1110 1100 1010 +Input (each letter or variable is a boolean, that can have 0 or 1) +abcd efgh ijkl mnop qrst uvxy zABC DEFG +Selection (using spaces) + d gh jkl nop rs uvx zA D F +Desired result: + dghjklnoprsuvxzADF + +0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1 +1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix + + 10 1101 1101 +/* +Complete example (32 bits) +Selection mask: +0001 0011 0111 0111 0110 1110 1100 1010 +Input (each letter or variable is a boolean, that can have 0 or 1) +abcd efgh ijkl mnop qrst uvxy zABC DEFG +Selection (using spaces) + d gh jkl nop rs uvx zA D F +Desired result: + dghjklnoprsuvxzADF + +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 + == groupsize of ~compressionMask +This indicates the positions that have a 0 immediately to the right in + compressionMask +4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the + current position in forParallelSuffix, + last decimal digit +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix +We have just identified the positions that need to move an odd number of +positions. Filter them with positions with a bit set in compressionMask: +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of + compressionMask by 1 == groupSize +0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits + that will move) +---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize +0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask. +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix +1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved) +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero + immediately to their right) +1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => + had even zeroes to their right) +At this point, we have removed from compressionMask the positions that moved an +odd number of positions and moved them 1 position, +then, we only keep positions that move an even number of positions. +Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ... +*/ + +template +constexpr SWAR +compress(SWAR input, SWAR compressionMask) { + // This solution uses the parallel suffix operation as a primary tool: + // For every bit postion it indicates an odd number of ones to the right, + // including itself. + // Because we want to detect the "oddness" of groups of zeroes to the right, + // we flip the compression mask. To not count the bit position itself, + // we shift by one. + #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) + ZTE(input); + ZTE(compressionMask); + using S = SWAR; + auto result = input & compressionMask; + auto groupSize = 1; + auto + shiftLeftMask = S{S::LowerBits}, + shiftRightMask = S{S::LowerBits << 1}; + ZTE(~compressionMask); + auto forParallelSuffix = // this is called "mk" in the book + (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); + ZTE(forParallelSuffix); + // note: forParallelSuffix denotes positions with a zero + // immediately to the right in 'compressionMask' + for(;;) { + ZTE(groupSize); + ZTE(shiftLeftMask); + ZTE(shiftRightMask); + ZTE(result); + auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book + parallelSuffix(forParallelSuffix); + ZTE(oddCountOfGroupsOfZerosToTheRight); + // compress the bits just identified in both the result and the mask + auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight; + ZTE(moving); + compressionMask = + (compressionMask ^ moving) | // clear the moving + moving.shiftIntraLaneRight(groupSize, shiftRightMask); + ZTE(compressionMask); + auto movingFromInput = result & moving; + result = + (result ^ movingFromInput) | // clear the moving from the result + movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask); + auto nextGroupSize = groupSize << 1; + if(NB <= nextGroupSize) { + break; + } + auto evenCountOfGroupsOfZerosToTheRight = + ~oddCountOfGroupsOfZerosToTheRight; + forParallelSuffix = + forParallelSuffix & evenCountOfGroupsOfZerosToTheRight; + auto newShiftLeftMask = + shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask); + shiftRightMask = + shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask); + shiftLeftMask = newShiftLeftMask; + groupSize = nextGroupSize; + } + ZTE(result); + #undef ZTE + return result; +} + /// \todo because of the desirability of "accumuating" the XORs at the MSB, /// the parallel suffix operation is more suitable. template diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index af5a1016..83038df0 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -101,6 +101,17 @@ TEST_CASE( } } +TEST_CASE("Compress/Expand", "[swar]") { + unsigned + Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010, + ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101, + // Selection: 1 01 101 101 10 010 01 0 0 + result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0; + using S1_32 = SWAR<32, uint32_t>; + auto q = compress(S1_32{ToMove}, S1_32{Mask}); + CHECK(result == q.value()); +} + static_assert(1 == popcount<5>(0x100ull)); static_assert(1 == popcount<5>(0x010ull)); static_assert(1 == popcount<5>(0x001ull));