From 87ddc04056ed16c22191e761671265cd576a37f4 Mon Sep 17 00:00:00 2001 From: Eddie Date: Thu, 22 Feb 2024 18:02:56 -0800 Subject: [PATCH 1/4] Compress/expand --- .gitignore | 11 +- inc/zoo/swar/SWAR.h | 13 +- inc/zoo/swar/associative_iteration.h | 715 +++++++++++++++++---------- test/swar/BasicOperations.cpp | 12 + 4 files changed, 484 insertions(+), 267 deletions(-) diff --git a/.gitignore b/.gitignore index 64d2f48d..b7a554bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ -# Vscode does not like to build outside of the source tree -# (multiple glitches) - -.vscode -test/.vscode +# Vscode does not like to build outside of the source tree +# (multiple glitches) + +.vscode +test/.vscode +build diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index d25fd457..4d47dc17 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -15,7 +15,7 @@ namespace zoo { namespace swar { using u64 = uint64_t; using u32 = uint32_t; using u16 = uint16_t; -using u8 = uint8_t; +using u8 = std::uint8_t; template constexpr uint64_t popcount(uint64_t a) noexcept { @@ -58,7 +58,10 @@ struct SWAR { SignificantBitsCount = BitWidth - PaddingBitsCount, AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount, LeastSignificantBit = meta::BitmaskMaker{1}, NBits>::value, - MostSignificantBit = LeastSignificantBit << (NBits - 1); + MostSignificantBit = LeastSignificantBit << (NBits - 1), + // Use LowerBits in favor of ~MostSignificantBit to not pollute + // "don't care" bits when non-power-of-two bit lane sizes are supported + LowerBits = MostSignificantBit - LeastSignificantBit; SWAR() = default; constexpr explicit SWAR(T v): m_v(v) {} @@ -134,14 +137,16 @@ struct SWAR { /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance. constexpr SWAR shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() << bitCount}; + T shiftC = static_cast(bitCount); // could be a narrowing conversion + auto V = (*this & protectiveMask).value(); + return SWAR{static_cast(V << shiftC)}; } /// \param protectiveMask should clear the bits that would cross the lane /// \sa shiftIntraLaneLeft constexpr SWAR shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() >> bitCount}; + return SWAR{(*this & protectiveMask).value() >> T{bitCount}}; } T m_v; diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index b2201823..53e81425 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,258 +1,457 @@ -#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H -#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H - -#include "zoo/swar/SWAR.h" - -namespace zoo::swar { - -/// \note This code should be substituted by an application of "progressive" algebraic iteration -/// \note There is also parallelPrefix (to be implemented) -template -constexpr SWAR parallelSuffix(SWAR input) { - using S = SWAR; - auto - shiftClearingMask = S{~S::MostSignificantBit}, - doubling = input, - result = S{0}; - auto - bitsToXOR = NB, - power = 1; - for(;;) { - if(1 & bitsToXOR) { - result = result ^ doubling; - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - } - bitsToXOR >>= 1; - if(!bitsToXOR) { break; } - auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - doubling = doubling ^ shifted; - // 01...1 - // 001...1 - // 00001...1 - // 000000001...1 - shiftClearingMask = - shiftClearingMask & S{shiftClearingMask.value() >> power}; - power <<= 1; - } - return S{result}; -} - -/// \todo because of the desirability of "accumuating" the XORs at the MSB, -/// the parallel suffix operation is more suitable. -template -constexpr SWAR parity(SWAR input) { - using S = SWAR; - auto preResult = parallelSuffix(input); - auto onlyMSB = preResult.value() & S::MostSignificantBit; - return S{onlyMSB}; -} - - -namespace impl { -template -constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { - auto msbCopiedDown = msb - lsb; - auto msbReintroduced = msbCopiedDown | msb; - return msbReintroduced; -} -} - -template -constexpr auto makeLaneMaskFromLSB(SWAR input) { - using S = SWAR; - auto lsb = input & S{S::LeastSignificantBit}; - auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); -} - -template -constexpr auto makeLaneMaskFromMSB(SWAR input) { - using S = SWAR; - auto msb = input & S{S::MostSignificantBit}; - auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); -} - -template -struct ArithmeticResultTriplet { - SWAR result; - BooleanSWAR carry, overflow; -}; - - -template -constexpr ArithmeticResultTriplet -fullAddition(SWAR s1, SWAR s2) { - using S = SWAR; - constexpr auto - SignBit = S{S::MostSignificantBit}, - LowerBits = SignBit - S{S::LeastSignificantBit}; - // prevent overflow by clearing the most significant bits - auto - s1prime = LowerBits & s1, - s2prime = LowerBits & s2, - resultPrime = s1prime + s2prime, - s1Sign = SignBit & s1, - s2Sign = SignBit & s2, - signPrime = SignBit & resultPrime, - result = resultPrime ^ s1Sign ^ s2Sign, - // carry is set whenever at least two of the sign bits of s1, s2, - // signPrime are set - carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), - // overflow: the inputs have the same sign and different to result - // same sign: s1Sign ^ s2Sign - overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); - using BS = BooleanSWAR; - return { result, BS{carry.value()}, BS{overflow.value()} }; -}; - - -template -constexpr auto negate(SWAR input) { - using S = SWAR; - constexpr auto Ones = S{S::LeastSignificantBit}; - return fullAddition(~input, Ones).result; -} - -/// \brief Performs a generalized iterated application of an associative operator to a base -/// -/// In algebra, the repeated application of an operator to a "base" has different names depending on the -/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", -/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n -/// power". -/// The general term in algebra is "iteration", hence the naming of this function. -/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we -/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very -/// different meaning in programming and especially different in C++. -/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this -/// function leverages the associative property of the operator to "halve" the count of iterations at each step. -/// \note There is a symmetrical operation to be implemented of associative iteration in the -/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, -/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" -/// \tparam Operator a callable with three arguments: the left and right arguments to the operation -/// and the count to be used, the "count" is an artifact of this generalization -/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not -/// be a number, the iteration count is part of the execution context to apply the operator -/// \param forSquaring is an artifact of this generalization -/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows -/// there are fewer iterations than what the type of exponent would allow -template< - typename Base, typename IterationCount, typename Operator, - // the critical use of associativity is that it allows halving the - // iteration count - typename CountHalver -> -constexpr auto associativeOperatorIterated_regressive( - Base base, Base neutral, IterationCount count, IterationCount forSquaring, - Operator op, unsigned log2Count, CountHalver ch -) { - auto result = neutral; - if(!log2Count) { return result; } - for(;;) { - result = op(result, base, count); - if(!--log2Count) { break; } - result = op(result, result, forSquaring); - count = ch(count); - } - return result; -} - -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( - SWAR multiplicand, SWAR multiplier -) { - using S = SWAR; - - auto operation = [](auto left, auto right, auto counts) { - auto addendums = makeLaneMaskFromMSB(counts); - return left + (addendums & right); - }; - - auto halver = [](auto counts) { - auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; - }; - - multiplier = S{multiplier.value() << (NB - ActualBits)}; - return associativeOperatorIterated_regressive( - multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, - ActualBits, halver - ); -} - -/// \note Not removed yet because it is an example of "progressive" associative exponentiation -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( - SWAR multiplicand, - SWAR multiplier -) { - using S = SWAR; - constexpr auto LeastBit = S::LeastSignificantBit; - auto multiplicandDoubling = multiplicand.value(); - auto mplier = multiplier.value(); - auto product = S{0}; - for(auto count = ActualBits;;) { - auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); - product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); - if(!--count) { break; } - multiplicandDoubling <<= 1; - auto leastBitCleared = mplier & ~LeastBit; - mplier = leastBitCleared >> 1; - } - return product; -} - -template -constexpr auto multiplication_OverflowUnsafe( - SWAR multiplicand, - SWAR multiplier -) { - return - multiplication_OverflowUnsafe_SpecificBitCount( - multiplicand, multiplier - ); -} - -template -struct SWAR_Pair{ - SWAR even, odd; -}; - -template -constexpr SWAR doublingMask() { - using S = SWAR; - static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); - using D = SWAR; - return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; -} - -template -constexpr auto doublePrecision(SWAR input) { - using S = SWAR; - static_assert( - 0 == S::NSlots % 2, - "Precision can only be doubled for SWARs of even element count" - ); - using RV = SWAR; - constexpr auto DM = doublingMask(); - return SWAR_Pair{ - RV{(input & DM).value()}, - RV{(input.value() >> NB) & DM.value()} - }; -} - -template -constexpr auto halvePrecision(SWAR even, SWAR odd) { - using S = SWAR; - static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); - using RV = SWAR; - constexpr auto HalvingMask = doublingMask(); - auto - evenHalf = RV{even.value()} & HalvingMask, - oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; - return evenHalf | oddHalf; -} - -} - -#endif +#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H + +#include "zoo/swar/SWAR.h" + +#define ZOO_DEVELOPMENT_DEBUGGING +#ifdef ZOO_DEVELOPMENT_DEBUGGING +#include + +inline std::ostream &binary(std::ostream &out, uint64_t input, int count) { + while(count--) { + out << (1 & input); + input >>= 1; + } + return out; +} + +template +std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { + using S = zoo::swar::SWAR; + auto shiftCounter = sizeof(B) * 8 / NB; + out << "<|"; + auto v = s.value(); + do { + binary(out, v, NB) << '|'; + + } while(--shiftCounter); + return out << ">"; +} + +#define ZOO_TO_STRING(a) #a +// std::endl is needed within the context of debugging: flush the line +#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl; +#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__) + +#else + +#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__ + +#endif + + +namespace zoo::swar { + +/// \note This code should be substituted by an application of "progressive" algebraic iteration +/// \note There is also parallelPrefix (to be implemented) +template +constexpr SWAR parallelSuffix(SWAR input) { + using S = SWAR; + auto + shiftClearingMask = S{static_cast(~S::MostSignificantBit)}, + doubling = input, + result = S{0}; + auto + bitsToXOR = NB, + power = 1; + #define ZTE(...) __VA_ARGS__ + for(;;) { + ZTE(doubling); + if(1 & bitsToXOR) { + ZTE(result = result ^ doubling); + ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask)); + } + ZTE(bitsToXOR >>= 1); + if(!bitsToXOR) { break; } + auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + ZTE(shifted); + ZTE(doubling = doubling ^ shifted); + // 01...1 + // 001...1 + // 00001...1 + // 000000001...1 + shiftClearingMask = + shiftClearingMask & + S{static_cast(shiftClearingMask.value() >> power)}; + ZTE(power <<= 1); + } + ZTE(input); + #undef ZTE + return S{result}; +} + +template +constexpr SWAR +compress(SWAR input, SWAR compressionMask) { + // the only bits turned on in the result are the bits set in the input that + // are moved down (shifted right) + + // Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4 + // The compression moves bits right as many positions as there are zeroes + // in the mask "below" it (or to the right). + // We can count the zeroes in the mask in a logarithmic way: + // First detect an odd count of zeroes, move those bits in the input one + // position down (right). + // Then an odd count of *pairs* of zeroes, moving them 2 positions right. + // Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4 + // right. + // An odd count of octects (bytes) of zeroes, shifting right 8, + // Odd count of 16 zeroes, >> 16 + // ... + // + // This solution will use the parallel suffix operation as a primary tool: + // For every bit postion it indicates an odd number of ones to the right, + // including itself. + // Because we want to detect the "oddity" of groups of zeroes to the right, + // we flip the compression mask. To not count the bit position itself, + // we shift by one. + #define ZTE ZOO_TRACEABLE_EXPRESSION + ZTE(input); + ZTE(compressionMask); + using S = SWAR; + auto result = input; + auto groupSize = 1; + auto shiftLeftMask = S{S::LowerBits}; + auto shiftRightMask = S{S::LowerBits << 1}; + auto forParallelSuffix = // this is called "mk" in the book + (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); + ZTE(forParallelSuffix); + // note: forParallelSuffix denotes positions with a zero + // immediately to the right in the 'mask' + auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book + parallelSuffix(forParallelSuffix); + ZTE(oddCountOfGroupsOfZerosToTheRight); + // compress the bits just identified in both the result and the mask + auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight; + ZTE(movingFromMask); + auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight; + /*compressionMask = + (compressionMask ^ movingFromMask) | + movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/ + result = + (result ^ movingFromInput) | + movingFromInput.shiftIntraLaneLeft(groupSize, shiftRightMask); + + auto evenCountOfGroupsOfZerosToTheRight = + ~oddCountOfGroupsOfZerosToTheRight; + + //auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit}); + //result = result ^ moved; + return result; + #undef ZTE +} + +/* +Complete example (32 bits) +Selection mask: +0001 0011 0111 0111 0110 1110 1100 1010 +Input (each letter or variable is a boolean, that can have 0 or 1) +abcd efgh ijkl mnop qrst uvxy zABC DEFG +Selection (using spaces) + d gh jkl nop rs uvx zA D F +Desired result: + dghjklnoprsuvxzADF + +0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1 +1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix + + 10 1101 1101 +/* +Complete example (32 bits) +Selection mask: +0001 0011 0111 0111 0110 1110 1100 1010 +Input (each letter or variable is a boolean, that can have 0 or 1) +abcd efgh ijkl mnop qrst uvxy zABC DEFG +Selection (using spaces) + d gh jkl nop rs uvx zA D F +Desired result: + dghjklnoprsuvxzADF + +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask +This indicates the positions that have a 0 immediately to the right in compressionMask +4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix +we have just identified the positions that need to move an odd number of positions +filter those positions to positions that have a bit set in the compressionMask: +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize +0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move) +---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize +0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask. +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix +1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved) +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right) +1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right) +At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position, +then, we only keep positions that move an even number of positions. +Now, we will repeat these steps but for groups of two zeroes + + +Binary compress: A fascinating algorithm. +Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm: +From a "mask", a selector of bits from an input, we want to put them together in the output. +For example's sake, this is the selector: +Note: this follows the usual 'big endian' convention of denoting the most significant bit first +0001 0011 0111 0111 0110 1110 1100 1010 +Imagine the input is the 32-bit or 32-boolean variable expression +abcd efgh ijkl mnop qrst uvxy zABC DEFG +We want the selection + d gh jkl nop rs uvx zA D F +To be compressed into the output +0000 0000 0000 00dg hjkl nopr suvx zADF +This algorithm will virtually calculate the count of positions that the selected bits travel to the right, +by constructing the binary encoding of that count: +It will identify the positions that will travel an odd number of positions to the right, these are those +whose position-travel-count have the units set. +It will move those positions by one position to the right, and eliminate them from the yet-to-move positions. +Because it eliminates the positions that would move an odd count, there remains only positions that move +an even number of positions. Now it finds the positions that move an odd count of /pairs/ of positions, +and moves them 2 positions. +then an odd count of /quartets/ of positions, and moves them 4; +8, 16, 32, ... + +*/ + + +/// \todo because of the desirability of "accumuating" the XORs at the MSB, +/// the parallel suffix operation is more suitable. +template +constexpr SWAR parity(SWAR input) { + using S = SWAR; + auto preResult = parallelSuffix(input); + auto onlyMSB = preResult.value() & S::MostSignificantBit; + return S{onlyMSB}; +} + + +namespace impl { +template +constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { + auto msbCopiedDown = msb - lsb; + auto msbReintroduced = msbCopiedDown | msb; + return msbReintroduced; +} +} + +template +constexpr auto makeLaneMaskFromLSB(SWAR input) { + using S = SWAR; + auto lsb = input & S{S::LeastSignificantBit}; + auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); +} + +template +constexpr auto makeLaneMaskFromMSB(SWAR input) { + using S = SWAR; + auto msb = input & S{S::MostSignificantBit}; + auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); +} + +template +struct ArithmeticResultTriplet { + SWAR result; + BooleanSWAR carry, overflow; +}; + + +template +constexpr ArithmeticResultTriplet +fullAddition(SWAR s1, SWAR s2) { + using S = SWAR; + constexpr auto + SignBit = S{S::MostSignificantBit}, + LowerBits = SignBit - S{S::LeastSignificantBit}; + // prevent overflow by clearing the most significant bits + auto + s1prime = LowerBits & s1, + s2prime = LowerBits & s2, + resultPrime = s1prime + s2prime, + s1Sign = SignBit & s1, + s2Sign = SignBit & s2, + signPrime = SignBit & resultPrime, + result = resultPrime ^ s1Sign ^ s2Sign, + // carry is set whenever at least two of the sign bits of s1, s2, + // signPrime are set + carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), + // overflow: the inputs have the same sign and different to result + // same sign: s1Sign ^ s2Sign + overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); + using BS = BooleanSWAR; + return { result, BS{carry.value()}, BS{overflow.value()} }; +}; + + +template +constexpr auto negate(SWAR input) { + using S = SWAR; + constexpr auto Ones = S{S::LeastSignificantBit}; + return fullAddition(~input, Ones).result; +} + +/// \brief Performs a generalized iterated application of an associative operator to a base +/// +/// In algebra, the repeated application of an operator to a "base" has different names depending on the +/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", +/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n +/// power". +/// The general term in algebra is "iteration", hence the naming of this function. +/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we +/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very +/// different meaning in programming and especially different in C++. +/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this +/// function leverages the associative property of the operator to "halve" the count of iterations at each step. +/// \note There is a symmetrical operation to be implemented of associative iteration in the +/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, +/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" +/// \tparam Operator a callable with three arguments: the left and right arguments to the operation +/// and the count to be used, the "count" is an artifact of this generalization +/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not +/// be a number, the iteration count is part of the execution context to apply the operator +/// \param forSquaring is an artifact of this generalization +/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows +/// there are fewer iterations than what the type of exponent would allow +template< + typename Base, typename IterationCount, typename Operator, + // the critical use of associativity is that it allows halving the + // iteration count + typename CountHalver +> +constexpr auto associativeOperatorIterated_regressive( + Base base, Base neutral, IterationCount count, IterationCount forSquaring, + Operator op, unsigned log2Count, CountHalver ch +) { + auto result = neutral; + if(!log2Count) { return result; } + for(;;) { + result = op(result, base, count); + if(!--log2Count) { break; } + result = op(result, result, forSquaring); + count = ch(count); + } + return result; +} + +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( + SWAR multiplicand, SWAR multiplier +) { + using S = SWAR; + + auto operation = [](auto left, auto right, auto counts) { + auto addendums = makeLaneMaskFromMSB(counts); + return left + (addendums & right); + }; + + auto halver = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; + }; + + multiplier = S{multiplier.value() << (NB - ActualBits)}; + return associativeOperatorIterated_regressive( + multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, + ActualBits, halver + ); +} + +/// \note Not removed yet because it is an example of "progressive" associative exponentiation +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( + SWAR multiplicand, + SWAR multiplier +) { + using S = SWAR; + constexpr auto LeastBit = S::LeastSignificantBit; + auto multiplicandDoubling = multiplicand.value(); + auto mplier = multiplier.value(); + auto product = S{0}; + for(auto count = ActualBits;;) { + auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); + product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); + if(!--count) { break; } + multiplicandDoubling <<= 1; + auto leastBitCleared = mplier & ~LeastBit; + mplier = leastBitCleared >> 1; + } + return product; +} + +template +constexpr auto multiplication_OverflowUnsafe( + SWAR multiplicand, + SWAR multiplier +) { + return + multiplication_OverflowUnsafe_SpecificBitCount( + multiplicand, multiplier + ); +} + +template +struct SWAR_Pair{ + SWAR even, odd; +}; + +template +constexpr SWAR doublingMask() { + using S = SWAR; + static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); + using D = SWAR; + return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; +} + +template +constexpr auto doublePrecision(SWAR input) { + using S = SWAR; + static_assert( + 0 == S::NSlots % 2, + "Precision can only be doubled for SWARs of even element count" + ); + using RV = SWAR; + constexpr auto DM = doublingMask(); + return SWAR_Pair{ + RV{(input & DM).value()}, + RV{(input.value() >> NB) & DM.value()} + }; +} + +template +constexpr auto halvePrecision(SWAR even, SWAR odd) { + using S = SWAR; + static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); + using RV = SWAR; + constexpr auto HalvingMask = doublingMask(); + auto + evenHalf = RV{even.value()} & HalvingMask, + oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; + return evenHalf | oddHalf; +} + +/* +template +constexpr auto compress(SWAR input, SWAR mask) { + using S = SWAR; + // Follows Henry S. Warren's "Hacker's Delight" 7-4 + auto movers = input & mask; + // The mechanism detects positions with an odd number of zeroes to the + // right. + // To count odd zeroes, invert the mask + // The "parallel suffix" gives this, but including the position, to exclude + // the position, shift left by one + auto preOddZeroesToTheRight = ~S{~mask.value() << 1}; + auto oddZeroesToTheRight = parallelSuffix(preOddZeroesToTheRight); + auto moveSelector1 = oddZeroesToTheRight & mask; + auto shiftRightMask = ~S::LeastSignificantBit; + auto move1 = moveSelector1 & movers; + auto result = (moveSelector1 ^ move1) | movers.shiftIntraLaneRight(1, shiftRightMask); + return result; +} +*/ + +} + +#endif diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index af5a1016..3ef469d4 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -101,6 +101,18 @@ TEST_CASE( } } +TEST_CASE("Compress/Expand", "[swar]") { + unsigned Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010; + unsigned ToMove = 0x55555555; + using S1_32 = SWAR<32, uint32_t>; + auto q = compress(S1_32{ToMove}, S1_32{Mask}); + CHECK(0 != q.value()); + using S2_8 = SWAR<2, uint8_t>; + auto r = compress(S2_8{0b10'10'10'10}, S2_8{0b11'10'00'00}); + S2_8 expected{0b10'01'00'00}; + CHECK(expected.value() == r.value()); +} + static_assert(1 == popcount<5>(0x100ull)); static_assert(1 == popcount<5>(0x010ull)); static_assert(1 == popcount<5>(0x001ull)); From efeb812bd0d8b01750c6e6233e253e5fb205c0aa Mon Sep 17 00:00:00 2001 From: Eddie Date: Fri, 23 Feb 2024 11:06:11 -0800 Subject: [PATCH 2/4] Compress tested successfully --- inc/zoo/swar/SWAR.h | 34 +++-- inc/zoo/swar/associative_iteration.h | 218 +++++++++++++++------------ test/swar/BasicOperations.cpp | 13 +- 3 files changed, 143 insertions(+), 122 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 4d47dc17..cd44906c 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -132,22 +132,24 @@ struct SWAR { /// \brief as the name suggests /// \param protectiveMask should clear the bits that would cross the lane. - /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain - /// the protective mask by the caller, otherwise, the mask will be computed on all invocations. - /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance. - constexpr SWAR - shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept { - T shiftC = static_cast(bitCount); // could be a narrowing conversion - auto V = (*this & protectiveMask).value(); - return SWAR{static_cast(V << shiftC)}; - } - - /// \param protectiveMask should clear the bits that would cross the lane - /// \sa shiftIntraLaneLeft - constexpr SWAR - shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() >> T{bitCount}}; - } + /// The bits that will be cleared are directly related to the count of + /// shifts, it is natural to maintain the protective mask by the caller, + /// otherwise, the mask would have to be computed in all invocations. + /// We are not sure the optimizer would maintain this mask somewhere, if it + /// were to recalculate it, it would be disastrous for performance + /// \note the \c static_cast are necessary because of narrowing conversions + #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>) + #define X(name, op) \ + constexpr SWAR \ + shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \ + T shiftC = static_cast(bitCount); \ + auto V = (*this & protectiveMask).value(); \ + auto rv = static_cast(V op shiftC); \ + return SWAR{rv}; \ + } + SHIFT_INTRALANE_OP_X_LIST + #undef X + #undef SHIFT_INTRALANE_OP_X_LIST T m_v; }; diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 53e81425..ba29b350 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -80,68 +80,42 @@ constexpr SWAR parallelSuffix(SWAR input) { return S{result}; } -template -constexpr SWAR -compress(SWAR input, SWAR compressionMask) { - // the only bits turned on in the result are the bits set in the input that - // are moved down (shifted right) - - // Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4 - // The compression moves bits right as many positions as there are zeroes - // in the mask "below" it (or to the right). - // We can count the zeroes in the mask in a logarithmic way: - // First detect an odd count of zeroes, move those bits in the input one - // position down (right). - // Then an odd count of *pairs* of zeroes, moving them 2 positions right. - // Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4 - // right. - // An odd count of octects (bytes) of zeroes, shifting right 8, - // Odd count of 16 zeroes, >> 16 - // ... - // - // This solution will use the parallel suffix operation as a primary tool: - // For every bit postion it indicates an odd number of ones to the right, - // including itself. - // Because we want to detect the "oddity" of groups of zeroes to the right, - // we flip the compression mask. To not count the bit position itself, - // we shift by one. - #define ZTE ZOO_TRACEABLE_EXPRESSION - ZTE(input); - ZTE(compressionMask); - using S = SWAR; - auto result = input; - auto groupSize = 1; - auto shiftLeftMask = S{S::LowerBits}; - auto shiftRightMask = S{S::LowerBits << 1}; - auto forParallelSuffix = // this is called "mk" in the book - (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); - ZTE(forParallelSuffix); - // note: forParallelSuffix denotes positions with a zero - // immediately to the right in the 'mask' - auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book - parallelSuffix(forParallelSuffix); - ZTE(oddCountOfGroupsOfZerosToTheRight); - // compress the bits just identified in both the result and the mask - auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight; - ZTE(movingFromMask); - auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight; - /*compressionMask = - (compressionMask ^ movingFromMask) | - movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/ - result = - (result ^ movingFromInput) | - movingFromInput.shiftIntraLaneLeft(groupSize, shiftRightMask); - - auto evenCountOfGroupsOfZerosToTheRight = - ~oddCountOfGroupsOfZerosToTheRight; - - //auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit}); - //result = result ^ moved; - return result; - #undef ZTE -} - /* +Binary compress: A fascinating algorithm. + +Warren (Hacker's Delight) believes Guy L. Steele is the author of the following +binary compression operation, equivalent to Intel's BMI2 instruction PEXT of +"Parallel Extraction" + +From a "mask", a selector of bits from an input, we want to put them together in +the output. + +For example's sake, this is the selector: +Note: this follows the usual 'big endian' convention of denoting the most +significant bit first: +0001 0011 0111 0111 0110 1110 1100 1010 +Imagine the input is the 32-bit or 32-boolean variable expression +abcd efgh ijkl mnop qrst uvxy zABC DEFG +We want the selection + d gh jkl nop rs uvx zA D F +To be compressed into the output +0000 0000 0000 00dg hjkl nopr suvx zADF + +This algorithm will virtually calculate the count of positions that the selected +bits travel to the right, by constructing the binary encoding of that count: +It will identify the positions that will travel an odd number of positions to +the right, these are those whose position-travel-count have the units set. +It will then move those positions by one position to the right, and eliminate +them from the yet-to-move positions. Because it eliminates the positions that +would move an odd count, there remains only positions that move an even number +of positions. Now it finds the positions that move an odd count of /pairs/ of +positions, it moves them 2 positions. This is equivalent to finding the +positions that would have the bit for 2 set in the count of positions to move +right. +Then an odd count of /quartets/ of positions, and moves them 4; +8, 16, 32, ... + + Complete example (32 bits) Selection mask: 0001 0011 0111 0111 0110 1110 1100 1010 @@ -169,51 +143,97 @@ Desired result: 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask 1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask -1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask -This indicates the positions that have a 0 immediately to the right in compressionMask -4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit -0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix -we have just identified the positions that need to move an odd number of positions -filter those positions to positions that have a bit set in the compressionMask: +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 + == groupsize of ~compressionMask +This indicates the positions that have a 0 immediately to the right in + compressionMask +4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the + current position in forParallelSuffix, + last decimal digit +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix +We have just identified the positions that need to move an odd number of +positions. Filter them with positions with a bit set in compressionMask: 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask ----- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize -0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move) +---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of + compressionMask by 1 == groupSize +0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits + that will move) ---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize 0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask. -0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix 1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved) -1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right) -1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right) -At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position, +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero + immediately to their right) +1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => + had even zeroes to their right) +At this point, we have removed from compressionMask the positions that moved an +odd number of positions and moved them 1 position, then, we only keep positions that move an even number of positions. -Now, we will repeat these steps but for groups of two zeroes - - -Binary compress: A fascinating algorithm. -Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm: -From a "mask", a selector of bits from an input, we want to put them together in the output. -For example's sake, this is the selector: -Note: this follows the usual 'big endian' convention of denoting the most significant bit first -0001 0011 0111 0111 0110 1110 1100 1010 -Imagine the input is the 32-bit or 32-boolean variable expression -abcd efgh ijkl mnop qrst uvxy zABC DEFG -We want the selection - d gh jkl nop rs uvx zA D F -To be compressed into the output -0000 0000 0000 00dg hjkl nopr suvx zADF -This algorithm will virtually calculate the count of positions that the selected bits travel to the right, -by constructing the binary encoding of that count: -It will identify the positions that will travel an odd number of positions to the right, these are those -whose position-travel-count have the units set. -It will move those positions by one position to the right, and eliminate them from the yet-to-move positions. -Because it eliminates the positions that would move an odd count, there remains only positions that move -an even number of positions. Now it finds the positions that move an odd count of /pairs/ of positions, -and moves them 2 positions. -then an odd count of /quartets/ of positions, and moves them 4; -8, 16, 32, ... - +Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ... */ +template +constexpr SWAR +compress(SWAR input, SWAR compressionMask) { + // This solution uses the parallel suffix operation as a primary tool: + // For every bit postion it indicates an odd number of ones to the right, + // including itself. + // Because we want to detect the "oddness" of groups of zeroes to the right, + // we flip the compression mask. To not count the bit position itself, + // we shift by one. + // #define ZTE ZOO_TRACEABLE_EXPRESSION + ZTE(input); + ZTE(compressionMask); + using S = SWAR; + auto result = input & compressionMask; + auto groupSize = 1; + auto + shiftLeftMask = S{S::LowerBits}, + shiftRightMask = S{S::LowerBits << 1}; + ZTE(~compressionMask); + auto forParallelSuffix = // this is called "mk" in the book + (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); + ZTE(forParallelSuffix); + // note: forParallelSuffix denotes positions with a zero + // immediately to the right in 'compressionMask' + do { + ZTE(groupSize); + ZTE(shiftLeftMask); + ZTE(shiftRightMask); + ZTE(result); + auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book + parallelSuffix(forParallelSuffix); + ZTE(oddCountOfGroupsOfZerosToTheRight); + // compress the bits just identified in both the result and the mask + auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight; + ZTE(moving); + compressionMask = + (compressionMask ^ moving) | // clear the moving + moving.shiftIntraLaneRight(groupSize, shiftRightMask); + ZTE(compressionMask); + auto movingFromInput = result & moving; + result = + (result ^ movingFromInput) | // clear the moving from the result + movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask); + + auto evenCountOfGroupsOfZerosToTheRight = + ~oddCountOfGroupsOfZerosToTheRight; + forParallelSuffix = + forParallelSuffix & evenCountOfGroupsOfZerosToTheRight; + auto newShiftLeftMask = + shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask); + shiftRightMask = + shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask); + shiftLeftMask = newShiftLeftMask; + groupSize <<= 1; + } while(groupSize < NB); + ZTE(result); + #undef ZTE + return result; +} + /// \todo because of the desirability of "accumuating" the XORs at the MSB, /// the parallel suffix operation is more suitable. diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 3ef469d4..83038df0 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -102,15 +102,14 @@ TEST_CASE( } TEST_CASE("Compress/Expand", "[swar]") { - unsigned Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010; - unsigned ToMove = 0x55555555; + unsigned + Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010, + ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101, + // Selection: 1 01 101 101 10 010 01 0 0 + result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0; using S1_32 = SWAR<32, uint32_t>; auto q = compress(S1_32{ToMove}, S1_32{Mask}); - CHECK(0 != q.value()); - using S2_8 = SWAR<2, uint8_t>; - auto r = compress(S2_8{0b10'10'10'10}, S2_8{0b11'10'00'00}); - S2_8 expected{0b10'01'00'00}; - CHECK(expected.value() == r.value()); + CHECK(result == q.value()); } static_assert(1 == popcount<5>(0x100ull)); From 2d2d9a8baa289d963c08cf5d072c0d4317fb153a Mon Sep 17 00:00:00 2001 From: Eddie Date: Fri, 23 Feb 2024 11:20:58 -0800 Subject: [PATCH 3/4] Cleaning --- inc/zoo/swar/associative_iteration.h | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index ba29b350..f5c1ad7e 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -3,7 +3,7 @@ #include "zoo/swar/SWAR.h" -#define ZOO_DEVELOPMENT_DEBUGGING +//#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING #include @@ -54,7 +54,7 @@ constexpr SWAR parallelSuffix(SWAR input) { auto bitsToXOR = NB, power = 1; - #define ZTE(...) __VA_ARGS__ + #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) for(;;) { ZTE(doubling); if(1 & bitsToXOR) { @@ -183,7 +183,7 @@ compress(SWAR input, SWAR compressionMask) { // Because we want to detect the "oddness" of groups of zeroes to the right, // we flip the compression mask. To not count the bit position itself, // we shift by one. - // #define ZTE ZOO_TRACEABLE_EXPRESSION + #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) ZTE(input); ZTE(compressionMask); using S = SWAR; @@ -451,27 +451,6 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { return evenHalf | oddHalf; } -/* -template -constexpr auto compress(SWAR input, SWAR mask) { - using S = SWAR; - // Follows Henry S. Warren's "Hacker's Delight" 7-4 - auto movers = input & mask; - // The mechanism detects positions with an odd number of zeroes to the - // right. - // To count odd zeroes, invert the mask - // The "parallel suffix" gives this, but including the position, to exclude - // the position, shift left by one - auto preOddZeroesToTheRight = ~S{~mask.value() << 1}; - auto oddZeroesToTheRight = parallelSuffix(preOddZeroesToTheRight); - auto moveSelector1 = oddZeroesToTheRight & mask; - auto shiftRightMask = ~S::LeastSignificantBit; - auto move1 = moveSelector1 & movers; - auto result = (moveSelector1 ^ move1) | movers.shiftIntraLaneRight(1, shiftRightMask); - return result; -} -*/ - } #endif From d8c2875b75c2347bd88537865ff1387f257bcd92 Mon Sep 17 00:00:00 2001 From: Eddie Date: Fri, 23 Feb 2024 18:31:54 -0800 Subject: [PATCH 4/4] Minor improvement --- inc/zoo/swar/associative_iteration.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d45ad3aa..7c6cab3b 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -198,7 +198,7 @@ compress(SWAR input, SWAR compressionMask) { ZTE(forParallelSuffix); // note: forParallelSuffix denotes positions with a zero // immediately to the right in 'compressionMask' - do { + for(;;) { ZTE(groupSize); ZTE(shiftLeftMask); ZTE(shiftRightMask); @@ -217,7 +217,10 @@ compress(SWAR input, SWAR compressionMask) { result = (result ^ movingFromInput) | // clear the moving from the result movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask); - + auto nextGroupSize = groupSize << 1; + if(NB <= nextGroupSize) { + break; + } auto evenCountOfGroupsOfZerosToTheRight = ~oddCountOfGroupsOfZerosToTheRight; forParallelSuffix = @@ -227,8 +230,8 @@ compress(SWAR input, SWAR compressionMask) { shiftRightMask = shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask); shiftLeftMask = newShiftLeftMask; - groupSize <<= 1; - } while(groupSize < NB); + groupSize = nextGroupSize; + } ZTE(result); #undef ZTE return result;