diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index d2bdd6e8..d25fd457 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -56,7 +56,9 @@ struct SWAR { NSlots = Lanes, PaddingBitsCount = BitWidth % NBits, SignificantBitsCount = BitWidth - PaddingBitsCount, - AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount; + AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount, + LeastSignificantBit = meta::BitmaskMaker{1}, NBits>::value, + MostSignificantBit = LeastSignificantBit << (NBits - 1); SWAR() = default; constexpr explicit SWAR(T v): m_v(v) {} @@ -68,7 +70,7 @@ struct SWAR { X(SWAR, ~) //constexpr SWAR operator~() const noexcept { return SWAR{~m_v}; } #define SWAR_BINARY_OPERATORS_X_LIST \ - X(SWAR, &) X(SWAR, ^) X(SWAR, |) X(SWAR, -) X(SWAR, +) X(SWAR, *) + X(SWAR, &) X(SWAR, ^) X(SWAR, |) X(SWAR, -) X(SWAR, +) #define X(rt, op) constexpr rt operator op() const noexcept { return rt(op m_v); } SWAR_UNARY_OPERATORS_X_LIST @@ -106,6 +108,17 @@ struct SWAR { return SWAR(m_v | (T(1) << (index * NBits + bit))); } + constexpr auto blitElement(int index, T value) const noexcept { + auto elementMask = ((T(1) << NBits) - 1) << (index * NBits); + return SWAR((m_v & ~elementMask) | (value << (index * NBits))); + } + + constexpr SWAR blitElement(int index, SWAR other) const noexcept { + constexpr auto OneElementMask = SWAR(~(~T(0) << NBits)); + auto IsolationMask = OneElementMask.shiftLanesLeft(index); + return (*this & ~IsolationMask) | (other & IsolationMask); + } + constexpr SWAR shiftLanesLeft(int laneCount) const noexcept { return SWAR(value() << (NBits * laneCount)); } @@ -114,15 +127,21 @@ struct SWAR { return SWAR(value() >> (NBits * laneCount)); } - constexpr auto blitElement(int index, T value) const noexcept { - auto elementMask = ((T(1) << NBits) - 1) << (index * NBits); - return SWAR((m_v & ~elementMask) | (value << (index * NBits))); + /// \brief as the name suggests + /// \param protectiveMask should clear the bits that would cross the lane. + /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain + /// the protective mask by the caller, otherwise, the mask will be computed on all invocations. + /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance. + constexpr SWAR + shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept { + return SWAR{(*this & protectiveMask).value() << bitCount}; } - constexpr SWAR blitElement(int index, SWAR other) const noexcept { - constexpr auto OneElementMask = SWAR(~(~T(0) << NBits)); - auto IsolationMask = OneElementMask.shiftLanesLeft(index); - return (*this & ~IsolationMask) | (other & IsolationMask); + /// \param protectiveMask should clear the bits that would cross the lane + /// \sa shiftIntraLaneLeft + constexpr SWAR + shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept { + return SWAR{(*this & protectiveMask).value() >> bitCount}; } T m_v; @@ -299,7 +318,7 @@ constexpr auto broadcast(SWAR v) { /// BooleanSWAR treats the MSB of each SWAR lane as the boolean associated with that lane. template struct BooleanSWAR: SWAR { - // Booleanness is stored in MSB of a given swar. + // Booleanness is stored in the MSBs static constexpr auto MaskLaneMSB = broadcast(SWAR(T(1) << (NBits -1))); constexpr explicit BooleanSWAR(T v): SWAR(v) {} diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index b4480e54..db826eda 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -4,9 +4,334 @@ #include +namespace zoo::swar { + +/// \note This code should be substituted by an application of "progressive" algebraic iteration +/// \note There is also parallelPrefix (to be implemented) +template +constexpr SWAR parallelSuffix(SWAR input) { + using S = SWAR; + auto + shiftClearingMask = S{~S::MostSignificantBit}, + doubling = input, + result = S{0}; + auto + bitsToXOR = NB, + power = 1; + for(;;) { + if(1 & bitsToXOR) { + result = result ^ doubling; + doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + } + bitsToXOR >>= 1; + if(!bitsToXOR) { break; } + auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + doubling = doubling ^ shifted; + // 01...1 + // 001...1 + // 00001...1 + // 000000001...1 + shiftClearingMask = + shiftClearingMask & S{shiftClearingMask.value() >> power}; + power <<= 1; + } + return S{result}; +} + +/// \todo because of the desirability of "accumuating" the XORs at the MSB, +/// the parallel suffix operation is more suitable. +template +constexpr SWAR parity(SWAR input) { + using S = SWAR; + auto preResult = parallelSuffix(input); + auto onlyMSB = preResult.value() & S::MostSignificantBit; + return S{onlyMSB}; +} + + +/* +Execution trace at two points: +1. when checking `if(1 & count)` +2. when checking `if(!count)` +If the variable did not change from the last value, it may be ommitted +input Count x d power mask +1 1 0 x0 1 01111... + 0 x0 +2 2 0 x0 1 01111... + 1 0 x0 1 01111... + 1 0 x01 2 00111... + 0 x01 +3 3 0 x0 1 01111... + 1 x0 x1 1 01111... + 1 x0 x12 2 00111... + 0 x012 x23 +4 4 0 x0 1 01111... + 2 0 x0 1 01111... + 2 0 x01 2 00111... + 1 0 x01 + 1 0 x0123 4 00001... + 0 x0123 x01234567 +5 5 0 x0 1 01111... + 2 x0 x1 + 2 x0 x12 2 00111... + 1 + 1 x1234 4 00001... + 0 x01234 +6 6 0 x0 1 01111... + 3 + 3 x01 2 00111... + 1 x01 x23 + 1 x2345 4 00001... + 0 x012345 x6789 +7 7 0 x0 1 01...... + 3 x0 x1 + 3 x12 2 001..... + 1 x012 x34 + 1 x3456 4 00001... + 0 x0-6 x789A +25 = 16 + 8 + 1 +25 25 0 x0 1 01111... + 12 x0 x1 + 12 x12 2 00111... + 6 + 6 x1234 4 {0}4 + 3 + 3 x1-8 8 {0}8 + 1 x0-8 x9-16 + 1 x9-24 16 {0}16 + 0 x0-24 x25-? +*/ + +template +struct ArithmeticResultTriplet { + SWAR result; + BooleanSWAR carry, overflow; +}; + +namespace impl { +template +constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { + auto msbCopiedDown = msb - lsb; + auto msbReintroduced = msbCopiedDown | msb; + return msbReintroduced; +} +} + +template +constexpr auto makeLaneMaskFromLSB(SWAR input) { + using S = SWAR; + auto lsb = input & S{S::LeastSignificantBit}; + auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); +} + +template +constexpr auto makeLaneMaskFromMSB(SWAR input) { + using S = SWAR; + auto msb = input & S{S::MostSignificantBit}; + auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); +} + +template +constexpr auto fullAddition(SWAR s1, SWAR s2) { + using S = SWAR; + constexpr auto + SignBit = S{S::MostSignificantBit}, + LowerBits = SignBit - S{S::LeastSignificantBit}; + // prevent overflow by clearing the most significant bits + auto + s1prime = LowerBits & s1, + s2prime = LowerBits & s2, + resultPrime = s1prime + s2prime, + s1Sign = SignBit & s1, + s2Sign = SignBit & s2, + signPrime = SignBit & resultPrime, + result = resultPrime ^ s1Sign ^ s2Sign, + // carry is set whenever at least two of the sign bits of s1, s2, + // signPrime are set + carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), + // overflow: the inputs have the same sign and different to result + // same sign: s1Sign ^ s2Sign + overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); + return ArithmeticResultTriplet(result, carry, overflow); +} + +/// \brief Performs a generalized iterated application of an associative operator to a base +/// +/// In algebra, the repeated application of an operator to a "base" has different names depending on the +/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", +/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n power" +/// the generic term we use is "iteration" for naming this function. +/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we +/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very +/// different meaning in programming and especially different in C++. +/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this +/// function leverages the associative property of the operator to "halve" the count of iterations at each step. +/// \note There is a symmetrical operation to be implemented of associative iteration in the +/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, +/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" +/// \tparam Operator a callable with three arguments: the left and right arguments to the operation +/// and the count to be used, the "count" is an artifact of this generalization +/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not +/// be a number, the iteration count is part of the execution context to apply the operator +/// \param forSquaring is an artifact of this generalization +/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows +/// there are fewer iterations than what the type of exponent would allow +template< + typename Base, typename IterationCount, typename Operator, + // the critical use of associativity is that it allows halving the + // iteration count + typename CountHalver +> +constexpr auto associativeOperatorIterated_regressive( + Base base, Base neutral, IterationCount count, IterationCount forSquaring, + Operator op, unsigned log2Count, CountHalver ch +) { + auto result = neutral; + if(!log2Count) { return result; } + for(;;) { + result = op(result, base, count); + if(!--log2Count) { break; } + result = op(result, result, forSquaring); + count = ch(count); + } + return result; +} + +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( + SWAR multiplicand, SWAR multiplier +) { + using S = SWAR; + + auto operation = [](auto left, auto right, auto counts) { + auto addendums = makeElementMaskFromMSB(counts); + return left + (addendums & right); + }; + + auto halver = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; + }; + + multiplier = S{multiplier.value() << (NB - ActualBits)}; + return associativeOperatorIterated_regressive( + multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, + ActualBits, halver + ); +} + +/// \note Not removed yet because it is an example of "progressive" associative exponentiation +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( + SWAR multiplicand, + SWAR multiplier +) { + using S = SWAR; + constexpr auto LeastBit = S::LeastSignificantBit; + auto multiplicandDoubling = multiplicand.value(); + auto mplier = multiplier.value(); + auto product = S{0}; + for(auto count = ActualBits;;) { + auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); + product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); + if(!--count) { break; } + multiplicandDoubling <<= 1; + auto leastBitCleared = mplier & ~LeastBit; + mplier = leastBitCleared >> 1; + } + return product; +} + +template +constexpr auto multiplication_OverflowUnsafe( + SWAR multiplicand, + SWAR multiplier +) { + return + multiplication_OverflowUnsafe_SpecificBitCount( + multiplicand, multiplier + ); +} + +template +struct SWAR_Pair{ + SWAR even, odd; +}; + +template +constexpr SWAR doublingMask() { + using S = SWAR; + static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); + using D = SWAR; + return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; +} + +template +constexpr auto doublePrecision(SWAR input) { + using S = SWAR; + static_assert( + 0 == S::NSlots % 2, + "Precision can only be doubled for SWARs of even element count" + ); + using RV = SWAR; + constexpr auto DM = doublingMask(); + return SWAR_Pair{ + RV{(input & DM).value()}, + RV{(input.value() >> NB) & DM.value()} + }; +} + +template +constexpr auto halvePrecision(SWAR even, SWAR odd) { + using S = SWAR; + static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); + using RV = SWAR; + constexpr auto HalvingMask = doublingMask(); + auto + evenHalf = RV{even.value()} & HalvingMask, + oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; + return evenHalf | oddHalf; +} + +} + using namespace zoo; using namespace zoo::swar; +namespace Multiplication { + +static_assert(0x0F0F0F0F == doublingMask<4, uint32_t>().value()); + +constexpr auto PrecisionFixtureTest = 0x89ABCDEF; +constexpr auto Doubled = + doublePrecision(SWAR<4, uint32_t>{PrecisionFixtureTest}); + +static_assert(0x090B0D0F == Doubled.even.value()); +static_assert(0x080A0C0E == Doubled.odd.value()); +static_assert(PrecisionFixtureTest == halvePrecision(Doubled.even, Doubled.odd).value()); + +constexpr SWAR<8, u32> Micand{0x5030201}; +constexpr SWAR<8, u32> Mplier{0xA050301}; + +// expected: +// 5*0xA = 5*10 = 50 = 0x32, +// 3*5 = 15 = 0xF, +// 3*2 = 6, +// 1*1 = 1 +constexpr auto Expected = 0x320F0601; + +static_assert( + Expected == multiplication_OverflowUnsafe(Micand, Mplier).value() +); +static_assert( + 0x320F0601 != // intentionally use a too-small bit count + multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() +); + +} + #define HE(nbits, t, v0, v1) \ static_assert(horizontalEquality(\ SWAR(v0),\ @@ -21,6 +346,29 @@ HE(3, u8, 0xFF, 0x7); HE(2, u8, 0xAA, 0x2); #undef HE +TEST_CASE("Old version", "[deprecated][swar]") { + SWAR<8, u32> Micand{0x5030201}; + SWAR<8, u32> Mplier{0xA050301}; + auto Expected = 0x320F0601; + auto result = + multiplication_OverflowUnsafe_SpecificBitCount_deprecated<4>( + Micand, Mplier + ); + CHECK(Expected == result.value()); +} + +TEST_CASE("Parity", "[swar]") { + // For each nibble, E indicates (E)ven and O (O)dd parities + // EEOEEOOO + auto Examples = 0xFF13A7E4; + SWAR<4, u32> casesBy4{Examples}; + SWAR<8, u32> casesBy8{Examples}; + auto by4 = parity(casesBy4); + auto by8 = parity(casesBy8); + CHECK(by4.value() == 0x00800888); + CHECK(by8.value() == 0x00808000); +} + TEST_CASE( "Isolate", "[swar]"