diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h new file mode 100644 index 00000000..b2201823 --- /dev/null +++ b/inc/zoo/swar/associative_iteration.h @@ -0,0 +1,258 @@ +#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H + +#include "zoo/swar/SWAR.h" + +namespace zoo::swar { + +/// \note This code should be substituted by an application of "progressive" algebraic iteration +/// \note There is also parallelPrefix (to be implemented) +template +constexpr SWAR parallelSuffix(SWAR input) { + using S = SWAR; + auto + shiftClearingMask = S{~S::MostSignificantBit}, + doubling = input, + result = S{0}; + auto + bitsToXOR = NB, + power = 1; + for(;;) { + if(1 & bitsToXOR) { + result = result ^ doubling; + doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + } + bitsToXOR >>= 1; + if(!bitsToXOR) { break; } + auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + doubling = doubling ^ shifted; + // 01...1 + // 001...1 + // 00001...1 + // 000000001...1 + shiftClearingMask = + shiftClearingMask & S{shiftClearingMask.value() >> power}; + power <<= 1; + } + return S{result}; +} + +/// \todo because of the desirability of "accumuating" the XORs at the MSB, +/// the parallel suffix operation is more suitable. +template +constexpr SWAR parity(SWAR input) { + using S = SWAR; + auto preResult = parallelSuffix(input); + auto onlyMSB = preResult.value() & S::MostSignificantBit; + return S{onlyMSB}; +} + + +namespace impl { +template +constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { + auto msbCopiedDown = msb - lsb; + auto msbReintroduced = msbCopiedDown | msb; + return msbReintroduced; +} +} + +template +constexpr auto makeLaneMaskFromLSB(SWAR input) { + using S = SWAR; + auto lsb = input & S{S::LeastSignificantBit}; + auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); +} + +template +constexpr auto makeLaneMaskFromMSB(SWAR input) { + using S = SWAR; + auto msb = input & S{S::MostSignificantBit}; + auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); +} + +template +struct ArithmeticResultTriplet { + SWAR result; + BooleanSWAR carry, overflow; +}; + + +template +constexpr ArithmeticResultTriplet +fullAddition(SWAR s1, SWAR s2) { + using S = SWAR; + constexpr auto + SignBit = S{S::MostSignificantBit}, + LowerBits = SignBit - S{S::LeastSignificantBit}; + // prevent overflow by clearing the most significant bits + auto + s1prime = LowerBits & s1, + s2prime = LowerBits & s2, + resultPrime = s1prime + s2prime, + s1Sign = SignBit & s1, + s2Sign = SignBit & s2, + signPrime = SignBit & resultPrime, + result = resultPrime ^ s1Sign ^ s2Sign, + // carry is set whenever at least two of the sign bits of s1, s2, + // signPrime are set + carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), + // overflow: the inputs have the same sign and different to result + // same sign: s1Sign ^ s2Sign + overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); + using BS = BooleanSWAR; + return { result, BS{carry.value()}, BS{overflow.value()} }; +}; + + +template +constexpr auto negate(SWAR input) { + using S = SWAR; + constexpr auto Ones = S{S::LeastSignificantBit}; + return fullAddition(~input, Ones).result; +} + +/// \brief Performs a generalized iterated application of an associative operator to a base +/// +/// In algebra, the repeated application of an operator to a "base" has different names depending on the +/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", +/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n +/// power". +/// The general term in algebra is "iteration", hence the naming of this function. +/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we +/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very +/// different meaning in programming and especially different in C++. +/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this +/// function leverages the associative property of the operator to "halve" the count of iterations at each step. +/// \note There is a symmetrical operation to be implemented of associative iteration in the +/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, +/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" +/// \tparam Operator a callable with three arguments: the left and right arguments to the operation +/// and the count to be used, the "count" is an artifact of this generalization +/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not +/// be a number, the iteration count is part of the execution context to apply the operator +/// \param forSquaring is an artifact of this generalization +/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows +/// there are fewer iterations than what the type of exponent would allow +template< + typename Base, typename IterationCount, typename Operator, + // the critical use of associativity is that it allows halving the + // iteration count + typename CountHalver +> +constexpr auto associativeOperatorIterated_regressive( + Base base, Base neutral, IterationCount count, IterationCount forSquaring, + Operator op, unsigned log2Count, CountHalver ch +) { + auto result = neutral; + if(!log2Count) { return result; } + for(;;) { + result = op(result, base, count); + if(!--log2Count) { break; } + result = op(result, result, forSquaring); + count = ch(count); + } + return result; +} + +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( + SWAR multiplicand, SWAR multiplier +) { + using S = SWAR; + + auto operation = [](auto left, auto right, auto counts) { + auto addendums = makeLaneMaskFromMSB(counts); + return left + (addendums & right); + }; + + auto halver = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; + }; + + multiplier = S{multiplier.value() << (NB - ActualBits)}; + return associativeOperatorIterated_regressive( + multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, + ActualBits, halver + ); +} + +/// \note Not removed yet because it is an example of "progressive" associative exponentiation +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( + SWAR multiplicand, + SWAR multiplier +) { + using S = SWAR; + constexpr auto LeastBit = S::LeastSignificantBit; + auto multiplicandDoubling = multiplicand.value(); + auto mplier = multiplier.value(); + auto product = S{0}; + for(auto count = ActualBits;;) { + auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); + product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); + if(!--count) { break; } + multiplicandDoubling <<= 1; + auto leastBitCleared = mplier & ~LeastBit; + mplier = leastBitCleared >> 1; + } + return product; +} + +template +constexpr auto multiplication_OverflowUnsafe( + SWAR multiplicand, + SWAR multiplier +) { + return + multiplication_OverflowUnsafe_SpecificBitCount( + multiplicand, multiplier + ); +} + +template +struct SWAR_Pair{ + SWAR even, odd; +}; + +template +constexpr SWAR doublingMask() { + using S = SWAR; + static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); + using D = SWAR; + return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; +} + +template +constexpr auto doublePrecision(SWAR input) { + using S = SWAR; + static_assert( + 0 == S::NSlots % 2, + "Precision can only be doubled for SWARs of even element count" + ); + using RV = SWAR; + constexpr auto DM = doublingMask(); + return SWAR_Pair{ + RV{(input & DM).value()}, + RV{(input.value() >> NB) & DM.value()} + }; +} + +template +constexpr auto halvePrecision(SWAR even, SWAR odd) { + using S = SWAR; + static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); + using RV = SWAR; + constexpr auto HalvingMask = doublingMask(); + auto + evenHalf = RV{even.value()} & HalvingMask, + oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; + return evenHalf | oddHalf; +} + +} + +#endif diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index ef940992..af5a1016 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -1,307 +1,18 @@ -#include "zoo/swar/SWAR.h" +#include "zoo/swar/associative_iteration.h" #include "catch2/catch.hpp" #include -namespace zoo::swar { - -/// \note This code should be substituted by an application of "progressive" algebraic iteration -/// \note There is also parallelPrefix (to be implemented) -template -constexpr SWAR parallelSuffix(SWAR input) { - using S = SWAR; - auto - shiftClearingMask = S{~S::MostSignificantBit}, - doubling = input, - result = S{0}; - auto - bitsToXOR = NB, - power = 1; - for(;;) { - if(1 & bitsToXOR) { - result = result ^ doubling; - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - } - bitsToXOR >>= 1; - if(!bitsToXOR) { break; } - auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - doubling = doubling ^ shifted; - // 01...1 - // 001...1 - // 00001...1 - // 000000001...1 - shiftClearingMask = - shiftClearingMask & S{shiftClearingMask.value() >> power}; - power <<= 1; - } - return S{result}; -} - -/// \todo because of the desirability of "accumuating" the XORs at the MSB, -/// the parallel suffix operation is more suitable. -template -constexpr SWAR parity(SWAR input) { - using S = SWAR; - auto preResult = parallelSuffix(input); - auto onlyMSB = preResult.value() & S::MostSignificantBit; - return S{onlyMSB}; -} - - -/* -Execution trace at two points: -1. when checking `if(1 & count)` -2. when checking `if(!count)` -If the variable did not change from the last value, it may be ommitted -input Count x d power mask -1 1 0 x0 1 01111... - 0 x0 -2 2 0 x0 1 01111... - 1 0 x0 1 01111... - 1 0 x01 2 00111... - 0 x01 -3 3 0 x0 1 01111... - 1 x0 x1 1 01111... - 1 x0 x12 2 00111... - 0 x012 x23 -4 4 0 x0 1 01111... - 2 0 x0 1 01111... - 2 0 x01 2 00111... - 1 0 x01 - 1 0 x0123 4 00001... - 0 x0123 x01234567 -5 5 0 x0 1 01111... - 2 x0 x1 - 2 x0 x12 2 00111... - 1 - 1 x1234 4 00001... - 0 x01234 -6 6 0 x0 1 01111... - 3 - 3 x01 2 00111... - 1 x01 x23 - 1 x2345 4 00001... - 0 x012345 x6789 -7 7 0 x0 1 01...... - 3 x0 x1 - 3 x12 2 001..... - 1 x012 x34 - 1 x3456 4 00001... - 0 x0-6 x789A -25 = 16 + 8 + 1 -25 25 0 x0 1 01111... - 12 x0 x1 - 12 x12 2 00111... - 6 - 6 x1234 4 {0}4 - 3 - 3 x1-8 8 {0}8 - 1 x0-8 x9-16 - 1 x9-24 16 {0}16 - 0 x0-24 x25-? -*/ - -template -struct ArithmeticResultTriplet { - SWAR result; - BooleanSWAR carry, overflow; -}; - -namespace impl { -template -constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { - auto msbCopiedDown = msb - lsb; - auto msbReintroduced = msbCopiedDown | msb; - return msbReintroduced; -} -} - -template -constexpr auto makeLaneMaskFromLSB(SWAR input) { - using S = SWAR; - auto lsb = input & S{S::LeastSignificantBit}; - auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); -} - -template -constexpr auto makeLaneMaskFromMSB(SWAR input) { - using S = SWAR; - auto msb = input & S{S::MostSignificantBit}; - auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); -} - -template -constexpr auto fullAddition(SWAR s1, SWAR s2) { - using S = SWAR; - constexpr auto - SignBit = S{S::MostSignificantBit}, - LowerBits = SignBit - S{S::LeastSignificantBit}; - // prevent overflow by clearing the most significant bits - auto - s1prime = LowerBits & s1, - s2prime = LowerBits & s2, - resultPrime = s1prime + s2prime, - s1Sign = SignBit & s1, - s2Sign = SignBit & s2, - signPrime = SignBit & resultPrime, - result = resultPrime ^ s1Sign ^ s2Sign, - // carry is set whenever at least two of the sign bits of s1, s2, - // signPrime are set - carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), - // overflow: the inputs have the same sign and different to result - // same sign: s1Sign ^ s2Sign - overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); - return ArithmeticResultTriplet(result, carry, overflow); -} - -/// \brief Performs a generalized iterated application of an associative operator to a base -/// -/// In algebra, the repeated application of an operator to a "base" has different names depending on the -/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", -/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n power" -/// the generic term we use is "iteration" for naming this function. -/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we -/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very -/// different meaning in programming and especially different in C++. -/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this -/// function leverages the associative property of the operator to "halve" the count of iterations at each step. -/// \note There is a symmetrical operation to be implemented of associative iteration in the -/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, -/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" -/// \tparam Operator a callable with three arguments: the left and right arguments to the operation -/// and the count to be used, the "count" is an artifact of this generalization -/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not -/// be a number, the iteration count is part of the execution context to apply the operator -/// \param forSquaring is an artifact of this generalization -/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows -/// there are fewer iterations than what the type of exponent would allow -template< - typename Base, typename IterationCount, typename Operator, - // the critical use of associativity is that it allows halving the - // iteration count - typename CountHalver -> -constexpr auto associativeOperatorIterated_regressive( - Base base, Base neutral, IterationCount count, IterationCount forSquaring, - Operator op, unsigned log2Count, CountHalver ch -) { - auto result = neutral; - if(!log2Count) { return result; } - for(;;) { - result = op(result, base, count); - if(!--log2Count) { break; } - result = op(result, result, forSquaring); - count = ch(count); - } - return result; -} - -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( - SWAR multiplicand, SWAR multiplier -) { - using S = SWAR; - - auto operation = [](auto left, auto right, auto counts) { - auto addendums = makeLaneMaskFromMSB(counts); - return left + (addendums & right); - }; - - auto halver = [](auto counts) { - auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; - }; - - multiplier = S{multiplier.value() << (NB - ActualBits)}; - return associativeOperatorIterated_regressive( - multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, - ActualBits, halver - ); -} - -/// \note Not removed yet because it is an example of "progressive" associative exponentiation -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( - SWAR multiplicand, - SWAR multiplier -) { - using S = SWAR; - constexpr auto LeastBit = S::LeastSignificantBit; - auto multiplicandDoubling = multiplicand.value(); - auto mplier = multiplier.value(); - auto product = S{0}; - for(auto count = ActualBits;;) { - auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); - product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); - if(!--count) { break; } - multiplicandDoubling <<= 1; - auto leastBitCleared = mplier & ~LeastBit; - mplier = leastBitCleared >> 1; - } - return product; -} - -template -constexpr auto multiplication_OverflowUnsafe( - SWAR multiplicand, - SWAR multiplier -) { - return - multiplication_OverflowUnsafe_SpecificBitCount( - multiplicand, multiplier - ); -} - -template -struct SWAR_Pair{ - SWAR even, odd; -}; - -template -constexpr SWAR doublingMask() { - using S = SWAR; - static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); - using D = SWAR; - return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; -} - -template -constexpr auto doublePrecision(SWAR input) { - using S = SWAR; - static_assert( - 0 == S::NSlots % 2, - "Precision can only be doubled for SWARs of even element count" - ); - using RV = SWAR; - constexpr auto DM = doublingMask(); - return SWAR_Pair{ - RV{(input & DM).value()}, - RV{(input.value() >> NB) & DM.value()} - }; -} - -template -constexpr auto halvePrecision(SWAR even, SWAR odd) { - using S = SWAR; - static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); - using RV = SWAR; - constexpr auto HalvingMask = doublingMask(); - auto - evenHalf = RV{even.value()} & HalvingMask, - oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; - return evenHalf | oddHalf; -} - -} using namespace zoo; using namespace zoo::swar; namespace Multiplication { +using S4_64 = SWAR<4, uint64_t>; + +static_assert(~int64_t(0) == negate(S4_64{S4_64::LeastSignificantBit}).value()); static_assert(0x0F0F0F0F == doublingMask<4, uint32_t>().value()); constexpr auto PrecisionFixtureTest = 0x89ABCDEF;