From 03ac5a1c5d5802c24b02d23cf090a569d320b0e2 Mon Sep 17 00:00:00 2001 From: thecppzoo Date: Tue, 16 Jan 2024 19:20:42 -0800 Subject: [PATCH 01/10] Better comments --- inc/zoo/map/RobinHood.h | 107 ++++++++++++++++++++------- inc/zoo/swar/associative_iteration.h | 39 +++++++++- 2 files changed, 120 insertions(+), 26 deletions(-) diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h index a58235a1..6217ee1b 100644 --- a/inc/zoo/map/RobinHood.h +++ b/inc/zoo/map/RobinHood.h @@ -17,6 +17,43 @@ #include #endif +/*! \file RobinHood.h +\brief User entry point to the implementation of hash tables using the "Robin +Hood" invariant. + +The "Robin Hood" monicker means that each key has a preferred or "home" slot +in the hash table. If, upon insertion, the key can not be inserted into its +home slot, then the insertion would look to insert it as close as possible to +the home slot. + +In this code base, the acronym PSL is used frequently, it means "Probe Sequence +Length", this is the distance from the preferred or "home" slot and the current +search position. +For a practical reason, a key inserted into its home has a PSL of 1, in this +way, the metadata indicates with a PSL of 0 that no key is in the slot, +or that the slot is free. + +The invariant is that a key won't be inserted further away from its home than +the key in the current slot. That is, a key is "richer" than another if it is +closer to its "home", the insertion mechanism would "evict" a key that would be +richer than the key being inserted. In this regard, the "Robin Hood" metaphor +is realized: the insertion "steals" from the rich to give it to the poor. + +\note All of this codebase makes the unchecked assumption that the byte ordering +is LITTLE ENDIAN + +\todo complement with the other theoretical and practical comments relevant, +including: +1. How the table is not stable with regards to insetions and deletions, +2. How an insertion can cascade into very long chains of evictions/reinsertions +3. The theoretical guarantee that the longest PSL is in the order of Log(N) +4. How it seems that in practice the theoretical guarantee is not achieved. +... + +\todo determine a moment to endure the version control pain of making the +indentation consistent. +*/ + namespace zoo { namespace rh { @@ -31,6 +68,7 @@ struct RelocationStackExhausted: RobinHoodException { using RobinHoodException::RobinHoodException; }; +/// \brief The canonical backend (implementation) template struct RH_Backend { using Metadata = impl::Metadata; @@ -38,31 +76,8 @@ struct RH_Backend { constexpr static inline auto Width = Metadata::NBits; Metadata *md_; - /*! \brief SWAR check for a potential match - The invariant in Robin Hood is that the element being looked for, the "needle", is "richer" - than the elements already present, the "haystack". - "Richer" means that the PSL is smaller. - A PSL of 0 can only happen in the haystack, to indicate the slot is empty, this is "richest". - The first time the needle has a PSL greater than the haystacks' means the matching will fail, - because the hypothetical prior insertion would have "stolen" that slot. - If there is an equal, it would start a sequence of potential matches. To determine an actual match: - 1. A cheap SWAR check of hoisted hashes - 2. If there are still potential matches (now also the hoisted hashes), fall back to non-SWAR, - or iterative and expensive "deep equality" test for each potential match, outside of this function - - The above makes it very important to detect the first case in which the PSL is greater equal to the needle. - We call this the "deadline". - Because we assume the LITTLE ENDIAN byte ordering, the first element would be the least significant - non-false Boolean SWAR. - - Note about performance: - Every "early exit" faces a big justification hurdle, the proportion of cases - they intercept to be large enough that the branch prediction penalty of the entropy introduced is - overcompensated. - */ - - /// Boolean SWAR true in the first element/lane of the needle strictly poorer than its corresponding - /// haystack + /// Boolean SWAR true in the first element/lane of the needle strictly + /// poorer than its corresponding haystack constexpr static auto firstInvariantBreakage(Metadata needle, Metadata haystack) { auto nPSL = needle.PSLs(); @@ -97,6 +112,36 @@ struct RH_Backend { return std::tuple{Metadata{nPSL.PSLs() | needlePSLsToSaturate}, bool(saturation)}; // saturated at any point, last swar to check. } + + /*! \brief SWAR check for a potential match + + The invariant in Robin Hood is that the element being looked for, the + "needle", is at least as "rich" as the elements already present (the + "haystack"). + "Richer" means that the PSL is smaller. + A PSL of 0 can only happen in the haystack, to indicate the slot is empty, + this is "richest". + The first time the needle has a PSL greater than the haystacks' means the + matching will fail, because the hypothetical prior insertion would have + "stolen" that slot. + If there is an equal, it would start a sequence of potential matches. To + determine an actual match: + 1. A cheap SWAR check of hoisted hashes + 2. If there are still potential matches (now also the hoisted hashes), fall + back to non-SWAR, or iterative and expensive "deep equality" test for each + potential match, outside of this function. + + The above makes it very important to detect the first case in which the PSL + is greater equal to the needle. We call this the "deadline". + Because we assume the LITTLE ENDIAN byte ordering, the first element would + be the least significant non-false Boolean SWAR. + + Note about performance: + Every "early exit" faces a big justification hurdle, the proportion of cases + they intercept to be large enough that the branch prediction penalty of the + entropy introduced is overcompensated. + */ + constexpr static impl::MatchResult potentialMatches( Metadata needle, Metadata haystack @@ -212,6 +257,9 @@ RH_Backend::findMisaligned_assumesSkarupkeTail( } } +/// \brief The slots in the table may have a key-value pair or not, this +/// optionality is not suitably captured by any standard library component, +/// hence we need to implement our own. template struct KeyValuePairWrapper { using type = std::pair; @@ -243,6 +291,15 @@ struct KeyValuePairWrapper { const auto &value() const noexcept { return const_cast(this)->value(); } }; +/// \brief Frontend with the "Skarupke Tail" +/// +/// Normally we need to explicitly check for whether key searches have reached +/// the end of the table. Malte Skarupke devised a tail of table entries to +/// make this explicit check innecessary: Regardless of the end of the table, +/// a search must terminate in failure if the maximum PSL is reached, then, +/// by just adding the maximum PSL entries to the table, while keeping the +/// slot indexing function the same, searches at the end of the table will never +/// atempt to go past the real end, but return not-found within the tail. template< typename K, typename MV, diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index b2201823..bfdc2912 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -80,6 +80,39 @@ struct ArithmeticResultTriplet { }; +/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition +/// and producing the flags for unsigned overflow (carry) and signed overflow. +/// This is the function to perform signed addition (that relies on supporting +/// unsigned overflow) +/// +/// This function is called "full addition" because it can perform the addition +/// with all the bits of the inputs by making sure the overflowing (in the +/// unsigned sense) does not cross the lane boundary. +/// This function has less performance than "optimistic" addition (operator+). +/// The mechanism to manage potential overflow naturally allows the calculation +/// of the carry and signed overflow flags for no extra performance cost. +/// +/// The performance relies on the optimizer removing the calculation of +/// the carry or signed overflow if they are not used. +/// +/// When interpreted as unsigned addition, carrying out of the result is +/// overflow. +/// +/// The carry bit is essential to increase the precision of the results in +/// normal arithmetic, but in unsigned SWAR it is preferable to double the +/// precision before executing addition, thus guaranteeing no overflow will +/// occur and using the more performing operator+ addition. Hence, +/// the carry flag is mostly useful in SWAR for detection of unsigned overflow. +/// +/// The signed integer interpretation is the technique of two's complement, that +/// routinely overflows (as interpreted as unsigned). Signed overflow may only +/// occur if the inputs have the same sign, it is detected when the sign of the +/// result is opposite that of the inputs. +/// +/// \todo The library is not explicit with regards to the fact that +/// operator+ is only useful with the unsigned interpretation. A decision +/// must be made to either keep the library as is, or to promote full addition +/// to operator+, and the rationale for the decision template constexpr ArithmeticResultTriplet fullAddition(SWAR s1, SWAR s2) { @@ -106,7 +139,11 @@ fullAddition(SWAR s1, SWAR s2) { return { result, BS{carry.value()}, BS{overflow.value()} }; }; - +/// \brief Negation is useful only for the signed integer interpretation +/// @tparam B +/// @tparam NB +/// @param input +/// @return template constexpr auto negate(SWAR input) { using S = SWAR; From 4475b13e065270c38f0bd85320fd457477b996b6 Mon Sep 17 00:00:00 2001 From: Eddie Date: Tue, 16 Jan 2024 19:31:43 -0800 Subject: [PATCH 02/10] More ocmments, dos2unix --- inc/zoo/swar/associative_iteration.h | 589 +++++++++++++-------------- 1 file changed, 294 insertions(+), 295 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index bfdc2912..8da3c838 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,295 +1,294 @@ -#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H -#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H - -#include "zoo/swar/SWAR.h" - -namespace zoo::swar { - -/// \note This code should be substituted by an application of "progressive" algebraic iteration -/// \note There is also parallelPrefix (to be implemented) -template -constexpr SWAR parallelSuffix(SWAR input) { - using S = SWAR; - auto - shiftClearingMask = S{~S::MostSignificantBit}, - doubling = input, - result = S{0}; - auto - bitsToXOR = NB, - power = 1; - for(;;) { - if(1 & bitsToXOR) { - result = result ^ doubling; - doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - } - bitsToXOR >>= 1; - if(!bitsToXOR) { break; } - auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); - doubling = doubling ^ shifted; - // 01...1 - // 001...1 - // 00001...1 - // 000000001...1 - shiftClearingMask = - shiftClearingMask & S{shiftClearingMask.value() >> power}; - power <<= 1; - } - return S{result}; -} - -/// \todo because of the desirability of "accumuating" the XORs at the MSB, -/// the parallel suffix operation is more suitable. -template -constexpr SWAR parity(SWAR input) { - using S = SWAR; - auto preResult = parallelSuffix(input); - auto onlyMSB = preResult.value() & S::MostSignificantBit; - return S{onlyMSB}; -} - - -namespace impl { -template -constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { - auto msbCopiedDown = msb - lsb; - auto msbReintroduced = msbCopiedDown | msb; - return msbReintroduced; -} -} - -template -constexpr auto makeLaneMaskFromLSB(SWAR input) { - using S = SWAR; - auto lsb = input & S{S::LeastSignificantBit}; - auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); -} - -template -constexpr auto makeLaneMaskFromMSB(SWAR input) { - using S = SWAR; - auto msb = input & S{S::MostSignificantBit}; - auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; - return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); -} - -template -struct ArithmeticResultTriplet { - SWAR result; - BooleanSWAR carry, overflow; -}; - - -/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition -/// and producing the flags for unsigned overflow (carry) and signed overflow. -/// This is the function to perform signed addition (that relies on supporting -/// unsigned overflow) -/// -/// This function is called "full addition" because it can perform the addition -/// with all the bits of the inputs by making sure the overflowing (in the -/// unsigned sense) does not cross the lane boundary. -/// This function has less performance than "optimistic" addition (operator+). -/// The mechanism to manage potential overflow naturally allows the calculation -/// of the carry and signed overflow flags for no extra performance cost. -/// -/// The performance relies on the optimizer removing the calculation of -/// the carry or signed overflow if they are not used. -/// -/// When interpreted as unsigned addition, carrying out of the result is -/// overflow. -/// -/// The carry bit is essential to increase the precision of the results in -/// normal arithmetic, but in unsigned SWAR it is preferable to double the -/// precision before executing addition, thus guaranteeing no overflow will -/// occur and using the more performing operator+ addition. Hence, -/// the carry flag is mostly useful in SWAR for detection of unsigned overflow. -/// -/// The signed integer interpretation is the technique of two's complement, that -/// routinely overflows (as interpreted as unsigned). Signed overflow may only -/// occur if the inputs have the same sign, it is detected when the sign of the -/// result is opposite that of the inputs. -/// -/// \todo The library is not explicit with regards to the fact that -/// operator+ is only useful with the unsigned interpretation. A decision -/// must be made to either keep the library as is, or to promote full addition -/// to operator+, and the rationale for the decision -template -constexpr ArithmeticResultTriplet -fullAddition(SWAR s1, SWAR s2) { - using S = SWAR; - constexpr auto - SignBit = S{S::MostSignificantBit}, - LowerBits = SignBit - S{S::LeastSignificantBit}; - // prevent overflow by clearing the most significant bits - auto - s1prime = LowerBits & s1, - s2prime = LowerBits & s2, - resultPrime = s1prime + s2prime, - s1Sign = SignBit & s1, - s2Sign = SignBit & s2, - signPrime = SignBit & resultPrime, - result = resultPrime ^ s1Sign ^ s2Sign, - // carry is set whenever at least two of the sign bits of s1, s2, - // signPrime are set - carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), - // overflow: the inputs have the same sign and different to result - // same sign: s1Sign ^ s2Sign - overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); - using BS = BooleanSWAR; - return { result, BS{carry.value()}, BS{overflow.value()} }; -}; - -/// \brief Negation is useful only for the signed integer interpretation -/// @tparam B -/// @tparam NB -/// @param input -/// @return -template -constexpr auto negate(SWAR input) { - using S = SWAR; - constexpr auto Ones = S{S::LeastSignificantBit}; - return fullAddition(~input, Ones).result; -} - -/// \brief Performs a generalized iterated application of an associative operator to a base -/// -/// In algebra, the repeated application of an operator to a "base" has different names depending on the -/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", -/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n -/// power". -/// The general term in algebra is "iteration", hence the naming of this function. -/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we -/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very -/// different meaning in programming and especially different in C++. -/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this -/// function leverages the associative property of the operator to "halve" the count of iterations at each step. -/// \note There is a symmetrical operation to be implemented of associative iteration in the -/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, -/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" -/// \tparam Operator a callable with three arguments: the left and right arguments to the operation -/// and the count to be used, the "count" is an artifact of this generalization -/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not -/// be a number, the iteration count is part of the execution context to apply the operator -/// \param forSquaring is an artifact of this generalization -/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows -/// there are fewer iterations than what the type of exponent would allow -template< - typename Base, typename IterationCount, typename Operator, - // the critical use of associativity is that it allows halving the - // iteration count - typename CountHalver -> -constexpr auto associativeOperatorIterated_regressive( - Base base, Base neutral, IterationCount count, IterationCount forSquaring, - Operator op, unsigned log2Count, CountHalver ch -) { - auto result = neutral; - if(!log2Count) { return result; } - for(;;) { - result = op(result, base, count); - if(!--log2Count) { break; } - result = op(result, result, forSquaring); - count = ch(count); - } - return result; -} - -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( - SWAR multiplicand, SWAR multiplier -) { - using S = SWAR; - - auto operation = [](auto left, auto right, auto counts) { - auto addendums = makeLaneMaskFromMSB(counts); - return left + (addendums & right); - }; - - auto halver = [](auto counts) { - auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; - }; - - multiplier = S{multiplier.value() << (NB - ActualBits)}; - return associativeOperatorIterated_regressive( - multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, - ActualBits, halver - ); -} - -/// \note Not removed yet because it is an example of "progressive" associative exponentiation -template -constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( - SWAR multiplicand, - SWAR multiplier -) { - using S = SWAR; - constexpr auto LeastBit = S::LeastSignificantBit; - auto multiplicandDoubling = multiplicand.value(); - auto mplier = multiplier.value(); - auto product = S{0}; - for(auto count = ActualBits;;) { - auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); - product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); - if(!--count) { break; } - multiplicandDoubling <<= 1; - auto leastBitCleared = mplier & ~LeastBit; - mplier = leastBitCleared >> 1; - } - return product; -} - -template -constexpr auto multiplication_OverflowUnsafe( - SWAR multiplicand, - SWAR multiplier -) { - return - multiplication_OverflowUnsafe_SpecificBitCount( - multiplicand, multiplier - ); -} - -template -struct SWAR_Pair{ - SWAR even, odd; -}; - -template -constexpr SWAR doublingMask() { - using S = SWAR; - static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); - using D = SWAR; - return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; -} - -template -constexpr auto doublePrecision(SWAR input) { - using S = SWAR; - static_assert( - 0 == S::NSlots % 2, - "Precision can only be doubled for SWARs of even element count" - ); - using RV = SWAR; - constexpr auto DM = doublingMask(); - return SWAR_Pair{ - RV{(input & DM).value()}, - RV{(input.value() >> NB) & DM.value()} - }; -} - -template -constexpr auto halvePrecision(SWAR even, SWAR odd) { - using S = SWAR; - static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); - using RV = SWAR; - constexpr auto HalvingMask = doublingMask(); - auto - evenHalf = RV{even.value()} & HalvingMask, - oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; - return evenHalf | oddHalf; -} - -} - -#endif +#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H + +#include "zoo/swar/SWAR.h" + +namespace zoo::swar { + +/// \note This code should be substituted by an application of "progressive" algebraic iteration +/// \note There is also parallelPrefix (to be implemented) +template +constexpr SWAR parallelSuffix(SWAR input) { + using S = SWAR; + auto + shiftClearingMask = S{~S::MostSignificantBit}, + doubling = input, + result = S{0}; + auto + bitsToXOR = NB, + power = 1; + for(;;) { + if(1 & bitsToXOR) { + result = result ^ doubling; + doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + } + bitsToXOR >>= 1; + if(!bitsToXOR) { break; } + auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + doubling = doubling ^ shifted; + // 01...1 + // 001...1 + // 00001...1 + // 000000001...1 + shiftClearingMask = + shiftClearingMask & S{shiftClearingMask.value() >> power}; + power <<= 1; + } + return S{result}; +} + +/// \todo because of the desirability of "accumuating" the XORs at the MSB, +/// the parallel suffix operation is more suitable. +template +constexpr SWAR parity(SWAR input) { + using S = SWAR; + auto preResult = parallelSuffix(input); + auto onlyMSB = preResult.value() & S::MostSignificantBit; + return S{onlyMSB}; +} + + +namespace impl { +template +constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR msb, SWAR lsb) { + auto msbCopiedDown = msb - lsb; + auto msbReintroduced = msbCopiedDown | msb; + return msbReintroduced; +} +} + +template +constexpr auto makeLaneMaskFromLSB(SWAR input) { + using S = SWAR; + auto lsb = input & S{S::LeastSignificantBit}; + auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); +} + +template +constexpr auto makeLaneMaskFromMSB(SWAR input) { + using S = SWAR; + auto msb = input & S{S::MostSignificantBit}; + auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; + return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); +} + +template +struct ArithmeticResultTriplet { + SWAR result; + BooleanSWAR carry, overflow; +}; + + +/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition +/// and producing the flags for unsigned overflow (carry) and signed overflow. +/// This is the function to perform signed addition (that relies on supporting +/// unsigned overflow) +/// +/// This function is called "full addition" because it can perform the addition +/// with all the bits of the inputs by making sure the overflowing (in the +/// unsigned sense) does not cross the lane boundary. +/// This function has less performance than "optimistic" addition (operator+). +/// The mechanism to manage potential overflow naturally allows the calculation +/// of the carry and signed overflow flags for no extra performance cost. +/// +/// The performance relies on the optimizer removing the calculation of +/// the carry or signed overflow if they are not used. +/// +/// When interpreted as unsigned addition, carrying out of the result is +/// overflow. +/// +/// The carry bit is essential to increase the precision of the results in +/// normal arithmetic, but in unsigned SWAR it is preferable to double the +/// precision before executing addition, thus guaranteeing no overflow will +/// occur and using the more performing operator+ addition. Hence, +/// the carry flag is mostly useful in SWAR for detection of unsigned overflow. +/// +/// The signed integer interpretation is the technique of two's complement, that +/// routinely overflows (as interpreted as unsigned). Signed overflow may only +/// occur if the inputs have the same sign, it is detected when the sign of the +/// result is opposite that of the inputs. +/// +/// \todo The library is not explicit with regards to the fact that +/// operator+ is only useful with the unsigned interpretation. A decision +/// must be made to either keep the library as is, or to promote full addition +/// to operator+, and the rationale for the decision +/// +/// \todo What is the right place for this function? +/// It was added here because in practice multiplication overflows, as a draft +template +constexpr ArithmeticResultTriplet +fullAddition(SWAR s1, SWAR s2) { + using S = SWAR; + constexpr auto + SignBit = S{S::MostSignificantBit}, + LowerBits = SignBit - S{S::LeastSignificantBit}; + // prevent overflow by clearing the most significant bits + auto + s1prime = LowerBits & s1, + s2prime = LowerBits & s2, + resultPrime = s1prime + s2prime, + s1Sign = SignBit & s1, + s2Sign = SignBit & s2, + signPrime = SignBit & resultPrime, + result = resultPrime ^ s1Sign ^ s2Sign, + // carry is set whenever at least two of the sign bits of s1, s2, + // signPrime are set + carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), + // overflow: the inputs have the same sign and different to result + // same sign: s1Sign ^ s2Sign + overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); + using BS = BooleanSWAR; + return { result, BS{carry.value()}, BS{overflow.value()} }; +}; + +/// \brief Negation is useful only for the signed integer interpretation +template +constexpr auto negate(SWAR input) { + using S = SWAR; + constexpr auto Ones = S{S::LeastSignificantBit}; + return fullAddition(~input, Ones).result; +} + +/// \brief Performs a generalized iterated application of an associative operator to a base +/// +/// In algebra, the repeated application of an operator to a "base" has different names depending on the +/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", +/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n +/// power". +/// The general term in algebra is "iteration", hence the naming of this function. +/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we +/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very +/// different meaning in programming and especially different in C++. +/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this +/// function leverages the associative property of the operator to "halve" the count of iterations at each step. +/// \note There is a symmetrical operation to be implemented of associative iteration in the +/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, +/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" +/// \tparam Operator a callable with three arguments: the left and right arguments to the operation +/// and the count to be used, the "count" is an artifact of this generalization +/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not +/// be a number, the iteration count is part of the execution context to apply the operator +/// \param forSquaring is an artifact of this generalization +/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows +/// there are fewer iterations than what the type of exponent would allow +template< + typename Base, typename IterationCount, typename Operator, + // the critical use of associativity is that it allows halving the + // iteration count + typename CountHalver +> +constexpr auto associativeOperatorIterated_regressive( + Base base, Base neutral, IterationCount count, IterationCount forSquaring, + Operator op, unsigned log2Count, CountHalver ch +) { + auto result = neutral; + if(!log2Count) { return result; } + for(;;) { + result = op(result, base, count); + if(!--log2Count) { break; } + result = op(result, result, forSquaring); + count = ch(count); + } + return result; +} + +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( + SWAR multiplicand, SWAR multiplier +) { + using S = SWAR; + + auto operation = [](auto left, auto right, auto counts) { + auto addendums = makeLaneMaskFromMSB(counts); + return left + (addendums & right); + }; + + auto halver = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; + }; + + multiplier = S{multiplier.value() << (NB - ActualBits)}; + return associativeOperatorIterated_regressive( + multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, + ActualBits, halver + ); +} + +/// \note Not removed yet because it is an example of "progressive" associative exponentiation +template +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( + SWAR multiplicand, + SWAR multiplier +) { + using S = SWAR; + constexpr auto LeastBit = S::LeastSignificantBit; + auto multiplicandDoubling = multiplicand.value(); + auto mplier = multiplier.value(); + auto product = S{0}; + for(auto count = ActualBits;;) { + auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); + product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); + if(!--count) { break; } + multiplicandDoubling <<= 1; + auto leastBitCleared = mplier & ~LeastBit; + mplier = leastBitCleared >> 1; + } + return product; +} + +template +constexpr auto multiplication_OverflowUnsafe( + SWAR multiplicand, + SWAR multiplier +) { + return + multiplication_OverflowUnsafe_SpecificBitCount( + multiplicand, multiplier + ); +} + +template +struct SWAR_Pair{ + SWAR even, odd; +}; + +template +constexpr SWAR doublingMask() { + using S = SWAR; + static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); + using D = SWAR; + return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; +} + +template +constexpr auto doublePrecision(SWAR input) { + using S = SWAR; + static_assert( + 0 == S::NSlots % 2, + "Precision can only be doubled for SWARs of even element count" + ); + using RV = SWAR; + constexpr auto DM = doublingMask(); + return SWAR_Pair{ + RV{(input & DM).value()}, + RV{(input.value() >> NB) & DM.value()} + }; +} + +template +constexpr auto halvePrecision(SWAR even, SWAR odd) { + using S = SWAR; + static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); + using RV = SWAR; + constexpr auto HalvingMask = doublingMask(); + auto + evenHalf = RV{even.value()} & HalvingMask, + oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; + return evenHalf | oddHalf; +} + +} + +#endif From b1b8b5e0506900bbb815f8e53708c225a0f36fc5 Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:09:41 -0400 Subject: [PATCH 03/10] Update inc/zoo/swar/associative_iteration.h grammar nit --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 8da3c838..f047b9bb 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -86,7 +86,7 @@ struct ArithmeticResultTriplet { /// unsigned overflow) /// /// This function is called "full addition" because it can perform the addition -/// with all the bits of the inputs by making sure the overflowing (in the +/// with all the bits of the inputs by making sure the overflow (in the /// unsigned sense) does not cross the lane boundary. /// This function has less performance than "optimistic" addition (operator+). /// The mechanism to manage potential overflow naturally allows the calculation From 2d0b540b40fcc9f342611f30f29be733fbce8114 Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:14:57 -0400 Subject: [PATCH 04/10] Update inc/zoo/swar/associative_iteration.h --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index f047b9bb..35bba6bc 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -101,7 +101,7 @@ struct ArithmeticResultTriplet { /// The carry bit is essential to increase the precision of the results in /// normal arithmetic, but in unsigned SWAR it is preferable to double the /// precision before executing addition, thus guaranteeing no overflow will -/// occur and using the more performing operator+ addition. Hence, +/// occur and using the more performant operator+ addition. Hence, /// the carry flag is mostly useful in SWAR for detection of unsigned overflow. /// /// The signed integer interpretation is the technique of two's complement, that From e3f652b0814130cb37390f9354f26147db0a6e2a Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:19:12 -0400 Subject: [PATCH 05/10] Update inc/zoo/swar/associative_iteration.h --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 35bba6bc..01aea576 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -102,7 +102,7 @@ struct ArithmeticResultTriplet { /// normal arithmetic, but in unsigned SWAR it is preferable to double the /// precision before executing addition, thus guaranteeing no overflow will /// occur and using the more performant operator+ addition. Hence, -/// the carry flag is mostly useful in SWAR for detection of unsigned overflow. +/// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical. /// /// The signed integer interpretation is the technique of two's complement, that /// routinely overflows (as interpreted as unsigned). Signed overflow may only From 8f59029e77b17d83aa592d05bf34d081b926e82f Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:23:47 -0400 Subject: [PATCH 06/10] Update inc/zoo/swar/associative_iteration.h --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 01aea576..97dd4089 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -104,7 +104,7 @@ struct ArithmeticResultTriplet { /// occur and using the more performant operator+ addition. Hence, /// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical. /// -/// The signed integer interpretation is the technique of two's complement, that +/// The signed integer interpretation is two's complement, which /// routinely overflows (as interpreted as unsigned). Signed overflow may only /// occur if the inputs have the same sign, it is detected when the sign of the /// result is opposite that of the inputs. From 933877e6ff21df3ac3bb4af5dbf95c8101deb3e9 Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:25:44 -0400 Subject: [PATCH 07/10] Update inc/zoo/swar/associative_iteration.h --- inc/zoo/swar/associative_iteration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 97dd4089..1ea526cc 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -105,7 +105,7 @@ struct ArithmeticResultTriplet { /// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical. /// /// The signed integer interpretation is two's complement, which -/// routinely overflows (as interpreted as unsigned). Signed overflow may only +/// routinely overflows (when interpreted as unsigned). Signed overflow may only /// occur if the inputs have the same sign, it is detected when the sign of the /// result is opposite that of the inputs. /// From 69ca47ca0b64b29470d87903f52fe9312a4907f6 Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:29:04 -0400 Subject: [PATCH 08/10] Update inc/zoo/map/RobinHood.h --- inc/zoo/map/RobinHood.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h index 6217ee1b..8beeb1d8 100644 --- a/inc/zoo/map/RobinHood.h +++ b/inc/zoo/map/RobinHood.h @@ -299,7 +299,7 @@ struct KeyValuePairWrapper { /// a search must terminate in failure if the maximum PSL is reached, then, /// by just adding the maximum PSL entries to the table, while keeping the /// slot indexing function the same, searches at the end of the table will never -/// atempt to go past the real end, but return not-found within the tail. +/// attempt to go past the real end, but return not-found within the tail. template< typename K, typename MV, From 2b7d06ac14632cb49740597bd7cebfa062dd4796 Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:41:19 -0400 Subject: [PATCH 09/10] Apply suggestions from code review --- inc/zoo/map/RobinHood.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h index 8beeb1d8..c156bb23 100644 --- a/inc/zoo/map/RobinHood.h +++ b/inc/zoo/map/RobinHood.h @@ -124,8 +124,8 @@ struct RH_Backend { The first time the needle has a PSL greater than the haystacks' means the matching will fail, because the hypothetical prior insertion would have "stolen" that slot. - If there is an equal, it would start a sequence of potential matches. To - determine an actual match: + If the PSLs are equal, it starts a sequence of potential matches. To + determine if there is an actual match, perform: 1. A cheap SWAR check of hoisted hashes 2. If there are still potential matches (now also the hoisted hashes), fall back to non-SWAR, or iterative and expensive "deep equality" test for each @@ -133,13 +133,13 @@ struct RH_Backend { The above makes it very important to detect the first case in which the PSL is greater equal to the needle. We call this the "deadline". - Because we assume the LITTLE ENDIAN byte ordering, the first element would + We assume the LITTLE ENDIAN byte ordering: the first element will be the least significant non-false Boolean SWAR. Note about performance: Every "early exit" faces a big justification hurdle, the proportion of cases - they intercept to be large enough that the branch prediction penalty of the - entropy introduced is overcompensated. + they intercept must be large enough that the branch prediction penalty of the + entropy introduced (by the early exit) is overcompensated. */ constexpr static impl::MatchResult From 836792229a232d9c79a18a97cda1d79ee9c18bed Mon Sep 17 00:00:00 2001 From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com> Date: Wed, 17 Jan 2024 00:43:08 -0400 Subject: [PATCH 10/10] Apply suggestions from code review --- inc/zoo/map/RobinHood.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h index c156bb23..6270ec74 100644 --- a/inc/zoo/map/RobinHood.h +++ b/inc/zoo/map/RobinHood.h @@ -295,9 +295,9 @@ struct KeyValuePairWrapper { /// /// Normally we need to explicitly check for whether key searches have reached /// the end of the table. Malte Skarupke devised a tail of table entries to -/// make this explicit check innecessary: Regardless of the end of the table, +/// make this explicit check unnecessary: Regardless of the end of the table, /// a search must terminate in failure if the maximum PSL is reached, then, -/// by just adding the maximum PSL entries to the table, while keeping the +/// by just adding an extra maximum PSL entries to the table, while keeping the /// slot indexing function the same, searches at the end of the table will never /// attempt to go past the real end, but return not-found within the tail. template<