From 03ac5a1c5d5802c24b02d23cf090a569d320b0e2 Mon Sep 17 00:00:00 2001
From: thecppzoo <thecppzoo@gmail.com>
Date: Tue, 16 Jan 2024 19:20:42 -0800
Subject: [PATCH 01/10] Better comments

---
 inc/zoo/map/RobinHood.h              | 107 ++++++++++++++++++++-------
 inc/zoo/swar/associative_iteration.h |  39 +++++++++-
 2 files changed, 120 insertions(+), 26 deletions(-)
diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h
index a58235a1..6217ee1b 100644
--- a/inc/zoo/map/RobinHood.h
+++ b/inc/zoo/map/RobinHood.h
@@ -17,6 +17,43 @@
     #include <assert>
 #endif
 
+/*! \file RobinHood.h
+\brief User entry point to the implementation of hash tables using the "Robin
+Hood" invariant.
+
+The "Robin Hood" monicker means that each key has a preferred or "home" slot
+in the hash table.  If, upon insertion, the key can not be inserted into its
+home slot, then the insertion would look to insert it as close as possible to
+the home slot.
+
+In this code base, the acronym PSL is used frequently, it means "Probe Sequence
+Length", this is the distance from the preferred or "home" slot and the current
+search position.
+For a practical reason, a key inserted into its home has a PSL of 1, in this
+way, the metadata indicates with a PSL of 0 that no key is in the slot,
+or that the slot is free.
+
+The invariant is that a key won't be inserted further away from its home than
+the key in the current slot.  That is, a key is "richer" than another if it is
+closer to its "home", the insertion mechanism would "evict" a key that would be
+richer than the key being inserted.  In this regard, the "Robin Hood" metaphor
+is realized: the insertion "steals" from the rich to give it to the poor.
+
+\note All of this codebase makes the unchecked assumption that the byte ordering
+is LITTLE ENDIAN
+
+\todo complement with the other theoretical and practical comments relevant,
+including:
+1. How the table is not stable with regards to insetions and deletions,
+2. How an insertion can cascade into very long chains of evictions/reinsertions
+3. The theoretical guarantee that the longest PSL is in the order of Log(N)
+4. How it seems that in practice the theoretical guarantee is not achieved.
+...
+
+\todo determine a moment to endure the version control pain of making the
+indentation consistent.
+*/
+
 namespace zoo {
 namespace rh {
 
@@ -31,6 +68,7 @@ struct RelocationStackExhausted: RobinHoodException {
     using RobinHoodException::RobinHoodException;
 };
 
+/// \brief The canonical backend (implementation)
 template<int PSL_Bits, int HashBits, typename U = std::uint64_t>
 struct RH_Backend {
     using Metadata = impl::Metadata<PSL_Bits, HashBits, U>;
@@ -38,31 +76,8 @@ struct RH_Backend {
     constexpr static inline auto Width = Metadata::NBits;
     Metadata *md_;
 
-    /*! \brief SWAR check for a potential match
-    The invariant in Robin Hood is that the element being looked for, the "needle", is "richer"
-    than the elements already present, the "haystack".
-    "Richer" means that the PSL is smaller.
-    A PSL of 0 can only happen in the haystack, to indicate the slot is empty, this is "richest".
-    The first time the needle has a PSL greater than the haystacks' means the matching will fail,
-    because the hypothetical prior insertion would have "stolen" that slot.
-    If there is an equal, it would start a sequence of potential matches.  To determine an actual match:
-    1. A cheap SWAR check of hoisted hashes
-    2. If there are still potential matches (now also the hoisted hashes), fall back to non-SWAR,
-    or iterative and expensive "deep equality" test for each potential match, outside of this function
-
-    The above makes it very important to detect the first case in which the PSL is greater equal to the needle.
-    We call this the "deadline".
-    Because we assume the LITTLE ENDIAN byte ordering, the first element would be the least significant
-    non-false Boolean SWAR.
-
-    Note about performance:
-    Every "early exit" faces a big justification hurdle, the proportion of cases
-    they intercept to be large enough that the branch prediction penalty of the entropy introduced is
-    overcompensated.
-    */
-
-    /// Boolean SWAR true in the first element/lane of the needle strictly poorer than its corresponding
-    /// haystack
+    /// Boolean SWAR true in the first element/lane of the needle strictly
+    /// poorer than its corresponding haystack
     constexpr static auto
     firstInvariantBreakage(Metadata needle, Metadata haystack) {
         auto nPSL = needle.PSLs();
@@ -97,6 +112,36 @@ struct RH_Backend {
          return std::tuple{Metadata{nPSL.PSLs() | needlePSLsToSaturate}, bool(saturation)};  // saturated at any point, last swar to check.
      }
 
+
+    /*! \brief SWAR check for a potential match
+
+    The invariant in Robin Hood is that the element being looked for, the
+    "needle", is at least as "rich" as the elements already present (the
+    "haystack").
+    "Richer" means that the PSL is smaller.
+    A PSL of 0 can only happen in the haystack, to indicate the slot is empty,
+    this is "richest".
+    The first time the needle has a PSL greater than the haystacks' means the
+    matching will fail, because the hypothetical prior insertion would have
+    "stolen" that slot.
+    If there is an equal, it would start a sequence of potential matches.  To
+    determine an actual match:
+    1. A cheap SWAR check of hoisted hashes
+    2. If there are still potential matches (now also the hoisted hashes), fall
+    back to non-SWAR, or iterative and expensive "deep equality" test for each
+    potential match, outside of this function.
+
+    The above makes it very important to detect the first case in which the PSL
+    is greater equal to the needle.  We call this the "deadline".
+    Because we assume the LITTLE ENDIAN byte ordering, the first element would
+    be the least significant non-false Boolean SWAR.
+
+    Note about performance:
+    Every "early exit" faces a big justification hurdle, the proportion of cases
+    they intercept to be large enough that the branch prediction penalty of the
+    entropy introduced is overcompensated.
+    */
+
     constexpr static impl::MatchResult<PSL_Bits, HashBits, U>
     potentialMatches(
         Metadata needle, Metadata haystack
@@ -212,6 +257,9 @@ RH_Backend<PSL_Bits, HashBits, U>::findMisaligned_assumesSkarupkeTail(
         }
     }
 
+/// \brief The slots in the table may have a key-value pair or not, this
+/// optionality is not suitably captured by any standard library component,
+/// hence we need to implement our own.
 template<typename K, typename MV>
 struct KeyValuePairWrapper {
     using type = std::pair<K, MV>;
@@ -243,6 +291,15 @@ struct KeyValuePairWrapper {
     const auto &value() const noexcept { return const_cast<KeyValuePairWrapper *>(this)->value(); }
 };
 
+/// \brief Frontend with the "Skarupke Tail"
+///
+/// Normally we need to explicitly check for whether key searches have reached
+/// the end of the table.  Malte Skarupke devised a tail of table entries to
+/// make this explicit check innecessary: Regardless of the end of the table,
+/// a search must terminate in failure if the maximum PSL is reached, then,
+/// by just adding the maximum PSL entries to the table, while keeping the
+/// slot indexing function the same, searches at the end of the table will never
+/// atempt to go past the real end, but return not-found within the tail.
 template<
     typename K,
     typename MV,
diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index b2201823..bfdc2912 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -80,6 +80,39 @@ struct ArithmeticResultTriplet {
 };
 
 
+/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition
+/// and producing the flags for unsigned overflow (carry) and signed overflow.
+/// This is the function to perform signed addition (that relies on supporting
+/// unsigned overflow)
+///
+/// This function is called "full addition" because it can perform the addition
+/// with all the bits of the inputs by making sure the overflowing (in the
+/// unsigned sense) does not cross the lane boundary.
+/// This function has less performance than "optimistic" addition (operator+).
+/// The mechanism to manage potential overflow naturally allows the calculation
+/// of the carry and signed overflow flags for no extra performance cost.
+///
+/// The performance relies on the optimizer removing the calculation of
+/// the carry or signed overflow if they are not used.
+///
+/// When interpreted as unsigned addition, carrying out of the result is
+/// overflow.
+///
+/// The carry bit is essential to increase the precision of the results in
+/// normal arithmetic, but in unsigned SWAR it is preferable to double the
+/// precision before executing addition, thus guaranteeing no overflow will
+/// occur and using the more performing operator+ addition.  Hence,
+/// the carry flag is mostly useful in SWAR for detection of unsigned overflow.
+///
+/// The signed integer interpretation is the technique of two's complement, that
+/// routinely overflows (as interpreted as unsigned).  Signed overflow may only
+/// occur if the inputs have the same sign, it is detected when the sign of the
+/// result is opposite that of the inputs.
+///
+/// \todo The library is not explicit with regards to the fact that
+/// operator+ is only useful with the unsigned interpretation.  A decision
+/// must be made to either keep the library as is, or to promote full addition
+/// to operator+, and the rationale for the decision
 template<int NB, typename B>
 constexpr ArithmeticResultTriplet<NB, B>
 fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
@@ -106,7 +139,11 @@ fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
     return { result, BS{carry.value()}, BS{overflow.value()} };
 };
 
-
+/// \brief Negation is useful only for the signed integer interpretation
+/// @tparam B 
+/// @tparam NB 
+/// @param input 
+/// @return 
 template<int NB, typename B>
 constexpr auto negate(SWAR<NB, B> input) {
     using S = SWAR<NB, B>;

From 4475b13e065270c38f0bd85320fd457477b996b6 Mon Sep 17 00:00:00 2001
From: Eddie <eddie see email elsewhere>
Date: Tue, 16 Jan 2024 19:31:43 -0800
Subject: [PATCH 02/10] More ocmments, dos2unix

---
 inc/zoo/swar/associative_iteration.h | 589 +++++++++++++--------------
 1 file changed, 294 insertions(+), 295 deletions(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index bfdc2912..8da3c838 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -1,295 +1,294 @@
-#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
-#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H
-
-#include "zoo/swar/SWAR.h"
-
-namespace zoo::swar {
-
-/// \note This code should be substituted by an application of "progressive" algebraic iteration
-/// \note There is also parallelPrefix (to be implemented)
-template<int NB, typename B>
-constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto
-        shiftClearingMask = S{~S::MostSignificantBit},
-        doubling = input,
-        result = S{0};
-    auto
-        bitsToXOR = NB,
-        power = 1;
-    for(;;) {
-        if(1 & bitsToXOR) {
-            result = result ^ doubling;
-            doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
-        }
-        bitsToXOR >>= 1;
-        if(!bitsToXOR) { break; }
-        auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
-        doubling = doubling ^ shifted;
-        // 01...1
-        // 001...1
-        // 00001...1
-        // 000000001...1
-        shiftClearingMask =
-            shiftClearingMask & S{shiftClearingMask.value() >> power};
-        power <<= 1;
-    }
-    return S{result};
-}
-
-/// \todo because of the desirability of "accumuating" the XORs at the MSB,
-/// the parallel suffix operation is more suitable.
-template<int NB, typename B>
-constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto preResult = parallelSuffix(input);
-    auto onlyMSB = preResult.value() & S::MostSignificantBit;
-    return S{onlyMSB};
-}
-
-
-namespace impl {
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
-    auto msbCopiedDown = msb - lsb;
-    auto msbReintroduced = msbCopiedDown | msb;
-    return msbReintroduced;
-}
-}
-
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto lsb = input & S{S::LeastSignificantBit};
-    auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
-    return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
-}
-
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto msb = input & S{S::MostSignificantBit};
-    auto msbCopiedToLSB = S{msb.value() >> (NB - 1)};
-    return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
-}
-
-template<int NB, typename B>
-struct ArithmeticResultTriplet {
-    SWAR<NB, B> result;
-    BooleanSWAR<NB, B> carry, overflow;
-};
-
-
-/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition
-/// and producing the flags for unsigned overflow (carry) and signed overflow.
-/// This is the function to perform signed addition (that relies on supporting
-/// unsigned overflow)
-///
-/// This function is called "full addition" because it can perform the addition
-/// with all the bits of the inputs by making sure the overflowing (in the
-/// unsigned sense) does not cross the lane boundary.
-/// This function has less performance than "optimistic" addition (operator+).
-/// The mechanism to manage potential overflow naturally allows the calculation
-/// of the carry and signed overflow flags for no extra performance cost.
-///
-/// The performance relies on the optimizer removing the calculation of
-/// the carry or signed overflow if they are not used.
-///
-/// When interpreted as unsigned addition, carrying out of the result is
-/// overflow.
-///
-/// The carry bit is essential to increase the precision of the results in
-/// normal arithmetic, but in unsigned SWAR it is preferable to double the
-/// precision before executing addition, thus guaranteeing no overflow will
-/// occur and using the more performing operator+ addition.  Hence,
-/// the carry flag is mostly useful in SWAR for detection of unsigned overflow.
-///
-/// The signed integer interpretation is the technique of two's complement, that
-/// routinely overflows (as interpreted as unsigned).  Signed overflow may only
-/// occur if the inputs have the same sign, it is detected when the sign of the
-/// result is opposite that of the inputs.
-///
-/// \todo The library is not explicit with regards to the fact that
-/// operator+ is only useful with the unsigned interpretation.  A decision
-/// must be made to either keep the library as is, or to promote full addition
-/// to operator+, and the rationale for the decision
-template<int NB, typename B>
-constexpr ArithmeticResultTriplet<NB, B>
-fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
-    using S = SWAR<NB, B>;
-    constexpr auto
-        SignBit = S{S::MostSignificantBit},
-        LowerBits = SignBit - S{S::LeastSignificantBit};
-    // prevent overflow by clearing the most significant bits
-    auto
-        s1prime = LowerBits & s1,
-        s2prime = LowerBits & s2,
-        resultPrime = s1prime + s2prime,
-        s1Sign = SignBit & s1,
-        s2Sign = SignBit & s2,
-        signPrime = SignBit & resultPrime,
-        result = resultPrime ^ s1Sign ^ s2Sign,
-        // carry is set whenever at least two of the sign bits of s1, s2,
-        // signPrime are set
-        carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
-        // overflow: the inputs have the same sign and different to result
-        // same sign: s1Sign ^ s2Sign
-        overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
-    using BS = BooleanSWAR<NB, B>;
-    return { result, BS{carry.value()}, BS{overflow.value()} };
-};
-
-/// \brief Negation is useful only for the signed integer interpretation
-/// @tparam B 
-/// @tparam NB 
-/// @param input 
-/// @return 
-template<int NB, typename B>
-constexpr auto negate(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    constexpr auto Ones = S{S::LeastSignificantBit};
-    return fullAddition(~input, Ones).result;
-}
-
-/// \brief Performs a generalized iterated application of an associative operator to a base
-///
-/// In algebra, the repeated application of an operator to a "base" has different names depending on the
-/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
-/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
-/// power".
-/// The general term in algebra is "iteration", hence the naming of this function.
-/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
-/// keep the option to use the imprecise language of "product, base and exponent".  "Iteration" has a very
-/// different meaning in programming and especially different in C++.
-/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
-/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
-/// \note There is a symmetrical operation to be implemented of associative iteration in the
-/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
-/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
-/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
-/// and the count to be used, the "count" is an artifact of this generalization
-/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
-/// be a number, the iteration count is part of the execution context to apply the operator
-/// \param forSquaring is an artifact of this generalization
-/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
-/// there are fewer iterations than what the type of exponent would allow
-template<
-    typename Base, typename IterationCount, typename Operator,
-    // the critical use of associativity is that it allows halving the
-    // iteration count
-    typename CountHalver
->
-constexpr auto associativeOperatorIterated_regressive(
-    Base base, Base neutral, IterationCount count, IterationCount forSquaring,
-    Operator op, unsigned log2Count, CountHalver ch
-) {
-    auto result = neutral;
-    if(!log2Count) { return result; }
-    for(;;) {
-        result = op(result, base, count);
-        if(!--log2Count) { break; }
-        result = op(result, result, forSquaring);
-        count = ch(count);
-    }
-    return result;
-}
-
-template<int ActualBits, int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
-    SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
-) {
-    using S = SWAR<NB, T>;
-
-    auto operation = [](auto left, auto right, auto counts) {
-        auto addendums = makeLaneMaskFromMSB(counts);
-        return left + (addendums & right);
-    };
-
-    auto halver = [](auto counts) {
-        auto msbCleared = counts & ~S{S::MostSignificantBit};
-        return S{msbCleared.value() << 1};
-    };
-
-    multiplier = S{multiplier.value() << (NB - ActualBits)};
-    return associativeOperatorIterated_regressive(
-        multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
-        ActualBits, halver
-    );
-}
-
-/// \note Not removed yet because it is an example of "progressive" associative exponentiation
-template<int ActualBits, int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
-    SWAR<NB, T> multiplicand,
-    SWAR<NB, T> multiplier
-) {
-    using S = SWAR<NB, T>;
-    constexpr auto LeastBit = S::LeastSignificantBit;
-    auto multiplicandDoubling = multiplicand.value();
-    auto mplier = multiplier.value();
-    auto product = S{0};
-    for(auto count = ActualBits;;) {
-        auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
-        product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
-        if(!--count) { break; }
-        multiplicandDoubling <<= 1;
-        auto leastBitCleared = mplier & ~LeastBit;
-        mplier = leastBitCleared >> 1;
-    }
-    return product;
-}
-
-template<int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe(
-    SWAR<NB, T> multiplicand,
-    SWAR<NB, T> multiplier
-) {
-    return
-        multiplication_OverflowUnsafe_SpecificBitCount<NB>(
-            multiplicand, multiplier
-        );
-}
-
-template<int NB, typename T>
-struct SWAR_Pair{
-    SWAR<NB, T> even, odd;
-};
-
-template<int NB, typename T>
-constexpr SWAR<NB, T> doublingMask() {
-    using S = SWAR<NB, T>;
-    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
-    using D = SWAR<NB * 2, T>;
-    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
-}
-
-template<int NB, typename T>
-constexpr auto doublePrecision(SWAR<NB, T> input) {
-    using S = SWAR<NB, T>;
-    static_assert(
-        0 == S::NSlots % 2,
-        "Precision can only be doubled for SWARs of even element count"
-    );
-    using RV = SWAR<NB * 2, T>;
-    constexpr auto DM = doublingMask<NB, T>();
-    return SWAR_Pair<NB * 2, T>{
-        RV{(input & DM).value()},
-        RV{(input.value() >> NB) & DM.value()}
-    };
-}
-
-template<int NB, typename T>
-constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
-    using S = SWAR<NB, T>;
-    static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
-    using RV = SWAR<NB/2, T>;
-    constexpr auto HalvingMask = doublingMask<NB/2, T>();
-    auto
-        evenHalf = RV{even.value()} & HalvingMask,
-        oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
-    return evenHalf | oddHalf;
-}
-
-}
-
-#endif
+#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
+#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H
+
+#include "zoo/swar/SWAR.h"
+
+namespace zoo::swar {
+
+/// \note This code should be substituted by an application of "progressive" algebraic iteration
+/// \note There is also parallelPrefix (to be implemented)
+template<int NB, typename B>
+constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto
+        shiftClearingMask = S{~S::MostSignificantBit},
+        doubling = input,
+        result = S{0};
+    auto
+        bitsToXOR = NB,
+        power = 1;
+    for(;;) {
+        if(1 & bitsToXOR) {
+            result = result ^ doubling;
+            doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
+        }
+        bitsToXOR >>= 1;
+        if(!bitsToXOR) { break; }
+        auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
+        doubling = doubling ^ shifted;
+        // 01...1
+        // 001...1
+        // 00001...1
+        // 000000001...1
+        shiftClearingMask =
+            shiftClearingMask & S{shiftClearingMask.value() >> power};
+        power <<= 1;
+    }
+    return S{result};
+}
+
+/// \todo because of the desirability of "accumuating" the XORs at the MSB,
+/// the parallel suffix operation is more suitable.
+template<int NB, typename B>
+constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto preResult = parallelSuffix(input);
+    auto onlyMSB = preResult.value() & S::MostSignificantBit;
+    return S{onlyMSB};
+}
+
+
+namespace impl {
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
+    auto msbCopiedDown = msb - lsb;
+    auto msbReintroduced = msbCopiedDown | msb;
+    return msbReintroduced;
+}
+}
+
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto lsb = input & S{S::LeastSignificantBit};
+    auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
+    return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
+}
+
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto msb = input & S{S::MostSignificantBit};
+    auto msbCopiedToLSB = S{msb.value() >> (NB - 1)};
+    return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
+}
+
+template<int NB, typename B>
+struct ArithmeticResultTriplet {
+    SWAR<NB, B> result;
+    BooleanSWAR<NB, B> carry, overflow;
+};
+
+
+/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition
+/// and producing the flags for unsigned overflow (carry) and signed overflow.
+/// This is the function to perform signed addition (that relies on supporting
+/// unsigned overflow)
+///
+/// This function is called "full addition" because it can perform the addition
+/// with all the bits of the inputs by making sure the overflowing (in the
+/// unsigned sense) does not cross the lane boundary.
+/// This function has less performance than "optimistic" addition (operator+).
+/// The mechanism to manage potential overflow naturally allows the calculation
+/// of the carry and signed overflow flags for no extra performance cost.
+///
+/// The performance relies on the optimizer removing the calculation of
+/// the carry or signed overflow if they are not used.
+///
+/// When interpreted as unsigned addition, carrying out of the result is
+/// overflow.
+///
+/// The carry bit is essential to increase the precision of the results in
+/// normal arithmetic, but in unsigned SWAR it is preferable to double the
+/// precision before executing addition, thus guaranteeing no overflow will
+/// occur and using the more performing operator+ addition.  Hence,
+/// the carry flag is mostly useful in SWAR for detection of unsigned overflow.
+///
+/// The signed integer interpretation is the technique of two's complement, that
+/// routinely overflows (as interpreted as unsigned).  Signed overflow may only
+/// occur if the inputs have the same sign, it is detected when the sign of the
+/// result is opposite that of the inputs.
+///
+/// \todo The library is not explicit with regards to the fact that
+/// operator+ is only useful with the unsigned interpretation.  A decision
+/// must be made to either keep the library as is, or to promote full addition
+/// to operator+, and the rationale for the decision
+///
+/// \todo What is the right place for this function?
+/// It was added here because in practice multiplication overflows, as a draft
+template<int NB, typename B>
+constexpr ArithmeticResultTriplet<NB, B>
+fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
+    using S = SWAR<NB, B>;
+    constexpr auto
+        SignBit = S{S::MostSignificantBit},
+        LowerBits = SignBit - S{S::LeastSignificantBit};
+    // prevent overflow by clearing the most significant bits
+    auto
+        s1prime = LowerBits & s1,
+        s2prime = LowerBits & s2,
+        resultPrime = s1prime + s2prime,
+        s1Sign = SignBit & s1,
+        s2Sign = SignBit & s2,
+        signPrime = SignBit & resultPrime,
+        result = resultPrime ^ s1Sign ^ s2Sign,
+        // carry is set whenever at least two of the sign bits of s1, s2,
+        // signPrime are set
+        carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
+        // overflow: the inputs have the same sign and different to result
+        // same sign: s1Sign ^ s2Sign
+        overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
+    using BS = BooleanSWAR<NB, B>;
+    return { result, BS{carry.value()}, BS{overflow.value()} };
+};
+
+/// \brief Negation is useful only for the signed integer interpretation
+template<int NB, typename B>
+constexpr auto negate(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    constexpr auto Ones = S{S::LeastSignificantBit};
+    return fullAddition(~input, Ones).result;
+}
+
+/// \brief Performs a generalized iterated application of an associative operator to a base
+///
+/// In algebra, the repeated application of an operator to a "base" has different names depending on the
+/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
+/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
+/// power".
+/// The general term in algebra is "iteration", hence the naming of this function.
+/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
+/// keep the option to use the imprecise language of "product, base and exponent".  "Iteration" has a very
+/// different meaning in programming and especially different in C++.
+/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
+/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
+/// \note There is a symmetrical operation to be implemented of associative iteration in the
+/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
+/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
+/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
+/// and the count to be used, the "count" is an artifact of this generalization
+/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
+/// be a number, the iteration count is part of the execution context to apply the operator
+/// \param forSquaring is an artifact of this generalization
+/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
+/// there are fewer iterations than what the type of exponent would allow
+template<
+    typename Base, typename IterationCount, typename Operator,
+    // the critical use of associativity is that it allows halving the
+    // iteration count
+    typename CountHalver
+>
+constexpr auto associativeOperatorIterated_regressive(
+    Base base, Base neutral, IterationCount count, IterationCount forSquaring,
+    Operator op, unsigned log2Count, CountHalver ch
+) {
+    auto result = neutral;
+    if(!log2Count) { return result; }
+    for(;;) {
+        result = op(result, base, count);
+        if(!--log2Count) { break; }
+        result = op(result, result, forSquaring);
+        count = ch(count);
+    }
+    return result;
+}
+
+template<int ActualBits, int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
+    SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
+) {
+    using S = SWAR<NB, T>;
+
+    auto operation = [](auto left, auto right, auto counts) {
+        auto addendums = makeLaneMaskFromMSB(counts);
+        return left + (addendums & right);
+    };
+
+    auto halver = [](auto counts) {
+        auto msbCleared = counts & ~S{S::MostSignificantBit};
+        return S{msbCleared.value() << 1};
+    };
+
+    multiplier = S{multiplier.value() << (NB - ActualBits)};
+    return associativeOperatorIterated_regressive(
+        multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
+        ActualBits, halver
+    );
+}
+
+/// \note Not removed yet because it is an example of "progressive" associative exponentiation
+template<int ActualBits, int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
+    SWAR<NB, T> multiplicand,
+    SWAR<NB, T> multiplier
+) {
+    using S = SWAR<NB, T>;
+    constexpr auto LeastBit = S::LeastSignificantBit;
+    auto multiplicandDoubling = multiplicand.value();
+    auto mplier = multiplier.value();
+    auto product = S{0};
+    for(auto count = ActualBits;;) {
+        auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
+        product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
+        if(!--count) { break; }
+        multiplicandDoubling <<= 1;
+        auto leastBitCleared = mplier & ~LeastBit;
+        mplier = leastBitCleared >> 1;
+    }
+    return product;
+}
+
+template<int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe(
+    SWAR<NB, T> multiplicand,
+    SWAR<NB, T> multiplier
+) {
+    return
+        multiplication_OverflowUnsafe_SpecificBitCount<NB>(
+            multiplicand, multiplier
+        );
+}
+
+template<int NB, typename T>
+struct SWAR_Pair{
+    SWAR<NB, T> even, odd;
+};
+
+template<int NB, typename T>
+constexpr SWAR<NB, T> doublingMask() {
+    using S = SWAR<NB, T>;
+    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
+    using D = SWAR<NB * 2, T>;
+    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
+}
+
+template<int NB, typename T>
+constexpr auto doublePrecision(SWAR<NB, T> input) {
+    using S = SWAR<NB, T>;
+    static_assert(
+        0 == S::NSlots % 2,
+        "Precision can only be doubled for SWARs of even element count"
+    );
+    using RV = SWAR<NB * 2, T>;
+    constexpr auto DM = doublingMask<NB, T>();
+    return SWAR_Pair<NB * 2, T>{
+        RV{(input & DM).value()},
+        RV{(input.value() >> NB) & DM.value()}
+    };
+}
+
+template<int NB, typename T>
+constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
+    using S = SWAR<NB, T>;
+    static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
+    using RV = SWAR<NB/2, T>;
+    constexpr auto HalvingMask = doublingMask<NB/2, T>();
+    auto
+        evenHalf = RV{even.value()} & HalvingMask,
+        oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
+    return evenHalf | oddHalf;
+}
+
+}
+
+#endif

From b1b8b5e0506900bbb815f8e53708c225a0f36fc5 Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:09:41 -0400
Subject: [PATCH 03/10] Update inc/zoo/swar/associative_iteration.h

grammar nit
---
 inc/zoo/swar/associative_iteration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index 8da3c838..f047b9bb 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -86,7 +86,7 @@ struct ArithmeticResultTriplet {
 /// unsigned overflow)
 ///
 /// This function is called "full addition" because it can perform the addition
-/// with all the bits of the inputs by making sure the overflowing (in the
+/// with all the bits of the inputs by making sure the overflow (in the
 /// unsigned sense) does not cross the lane boundary.
 /// This function has less performance than "optimistic" addition (operator+).
 /// The mechanism to manage potential overflow naturally allows the calculation

From 2d0b540b40fcc9f342611f30f29be733fbce8114 Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:14:57 -0400
Subject: [PATCH 04/10] Update inc/zoo/swar/associative_iteration.h

---
 inc/zoo/swar/associative_iteration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index f047b9bb..35bba6bc 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -101,7 +101,7 @@ struct ArithmeticResultTriplet {
 /// The carry bit is essential to increase the precision of the results in
 /// normal arithmetic, but in unsigned SWAR it is preferable to double the
 /// precision before executing addition, thus guaranteeing no overflow will
-/// occur and using the more performing operator+ addition.  Hence,
+/// occur and using the more performant operator+ addition.  Hence,
 /// the carry flag is mostly useful in SWAR for detection of unsigned overflow.
 ///
 /// The signed integer interpretation is the technique of two's complement, that

From e3f652b0814130cb37390f9354f26147db0a6e2a Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:19:12 -0400
Subject: [PATCH 05/10] Update inc/zoo/swar/associative_iteration.h

---
 inc/zoo/swar/associative_iteration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index 35bba6bc..01aea576 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -102,7 +102,7 @@ struct ArithmeticResultTriplet {
 /// normal arithmetic, but in unsigned SWAR it is preferable to double the
 /// precision before executing addition, thus guaranteeing no overflow will
 /// occur and using the more performant operator+ addition.  Hence,
-/// the carry flag is mostly useful in SWAR for detection of unsigned overflow.
+/// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical.
 ///
 /// The signed integer interpretation is the technique of two's complement, that
 /// routinely overflows (as interpreted as unsigned).  Signed overflow may only

From 8f59029e77b17d83aa592d05bf34d081b926e82f Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:23:47 -0400
Subject: [PATCH 06/10] Update inc/zoo/swar/associative_iteration.h

---
 inc/zoo/swar/associative_iteration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index 01aea576..97dd4089 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -104,7 +104,7 @@ struct ArithmeticResultTriplet {
 /// occur and using the more performant operator+ addition.  Hence,
 /// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical.
 ///
-/// The signed integer interpretation is the technique of two's complement, that
+/// The signed integer interpretation is two's complement, which
 /// routinely overflows (as interpreted as unsigned).  Signed overflow may only
 /// occur if the inputs have the same sign, it is detected when the sign of the
 /// result is opposite that of the inputs.

From 933877e6ff21df3ac3bb4af5dbf95c8101deb3e9 Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:25:44 -0400
Subject: [PATCH 07/10] Update inc/zoo/swar/associative_iteration.h

---
 inc/zoo/swar/associative_iteration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index 97dd4089..1ea526cc 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -105,7 +105,7 @@ struct ArithmeticResultTriplet {
 /// the carry and overflow flags are mostly useful in SWAR for detection of unsigned overflow (as for unsigned addition they are semantically identical.
 ///
 /// The signed integer interpretation is two's complement, which
-/// routinely overflows (as interpreted as unsigned).  Signed overflow may only
+/// routinely overflows (when interpreted as unsigned).  Signed overflow may only
 /// occur if the inputs have the same sign, it is detected when the sign of the
 /// result is opposite that of the inputs.
 ///

From 69ca47ca0b64b29470d87903f52fe9312a4907f6 Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:29:04 -0400
Subject: [PATCH 08/10] Update inc/zoo/map/RobinHood.h

---
 inc/zoo/map/RobinHood.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h
index 6217ee1b..8beeb1d8 100644
--- a/inc/zoo/map/RobinHood.h
+++ b/inc/zoo/map/RobinHood.h
@@ -299,7 +299,7 @@ struct KeyValuePairWrapper {
 /// a search must terminate in failure if the maximum PSL is reached, then,
 /// by just adding the maximum PSL entries to the table, while keeping the
 /// slot indexing function the same, searches at the end of the table will never
-/// atempt to go past the real end, but return not-found within the tail.
+/// attempt to go past the real end, but return not-found within the tail.
 template<
     typename K,
     typename MV,

From 2b7d06ac14632cb49740597bd7cebfa062dd4796 Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:41:19 -0400
Subject: [PATCH 09/10] Apply suggestions from code review

---
 inc/zoo/map/RobinHood.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h
index 8beeb1d8..c156bb23 100644
--- a/inc/zoo/map/RobinHood.h
+++ b/inc/zoo/map/RobinHood.h
@@ -124,8 +124,8 @@ struct RH_Backend {
     The first time the needle has a PSL greater than the haystacks' means the
     matching will fail, because the hypothetical prior insertion would have
     "stolen" that slot.
-    If there is an equal, it would start a sequence of potential matches.  To
-    determine an actual match:
+    If the PSLs are equal, it starts a sequence of potential matches.  To
+    determine if there is an actual match, perform:
     1. A cheap SWAR check of hoisted hashes
     2. If there are still potential matches (now also the hoisted hashes), fall
     back to non-SWAR, or iterative and expensive "deep equality" test for each
@@ -133,13 +133,13 @@ struct RH_Backend {
 
     The above makes it very important to detect the first case in which the PSL
     is greater equal to the needle.  We call this the "deadline".
-    Because we assume the LITTLE ENDIAN byte ordering, the first element would
+    We assume the LITTLE ENDIAN byte ordering: the first element will
     be the least significant non-false Boolean SWAR.
 
     Note about performance:
     Every "early exit" faces a big justification hurdle, the proportion of cases
-    they intercept to be large enough that the branch prediction penalty of the
-    entropy introduced is overcompensated.
+    they intercept must be large enough that the branch prediction penalty of the
+    entropy introduced (by the early exit) is overcompensated.
     */
 
     constexpr static impl::MatchResult<PSL_Bits, HashBits, U>

From 836792229a232d9c79a18a97cda1d79ee9c18bed Mon Sep 17 00:00:00 2001
From: Scottbruceheart <105394870+Scottbruceheart@users.noreply.github.com>
Date: Wed, 17 Jan 2024 00:43:08 -0400
Subject: [PATCH 10/10] Apply suggestions from code review

---
 inc/zoo/map/RobinHood.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h
index c156bb23..6270ec74 100644
--- a/inc/zoo/map/RobinHood.h
+++ b/inc/zoo/map/RobinHood.h
@@ -295,9 +295,9 @@ struct KeyValuePairWrapper {
 ///
 /// Normally we need to explicitly check for whether key searches have reached
 /// the end of the table.  Malte Skarupke devised a tail of table entries to
-/// make this explicit check innecessary: Regardless of the end of the table,
+/// make this explicit check unnecessary: Regardless of the end of the table,
 /// a search must terminate in failure if the maximum PSL is reached, then,
-/// by just adding the maximum PSL entries to the table, while keeping the
+/// by just adding an extra maximum PSL entries to the table, while keeping the
 /// slot indexing function the same, searches at the end of the table will never
 /// attempt to go past the real end, but return not-found within the tail.
 template<