From 87ddc04056ed16c22191e761671265cd576a37f4 Mon Sep 17 00:00:00 2001
From: Eddie <eddie see email elsewhere>
Date: Thu, 22 Feb 2024 18:02:56 -0800
Subject: [PATCH 1/4] Compress/expand

---
 .gitignore                           |  11 +-
 inc/zoo/swar/SWAR.h                  |  13 +-
 inc/zoo/swar/associative_iteration.h | 715 +++++++++++++++++----------
 test/swar/BasicOperations.cpp        |  12 +
 4 files changed, 484 insertions(+), 267 deletions(-)
diff --git a/.gitignore b/.gitignore
index 64d2f48d..b7a554bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
-# Vscode does not like to build outside of the source tree
-# (multiple glitches)
-
-.vscode
-test/.vscode
+# Vscode does not like to build outside of the source tree
+# (multiple glitches)
+
+.vscode
+test/.vscode
+build
diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h
index d25fd457..4d47dc17 100644
--- a/inc/zoo/swar/SWAR.h
+++ b/inc/zoo/swar/SWAR.h
@@ -15,7 +15,7 @@ namespace zoo { namespace swar {
 using u64 = uint64_t;
 using u32 = uint32_t;
 using u16 = uint16_t;
-using u8 = uint8_t;
+using u8 = std::uint8_t;
 
 template<int LogNBits>
 constexpr uint64_t popcount(uint64_t a) noexcept {
@@ -58,7 +58,10 @@ struct SWAR {
         SignificantBitsCount = BitWidth - PaddingBitsCount,
         AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount,
         LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
-        MostSignificantBit = LeastSignificantBit << (NBits - 1);
+        MostSignificantBit = LeastSignificantBit << (NBits - 1),
+        // Use LowerBits in favor of ~MostSignificantBit to not pollute
+        // "don't care" bits when non-power-of-two bit lane sizes are supported
+        LowerBits = MostSignificantBit - LeastSignificantBit;
 
     SWAR() = default;
     constexpr explicit SWAR(T v): m_v(v) {}
@@ -134,14 +137,16 @@ struct SWAR {
     /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
     constexpr SWAR
     shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
-        return SWAR{(*this & protectiveMask).value() << bitCount};
+        T shiftC = static_cast<T>(bitCount); // could be a narrowing conversion
+        auto V = (*this & protectiveMask).value();
+        return SWAR{static_cast<T>(V << shiftC)};
     }
 
     /// \param protectiveMask should clear the bits that would cross the lane
     /// \sa shiftIntraLaneLeft
     constexpr SWAR
     shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
-        return SWAR{(*this & protectiveMask).value() >> bitCount};
+        return SWAR{(*this & protectiveMask).value() >> T{bitCount}};
     }
 
     T m_v;
diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index b2201823..53e81425 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -1,258 +1,457 @@
-#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
-#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H
-
-#include "zoo/swar/SWAR.h"
-
-namespace zoo::swar {
-
-/// \note This code should be substituted by an application of "progressive" algebraic iteration
-/// \note There is also parallelPrefix (to be implemented)
-template<int NB, typename B>
-constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto
-        shiftClearingMask = S{~S::MostSignificantBit},
-        doubling = input,
-        result = S{0};
-    auto
-        bitsToXOR = NB,
-        power = 1;
-    for(;;) {
-        if(1 & bitsToXOR) {
-            result = result ^ doubling;
-            doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
-        }
-        bitsToXOR >>= 1;
-        if(!bitsToXOR) { break; }
-        auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
-        doubling = doubling ^ shifted;
-        // 01...1
-        // 001...1
-        // 00001...1
-        // 000000001...1
-        shiftClearingMask =
-            shiftClearingMask & S{shiftClearingMask.value() >> power};
-        power <<= 1;
-    }
-    return S{result};
-}
-
-/// \todo because of the desirability of "accumuating" the XORs at the MSB,
-/// the parallel suffix operation is more suitable.
-template<int NB, typename B>
-constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto preResult = parallelSuffix(input);
-    auto onlyMSB = preResult.value() & S::MostSignificantBit;
-    return S{onlyMSB};
-}
-
-
-namespace impl {
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
-    auto msbCopiedDown = msb - lsb;
-    auto msbReintroduced = msbCopiedDown | msb;
-    return msbReintroduced;
-}
-}
-
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto lsb = input & S{S::LeastSignificantBit};
-    auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
-    return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
-}
-
-template<int NB, typename B>
-constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    auto msb = input & S{S::MostSignificantBit};
-    auto msbCopiedToLSB = S{msb.value() >> (NB - 1)};
-    return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
-}
-
-template<int NB, typename B>
-struct ArithmeticResultTriplet {
-    SWAR<NB, B> result;
-    BooleanSWAR<NB, B> carry, overflow;
-};
-
-
-template<int NB, typename B>
-constexpr ArithmeticResultTriplet<NB, B>
-fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
-    using S = SWAR<NB, B>;
-    constexpr auto
-        SignBit = S{S::MostSignificantBit},
-        LowerBits = SignBit - S{S::LeastSignificantBit};
-    // prevent overflow by clearing the most significant bits
-    auto
-        s1prime = LowerBits & s1,
-        s2prime = LowerBits & s2,
-        resultPrime = s1prime + s2prime,
-        s1Sign = SignBit & s1,
-        s2Sign = SignBit & s2,
-        signPrime = SignBit & resultPrime,
-        result = resultPrime ^ s1Sign ^ s2Sign,
-        // carry is set whenever at least two of the sign bits of s1, s2,
-        // signPrime are set
-        carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
-        // overflow: the inputs have the same sign and different to result
-        // same sign: s1Sign ^ s2Sign
-        overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
-    using BS = BooleanSWAR<NB, B>;
-    return { result, BS{carry.value()}, BS{overflow.value()} };
-};
-
-
-template<int NB, typename B>
-constexpr auto negate(SWAR<NB, B> input) {
-    using S = SWAR<NB, B>;
-    constexpr auto Ones = S{S::LeastSignificantBit};
-    return fullAddition(~input, Ones).result;
-}
-
-/// \brief Performs a generalized iterated application of an associative operator to a base
-///
-/// In algebra, the repeated application of an operator to a "base" has different names depending on the
-/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
-/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
-/// power".
-/// The general term in algebra is "iteration", hence the naming of this function.
-/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
-/// keep the option to use the imprecise language of "product, base and exponent".  "Iteration" has a very
-/// different meaning in programming and especially different in C++.
-/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
-/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
-/// \note There is a symmetrical operation to be implemented of associative iteration in the
-/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
-/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
-/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
-/// and the count to be used, the "count" is an artifact of this generalization
-/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
-/// be a number, the iteration count is part of the execution context to apply the operator
-/// \param forSquaring is an artifact of this generalization
-/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
-/// there are fewer iterations than what the type of exponent would allow
-template<
-    typename Base, typename IterationCount, typename Operator,
-    // the critical use of associativity is that it allows halving the
-    // iteration count
-    typename CountHalver
->
-constexpr auto associativeOperatorIterated_regressive(
-    Base base, Base neutral, IterationCount count, IterationCount forSquaring,
-    Operator op, unsigned log2Count, CountHalver ch
-) {
-    auto result = neutral;
-    if(!log2Count) { return result; }
-    for(;;) {
-        result = op(result, base, count);
-        if(!--log2Count) { break; }
-        result = op(result, result, forSquaring);
-        count = ch(count);
-    }
-    return result;
-}
-
-template<int ActualBits, int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
-    SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
-) {
-    using S = SWAR<NB, T>;
-
-    auto operation = [](auto left, auto right, auto counts) {
-        auto addendums = makeLaneMaskFromMSB(counts);
-        return left + (addendums & right);
-    };
-
-    auto halver = [](auto counts) {
-        auto msbCleared = counts & ~S{S::MostSignificantBit};
-        return S{msbCleared.value() << 1};
-    };
-
-    multiplier = S{multiplier.value() << (NB - ActualBits)};
-    return associativeOperatorIterated_regressive(
-        multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
-        ActualBits, halver
-    );
-}
-
-/// \note Not removed yet because it is an example of "progressive" associative exponentiation
-template<int ActualBits, int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
-    SWAR<NB, T> multiplicand,
-    SWAR<NB, T> multiplier
-) {
-    using S = SWAR<NB, T>;
-    constexpr auto LeastBit = S::LeastSignificantBit;
-    auto multiplicandDoubling = multiplicand.value();
-    auto mplier = multiplier.value();
-    auto product = S{0};
-    for(auto count = ActualBits;;) {
-        auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
-        product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
-        if(!--count) { break; }
-        multiplicandDoubling <<= 1;
-        auto leastBitCleared = mplier & ~LeastBit;
-        mplier = leastBitCleared >> 1;
-    }
-    return product;
-}
-
-template<int NB, typename T>
-constexpr auto multiplication_OverflowUnsafe(
-    SWAR<NB, T> multiplicand,
-    SWAR<NB, T> multiplier
-) {
-    return
-        multiplication_OverflowUnsafe_SpecificBitCount<NB>(
-            multiplicand, multiplier
-        );
-}
-
-template<int NB, typename T>
-struct SWAR_Pair{
-    SWAR<NB, T> even, odd;
-};
-
-template<int NB, typename T>
-constexpr SWAR<NB, T> doublingMask() {
-    using S = SWAR<NB, T>;
-    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
-    using D = SWAR<NB * 2, T>;
-    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
-}
-
-template<int NB, typename T>
-constexpr auto doublePrecision(SWAR<NB, T> input) {
-    using S = SWAR<NB, T>;
-    static_assert(
-        0 == S::NSlots % 2,
-        "Precision can only be doubled for SWARs of even element count"
-    );
-    using RV = SWAR<NB * 2, T>;
-    constexpr auto DM = doublingMask<NB, T>();
-    return SWAR_Pair<NB * 2, T>{
-        RV{(input & DM).value()},
-        RV{(input.value() >> NB) & DM.value()}
-    };
-}
-
-template<int NB, typename T>
-constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
-    using S = SWAR<NB, T>;
-    static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
-    using RV = SWAR<NB/2, T>;
-    constexpr auto HalvingMask = doublingMask<NB/2, T>();
-    auto
-        evenHalf = RV{even.value()} & HalvingMask,
-        oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
-    return evenHalf | oddHalf;
-}
-
-}
-
-#endif
+#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
+#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H
+
+#include "zoo/swar/SWAR.h"
+
+#define ZOO_DEVELOPMENT_DEBUGGING
+#ifdef ZOO_DEVELOPMENT_DEBUGGING
+#include <iostream>
+
+inline std::ostream &binary(std::ostream &out, uint64_t input, int count) {
+    while(count--) {
+        out << (1 & input);
+        input >>= 1;
+    }
+    return out;
+}
+
+template<int NB, typename B>
+std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
+    using S = zoo::swar::SWAR<NB, B>;
+    auto shiftCounter = sizeof(B) * 8 / NB;
+    out << "<|";
+    auto v = s.value();
+    do {
+        binary(out, v, NB) << '|';
+
+    } while(--shiftCounter);
+    return out << ">";
+}
+
+#define ZOO_TO_STRING(a) #a
+// std::endl is needed within the context of debugging: flush the line
+#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" <<  F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl;
+#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)
+
+#else
+
+#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__
+
+#endif
+
+
+namespace zoo::swar {
+
+/// \note This code should be substituted by an application of "progressive" algebraic iteration
+/// \note There is also parallelPrefix (to be implemented)
+template<int NB, typename B>
+constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto
+        shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)},
+        doubling = input,
+        result = S{0};
+    auto
+        bitsToXOR = NB,
+        power = 1;
+    #define ZTE(...) __VA_ARGS__
+    for(;;) {
+        ZTE(doubling);
+        if(1 & bitsToXOR) {
+            ZTE(result = result ^ doubling);
+            ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask));
+        }
+        ZTE(bitsToXOR >>= 1);
+        if(!bitsToXOR) { break; }
+        auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
+        ZTE(shifted);
+        ZTE(doubling = doubling ^ shifted);
+        // 01...1
+        // 001...1
+        // 00001...1
+        // 000000001...1
+        shiftClearingMask =
+            shiftClearingMask &
+                S{static_cast<B>(shiftClearingMask.value() >> power)};
+        ZTE(power <<= 1);
+    }
+    ZTE(input);
+    #undef ZTE
+    return S{result};
+}
+
+template<int NB, typename B>
+constexpr SWAR<NB, B>
+compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
+    // the only bits turned on in the result are the bits set in the input that
+    // are moved down (shifted right)
+
+    // Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4
+    // The compression moves bits right as many positions as there are zeroes
+    // in the mask "below" it (or to the right).
+    // We can count the zeroes in the mask in a logarithmic way:
+    // First detect an odd count of zeroes, move those bits in the input one
+    // position down (right).
+    // Then an odd count of *pairs* of zeroes, moving them 2 positions right.
+    // Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4
+    // right.
+    // An odd count of octects (bytes) of zeroes, shifting right 8,
+    // Odd count of 16 zeroes, >> 16
+    // ...
+    //
+    // This solution will use the parallel suffix operation as a primary tool:
+    // For every bit postion it indicates an odd number of ones to the right,
+    // including itself.
+    // Because we want to detect the "oddity" of groups of zeroes to the right,
+    // we flip the compression mask.  To not count the bit position itself,
+    // we shift by one.
+    #define ZTE ZOO_TRACEABLE_EXPRESSION
+    ZTE(input);
+    ZTE(compressionMask);
+    using S = SWAR<NB, B>;
+    auto result = input;
+    auto groupSize = 1;
+    auto shiftLeftMask = S{S::LowerBits};
+    auto shiftRightMask = S{S::LowerBits << 1};
+    auto forParallelSuffix = // this is called "mk" in the book
+        (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
+    ZTE(forParallelSuffix);
+        // note: forParallelSuffix denotes positions with a zero
+        // immediately to the right in the 'mask'
+    auto oddCountOfGroupsOfZerosToTheRight =  // called "mp" in the book
+        parallelSuffix(forParallelSuffix);
+    ZTE(oddCountOfGroupsOfZerosToTheRight);
+    // compress the bits just identified in both the result and the mask
+    auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight;
+    ZTE(movingFromMask);
+    auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight;
+    /*compressionMask =
+        (compressionMask ^ movingFromMask) |
+        movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/
+    result =
+        (result ^ movingFromInput) |
+        movingFromInput.shiftIntraLaneLeft(groupSize, shiftRightMask);
+
+    auto evenCountOfGroupsOfZerosToTheRight =
+        ~oddCountOfGroupsOfZerosToTheRight;
+    
+    //auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit});
+    //result = result ^ moved;
+    return result;
+    #undef ZTE
+}
+
+/*
+Complete example (32 bits)
+Selection mask:
+0001 0011 0111 0111 0110 1110 1100 1010
+Input (each letter or variable is a boolean, that can have 0 or 1)
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+Selection (using spaces)
+   d   gh  jkl  nop  rs  uvx  zA   D F
+Desired result:
+                     dghjklnoprsuvxzADF
+
+0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1
+1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix
+
+                           10 1101 1101
+/*
+Complete example (32 bits)
+Selection mask:
+0001 0011 0111 0111 0110 1110 1100 1010
+Input (each letter or variable is a boolean, that can have 0 or 1)
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+Selection (using spaces)
+   d   gh  jkl  nop  rs  uvx  zA   D F
+Desired result:
+                     dghjklnoprsuvxzADF
+
+0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
+1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask
+This indicates the positions that have a 0 immediately to the right in compressionMask
+4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
+we have just identified the positions that need to move an odd number of positions
+filter those positions to positions that have a bit set in the compressionMask:
+0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
+---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize
+0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move)
+---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
+0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
+1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right)
+1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right)
+At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position,
+then, we only keep positions that move an even number of positions.
+Now, we will repeat these steps but for groups of two zeroes
+
+
+Binary compress: A fascinating algorithm.
+Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm:
+From a "mask", a selector of bits from an input, we want to put them together in the output.
+For example's sake, this is the selector:
+Note: this follows the usual 'big endian' convention of denoting the most significant bit first
+0001 0011 0111 0111 0110 1110 1100 1010
+Imagine the input is the 32-bit or 32-boolean variable expression
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+We want the selection
+   d   gh  jkl  nop  rs  uvx  zA   D F
+To be compressed into the output
+0000 0000 0000 00dg hjkl nopr suvx zADF
+This algorithm will virtually calculate the count of positions that the selected bits travel to the right,
+by constructing the binary encoding of that count:
+It will identify the positions that will travel an odd number of positions to the right, these are those
+whose position-travel-count have the units set.
+It will move those positions by one position to the right, and eliminate them from the yet-to-move positions.
+Because it eliminates the positions that would move an odd count, there remains only positions that move
+an even number of positions.  Now it finds the positions that move an odd count of /pairs/ of positions,
+and moves them 2 positions.
+then an odd count of /quartets/ of positions, and moves them 4;
+8, 16, 32, ...
+
+*/
+
+
+/// \todo because of the desirability of "accumuating" the XORs at the MSB,
+/// the parallel suffix operation is more suitable.
+template<int NB, typename B>
+constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto preResult = parallelSuffix(input);
+    auto onlyMSB = preResult.value() & S::MostSignificantBit;
+    return S{onlyMSB};
+}
+
+
+namespace impl {
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
+    auto msbCopiedDown = msb - lsb;
+    auto msbReintroduced = msbCopiedDown | msb;
+    return msbReintroduced;
+}
+}
+
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto lsb = input & S{S::LeastSignificantBit};
+    auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
+    return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
+}
+
+template<int NB, typename B>
+constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    auto msb = input & S{S::MostSignificantBit};
+    auto msbCopiedToLSB = S{msb.value() >> (NB - 1)};
+    return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
+}
+
+template<int NB, typename B>
+struct ArithmeticResultTriplet {
+    SWAR<NB, B> result;
+    BooleanSWAR<NB, B> carry, overflow;
+};
+
+
+template<int NB, typename B>
+constexpr ArithmeticResultTriplet<NB, B>
+fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
+    using S = SWAR<NB, B>;
+    constexpr auto
+        SignBit = S{S::MostSignificantBit},
+        LowerBits = SignBit - S{S::LeastSignificantBit};
+    // prevent overflow by clearing the most significant bits
+    auto
+        s1prime = LowerBits & s1,
+        s2prime = LowerBits & s2,
+        resultPrime = s1prime + s2prime,
+        s1Sign = SignBit & s1,
+        s2Sign = SignBit & s2,
+        signPrime = SignBit & resultPrime,
+        result = resultPrime ^ s1Sign ^ s2Sign,
+        // carry is set whenever at least two of the sign bits of s1, s2,
+        // signPrime are set
+        carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
+        // overflow: the inputs have the same sign and different to result
+        // same sign: s1Sign ^ s2Sign
+        overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
+    using BS = BooleanSWAR<NB, B>;
+    return { result, BS{carry.value()}, BS{overflow.value()} };
+};
+
+
+template<int NB, typename B>
+constexpr auto negate(SWAR<NB, B> input) {
+    using S = SWAR<NB, B>;
+    constexpr auto Ones = S{S::LeastSignificantBit};
+    return fullAddition(~input, Ones).result;
+}
+
+/// \brief Performs a generalized iterated application of an associative operator to a base
+///
+/// In algebra, the repeated application of an operator to a "base" has different names depending on the
+/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
+/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
+/// power".
+/// The general term in algebra is "iteration", hence the naming of this function.
+/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
+/// keep the option to use the imprecise language of "product, base and exponent".  "Iteration" has a very
+/// different meaning in programming and especially different in C++.
+/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
+/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
+/// \note There is a symmetrical operation to be implemented of associative iteration in the
+/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
+/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
+/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
+/// and the count to be used, the "count" is an artifact of this generalization
+/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
+/// be a number, the iteration count is part of the execution context to apply the operator
+/// \param forSquaring is an artifact of this generalization
+/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
+/// there are fewer iterations than what the type of exponent would allow
+template<
+    typename Base, typename IterationCount, typename Operator,
+    // the critical use of associativity is that it allows halving the
+    // iteration count
+    typename CountHalver
+>
+constexpr auto associativeOperatorIterated_regressive(
+    Base base, Base neutral, IterationCount count, IterationCount forSquaring,
+    Operator op, unsigned log2Count, CountHalver ch
+) {
+    auto result = neutral;
+    if(!log2Count) { return result; }
+    for(;;) {
+        result = op(result, base, count);
+        if(!--log2Count) { break; }
+        result = op(result, result, forSquaring);
+        count = ch(count);
+    }
+    return result;
+}
+
+template<int ActualBits, int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
+    SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
+) {
+    using S = SWAR<NB, T>;
+
+    auto operation = [](auto left, auto right, auto counts) {
+        auto addendums = makeLaneMaskFromMSB(counts);
+        return left + (addendums & right);
+    };
+
+    auto halver = [](auto counts) {
+        auto msbCleared = counts & ~S{S::MostSignificantBit};
+        return S{msbCleared.value() << 1};
+    };
+
+    multiplier = S{multiplier.value() << (NB - ActualBits)};
+    return associativeOperatorIterated_regressive(
+        multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
+        ActualBits, halver
+    );
+}
+
+/// \note Not removed yet because it is an example of "progressive" associative exponentiation
+template<int ActualBits, int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
+    SWAR<NB, T> multiplicand,
+    SWAR<NB, T> multiplier
+) {
+    using S = SWAR<NB, T>;
+    constexpr auto LeastBit = S::LeastSignificantBit;
+    auto multiplicandDoubling = multiplicand.value();
+    auto mplier = multiplier.value();
+    auto product = S{0};
+    for(auto count = ActualBits;;) {
+        auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
+        product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
+        if(!--count) { break; }
+        multiplicandDoubling <<= 1;
+        auto leastBitCleared = mplier & ~LeastBit;
+        mplier = leastBitCleared >> 1;
+    }
+    return product;
+}
+
+template<int NB, typename T>
+constexpr auto multiplication_OverflowUnsafe(
+    SWAR<NB, T> multiplicand,
+    SWAR<NB, T> multiplier
+) {
+    return
+        multiplication_OverflowUnsafe_SpecificBitCount<NB>(
+            multiplicand, multiplier
+        );
+}
+
+template<int NB, typename T>
+struct SWAR_Pair{
+    SWAR<NB, T> even, odd;
+};
+
+template<int NB, typename T>
+constexpr SWAR<NB, T> doublingMask() {
+    using S = SWAR<NB, T>;
+    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
+    using D = SWAR<NB * 2, T>;
+    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
+}
+
+template<int NB, typename T>
+constexpr auto doublePrecision(SWAR<NB, T> input) {
+    using S = SWAR<NB, T>;
+    static_assert(
+        0 == S::NSlots % 2,
+        "Precision can only be doubled for SWARs of even element count"
+    );
+    using RV = SWAR<NB * 2, T>;
+    constexpr auto DM = doublingMask<NB, T>();
+    return SWAR_Pair<NB * 2, T>{
+        RV{(input & DM).value()},
+        RV{(input.value() >> NB) & DM.value()}
+    };
+}
+
+template<int NB, typename T>
+constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
+    using S = SWAR<NB, T>;
+    static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
+    using RV = SWAR<NB/2, T>;
+    constexpr auto HalvingMask = doublingMask<NB/2, T>();
+    auto
+        evenHalf = RV{even.value()} & HalvingMask,
+        oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
+    return evenHalf | oddHalf;
+}
+
+/*
+template<int NB, typename T>
+constexpr auto compress(SWAR<NB, T> input, SWAR<NB, T> mask) {
+    using S = SWAR<NB, T>;
+    // Follows Henry S. Warren's "Hacker's Delight" 7-4
+    auto movers = input & mask;
+    // The mechanism detects positions with an odd number of zeroes to the
+    // right.
+    // To count odd zeroes, invert the mask
+    // The "parallel suffix" gives this, but including the position, to exclude
+    // the position, shift left by one
+    auto preOddZeroesToTheRight = ~S{~mask.value() << 1};
+    auto oddZeroesToTheRight = parallelSuffix(preOddZeroesToTheRight);
+    auto moveSelector1 = oddZeroesToTheRight & mask;
+    auto shiftRightMask = ~S::LeastSignificantBit;
+    auto move1 = moveSelector1 & movers;
+    auto result = (moveSelector1 ^ move1) | movers.shiftIntraLaneRight(1, shiftRightMask);
+    return result;
+}
+*/
+
+}
+
+#endif
diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp
index af5a1016..3ef469d4 100644
--- a/test/swar/BasicOperations.cpp
+++ b/test/swar/BasicOperations.cpp
@@ -101,6 +101,18 @@ TEST_CASE(
     }
 }
 
+TEST_CASE("Compress/Expand", "[swar]") {
+    unsigned Mask =   0b0001'0011'0111'0111'0110'1110'1100'1010;
+    unsigned ToMove = 0x55555555;
+    using S1_32 = SWAR<32, uint32_t>;
+    auto q = compress(S1_32{ToMove}, S1_32{Mask});
+    CHECK(0 != q.value());
+    using S2_8 = SWAR<2, uint8_t>;
+    auto r = compress(S2_8{0b10'10'10'10}, S2_8{0b11'10'00'00});
+    S2_8 expected{0b10'01'00'00};
+    CHECK(expected.value() == r.value());
+}
+
 static_assert(1 == popcount<5>(0x100ull));
 static_assert(1 == popcount<5>(0x010ull));
 static_assert(1 == popcount<5>(0x001ull));

From efeb812bd0d8b01750c6e6233e253e5fb205c0aa Mon Sep 17 00:00:00 2001
From: Eddie <eddie see email elsewhere>
Date: Fri, 23 Feb 2024 11:06:11 -0800
Subject: [PATCH 2/4] Compress tested successfully

---
 inc/zoo/swar/SWAR.h                  |  34 +++--
 inc/zoo/swar/associative_iteration.h | 218 +++++++++++++++------------
 test/swar/BasicOperations.cpp        |  13 +-
 3 files changed, 143 insertions(+), 122 deletions(-)

diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h
index 4d47dc17..cd44906c 100644
--- a/inc/zoo/swar/SWAR.h
+++ b/inc/zoo/swar/SWAR.h
@@ -132,22 +132,24 @@ struct SWAR {
 
     /// \brief as the name suggests
     /// \param protectiveMask should clear the bits that would cross the lane.
-    /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain
-    /// the protective mask by the caller, otherwise, the mask will be computed on all invocations.
-    /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
-    constexpr SWAR
-    shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
-        T shiftC = static_cast<T>(bitCount); // could be a narrowing conversion
-        auto V = (*this & protectiveMask).value();
-        return SWAR{static_cast<T>(V << shiftC)};
-    }
-
-    /// \param protectiveMask should clear the bits that would cross the lane
-    /// \sa shiftIntraLaneLeft
-    constexpr SWAR
-    shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
-        return SWAR{(*this & protectiveMask).value() >> T{bitCount}};
-    }
+    /// The bits that will be cleared are directly related to the count of
+    /// shifts, it is natural to maintain the protective mask by the caller,
+    /// otherwise, the mask would have to be computed in all invocations.
+    /// We are not sure the optimizer would maintain this mask somewhere, if it
+    /// were to recalculate it, it would be disastrous for performance
+    /// \note the \c static_cast are necessary because of narrowing conversions
+    #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
+    #define X(name, op) \
+        constexpr SWAR \
+        shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
+            T shiftC = static_cast<T>(bitCount); \
+            auto V = (*this & protectiveMask).value(); \
+            auto rv = static_cast<T>(V op shiftC); \
+            return SWAR{rv}; \
+        }
+    SHIFT_INTRALANE_OP_X_LIST
+    #undef X
+    #undef SHIFT_INTRALANE_OP_X_LIST
 
     T m_v;
 };
diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index 53e81425..ba29b350 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -80,68 +80,42 @@ constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
     return S{result};
 }
 
-template<int NB, typename B>
-constexpr SWAR<NB, B>
-compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
-    // the only bits turned on in the result are the bits set in the input that
-    // are moved down (shifted right)
-
-    // Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4
-    // The compression moves bits right as many positions as there are zeroes
-    // in the mask "below" it (or to the right).
-    // We can count the zeroes in the mask in a logarithmic way:
-    // First detect an odd count of zeroes, move those bits in the input one
-    // position down (right).
-    // Then an odd count of *pairs* of zeroes, moving them 2 positions right.
-    // Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4
-    // right.
-    // An odd count of octects (bytes) of zeroes, shifting right 8,
-    // Odd count of 16 zeroes, >> 16
-    // ...
-    //
-    // This solution will use the parallel suffix operation as a primary tool:
-    // For every bit postion it indicates an odd number of ones to the right,
-    // including itself.
-    // Because we want to detect the "oddity" of groups of zeroes to the right,
-    // we flip the compression mask.  To not count the bit position itself,
-    // we shift by one.
-    #define ZTE ZOO_TRACEABLE_EXPRESSION
-    ZTE(input);
-    ZTE(compressionMask);
-    using S = SWAR<NB, B>;
-    auto result = input;
-    auto groupSize = 1;
-    auto shiftLeftMask = S{S::LowerBits};
-    auto shiftRightMask = S{S::LowerBits << 1};
-    auto forParallelSuffix = // this is called "mk" in the book
-        (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
-    ZTE(forParallelSuffix);
-        // note: forParallelSuffix denotes positions with a zero
-        // immediately to the right in the 'mask'
-    auto oddCountOfGroupsOfZerosToTheRight =  // called "mp" in the book
-        parallelSuffix(forParallelSuffix);
-    ZTE(oddCountOfGroupsOfZerosToTheRight);
-    // compress the bits just identified in both the result and the mask
-    auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight;
-    ZTE(movingFromMask);
-    auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight;
-    /*compressionMask =
-        (compressionMask ^ movingFromMask) |
-        movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/
-    result =
-        (result ^ movingFromInput) |
-        movingFromInput.shiftIntraLaneLeft(groupSize, shiftRightMask);
-
-    auto evenCountOfGroupsOfZerosToTheRight =
-        ~oddCountOfGroupsOfZerosToTheRight;
-    
-    //auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit});
-    //result = result ^ moved;
-    return result;
-    #undef ZTE
-}
-
 /*
+Binary compress: A fascinating algorithm.
+
+Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
+binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
+"Parallel Extraction"
+
+From a "mask", a selector of bits from an input, we want to put them together in
+the output.
+
+For example's sake, this is the selector:
+Note: this follows the usual 'big endian' convention of denoting the most
+significant bit first:
+0001 0011 0111 0111 0110 1110 1100 1010
+Imagine the input is the 32-bit or 32-boolean variable expression
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+We want the selection
+   d   gh  jkl  nop  rs  uvx  zA   D F
+To be compressed into the output
+0000 0000 0000 00dg hjkl nopr suvx zADF
+
+This algorithm will virtually calculate the count of positions that the selected
+bits travel to the right, by constructing the binary encoding of that count:
+It will identify the positions that will travel an odd number of positions to
+the right, these are those whose position-travel-count have the units set.
+It will then move those positions by one position to the right, and eliminate
+them from the yet-to-move positions.  Because it eliminates the positions that
+would move an odd count, there remains only positions that move an even number
+of positions.  Now it finds the positions that move an odd count of /pairs/ of
+positions, it moves them 2 positions.  This is equivalent to finding the
+positions that would have the bit for 2 set in the count of positions to move
+right.
+Then an odd count of /quartets/ of positions, and moves them 4;
+8, 16, 32, ...
+
+
 Complete example (32 bits)
 Selection mask:
 0001 0011 0111 0111 0110 1110 1100 1010
@@ -169,51 +143,97 @@ Desired result:
 
 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
 1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
-1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask
-This indicates the positions that have a 0 immediately to the right in compressionMask
-4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit
-0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
-we have just identified the positions that need to move an odd number of positions
-filter those positions to positions that have a bit set in the compressionMask:
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
+                                            == groupsize of ~compressionMask
+This indicates the positions that have a 0 immediately to the right in
+                                            compressionMask
+4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
+                                          current position in forParallelSuffix,
+                                          last decimal digit
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
+                                            forParallelSuffix
+We have just identified the positions that need to move an odd number of
+positions.  Filter them with positions with a bit set in compressionMask:
 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
----- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize
-0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move)
+---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
+                                            compressionMask by 1 == groupSize
+0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
+                                            that will move)
 ---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
 0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
-0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
+                                            forParallelSuffix
 1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
-1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right)
-1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right)
-At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position,
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
+                                            immediately to their right)
+1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
+                                                had even zeroes to their right)
+At this point, we have removed from compressionMask the positions that moved an
+odd number of positions and moved them 1 position,
 then, we only keep positions that move an even number of positions.
-Now, we will repeat these steps but for groups of two zeroes
-
-
-Binary compress: A fascinating algorithm.
-Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm:
-From a "mask", a selector of bits from an input, we want to put them together in the output.
-For example's sake, this is the selector:
-Note: this follows the usual 'big endian' convention of denoting the most significant bit first
-0001 0011 0111 0111 0110 1110 1100 1010
-Imagine the input is the 32-bit or 32-boolean variable expression
-abcd efgh ijkl mnop qrst uvxy zABC DEFG
-We want the selection
-   d   gh  jkl  nop  rs  uvx  zA   D F
-To be compressed into the output
-0000 0000 0000 00dg hjkl nopr suvx zADF
-This algorithm will virtually calculate the count of positions that the selected bits travel to the right,
-by constructing the binary encoding of that count:
-It will identify the positions that will travel an odd number of positions to the right, these are those
-whose position-travel-count have the units set.
-It will move those positions by one position to the right, and eliminate them from the yet-to-move positions.
-Because it eliminates the positions that would move an odd count, there remains only positions that move
-an even number of positions.  Now it finds the positions that move an odd count of /pairs/ of positions,
-and moves them 2 positions.
-then an odd count of /quartets/ of positions, and moves them 4;
-8, 16, 32, ...
-
+Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
 */
 
+template<int NB, typename B>
+constexpr SWAR<NB, B>
+compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
+    // This solution uses the parallel suffix operation as a primary tool:
+    // For every bit postion it indicates an odd number of ones to the right,
+    // including itself.
+    // Because we want to detect the "oddness" of groups of zeroes to the right,
+    // we flip the compression mask.  To not count the bit position itself,
+    // we shift by one.
+    // #define ZTE ZOO_TRACEABLE_EXPRESSION
+    ZTE(input);
+    ZTE(compressionMask);
+    using S = SWAR<NB, B>;
+    auto result = input & compressionMask;
+    auto groupSize = 1;
+    auto
+        shiftLeftMask = S{S::LowerBits},
+        shiftRightMask = S{S::LowerBits << 1};
+    ZTE(~compressionMask);
+    auto forParallelSuffix = // this is called "mk" in the book
+        (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
+    ZTE(forParallelSuffix);
+        // note: forParallelSuffix denotes positions with a zero
+        // immediately to the right in 'compressionMask'
+    do {
+        ZTE(groupSize);
+        ZTE(shiftLeftMask);
+        ZTE(shiftRightMask);
+        ZTE(result);
+        auto oddCountOfGroupsOfZerosToTheRight =  // called "mp" in the book
+            parallelSuffix(forParallelSuffix);
+        ZTE(oddCountOfGroupsOfZerosToTheRight);
+        // compress the bits just identified in both the result and the mask
+        auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
+        ZTE(moving);
+        compressionMask =
+            (compressionMask ^ moving) | // clear the moving
+            moving.shiftIntraLaneRight(groupSize, shiftRightMask);
+        ZTE(compressionMask);
+        auto movingFromInput = result & moving;
+        result =
+            (result ^ movingFromInput) | // clear the moving from the result
+            movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
+
+        auto evenCountOfGroupsOfZerosToTheRight =
+            ~oddCountOfGroupsOfZerosToTheRight;
+        forParallelSuffix =
+            forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
+        auto newShiftLeftMask =
+            shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
+        shiftRightMask =
+            shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
+        shiftLeftMask = newShiftLeftMask;
+        groupSize <<= 1;
+    } while(groupSize < NB);
+    ZTE(result);
+    #undef ZTE
+    return result;
+}
+
 
 /// \todo because of the desirability of "accumuating" the XORs at the MSB,
 /// the parallel suffix operation is more suitable.
diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp
index 3ef469d4..83038df0 100644
--- a/test/swar/BasicOperations.cpp
+++ b/test/swar/BasicOperations.cpp
@@ -102,15 +102,14 @@ TEST_CASE(
 }
 
 TEST_CASE("Compress/Expand", "[swar]") {
-    unsigned Mask =   0b0001'0011'0111'0111'0110'1110'1100'1010;
-    unsigned ToMove = 0x55555555;
+    unsigned
+        Mask =   0b0001'0011'0111'0111'0110'1110'1100'1010,
+        ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101,
+        // Selection: 1   01  101  101  10  010  01   0 0
+        result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0;
     using S1_32 = SWAR<32, uint32_t>;
     auto q = compress(S1_32{ToMove}, S1_32{Mask});
-    CHECK(0 != q.value());
-    using S2_8 = SWAR<2, uint8_t>;
-    auto r = compress(S2_8{0b10'10'10'10}, S2_8{0b11'10'00'00});
-    S2_8 expected{0b10'01'00'00};
-    CHECK(expected.value() == r.value());
+    CHECK(result == q.value());
 }
 
 static_assert(1 == popcount<5>(0x100ull));

From 2d2d9a8baa289d963c08cf5d072c0d4317fb153a Mon Sep 17 00:00:00 2001
From: Eddie <eddie see email elsewhere>
Date: Fri, 23 Feb 2024 11:20:58 -0800
Subject: [PATCH 3/4] Cleaning

---
 inc/zoo/swar/associative_iteration.h | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index ba29b350..f5c1ad7e 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -3,7 +3,7 @@
 
 #include "zoo/swar/SWAR.h"
 
-#define ZOO_DEVELOPMENT_DEBUGGING
+//#define ZOO_DEVELOPMENT_DEBUGGING
 #ifdef ZOO_DEVELOPMENT_DEBUGGING
 #include <iostream>
 
@@ -54,7 +54,7 @@ constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
     auto
         bitsToXOR = NB,
         power = 1;
-    #define ZTE(...) __VA_ARGS__
+    #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
     for(;;) {
         ZTE(doubling);
         if(1 & bitsToXOR) {
@@ -183,7 +183,7 @@ compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
     // Because we want to detect the "oddness" of groups of zeroes to the right,
     // we flip the compression mask.  To not count the bit position itself,
     // we shift by one.
-    // #define ZTE ZOO_TRACEABLE_EXPRESSION
+    #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
     ZTE(input);
     ZTE(compressionMask);
     using S = SWAR<NB, B>;
@@ -451,27 +451,6 @@ constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
     return evenHalf | oddHalf;
 }
 
-/*
-template<int NB, typename T>
-constexpr auto compress(SWAR<NB, T> input, SWAR<NB, T> mask) {
-    using S = SWAR<NB, T>;
-    // Follows Henry S. Warren's "Hacker's Delight" 7-4
-    auto movers = input & mask;
-    // The mechanism detects positions with an odd number of zeroes to the
-    // right.
-    // To count odd zeroes, invert the mask
-    // The "parallel suffix" gives this, but including the position, to exclude
-    // the position, shift left by one
-    auto preOddZeroesToTheRight = ~S{~mask.value() << 1};
-    auto oddZeroesToTheRight = parallelSuffix(preOddZeroesToTheRight);
-    auto moveSelector1 = oddZeroesToTheRight & mask;
-    auto shiftRightMask = ~S::LeastSignificantBit;
-    auto move1 = moveSelector1 & movers;
-    auto result = (moveSelector1 ^ move1) | movers.shiftIntraLaneRight(1, shiftRightMask);
-    return result;
-}
-*/
-
 }
 
 #endif

From d8c2875b75c2347bd88537865ff1387f257bcd92 Mon Sep 17 00:00:00 2001
From: Eddie <eddie see email elsewhere>
Date: Fri, 23 Feb 2024 18:31:54 -0800
Subject: [PATCH 4/4] Minor improvement

---
 inc/zoo/swar/associative_iteration.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
index d45ad3aa..7c6cab3b 100644
--- a/inc/zoo/swar/associative_iteration.h
+++ b/inc/zoo/swar/associative_iteration.h
@@ -198,7 +198,7 @@ compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
     ZTE(forParallelSuffix);
         // note: forParallelSuffix denotes positions with a zero
         // immediately to the right in 'compressionMask'
-    do {
+    for(;;) {
         ZTE(groupSize);
         ZTE(shiftLeftMask);
         ZTE(shiftRightMask);
@@ -217,7 +217,10 @@ compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
         result =
             (result ^ movingFromInput) | // clear the moving from the result
             movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
-
+        auto nextGroupSize = groupSize << 1;
+        if(NB <= nextGroupSize) {
+            break;
+        }
         auto evenCountOfGroupsOfZerosToTheRight =
             ~oddCountOfGroupsOfZerosToTheRight;
         forParallelSuffix =
@@ -227,8 +230,8 @@ compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
         shiftRightMask =
             shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
         shiftLeftMask = newShiftLeftMask;
-        groupSize <<= 1;
-    } while(groupSize < NB);
+        groupSize = nextGroupSize;
+    }
     ZTE(result);
     #undef ZTE
     return result;