Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Vscode does not like to build outside of the source tree
# (multiple glitches)
.vscode
test/.vscode
build
.cache
# Vscode does not like to build outside of the source tree
# (multiple glitches)

.vscode
test/.vscode
build
.cache
39 changes: 23 additions & 16 deletions inc/zoo/swar/SWAR.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace zoo { namespace swar {
using u64 = uint64_t;
using u32 = uint32_t;
using u16 = uint16_t;
using u8 = uint8_t;
using u8 = std::uint8_t;

template<int LogNBits>
constexpr uint64_t popcount(uint64_t a) noexcept {
Expand Down Expand Up @@ -58,7 +58,10 @@ struct SWAR {
SignificantBitsCount = BitWidth - PaddingBitsCount,
AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount,
LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
MostSignificantBit = LeastSignificantBit << (NBits - 1);
MostSignificantBit = LeastSignificantBit << (NBits - 1),
// Use LowerBits in favor of ~MostSignificantBit to not pollute
// "don't care" bits when non-power-of-two bit lane sizes are supported
LowerBits = MostSignificantBit - LeastSignificantBit;

SWAR() = default;
constexpr explicit SWAR(T v): m_v(v) {}
Expand Down Expand Up @@ -129,20 +132,24 @@ struct SWAR {

/// \brief as the name suggests
/// \param protectiveMask should clear the bits that would cross the lane.
/// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain
/// the protective mask by the caller, otherwise, the mask will be computed on all invocations.
/// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
constexpr SWAR
shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
return SWAR{(*this & protectiveMask).value() << bitCount};
}

/// \param protectiveMask should clear the bits that would cross the lane
/// \sa shiftIntraLaneLeft
constexpr SWAR
shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
return SWAR{(*this & protectiveMask).value() >> bitCount};
}
/// The bits that will be cleared are directly related to the count of
/// shifts, it is natural to maintain the protective mask by the caller,
/// otherwise, the mask would have to be computed in all invocations.
/// We are not sure the optimizer would maintain this mask somewhere, if it
/// were to recalculate it, it would be disastrous for performance
/// \note the \c static_cast are necessary because of narrowing conversions
#define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
#define X(name, op) \
constexpr SWAR \
shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
T shiftC = static_cast<T>(bitCount); \
auto V = (*this & protectiveMask).value(); \
auto rv = static_cast<T>(V op shiftC); \
return SWAR{rv}; \
}
SHIFT_INTRALANE_OP_X_LIST
#undef X
#undef SHIFT_INTRALANE_OP_X_LIST

constexpr SWAR
multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; }
Expand Down
214 changes: 207 additions & 7 deletions inc/zoo/swar/associative_iteration.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,42 @@

#include "zoo/swar/SWAR.h"

//#define ZOO_DEVELOPMENT_DEBUGGING
#ifdef ZOO_DEVELOPMENT_DEBUGGING
#include <iostream>

inline std::ostream &binary(std::ostream &out, uint64_t input, int count) {
while(count--) {
out << (1 & input);
input >>= 1;
}
return out;
}

template<int NB, typename B>
std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
using S = zoo::swar::SWAR<NB, B>;
auto shiftCounter = sizeof(B) * 8 / NB;
out << "<|";
auto v = s.value();
do {
binary(out, v, NB) << '|';

} while(--shiftCounter);
return out << ">";
}

#define ZOO_TO_STRING(a) #a
// std::endl is needed within the context of debugging: flush the line
#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl;
#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)

#else

#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__

#endif

namespace zoo::swar {

/// \note This code should be substituted by an application of "progressive" algebraic iteration
Expand All @@ -11,32 +47,196 @@ template<int NB, typename B>
constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
using S = SWAR<NB, B>;
auto
shiftClearingMask = S{~S::MostSignificantBit},
shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)},
doubling = input,
result = S{0};
auto
bitsToXOR = NB,
power = 1;

#define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
for(;;) {
ZTE(doubling);
if(1 & bitsToXOR) {
result = result ^ doubling;
doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
ZTE(result = result ^ doubling);
ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask));
}
bitsToXOR >>= 1;
ZTE(bitsToXOR >>= 1);
if(!bitsToXOR) { break; }
auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
doubling = doubling ^ shifted;
ZTE(shifted);
ZTE(doubling = doubling ^ shifted);
// 01...1
// 001...1
// 00001...1
// 000000001...1
shiftClearingMask =
shiftClearingMask & S{shiftClearingMask.value() >> power};
power <<= 1;
shiftClearingMask &
S{static_cast<B>(shiftClearingMask.value() >> power)};
ZTE(power <<= 1);
}
ZTE(input);
#undef ZTE
return S{result};
}

/*
Binary compress: A fascinating algorithm.

Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
"Parallel Extraction"

From a "mask", a selector of bits from an input, we want to put them together in
the output.

For example's sake, this is the selector:
Note: this follows the usual 'big endian' convention of denoting the most
significant bit first:
0001 0011 0111 0111 0110 1110 1100 1010
Imagine the input is the 32-bit or 32-boolean variable expression
abcd efgh ijkl mnop qrst uvxy zABC DEFG
We want the selection
d gh jkl nop rs uvx zA D F
To be compressed into the output
0000 0000 0000 00dg hjkl nopr suvx zADF

This algorithm will virtually calculate the count of positions that the selected
bits travel to the right, by constructing the binary encoding of that count:
It will identify the positions that will travel an odd number of positions to
the right, these are those whose position-travel-count have the units set.
It will then move those positions by one position to the right, and eliminate
them from the yet-to-move positions. Because it eliminates the positions that
would move an odd count, there remains only positions that move an even number
of positions. Now it finds the positions that move an odd count of /pairs/ of
positions, it moves them 2 positions. This is equivalent to finding the
positions that would have the bit for 2 set in the count of positions to move
right.
Then an odd count of /quartets/ of positions, and moves them 4;
8, 16, 32, ...


Complete example (32 bits)
Selection mask:
0001 0011 0111 0111 0110 1110 1100 1010
Input (each letter or variable is a boolean, that can have 0 or 1)
abcd efgh ijkl mnop qrst uvxy zABC DEFG
Selection (using spaces)
d gh jkl nop rs uvx zA D F
Desired result:
dghjklnoprsuvxzADF

0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1
1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix

10 1101 1101
/*
Complete example (32 bits)
Selection mask:
0001 0011 0111 0111 0110 1110 1100 1010
Input (each letter or variable is a boolean, that can have 0 or 1)
abcd efgh ijkl mnop qrst uvxy zABC DEFG
Selection (using spaces)
d gh jkl nop rs uvx zA D F
Desired result:
dghjklnoprsuvxzADF

0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
== groupsize of ~compressionMask
This indicates the positions that have a 0 immediately to the right in
compressionMask
4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
current position in forParallelSuffix,
last decimal digit
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
forParallelSuffix
We have just identified the positions that need to move an odd number of
positions. Filter them with positions with a bit set in compressionMask:
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
compressionMask by 1 == groupSize
0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
that will move)
---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
forParallelSuffix
1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
immediately to their right)
1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
had even zeroes to their right)
At this point, we have removed from compressionMask the positions that moved an
odd number of positions and moved them 1 position,
then, we only keep positions that move an even number of positions.
Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
*/

template<int NB, typename B>
constexpr SWAR<NB, B>
compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
// This solution uses the parallel suffix operation as a primary tool:
// For every bit postion it indicates an odd number of ones to the right,
// including itself.
// Because we want to detect the "oddness" of groups of zeroes to the right,
// we flip the compression mask. To not count the bit position itself,
// we shift by one.
#define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
ZTE(input);
ZTE(compressionMask);
using S = SWAR<NB, B>;
auto result = input & compressionMask;
auto groupSize = 1;
auto
shiftLeftMask = S{S::LowerBits},
shiftRightMask = S{S::LowerBits << 1};
ZTE(~compressionMask);
auto forParallelSuffix = // this is called "mk" in the book
(~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
ZTE(forParallelSuffix);
// note: forParallelSuffix denotes positions with a zero
// immediately to the right in 'compressionMask'
for(;;) {
ZTE(groupSize);
ZTE(shiftLeftMask);
ZTE(shiftRightMask);
ZTE(result);
auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
parallelSuffix(forParallelSuffix);
ZTE(oddCountOfGroupsOfZerosToTheRight);
// compress the bits just identified in both the result and the mask
auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
ZTE(moving);
compressionMask =
(compressionMask ^ moving) | // clear the moving
moving.shiftIntraLaneRight(groupSize, shiftRightMask);
ZTE(compressionMask);
auto movingFromInput = result & moving;
result =
(result ^ movingFromInput) | // clear the moving from the result
movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
auto nextGroupSize = groupSize << 1;
if(NB <= nextGroupSize) {
break;
}
auto evenCountOfGroupsOfZerosToTheRight =
~oddCountOfGroupsOfZerosToTheRight;
forParallelSuffix =
forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
auto newShiftLeftMask =
shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
shiftRightMask =
shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
shiftLeftMask = newShiftLeftMask;
groupSize = nextGroupSize;
}
ZTE(result);
#undef ZTE
return result;
}

/// \todo because of the desirability of "accumuating" the XORs at the MSB,
/// the parallel suffix operation is more suitable.
template<int NB, typename B>
Expand Down
11 changes: 11 additions & 0 deletions test/swar/BasicOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,17 @@ TEST_CASE(
}
}

TEST_CASE("Compress/Expand", "[swar]") {
unsigned
Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010,
ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101,
// Selection: 1 01 101 101 10 010 01 0 0
result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0;
using S1_32 = SWAR<32, uint32_t>;
auto q = compress(S1_32{ToMove}, S1_32{Mask});
CHECK(result == q.value());
}

static_assert(1 == popcount<5>(0x100ull));
static_assert(1 == popcount<5>(0x010ull));
static_assert(1 == popcount<5>(0x001ull));
Expand Down