Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions encodings/fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,11 @@ required-features = ["_test-harness"]
name = "bit_transpose"
harness = false
required-features = ["_test-harness"]

[[bench]]
name = "bitpack_constant"
harness = false

[[bench]]
name = "bitpack_compare"
harness = false
108 changes: 108 additions & 0 deletions encodings/fastlanes/benches/bitpack_compare.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Compare an already-packed `BitPackedArray` against a constant value. Compares the
//! out-of-range fast path (constant outside `[0, 2^bit_width - 1]`) against an explicit
//! "decompress, then compare" baseline.
//!
//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare`.

#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

use divan::Bencher;
use divan::counter::ItemsCount;
use vortex_array::ArrayRef;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::builtins::ArrayBuiltins;
use vortex_array::scalar_fn::fns::operators::Operator;
use vortex_array::validity::Validity;
use vortex_buffer::BufferMut;
use vortex_fastlanes::BitPackedData;

fn main() {
divan::main();
}

const LENS: &[usize] = &[1024, 64 * 1024];
const BIT_WIDTHS: &[u8] = &[4, 16];

/// Build a packed array of varied in-range values, plus an out-of-range constant RHS for
/// the fast-path benches.
fn build_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let buf: BufferMut<u32> = (0..len).map(|i| (i as u32) % (1 << BW)).collect();
let array = BitPackedData::encode(
&PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(),
BW,
&mut ctx,
)
.unwrap()
.into_array();
// 1 << BW is just past the packable range, so the out-of-range fast path fires.
let constant = 1u32 << BW;
let rhs = ConstantArray::new(constant, len).into_array();
(array, rhs, ctx)
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn fast_eq_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
array
.clone()
.binary(rhs.clone(), Operator::Eq)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn baseline_eq<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
// What the fallback would do: materialize the unpacked primitive, then run Arrow
// compare on it.
let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
primitive
.into_array()
.binary(rhs.clone(), Operator::Eq)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn fast_lt_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
array
.clone()
.binary(rhs.clone(), Operator::Lt)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn baseline_lt<const BW: u8>(bencher: Bencher, len: usize) {
let (array, rhs, mut ctx) = build_inputs::<BW>(len);
bencher.counter(ItemsCount::new(len)).bench_local(|| {
let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
primitive
.into_array()
.binary(rhs.clone(), Operator::Lt)
.unwrap()
.execute::<BoolArray>(&mut ctx)
.unwrap()
});
}
53 changes: 53 additions & 0 deletions encodings/fastlanes/benches/bitpack_constant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Compare the fast constant bit-packing path against the standard `bitpack_encode`
//! pipeline on a uniform-constant input.
//!
//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_constant`.

#![expect(clippy::unwrap_used)]

use divan::Bencher;
use divan::black_box;
use divan::counter::ItemsCount;
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::BufferMut;
use vortex_fastlanes::bitpack_compress::bitpack_encode;
use vortex_fastlanes::bitpack_compress::bitpack_encode_constant;

fn main() {
divan::main();
}

const LENS: &[usize] = &[1024, 64 * 1024];
const BIT_WIDTHS: &[u8] = &[4, 16];

const CONSTANT: u32 = 7;

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn full_encode<const BW: u8>(bencher: Bencher, len: usize) {
let buf: BufferMut<u32> = (0..len).map(|_| CONSTANT).collect();
let arr = PrimitiveArray::new(buf.freeze(), Validity::NonNullable);
let mut ctx = LEGACY_SESSION.create_execution_ctx();

bencher
.counter(ItemsCount::new(len))
.bench_local(|| bitpack_encode(black_box(&arr), black_box(BW), None, &mut ctx).unwrap());
}

#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
fn fast_encode<const BW: u8>(bencher: Bencher, len: usize) {
bencher.counter(ItemsCount::new(len)).bench_local(|| {
bitpack_encode_constant::<u32>(
black_box(CONSTANT),
black_box(BW),
black_box(len),
Validity::NonNullable,
)
.unwrap()
});
}
149 changes: 149 additions & 0 deletions encodings/fastlanes/docs/inrange_compare_plan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Plan: in-range constant compare for bit-packed arrays

## Status today

`encodings/fastlanes/src/bitpacking/compute/compare.rs` accelerates
`BitPackedArray op ConstantArray` only when `c` is **outside** the packable range
`[0, 2^bit_width - 1]`. That case reduces every packed lane to the same boolean
under `op`, so the result is a `ConstantArray<bool>` (no work on the buffer) or a
`BitBuffer` filled with that constant plus a per-position overlay at any patched
indices.

**In-range** constants (those that could equal a packed lane) fall through to the
canonical "decompress to `PrimitiveArray`, then run Arrow's vectorized compare"
path. For `Eq`/`NotEq`/`Lt`/`Lte`/`Gt`/`Gte` this is correct but does two SIMD
passes' worth of work (unpack + compare) and writes the unpacked primitive to
memory along the way.

## Why the obvious approach (FastLanes `unpack_cmp`) doesn't win

`fastlanes::BitPackingCompare::unchecked_unpack_cmp` fuses unpack + compare and
emits `[bool; 1024]` without materializing the primitive array. It is
`#[inline(never)]` and applies the comparator closure to every element
individually. We tried wiring it in: at 65 K u32 elements (bit_width 4) the
fused path measured ~170 µs against ~91 µs for the canonical "unpack then
Arrow compare" path. Both Arrow's primitive compare and FastLanes' `unpack` are
heavily SIMD-vectorized; the per-element closure call defeats vectorization in
`unpack_cmp`. Reverted in commit `cc586c6`.

## Proposal: bit-parallel compare on the packed buffer

Pack the constant into a 1024-element template once (we already have a
constant-only pack kernel in `bitpack_compress::bitpack_constant`, which
synthesizes the FastLanes bit pattern analytically — no `BitPacking::pack`
call). Then for each 1024-chunk of the input, do the comparison directly on the
packed bytes via SIMD/SWAR. No materialization, less memory traffic
(`~3W·128` bytes per chunk vs `12·1024` bytes for unpack + compare on `u32`),
and the loop is fully vectorizable.

### Equality (`Eq` / `NotEq`)

The clean case.

```
diff = packed_chunk ^ c_packed_chunk // SIMD XOR per word
eq_per_element = "every W-bit slot of diff is zero"
```

Per the FastLanes layout, lane `l`'s `W` output words contain bits
`[k·T, (k+1)·T)` of the per-lane stream `c, c, c, …` for `k ∈ 0..W`. After
XOR with the same-layout `c_packed`, element `r`'s `W` bits land at known
positions inside the lane's `W` output words.

* **`W` divides `T` (W ∈ {1, 2, 4, 8, 16, 32} for u32):** each element's `W`
bits are contained in a single output word. The classic SWAR "any byte is
zero" trick works for `W = 8`:
```
let v = diff_word;
let zero_byte = !v & (v.wrapping_sub(0x01010101)) & 0x80808080;
// bit 7 of each byte set iff that byte was 0
```
Analogous masks `0x55555555` (`W=2`), `0x11111111` (`W=4`),
`0x00010001` (`W=16`) cover the other power-of-2 widths.
* **`W` does not divide `T` (e.g. 3, 5, 7, 9, 11, 13, 15):** elements straddle
word boundaries. The "OR-reduce W shifted copies" idea still applies but the
mask depends on the rotation; easiest implementation is per-width SWAR
unrolled at compile time via `match_each_bit_width!`.

Pack the resulting per-element bits into the output `BitBuffer`. We already do
this for the out-of-range short-circuit's patches overlay; the same code
applies.

### Ordering (`Lt` / `Lte` / `Gt` / `Gte`)

The harder case. Two routes; pick one per width.

#### Route A — SWAR less-than (preferred for `W ∈ {8, 16, 32}`)

For `W = 8` and `u32` storage, each output word holds 4 packed elements as
bytes. The classic SWAR unsigned less-than is:

```
let A = packed_word;
let B = c_packed_word; // a constant per chunk
let mask = 0x80808080;
let lt = ((A | mask) - (B & !mask)) ^ ((A ^ B) | mask);
let lt_bits = lt & mask; // high bit per byte = 1 iff A_byte < B_byte
```

Extract bit 7 of each byte (e.g. `_pext_u32(lt_bits, 0x80808080)` on BMI2, or a
shift-and-mask sequence) and pack into the result `BitBuffer`. `W = 16` uses
`0x80008000`; `W = 32` is the trivial single-element-per-word case.

Derive the other three operators from `Lt`:
* `Gt(a, c) = Lt(c, a)` → swap operands.
* `Lte(a, c) = !Gt(a, c)` → SWAR less-than with swapped operands, then invert.
* `Gte(a, c) = !Lt(a, c)` → invert.

For `W = 4` the same SWAR pattern works on nibbles with mask `0x88888888`.

#### Route B — bit-sliced compare (covers all `W`)

Generic alternative: for each output word, treat the contained `W`-bit slots
as a vertical stack of `T_bits / W` slot values, and run the standard
bit-sliced comparator on the lane's `W` output words at once. This is
layout-aware (uses the FastLanes lane order) but doesn't need per-width
SWAR masks. Slower than Route A on widths Route A supports, but simpler to
write and works uniformly.

### Patches and validity

Same overlay pattern as the out-of-range path: compute the per-position
ordering bit from the packed buffer, then for each `(idx, value)` patch set the
bit at `idx - patches.offset()` to `op(value, c)`. Apply the validity mask
at the end via `BoolArray::new(bits, validity)`.

### Sliced arrays

`lhs.offset() != 0` means the first chunk's packed bytes do not align with
element 0; defer to the canonical path until we have proper offset handling
inside the SWAR loop (drop the first `offset` bits before writing).

## Order of work

1. **`Eq` in-range via XOR + SWAR zero-detect.** Add the per-width SWAR masks
for `W ∈ {1, 2, 4, 8, 16, 32}` first; widths in between can fall through
to canonical until step 3. NotEq is the same kernel inverted.
2. **`Gt` / `Gte` in-range via SWAR less-than.** Land `W ∈ {8, 16, 32}`,
derive the four ordering operators from a single `Lt(a, b)` primitive.
3. **Non-power-of-2 widths.** Pick Route B (bit-sliced compare) or
per-width SWAR; benchmark.
4. **Sliced offsets and patches.** Handle `offset != 0` inside the SWAR loop
so we don't fall back on sliced inputs.

Each step is independently shippable; the kernel already returns `Ok(None)`
for any case it doesn't accelerate, so the canonical path remains the
correctness fallback throughout.

## Benchmarks to land alongside

Add cases to `benches/bitpack_compare.rs` for an **in-range** constant
(currently only the out-of-range fast path is benched there). Compare:

* the SWAR fast path
* the canonical "execute to `PrimitiveArray`, then Arrow compare" baseline

across `bit_width ∈ {4, 8, 16}` and `len ∈ {1 024, 65 536}` for both `Eq`
and `Gt`. We need to beat the baseline at 64 K to be worth landing —
otherwise the canonical path's SIMD throughput is already the right answer
and we should drop this idea.
10 changes: 10 additions & 0 deletions encodings/fastlanes/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ pub mod vortex_fastlanes::bitpack_compress

pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(vortex_array::array::view::ArrayView<'_, vortex_array::arrays::primitive::vtable::Primitive>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<alloc::vec::Vec<usize>>

pub fn vortex_fastlanes::bitpack_compress::bitpack_constant<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking>(T, u8, usize) -> vortex_buffer::buffer::Buffer<T>

pub fn vortex_fastlanes::bitpack_compress::bitpack_encode(&vortex_array::arrays::primitive::vtable::PrimitiveArray, u8, core::option::Option<&[usize]>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>

pub fn vortex_fastlanes::bitpack_compress::bitpack_encode_constant<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking + num_traits::cast::ToPrimitive>(T, u8, usize, vortex_array::validity::Validity) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>

pub unsafe fn vortex_fastlanes::bitpack_compress::bitpack_encode_unchecked(vortex_array::arrays::primitive::vtable::PrimitiveArray, u8) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>

pub fn vortex_fastlanes::bitpack_compress::bitpack_primitive<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking>(&[T], u8) -> vortex_buffer::buffer::Buffer<T>
Expand Down Expand Up @@ -190,6 +194,10 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPacked

pub fn vortex_fastlanes::BitPacked::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>

impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_fastlanes::BitPacked

pub fn vortex_fastlanes::BitPacked::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>

impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_fastlanes::BitPacked

pub fn vortex_fastlanes::BitPacked::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
Expand Down Expand Up @@ -220,6 +228,8 @@ pub fn vortex_fastlanes::BitPackedData::try_new(vortex_array::buffer::BufferHand

pub fn vortex_fastlanes::BitPackedData::unpacked_chunks<T: vortex_fastlanes::unpack_iter::BitPacked>(&self, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<vortex_fastlanes::unpack_iter::BitUnpackedChunks<T>>

pub fn vortex_fastlanes::BitPackedData::value_fits_bit_width<T: vortex_array::dtype::ptype::NativePType + num_traits::cast::ToPrimitive>(&self, T) -> core::option::Option<bool>

impl core::clone::Clone for vortex_fastlanes::BitPackedData

pub fn vortex_fastlanes::BitPackedData::clone(&self) -> vortex_fastlanes::BitPackedData
Expand Down
Loading
Loading