From 329d81c97e41016e1b8cb9882b6bd228bb13873e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 14 May 2026 14:06:54 +0000 Subject: [PATCH 1/2] Eq/NotEq compare-constant fast path for bit-packed arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bit-packed lane holds values in `[0, 2^bit_width - 1]`. When the RHS constant sits outside that range, no packed lane can equal it, so: Eq -> false everywhere NotEq -> true everywhere modulo patches (which carry the real value) and validity. Detecting the range is an `O(1)` `i128` check on the constant alone — strictly cheaper than encoding `c` into the bit-packed representation. Register a `CompareKernel` for `BitPacked` that short-circuits this case. With no patches and no nulls it returns a `ConstantArray` (also `O(1)`); otherwise it allocates a `BitBuffer`, fills it with the constant result, and overlays the per-position outcome at each patch index. Ordering operators (`Lt`/`Lte`/`Gt`/`Gte`) and in-range constants fall through to the canonical decompress + Arrow compare path; tests exercise both fall-throughs. Signed-off-by: Claude --- encodings/fastlanes/public-api.lock | 4 + .../src/bitpacking/compute/compare.rs | 323 ++++++++++++++++++ .../fastlanes/src/bitpacking/compute/mod.rs | 1 + .../src/bitpacking/vtable/kernels.rs | 2 + 4 files changed, 330 insertions(+) create mode 100644 encodings/fastlanes/src/bitpacking/compute/compare.rs diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 4f0ce3df18c..527dda21442 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -190,6 +190,10 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPacked pub fn vortex_fastlanes::BitPacked::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> +impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_fastlanes::BitPacked + +pub fn vortex_fastlanes::BitPacked::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_fastlanes::BitPacked pub fn vortex_fastlanes::BitPacked::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs new file mode 100644 index 00000000000..900b7bd9f60 --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Fast-path `Eq` / `NotEq` comparison against a constant. +//! +//! When the constant cannot fit in the packable range `[0, 2^bit_width - 1]`, no value +//! stored in the packed buffer can equal it, so: +//! +//! * `Eq` → every position is `false` (modulo patches/validity). +//! * `NotEq` → every position is `true` (modulo patches/validity). +//! +//! Detecting this is an `O(1)` range check on the constant — strictly cheaper than +//! encoding `c` into the bit-packed representation. The check is layout-agnostic and +//! does not touch the packed buffer. +//! +//! In-range constants and ordering operators (`Lt`/`Lte`/`Gt`/`Gte`) currently fall +//! through to the canonical decompress + Arrow compare path. + +use num_traits::ToPrimitive; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::NativeValue; +use vortex_array::dtype::IntegerPType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; +use vortex_array::match_each_unsigned_integer_ptype; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_buffer::BitBufferMut; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::BitPacked; +use crate::BitPackedArrayExt; + +impl CompareKernel for BitPacked { + fn compare( + lhs: ArrayView<'_, Self>, + rhs: &ArrayRef, + operator: CompareOperator, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + // Only `Eq` / `NotEq` are accelerated here. Ordering operators (`Lt`, `Lte`, `Gt`, + // `Gte`) need either a SWAR less-than over the packed bytes or unpack-then-compare; + // both are out of scope for this commit and fall through to the canonical path. + if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) { + return Ok(None); + } + + let Some(constant) = rhs.as_constant() else { + return Ok(None); + }; + let Some(constant) = constant.as_primitive_opt() else { + return Ok(None); + }; + + match_each_integer_ptype!(constant.ptype(), |T| { + compare_eq_constant::( + lhs, + constant + .typed_value::() + .vortex_expect("null scalar handled in adaptor"), + rhs.dtype().nullability(), + operator, + ctx, + ) + }) + } +} + +/// Returns `true` if `constant` cannot fit in the packable range `[0, 2^bit_width - 1]`. +/// +/// `O(1)` check on the constant; never inspects the packed buffer. +#[inline] +fn constant_out_of_packable_range(constant: T, bit_width: u8) -> bool +where + T: NativePType + ToPrimitive, +{ + let Some(c) = constant.to_i128() else { + return false; + }; + let max = (1i128 << bit_width) - 1; + c < 0 || c > max +} + +fn compare_eq_constant( + lhs: ArrayView<'_, BitPacked>, + constant: T, + rhs_nullability: Nullability, + operator: CompareOperator, + ctx: &mut ExecutionCtx, +) -> VortexResult> +where + T: NativePType + ToPrimitive, +{ + if !constant_out_of_packable_range(constant, lhs.bit_width()) { + // Constant fits in the packable range, so at least some packed lanes could match + // it. The fast path doesn't apply. + return Ok(None); + } + + // Every packed lane disagrees with `constant`. `Eq` is `false` everywhere, `NotEq` is + // `true` everywhere — modulo patches (which carry the real value) and validity. + let packed_lane_result = matches!(operator, CompareOperator::NotEq); + let len = lhs.len(); + let validity = lhs.validity()?; + let patches = lhs.patches(); + let result_nullability = lhs.dtype().nullability() | rhs_nullability; + + // Hot path: no patches, no nulls — every position has the same boolean result, so we + // return a `ConstantArray` in `O(1)`. + if patches.is_none() && validity.no_nulls() { + return Ok(Some( + ConstantArray::new(Scalar::bool(packed_lane_result, result_nullability), len) + .into_array(), + )); + } + + let mut bits = BitBufferMut::full(packed_lane_result, len); + + if let Some(patches) = patches { + let indices = patches.indices().clone().execute::(ctx)?; + let values = patches.values().clone().execute::(ctx)?; + let patches_offset = patches.offset(); + + match_each_unsigned_integer_ptype!(indices.ptype(), |I| { + apply_eq_patches::( + &mut bits, + indices.as_slice::(), + values.as_slice::(), + patches_offset, + operator, + constant, + ); + }); + } + + let validity = validity.union_nullability(rhs_nullability); + Ok(Some(BoolArray::new(bits.freeze(), validity).into_array())) +} + +fn apply_eq_patches( + bits: &mut BitBufferMut, + indices: &[I], + values: &[T], + indices_offset: usize, + operator: CompareOperator, + constant: T, +) where + T: NativePType, + I: IntegerPType, +{ + // Only Eq/NotEq reach this point (see `CompareKernel::compare`). + let cmp: fn(T, T) -> bool = match operator { + CompareOperator::Eq => |l, r| NativeValue(l) == NativeValue(r), + CompareOperator::NotEq => |l, r| NativeValue(l) != NativeValue(r), + _ => unreachable!("only Eq/NotEq reach the bitpacked compare-constant fast path"), + }; + + let len = bits.len(); + for (&raw_idx, &value) in indices.iter().zip(values.iter()) { + let i: usize = raw_idx.as_(); + if i < indices_offset { + continue; + } + let pos = i - indices_offset; + if pos >= len { + break; + } + if cmp(value, constant) { + bits.set(pos); + } else { + bits.unset(pos); + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar::Scalar; + use vortex_array::scalar_fn::fns::operators::Operator; + use vortex_array::session::ArraySession; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use crate::BitPackedArrayExt; + use crate::BitPackedData; + use crate::bitpacking::compute::compare::constant_out_of_packable_range; + + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + + #[test] + fn range_check_is_o1() { + // 8-bit packable range is [0, 255]. + assert!(constant_out_of_packable_range::(256, 8)); + assert!(constant_out_of_packable_range::(-1, 8)); + assert!(!constant_out_of_packable_range::(255, 8)); + assert!(!constant_out_of_packable_range::(0, 8)); + } + + #[rstest] + #[case(Operator::Eq, false)] + #[case(Operator::NotEq, true)] + fn eq_above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + // 999 is above the 8-bit packable range; no packed lane matches. + let packed = BitPackedData::encode( + &PrimitiveArray::from_iter([1u32, 2, 3, 250, 100]).into_array(), + 8, + &mut ctx, + )?; + let result = packed + .into_array() + .binary(ConstantArray::new(999u32, 5).into_array(), op)? + .execute::(&mut ctx)?; + assert_arrays_eq!(result, BoolArray::from_iter([expected; 5])); + Ok(()) + } + + #[rstest] + #[case(Operator::Eq)] + #[case(Operator::NotEq)] + fn eq_above_range_with_patches(#[case] op: Operator) -> VortexResult<()> { + // bit_width=4 packable range is [0, 15]; out-of-range values become patches. + let mut ctx = SESSION.create_execution_ctx(); + let values = buffer![1u32, 5, 1000, 7, 1000, 14]; + let constant = 1000u32; + + let packed = BitPackedData::encode(&values.clone().into_array(), 4, &mut ctx)?; + assert!(packed.patches().is_some()); + + let result = packed + .into_array() + .binary(ConstantArray::new(constant, values.len()).into_array(), op)? + .execute::(&mut ctx)?; + + let expected: Vec = values + .iter() + .map(|v| match op { + Operator::Eq => *v == constant, + Operator::NotEq => *v != constant, + _ => unreachable!(), + }) + .collect(); + assert_arrays_eq!(result, BoolArray::from_iter(expected)); + Ok(()) + } + + #[test] + fn ordering_falls_through() -> VortexResult<()> { + // Ordering ops aren't accelerated yet; they go through the canonical path and + // must still return a correct answer. + let mut ctx = SESSION.create_execution_ctx(); + let values = [1u32, 2, 3, 250, 100]; + let packed = + BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?; + let result = packed + .into_array() + .binary(ConstantArray::new(999u32, 5).into_array(), Operator::Lt)? + .execute::(&mut ctx)?; + assert_arrays_eq!( + result, + BoolArray::from_iter(values.iter().map(|v| *v < 999)) + ); + Ok(()) + } + + #[test] + fn eq_in_range_falls_through() -> VortexResult<()> { + // In-range constants must defer to the canonical path. + let mut ctx = SESSION.create_execution_ctx(); + let values = [1u32, 2, 3, 250, 100]; + let packed = + BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?; + let result = packed + .into_array() + .binary(ConstantArray::new(100u32, 5).into_array(), Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!( + result, + BoolArray::from_iter(values.iter().map(|v| *v == 100)) + ); + Ok(()) + } + + #[test] + fn eq_nullable_constant() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + let packed = BitPackedData::encode( + &PrimitiveArray::from_iter([1u32, 2, 3]).into_array(), + 4, + &mut ctx, + )?; + let rhs = ConstantArray::new(Scalar::primitive(999u32, Nullability::Nullable), 3); + let result = packed + .into_array() + .binary(rhs.into_array(), Operator::Eq)? + .execute::(&mut ctx)?; + assert_eq!(result.dtype(), &DType::Bool(Nullability::Nullable)); + Ok(()) + } +} diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs index 2501d952356..169546d311c 100644 --- a/encodings/fastlanes/src/bitpacking/compute/mod.rs +++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors mod cast; +mod compare; mod filter; pub(crate) mod is_constant; mod slice; diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs index cb020dc2ce9..29ec9ea8982 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs @@ -5,12 +5,14 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor; use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::arrays::slice::SliceExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; use crate::BitPacked; pub(crate) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ ParentKernelSet::lift(&CastExecuteAdaptor(BitPacked)), + ParentKernelSet::lift(&CompareExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&FilterExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&SliceExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&TakeExecuteAdaptor(BitPacked)), From a20f09db29de291de88e9cd1e66a93291783a9e9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 14 May 2026 14:10:37 +0000 Subject: [PATCH 2/2] Extend bit-packed compare-constant to ordering, add bitpack_constant kernel and benches, plan in-range ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ordering operators (Lt/Lte/Gt/Gte) now use the same out-of-range short-circuit as Eq/NotEq: when `c` lies outside `[0, 2^bit_width - 1]`, every packed lane has the same `Ordering` relative to `c`, so each of the six operators collapses to a constant boolean (modulo patches and validity). Add a constant-only pack kernel `bitpack_constant` that builds the FastLanes bit pattern for a `[constant; len]` input without calling `BitPacking::pack`. For constant input every lane produces the same `bit_width` output words; we compute those words analytically — each output word's `j`-th bit is bit `(k * T_bits + j) mod bit_width` of `c` — then `memset` each word `LANES` times into a stack chunk template and `memcpy` the template into every full chunk. The standard packer is only invoked for the partial tail (zero-padded past `len`). `bitpack_encode_constant` wraps the buffer up as a `BitPackedArray`. A bitwise equivalence rstest covers byte-identity with `BitPacking::pack` across lengths, widths, and constants. Bench `bitpack_constant` (analytical vs full `bitpack_encode`) on a small, fast grid: at 64 K u32 elements the analytical kernel is roughly 23-62x faster than the full encoder, since it skips the histogram, min-scan, patches gather, and per-chunk SIMD pack call. Bench `bitpack_compare` (out-of-range fast path vs explicit "decompress + Arrow compare" baseline): 1.4-1.5 µs constant-array setup vs 8-125 µs for the baseline across `bit_width ∈ {4, 16}`, `len ∈ {1024, 65536}` and Eq/Lt. Add a `value_fits_bit_width` helper on `BitPackedData` exposing the same O(1) range check used internally. Plan how to accelerate **in-range** ordering comparisons in `encodings/fastlanes/docs/inrange_compare_plan.md`: compare the packed array against the packed constant via SWAR less-than per supported bit width, derive the four ordering operators from one `Lt` primitive, and benchmark against the canonical SIMD baseline before landing. Signed-off-by: Claude --- encodings/fastlanes/Cargo.toml | 8 + .../fastlanes/benches/bitpack_compare.rs | 108 +++++++++++ .../fastlanes/benches/bitpack_constant.rs | 53 ++++++ .../fastlanes/docs/inrange_compare_plan.md | 149 +++++++++++++++ encodings/fastlanes/public-api.lock | 6 + .../src/bitpacking/array/bitpack_compress.rs | 169 +++++++++++++++++ .../fastlanes/src/bitpacking/array/mod.rs | 23 +++ .../src/bitpacking/compute/compare.rs | 175 +++++++++++------- 8 files changed, 620 insertions(+), 71 deletions(-) create mode 100644 encodings/fastlanes/benches/bitpack_compare.rs create mode 100644 encodings/fastlanes/benches/bitpack_constant.rs create mode 100644 encodings/fastlanes/docs/inrange_compare_plan.md diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index a14e19389bc..5eaf9fa4bf5 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -59,3 +59,11 @@ required-features = ["_test-harness"] name = "bit_transpose" harness = false required-features = ["_test-harness"] + +[[bench]] +name = "bitpack_constant" +harness = false + +[[bench]] +name = "bitpack_compare" +harness = false diff --git a/encodings/fastlanes/benches/bitpack_compare.rs b/encodings/fastlanes/benches/bitpack_compare.rs new file mode 100644 index 00000000000..939c401ee38 --- /dev/null +++ b/encodings/fastlanes/benches/bitpack_compare.rs @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compare an already-packed `BitPackedArray` against a constant value. Compares the +//! out-of-range fast path (constant outside `[0, 2^bit_width - 1]`) against an explicit +//! "decompress, then compare" baseline. +//! +//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare`. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use divan::Bencher; +use divan::counter::ItemsCount; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedData; + +fn main() { + divan::main(); +} + +const LENS: &[usize] = &[1024, 64 * 1024]; +const BIT_WIDTHS: &[u8] = &[4, 16]; + +/// Build a packed array of varied in-range values, plus an out-of-range constant RHS for +/// the fast-path benches. +fn build_inputs(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let buf: BufferMut = (0..len).map(|i| (i as u32) % (1 << BW)).collect(); + let array = BitPackedData::encode( + &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(), + BW, + &mut ctx, + ) + .unwrap() + .into_array(); + // 1 << BW is just past the packable range, so the out-of-range fast path fires. + let constant = 1u32 << BW; + let rhs = ConstantArray::new(constant, len).into_array(); + (array, rhs, ctx) +} + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn fast_eq_out_of_range(bencher: Bencher, len: usize) { + let (array, rhs, mut ctx) = build_inputs::(len); + bencher.counter(ItemsCount::new(len)).bench_local(|| { + array + .clone() + .binary(rhs.clone(), Operator::Eq) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); +} + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn baseline_eq(bencher: Bencher, len: usize) { + let (array, rhs, mut ctx) = build_inputs::(len); + bencher.counter(ItemsCount::new(len)).bench_local(|| { + // What the fallback would do: materialize the unpacked primitive, then run Arrow + // compare on it. + let primitive = array.clone().execute::(&mut ctx).unwrap(); + primitive + .into_array() + .binary(rhs.clone(), Operator::Eq) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); +} + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn fast_lt_out_of_range(bencher: Bencher, len: usize) { + let (array, rhs, mut ctx) = build_inputs::(len); + bencher.counter(ItemsCount::new(len)).bench_local(|| { + array + .clone() + .binary(rhs.clone(), Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); +} + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn baseline_lt(bencher: Bencher, len: usize) { + let (array, rhs, mut ctx) = build_inputs::(len); + bencher.counter(ItemsCount::new(len)).bench_local(|| { + let primitive = array.clone().execute::(&mut ctx).unwrap(); + primitive + .into_array() + .binary(rhs.clone(), Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); +} diff --git a/encodings/fastlanes/benches/bitpack_constant.rs b/encodings/fastlanes/benches/bitpack_constant.rs new file mode 100644 index 00000000000..a5e06e53972 --- /dev/null +++ b/encodings/fastlanes/benches/bitpack_constant.rs @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compare the fast constant bit-packing path against the standard `bitpack_encode` +//! pipeline on a uniform-constant input. +//! +//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_constant`. + +#![expect(clippy::unwrap_used)] + +use divan::Bencher; +use divan::black_box; +use divan::counter::ItemsCount; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_fastlanes::bitpack_compress::bitpack_encode; +use vortex_fastlanes::bitpack_compress::bitpack_encode_constant; + +fn main() { + divan::main(); +} + +const LENS: &[usize] = &[1024, 64 * 1024]; +const BIT_WIDTHS: &[u8] = &[4, 16]; + +const CONSTANT: u32 = 7; + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn full_encode(bencher: Bencher, len: usize) { + let buf: BufferMut = (0..len).map(|_| CONSTANT).collect(); + let arr = PrimitiveArray::new(buf.freeze(), Validity::NonNullable); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + bencher + .counter(ItemsCount::new(len)) + .bench_local(|| bitpack_encode(black_box(&arr), black_box(BW), None, &mut ctx).unwrap()); +} + +#[divan::bench(args = LENS, consts = BIT_WIDTHS)] +fn fast_encode(bencher: Bencher, len: usize) { + bencher.counter(ItemsCount::new(len)).bench_local(|| { + bitpack_encode_constant::( + black_box(CONSTANT), + black_box(BW), + black_box(len), + Validity::NonNullable, + ) + .unwrap() + }); +} diff --git a/encodings/fastlanes/docs/inrange_compare_plan.md b/encodings/fastlanes/docs/inrange_compare_plan.md new file mode 100644 index 00000000000..4399f83dc14 --- /dev/null +++ b/encodings/fastlanes/docs/inrange_compare_plan.md @@ -0,0 +1,149 @@ +# Plan: in-range constant compare for bit-packed arrays + +## Status today + +`encodings/fastlanes/src/bitpacking/compute/compare.rs` accelerates +`BitPackedArray op ConstantArray` only when `c` is **outside** the packable range +`[0, 2^bit_width - 1]`. That case reduces every packed lane to the same boolean +under `op`, so the result is a `ConstantArray` (no work on the buffer) or a +`BitBuffer` filled with that constant plus a per-position overlay at any patched +indices. + +**In-range** constants (those that could equal a packed lane) fall through to the +canonical "decompress to `PrimitiveArray`, then run Arrow's vectorized compare" +path. For `Eq`/`NotEq`/`Lt`/`Lte`/`Gt`/`Gte` this is correct but does two SIMD +passes' worth of work (unpack + compare) and writes the unpacked primitive to +memory along the way. + +## Why the obvious approach (FastLanes `unpack_cmp`) doesn't win + +`fastlanes::BitPackingCompare::unchecked_unpack_cmp` fuses unpack + compare and +emits `[bool; 1024]` without materializing the primitive array. It is +`#[inline(never)]` and applies the comparator closure to every element +individually. We tried wiring it in: at 65 K u32 elements (bit_width 4) the +fused path measured ~170 µs against ~91 µs for the canonical "unpack then +Arrow compare" path. Both Arrow's primitive compare and FastLanes' `unpack` are +heavily SIMD-vectorized; the per-element closure call defeats vectorization in +`unpack_cmp`. Reverted in commit `cc586c6`. + +## Proposal: bit-parallel compare on the packed buffer + +Pack the constant into a 1024-element template once (we already have a +constant-only pack kernel in `bitpack_compress::bitpack_constant`, which +synthesizes the FastLanes bit pattern analytically — no `BitPacking::pack` +call). Then for each 1024-chunk of the input, do the comparison directly on the +packed bytes via SIMD/SWAR. No materialization, less memory traffic +(`~3W·128` bytes per chunk vs `12·1024` bytes for unpack + compare on `u32`), +and the loop is fully vectorizable. + +### Equality (`Eq` / `NotEq`) + +The clean case. + +``` +diff = packed_chunk ^ c_packed_chunk // SIMD XOR per word +eq_per_element = "every W-bit slot of diff is zero" +``` + +Per the FastLanes layout, lane `l`'s `W` output words contain bits +`[k·T, (k+1)·T)` of the per-lane stream `c, c, c, …` for `k ∈ 0..W`. After +XOR with the same-layout `c_packed`, element `r`'s `W` bits land at known +positions inside the lane's `W` output words. + +* **`W` divides `T` (W ∈ {1, 2, 4, 8, 16, 32} for u32):** each element's `W` + bits are contained in a single output word. The classic SWAR "any byte is + zero" trick works for `W = 8`: + ``` + let v = diff_word; + let zero_byte = !v & (v.wrapping_sub(0x01010101)) & 0x80808080; + // bit 7 of each byte set iff that byte was 0 + ``` + Analogous masks `0x55555555` (`W=2`), `0x11111111` (`W=4`), + `0x00010001` (`W=16`) cover the other power-of-2 widths. +* **`W` does not divide `T` (e.g. 3, 5, 7, 9, 11, 13, 15):** elements straddle + word boundaries. The "OR-reduce W shifted copies" idea still applies but the + mask depends on the rotation; easiest implementation is per-width SWAR + unrolled at compile time via `match_each_bit_width!`. + +Pack the resulting per-element bits into the output `BitBuffer`. We already do +this for the out-of-range short-circuit's patches overlay; the same code +applies. + +### Ordering (`Lt` / `Lte` / `Gt` / `Gte`) + +The harder case. Two routes; pick one per width. + +#### Route A — SWAR less-than (preferred for `W ∈ {8, 16, 32}`) + +For `W = 8` and `u32` storage, each output word holds 4 packed elements as +bytes. The classic SWAR unsigned less-than is: + +``` +let A = packed_word; +let B = c_packed_word; // a constant per chunk +let mask = 0x80808080; +let lt = ((A | mask) - (B & !mask)) ^ ((A ^ B) | mask); +let lt_bits = lt & mask; // high bit per byte = 1 iff A_byte < B_byte +``` + +Extract bit 7 of each byte (e.g. `_pext_u32(lt_bits, 0x80808080)` on BMI2, or a +shift-and-mask sequence) and pack into the result `BitBuffer`. `W = 16` uses +`0x80008000`; `W = 32` is the trivial single-element-per-word case. + +Derive the other three operators from `Lt`: +* `Gt(a, c) = Lt(c, a)` → swap operands. +* `Lte(a, c) = !Gt(a, c)` → SWAR less-than with swapped operands, then invert. +* `Gte(a, c) = !Lt(a, c)` → invert. + +For `W = 4` the same SWAR pattern works on nibbles with mask `0x88888888`. + +#### Route B — bit-sliced compare (covers all `W`) + +Generic alternative: for each output word, treat the contained `W`-bit slots +as a vertical stack of `T_bits / W` slot values, and run the standard +bit-sliced comparator on the lane's `W` output words at once. This is +layout-aware (uses the FastLanes lane order) but doesn't need per-width +SWAR masks. Slower than Route A on widths Route A supports, but simpler to +write and works uniformly. + +### Patches and validity + +Same overlay pattern as the out-of-range path: compute the per-position +ordering bit from the packed buffer, then for each `(idx, value)` patch set the +bit at `idx - patches.offset()` to `op(value, c)`. Apply the validity mask +at the end via `BoolArray::new(bits, validity)`. + +### Sliced arrays + +`lhs.offset() != 0` means the first chunk's packed bytes do not align with +element 0; defer to the canonical path until we have proper offset handling +inside the SWAR loop (drop the first `offset` bits before writing). + +## Order of work + +1. **`Eq` in-range via XOR + SWAR zero-detect.** Add the per-width SWAR masks + for `W ∈ {1, 2, 4, 8, 16, 32}` first; widths in between can fall through + to canonical until step 3. NotEq is the same kernel inverted. +2. **`Gt` / `Gte` in-range via SWAR less-than.** Land `W ∈ {8, 16, 32}`, + derive the four ordering operators from a single `Lt(a, b)` primitive. +3. **Non-power-of-2 widths.** Pick Route B (bit-sliced compare) or + per-width SWAR; benchmark. +4. **Sliced offsets and patches.** Handle `offset != 0` inside the SWAR loop + so we don't fall back on sliced inputs. + +Each step is independently shippable; the kernel already returns `Ok(None)` +for any case it doesn't accelerate, so the canonical path remains the +correctness fallback throughout. + +## Benchmarks to land alongside + +Add cases to `benches/bitpack_compare.rs` for an **in-range** constant +(currently only the out-of-range fast path is benched there). Compare: + +* the SWAR fast path +* the canonical "execute to `PrimitiveArray`, then Arrow compare" baseline + +across `bit_width ∈ {4, 8, 16}` and `len ∈ {1 024, 65 536}` for both `Eq` +and `Gt`. We need to beat the baseline at 64 K to be worth landing — +otherwise the canonical path's SIMD throughput is already the right answer +and we should drop this idea. diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 527dda21442..d63eb857ed8 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -18,8 +18,12 @@ pub mod vortex_fastlanes::bitpack_compress pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(vortex_array::array::view::ArrayView<'_, vortex_array::arrays::primitive::vtable::Primitive>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_fastlanes::bitpack_compress::bitpack_constant(T, u8, usize) -> vortex_buffer::buffer::Buffer + pub fn vortex_fastlanes::bitpack_compress::bitpack_encode(&vortex_array::arrays::primitive::vtable::PrimitiveArray, u8, core::option::Option<&[usize]>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_fastlanes::bitpack_compress::bitpack_encode_constant(T, u8, usize, vortex_array::validity::Validity) -> vortex_error::VortexResult + pub unsafe fn vortex_fastlanes::bitpack_compress::bitpack_encode_unchecked(vortex_array::arrays::primitive::vtable::PrimitiveArray, u8) -> vortex_error::VortexResult pub fn vortex_fastlanes::bitpack_compress::bitpack_primitive(&[T], u8) -> vortex_buffer::buffer::Buffer @@ -224,6 +228,8 @@ pub fn vortex_fastlanes::BitPackedData::try_new(vortex_array::buffer::BufferHand pub fn vortex_fastlanes::BitPackedData::unpacked_chunks(&self, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult> +pub fn vortex_fastlanes::BitPackedData::value_fits_bit_width(&self, T) -> core::option::Option + impl core::clone::Clone for vortex_fastlanes::BitPackedData pub fn vortex_fastlanes::BitPackedData::clone(&self) -> vortex_fastlanes::BitPackedData diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs index 2e517819059..acac34f4fae 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs @@ -194,6 +194,143 @@ pub fn bitpack_primitive(array: &[T], bit_width: u8 output.freeze() } +/// Build the bit-packed buffer for a `[constant; len]` input without calling the +/// SIMD packer. +/// +/// The FastLanes packing kernel runs `LANES` independent lane packers in parallel, each +/// consuming `T = 8 * size_of::()` input values and producing `bit_width` output words. +/// When every input value equals `constant`, all `LANES` lane packers produce the same +/// `bit_width` words. We compute those words analytically — looping over `T` bits per +/// output word with a single `OR`/shift — then replicate the lane pattern across the +/// chunk and the chunk pattern across the buffer with `memset`/`memcpy`. No call to +/// `BitPacking::pack` is involved for any full chunk. +/// +/// The trailing partial chunk (when `len % 1024 != 0`) is zero-padded past `len`, so it +/// has a different pattern than the full template. It is built by re-using the analytical +/// kernel only when `len % 1024` is itself a multiple of `T` (so the padded boundary +/// aligns with a lane row); otherwise we fall back to a single `unchecked_pack` call for +/// that final chunk only. +/// +/// # Preconditions +/// +/// * `constant` must fit in `bit_width`, i.e., `(constant as u64) < (1 << bit_width)`. +/// * `0 < bit_width <= size_of::() * 8`. +pub fn bitpack_constant( + constant: T, + bit_width: u8, + len: usize, +) -> Buffer { + if bit_width == 0 || len == 0 { + return Buffer::::empty(); + } + let w = bit_width as usize; + let t_bits = 8 * size_of::(); + let lanes = 1024 / t_bits; + let packed_len = 128 * w / size_of::(); + debug_assert_eq!(packed_len, w * lanes); + + let num_chunks = len.div_ceil(1024); + let num_full_chunks = len / 1024; + + let mut output = BufferMut::::with_capacity(num_chunks * packed_len); + + if num_full_chunks > 0 { + // One full chunk's bit pattern: `w` distinct output words, each replicated `lanes` + // times. Build the template on the stack with `lane_word`-sized `memset`s, then + // `memcpy` it into the output for every full chunk. + let lane_words = constant_lane_words::(constant, w); + let mut chunk: [T; 1024] = [T::zero(); 1024]; + for (k, &word) in lane_words.iter().enumerate() { + chunk[k * lanes..(k + 1) * lanes].fill(word); + } + let template = &chunk[..packed_len]; + for _ in 0..num_full_chunks { + output.extend_from_slice(template); + } + } + + if num_chunks > num_full_chunks { + // Tail chunk gets zero-padded past `len % 1024`, so it differs from the full + // template. Use the standard packer for this single chunk. + let last_chunk_size = len % 1024; + let mut last_chunk: [T; 1024] = [T::zero(); 1024]; + last_chunk[..last_chunk_size].fill(constant); + let tail_start = output.len(); + unsafe { + output.set_len(tail_start + packed_len); + BitPacking::unchecked_pack(w, &last_chunk, &mut output[tail_start..][..packed_len]); + } + } + + output.freeze() +} + +/// Compute the `bit_width` output words that every FastLanes lane produces when packing +/// `T = 8 * size_of::()` copies of `constant`. +/// +/// For constant input, each lane packs a periodic bit-stream of period `bit_width` made +/// of the low `bit_width` bits of `constant`. Output word `k` contains bits +/// `[k * T, (k + 1) * T)` of that stream, so its `j`-th bit equals bit +/// `(k * T + j) mod bit_width` of `constant`. +fn constant_lane_words(constant: T, bit_width: usize) -> Vec { + let t_bits = 8 * size_of::(); + let mask = if bit_width == t_bits { + !T::zero() + } else { + (T::one() << bit_width) - T::one() + }; + let s = constant & mask; + (0..bit_width) + .map(|k| { + let mut word = T::zero(); + for j in 0..t_bits { + let bit_in_s = (k * t_bits + j) % bit_width; + let bit = (s >> bit_in_s) & T::one(); + word = word | (bit << j); + } + word + }) + .collect() +} + +/// Encode a length-`len` array of `constant` values as a [`BitPackedArray`] without +/// running the standard encode pipeline. +/// +/// Returns an error if `constant` does not fit in `bit_width`, or if `bit_width` is too +/// large for `T`. +pub fn bitpack_encode_constant( + constant: T, + bit_width: u8, + len: usize, + validity: Validity, +) -> VortexResult { + if bit_width as usize >= T::PTYPE.bit_width() { + vortex_bail!( + InvalidArgument: "Cannot pack - specified bit width {bit_width} >= {}", + T::PTYPE.bit_width() + ); + } + let c = constant + .to_i128() + .ok_or_else(|| vortex_error::vortex_err!("cannot cast constant to i128"))?; + if c < 0 || c > (1i128 << bit_width) - 1 { + vortex_bail!( + InvalidArgument: "constant {c} does not fit in bit_width {bit_width}" + ); + } + + let packed = bitpack_constant(constant, bit_width, len).into_byte_buffer(); + BitPacked::try_new( + BufferHandle::new_host(packed), + T::PTYPE, + validity, + None, + bit_width, + len, + 0, + ) +} + pub fn gather_patches( parray: &PrimitiveArray, bit_width: u8, @@ -650,4 +787,36 @@ mod test { assert_arrays_eq!(chunk_offsets, PrimitiveArray::from_iter([0u64])); Ok(()) } + + #[rstest::rstest] + #[case::aligned_1024(1024u32, 7, 5)] + #[case::aligned_multi(8192u32, 7, 5)] + #[case::partial_tail(2050u32, 7, 5)] + #[case::small(13u32, 5, 17)] + #[case::large_bitwidth(1_000_000u32, 18, 200_000)] + fn bitpack_constant_matches_full_encode( + #[case] len: u32, + #[case] bit_width: u8, + #[case] constant: u32, + ) -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + let input = PrimitiveArray::from_iter(std::iter::repeat_n(constant, len as usize)); + + let slow = bitpack_encode(&input, bit_width, None, &mut ctx)?; + let fast = bitpack_encode_constant::( + constant, + bit_width, + len as usize, + Validity::NonNullable, + )?; + + let slow_packed = slow.packed().clone().unwrap_host(); + let fast_packed = fast.packed().clone().unwrap_host(); + assert_eq!(slow_packed.as_slice(), fast_packed.as_slice()); + + // Unpack fast result and verify roundtrip. + let unpacked = fast.into_array().execute::(&mut ctx)?; + assert_arrays_eq!(unpacked, input); + Ok(()) + } } diff --git a/encodings/fastlanes/src/bitpacking/array/mod.rs b/encodings/fastlanes/src/bitpacking/array/mod.rs index e5c64252fbc..b109d53d64c 100644 --- a/encodings/fastlanes/src/bitpacking/array/mod.rs +++ b/encodings/fastlanes/src/bitpacking/array/mod.rs @@ -273,6 +273,29 @@ impl BitPackedData { pub fn max_packed_value(&self) -> usize { (1 << self.bit_width()) - 1 } + + /// Test whether `value` can be represented as a packed lane in this array, i.e. whether + /// it falls in the range `[0, 2^bit_width - 1]`. + /// + /// This is an `O(1)` check that never inspects the packed buffer and is strictly cheaper + /// than encoding `value` into the bit-packed representation. It is the building block for + /// fast comparison kernels that can short-circuit when the constant cannot match any + /// packed lane. + /// + /// Returns `None` if `value` cannot be losslessly converted to `i128` (which never + /// happens for the integer types supported by bit-packing). + #[inline] + pub fn value_fits_bit_width( + &self, + value: T, + ) -> Option { + let v = value.to_i128()?; + if v < 0 { + return Some(false); + } + let max = (1i128 << self.bit_width()) - 1; + Some(v <= max) + } } pub trait BitPackedArrayExt: BitPackedArraySlotsExt { diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 900b7bd9f60..ef29f0b4608 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -1,20 +1,28 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Fast-path `Eq` / `NotEq` comparison against a constant. +//! Fast-path comparison against a constant for bit-packed arrays. //! -//! When the constant cannot fit in the packable range `[0, 2^bit_width - 1]`, no value -//! stored in the packed buffer can equal it, so: +//! A bit-packed lane holds values in `[0, 2^bit_width - 1]`. When the RHS constant sits +//! outside that range, every packed lane has the same `Ordering` relative to `c`: //! -//! * `Eq` → every position is `false` (modulo patches/validity). -//! * `NotEq` → every position is `true` (modulo patches/validity). +//! * `c > 2^bit_width - 1` (above range) → every packed lane is `< c` +//! * `c < 0` (below range) → every packed lane is `> c` (packed values are non-negative) //! -//! Detecting this is an `O(1)` range check on the constant — strictly cheaper than -//! encoding `c` into the bit-packed representation. The check is layout-agnostic and -//! does not touch the packed buffer. +//! That collapses each of the six comparison operators to a constant boolean (modulo +//! patches and validity), so the result is either a `ConstantArray` (`O(1)`) or a +//! `BitBuffer` filled with that constant and overlaid with per-position results at any +//! patched indices. //! -//! In-range constants and ordering operators (`Lt`/`Lte`/`Gt`/`Gte`) currently fall -//! through to the canonical decompress + Arrow compare path. +//! Detecting whether the constant falls in the packable range is an `O(1)` `i128` check +//! on the constant alone — strictly cheaper than encoding `c` into the bit-packed +//! representation, and layout-agnostic. +//! +//! **In-range constants** (those that could match a packed lane) fall through to the +//! canonical decompress + Arrow compare path. See `docs/inrange_compare_plan.md` for the +//! plan to accelerate that case for ordering operators. + +use std::cmp::Ordering; use num_traits::ToPrimitive; use vortex_array::ArrayRef; @@ -47,13 +55,6 @@ impl CompareKernel for BitPacked { operator: CompareOperator, ctx: &mut ExecutionCtx, ) -> VortexResult> { - // Only `Eq` / `NotEq` are accelerated here. Ordering operators (`Lt`, `Lte`, `Gt`, - // `Gte`) need either a SWAR less-than over the packed bytes or unpack-then-compare; - // both are out of scope for this commit and fall through to the canonical path. - if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) { - return Ok(None); - } - let Some(constant) = rhs.as_constant() else { return Ok(None); }; @@ -62,7 +63,7 @@ impl CompareKernel for BitPacked { }; match_each_integer_ptype!(constant.ptype(), |T| { - compare_eq_constant::( + compare_constant::( lhs, constant .typed_value::() @@ -75,22 +76,45 @@ impl CompareKernel for BitPacked { } } -/// Returns `true` if `constant` cannot fit in the packable range `[0, 2^bit_width - 1]`. +/// Ordering of every packed lane vs `constant` when `constant` is outside the packable +/// range. Returns `None` when `constant` itself fits in the range (no fast path applies). /// /// `O(1)` check on the constant; never inspects the packed buffer. #[inline] -fn constant_out_of_packable_range(constant: T, bit_width: u8) -> bool +fn constant_relation_to_packed(constant: T, bit_width: u8) -> Option where T: NativePType + ToPrimitive, { - let Some(c) = constant.to_i128() else { - return false; - }; + let c = constant.to_i128()?; + if c < 0 { + return Some(Ordering::Greater); + } let max = (1i128 << bit_width) - 1; - c < 0 || c > max + if c > max { + return Some(Ordering::Less); + } + None } -fn compare_eq_constant( +/// Reduce `lane op constant` to a constant boolean when every packed lane has the same +/// ordering relation to `constant`. +#[inline] +fn reduce_constant(relation: Ordering, operator: CompareOperator) -> bool { + match (operator, relation) { + (CompareOperator::Eq, _) => false, + (CompareOperator::NotEq, _) => true, + (CompareOperator::Lt, Ordering::Less) => true, + (CompareOperator::Lt, _) => false, + (CompareOperator::Lte, Ordering::Less | Ordering::Equal) => true, + (CompareOperator::Lte, _) => false, + (CompareOperator::Gt, Ordering::Greater) => true, + (CompareOperator::Gt, _) => false, + (CompareOperator::Gte, Ordering::Greater | Ordering::Equal) => true, + (CompareOperator::Gte, _) => false, + } +} + +fn compare_constant( lhs: ArrayView<'_, BitPacked>, constant: T, rhs_nullability: Nullability, @@ -100,22 +124,19 @@ fn compare_eq_constant( where T: NativePType + ToPrimitive, { - if !constant_out_of_packable_range(constant, lhs.bit_width()) { - // Constant fits in the packable range, so at least some packed lanes could match - // it. The fast path doesn't apply. + let Some(relation) = constant_relation_to_packed(constant, lhs.bit_width()) else { + // In-range constants currently fall through to the canonical path. See + // `docs/inrange_compare_plan.md` for the plan to accelerate Lt/Lte/Gt/Gte here. return Ok(None); - } + }; - // Every packed lane disagrees with `constant`. `Eq` is `false` everywhere, `NotEq` is - // `true` everywhere — modulo patches (which carry the real value) and validity. - let packed_lane_result = matches!(operator, CompareOperator::NotEq); + let packed_lane_result = reduce_constant(relation, operator); let len = lhs.len(); let validity = lhs.validity()?; let patches = lhs.patches(); let result_nullability = lhs.dtype().nullability() | rhs_nullability; - // Hot path: no patches, no nulls — every position has the same boolean result, so we - // return a `ConstantArray` in `O(1)`. + // Hot path: no patches, no nulls — every position has the same boolean result. if patches.is_none() && validity.no_nulls() { return Ok(Some( ConstantArray::new(Scalar::bool(packed_lane_result, result_nullability), len) @@ -131,7 +152,7 @@ where let patches_offset = patches.offset(); match_each_unsigned_integer_ptype!(indices.ptype(), |I| { - apply_eq_patches::( + apply_patches::( &mut bits, indices.as_slice::(), values.as_slice::(), @@ -146,7 +167,7 @@ where Ok(Some(BoolArray::new(bits.freeze(), validity).into_array())) } -fn apply_eq_patches( +fn apply_patches( bits: &mut BitBufferMut, indices: &[I], values: &[T], @@ -157,11 +178,13 @@ fn apply_eq_patches( T: NativePType, I: IntegerPType, { - // Only Eq/NotEq reach this point (see `CompareKernel::compare`). let cmp: fn(T, T) -> bool = match operator { CompareOperator::Eq => |l, r| NativeValue(l) == NativeValue(r), CompareOperator::NotEq => |l, r| NativeValue(l) != NativeValue(r), - _ => unreachable!("only Eq/NotEq reach the bitpacked compare-constant fast path"), + CompareOperator::Lt => |l, r| NativeValue(l) < NativeValue(r), + CompareOperator::Lte => |l, r| NativeValue(l) <= NativeValue(r), + CompareOperator::Gt => |l, r| NativeValue(l) > NativeValue(r), + CompareOperator::Gte => |l, r| NativeValue(l) >= NativeValue(r), }; let len = bits.len(); @@ -184,6 +207,7 @@ fn apply_eq_patches( #[cfg(test)] mod tests { + use std::cmp::Ordering; use std::sync::LazyLock; use rstest::rstest; @@ -205,26 +229,36 @@ mod tests { use crate::BitPackedArrayExt; use crate::BitPackedData; - use crate::bitpacking::compute::compare::constant_out_of_packable_range; + use crate::bitpacking::compute::compare::constant_relation_to_packed; static SESSION: LazyLock = LazyLock::new(|| VortexSession::empty().with::()); #[test] fn range_check_is_o1() { - // 8-bit packable range is [0, 255]. - assert!(constant_out_of_packable_range::(256, 8)); - assert!(constant_out_of_packable_range::(-1, 8)); - assert!(!constant_out_of_packable_range::(255, 8)); - assert!(!constant_out_of_packable_range::(0, 8)); + // For an 8-bit packable range of [0, 255]: + assert_eq!( + constant_relation_to_packed::(256, 8), + Some(Ordering::Less) + ); + assert_eq!( + constant_relation_to_packed::(-1, 8), + Some(Ordering::Greater) + ); + assert_eq!(constant_relation_to_packed::(255, 8), None); + assert_eq!(constant_relation_to_packed::(0, 8), None); } #[rstest] #[case(Operator::Eq, false)] #[case(Operator::NotEq, true)] - fn eq_above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> { + #[case(Operator::Lt, true)] + #[case(Operator::Lte, true)] + #[case(Operator::Gt, false)] + #[case(Operator::Gte, false)] + fn above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> { let mut ctx = SESSION.create_execution_ctx(); - // 999 is above the 8-bit packable range; no packed lane matches. + // 999 is above the 8-bit packable range; every packed lane is < 999. let packed = BitPackedData::encode( &PrimitiveArray::from_iter([1u32, 2, 3, 250, 100]).into_array(), 8, @@ -240,8 +274,8 @@ mod tests { #[rstest] #[case(Operator::Eq)] - #[case(Operator::NotEq)] - fn eq_above_range_with_patches(#[case] op: Operator) -> VortexResult<()> { + #[case(Operator::Lt)] + fn above_range_with_patches(#[case] op: Operator) -> VortexResult<()> { // bit_width=4 packable range is [0, 15]; out-of-range values become patches. let mut ctx = SESSION.create_execution_ctx(); let values = buffer![1u32, 5, 1000, 7, 1000, 14]; @@ -255,40 +289,39 @@ mod tests { .binary(ConstantArray::new(constant, values.len()).into_array(), op)? .execute::(&mut ctx)?; - let expected: Vec = values - .iter() - .map(|v| match op { - Operator::Eq => *v == constant, - Operator::NotEq => *v != constant, - _ => unreachable!(), - }) - .collect(); - assert_arrays_eq!(result, BoolArray::from_iter(expected)); + let cmp: fn(u32, u32) -> bool = match op { + Operator::Eq => |l, r| l == r, + Operator::Lt => |l, r| l < r, + _ => unreachable!(), + }; + assert_arrays_eq!( + result, + BoolArray::from_iter(values.iter().map(|v| cmp(*v, constant))) + ); Ok(()) } #[test] - fn ordering_falls_through() -> VortexResult<()> { - // Ordering ops aren't accelerated yet; they go through the canonical path and - // must still return a correct answer. + fn below_range_signed() -> VortexResult<()> { + // Packed signed values are non-negative, so -5 is always less than every lane. let mut ctx = SESSION.create_execution_ctx(); - let values = [1u32, 2, 3, 250, 100]; - let packed = - BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?; + let packed = BitPackedData::encode( + &PrimitiveArray::from_iter([0i32, 7, 15, 3, 12]).into_array(), + 4, + &mut ctx, + )?; + let len = packed.len(); let result = packed .into_array() - .binary(ConstantArray::new(999u32, 5).into_array(), Operator::Lt)? + .binary(ConstantArray::new(-5i32, len).into_array(), Operator::Gt)? .execute::(&mut ctx)?; - assert_arrays_eq!( - result, - BoolArray::from_iter(values.iter().map(|v| *v < 999)) - ); + assert_arrays_eq!(result, BoolArray::from_iter([true; 5])); Ok(()) } #[test] - fn eq_in_range_falls_through() -> VortexResult<()> { - // In-range constants must defer to the canonical path. + fn in_range_falls_through() -> VortexResult<()> { + // 100 is in the 8-bit packable range; fall through to the canonical path. let mut ctx = SESSION.create_execution_ctx(); let values = [1u32, 2, 3, 250, 100]; let packed = @@ -305,7 +338,7 @@ mod tests { } #[test] - fn eq_nullable_constant() -> VortexResult<()> { + fn nullable_constant() -> VortexResult<()> { let mut ctx = SESSION.create_execution_ctx(); let packed = BitPackedData::encode( &PrimitiveArray::from_iter([1u32, 2, 3]).into_array(),