From 9a56591e6d15cf58466c7b9f1842af559bc8537a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 10:50:55 +0100 Subject: [PATCH 1/7] Add bit-packed cast benchmark Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 5 + encodings/fastlanes/benches/cast_bitpacked.rs | 128 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 encodings/fastlanes/benches/cast_bitpacked.rs diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 7b4cb5da069..08c96c481d7 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -63,3 +63,8 @@ required-features = ["_test-harness"] [[bench]] name = "bitpack_compare" harness = false + +[[bench]] +name = "cast_bitpacked" +harness = false +required-features = ["_test-harness"] diff --git a/encodings/fastlanes/benches/cast_bitpacked.rs b/encodings/fastlanes/benches/cast_bitpacked.rs new file mode 100644 index 00000000000..ad4ee7e4e1a --- /dev/null +++ b/encodings/fastlanes/benches/cast_bitpacked.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks the cost of widening a bit-packed narrow integer column to a wider integer type on +//! decompression (e.g. `u16 -> u32`). +//! +//! Two strategies are compared: +//! +//! - `cast_execute`: the real public path, `array.cast(u32).execute()`. +//! - `canonicalize_then_cast`: explicitly canonicalizes to a full-length `u16` `PrimitiveArray` and +//! then casts that to `u32`. + +#![expect(clippy::unwrap_used)] + +use std::sync::LazyLock; + +use divan::Bencher; +use rand::RngExt; +use rand::SeedableRng; +use rand::prelude::StdRng; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ChunkedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; +use vortex_fastlanes::BitPackedArray; +use vortex_fastlanes::BitPackedData; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +const U32: DType = DType::Primitive(PType::U32, Nullability::NonNullable); + +// (chunk_len, chunk_count, fraction_patched) +const ARGS: &[(usize, usize, f64)] = &[ + (65_536, 1, 0.00), + (65_536, 1, 0.01), + (65_536, 16, 0.00), + (65_536, 16, 0.01), + (1_048_576, 1, 0.00), + (1_048_576, 1, 0.01), +]; + +/// Build a single bit-packed `u16` chunk. Most values fit in `bit_width` bits; `fraction_patched` +/// of them are large enough to require patches. +fn make_chunk(rng: &mut StdRng, len: usize, fraction_patched: f64) -> BitPackedArray { + let bit_width = 9u8; + let cap = 1u16 << bit_width; + let values = (0..len) + .map(|_| { + if rng.random_bool(fraction_patched) { + rng.random_range(cap..u16::MAX) + } else { + rng.random_range(0..cap) + } + }) + .collect::>(); + let array = PrimitiveArray::new(values, Validity::NonNullable); + BitPackedData::encode( + &array.into_array(), + bit_width, + &mut SESSION.create_execution_ctx(), + ) + .vortex_expect("encode") +} + +fn make_chunks(len: usize, count: usize, fraction_patched: f64) -> Vec { + let mut rng = StdRng::seed_from_u64(0); + (0..count) + .map(|_| make_chunk(&mut rng, len, fraction_patched)) + .collect() +} + +fn single(chunks: &[BitPackedArray]) -> ArrayRef { + if chunks.len() == 1 { + chunks[0].clone().into_array() + } else { + ChunkedArray::from_iter(chunks.iter().map(|c| c.clone().into_array())).into_array() + } +} + +/// The real public path: `array.cast(u32).execute()`. +#[cfg(not(codspeed))] +#[divan::bench(args = ARGS)] +fn cast_execute(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { + let chunks = make_chunks(chunk_len, chunk_count, frac); + bencher + .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) + .bench_refs(|(array, ctx)| { + array + .clone() + .cast(U32) + .unwrap() + .execute::(ctx) + .unwrap() + }); +} + +/// Baseline: canonicalize to a full-length `u16` array, then cast that primitive array to `u32`. +#[cfg(not(codspeed))] +#[divan::bench(args = ARGS)] +fn canonicalize_then_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { + let chunks = make_chunks(chunk_len, chunk_count, frac); + bencher + .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) + .bench_refs(|(array, ctx)| { + let canonical = array.clone().execute::(ctx).unwrap(); + canonical + .into_array() + .cast(U32) + .unwrap() + .execute::(ctx) + .unwrap() + }); +} From 3c153b9ae03a3fafa9874404acb80cbe25e86e81 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 11:02:32 +0100 Subject: [PATCH 2/7] u Signed-off-by: Joe Isaacs --- encodings/fastlanes/benches/cast_bitpacked.rs | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/encodings/fastlanes/benches/cast_bitpacked.rs b/encodings/fastlanes/benches/cast_bitpacked.rs index ad4ee7e4e1a..90a6d5854fe 100644 --- a/encodings/fastlanes/benches/cast_bitpacked.rs +++ b/encodings/fastlanes/benches/cast_bitpacked.rs @@ -92,8 +92,6 @@ fn single(chunks: &[BitPackedArray]) -> ArrayRef { } } -/// The real public path: `array.cast(u32).execute()`. -#[cfg(not(codspeed))] #[divan::bench(args = ARGS)] fn cast_execute(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { let chunks = make_chunks(chunk_len, chunk_count, frac); @@ -108,21 +106,3 @@ fn cast_execute(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, .unwrap() }); } - -/// Baseline: canonicalize to a full-length `u16` array, then cast that primitive array to `u32`. -#[cfg(not(codspeed))] -#[divan::bench(args = ARGS)] -fn canonicalize_then_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { - let chunks = make_chunks(chunk_len, chunk_count, frac); - bencher - .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) - .bench_refs(|(array, ctx)| { - let canonical = array.clone().execute::(ctx).unwrap(); - canonical - .into_array() - .cast(U32) - .unwrap() - .execute::(ctx) - .unwrap() - }); -} From 468cc1c26dc55099598c3fa9716343221e7c45c1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 11:12:48 +0100 Subject: [PATCH 3/7] Add bit-packed widening cast pushdown Signed-off-by: Joe Isaacs --- .../bitpacking/array/bitpack_decompress.rs | 63 +++++++- .../src/bitpacking/array/unpack_iter.rs | 70 ++++++++- .../fastlanes/src/bitpacking/compute/cast.rs | 138 +++++++++++++++++- .../fastlanes/src/for/array/for_decompress.rs | 2 +- 4 files changed, 256 insertions(+), 17 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 6570d26bbc9..881fbb27ebe 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -79,12 +79,62 @@ pub(crate) fn unpack_into_primitive_builder( Ok(()) } +/// Unpack a bit-packed array of physical type `F` directly into a wider primitive type `T`, +/// casting each value during decompression. +/// +/// This is the "cast pushdown" path: rather than canonicalizing to a full-length `F`-typed +/// `PrimitiveArray` and then casting it to `T` (two full-length buffers, with the `F` intermediate +/// written out to RAM), we unpack each 1024-element FastLanes chunk into a small cache-resident +/// scratch buffer and cast-copy straight into the `T` output. Only the `T` output buffer is +/// allocated and touched in RAM. +/// +/// The caller must ensure all valid values are representable in `T` (it is intended for widening +/// casts such as `u16 -> u32`); narrowing or sign-changing casts are not validated here. +pub(crate) fn unpack_and_cast_into_builder( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> +where + F: BitPackedUnpack + AsPrimitive, + T: NativePType, +{ + if array.is_empty() { + return Ok(()); + } + + let len = array.len(); + let mut uninit_range = builder.uninit_range(len); + + // SAFETY: We initialize all `len` values below via `decode_map_into` and the patch loop. + unsafe { + uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?); + } + + // SAFETY: `decode_map_into` writes a value to every slot in this range. + let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; + + let mut chunks = array.unpacked_chunks::()?; + chunks.decode_map_into(uninit_slice, |v: F| v.as_()); + + if let Some(patches) = array.patches() { + apply_patches_to_uninit_range_map(&mut uninit_range, &patches, ctx, |v: F| v.as_())?; + } + + // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same + // number of values was initialized via `decode_map_into` (and overwritten by patches). + unsafe { + uninit_range.finish(); + } + Ok(()) +} + pub fn apply_patches_to_uninit_range( dst: &mut UninitRange, patches: &Patches, ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - apply_patches_to_uninit_range_fn(dst, patches, ctx, |x| x) + apply_patches_to_uninit_range_fn(dst, patches, ctx, |v: T| v) } pub fn apply_patches_to_uninit_range_fn T>( @@ -92,13 +142,22 @@ pub fn apply_patches_to_uninit_range_fn T>( patches: &Patches, ctx: &mut ExecutionCtx, f: F, +) -> VortexResult<()> { + apply_patches_to_uninit_range_map(dst, patches, ctx, f) +} + +pub(crate) fn apply_patches_to_uninit_range_map T>( + dst: &mut UninitRange, + patches: &Patches, + ctx: &mut ExecutionCtx, + f: F, ) -> VortexResult<()> { assert_eq!(patches.array_len(), dst.len()); let indices = patches.indices().clone().execute::(ctx)?; let values = patches.values().clone().execute::(ctx)?; assert!(values.all_valid(ctx)?, "Patch values must be all valid"); - let values = values.as_slice::(); + let values = values.as_slice::(); match_each_unsigned_integer_ptype!(indices.ptype(), |P| { for (index, &value) in indices.as_slice::

().iter().zip_eq(values) { diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index 2f7187d26f1..c3476e5daf9 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -190,8 +190,7 @@ impl> UnpackedChunks { if let Some(initial) = self.initial() { local_idx = initial.len(); - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 + // TODO(connor): use maybe_uninit_write_slice when it gets stabilized. // SAFETY: &[T] and &[MaybeUninit] have the same layout. let init_initial: &[MaybeUninit] = unsafe { mem::transmute(initial) }; output[..local_idx].copy_from_slice(init_initial); @@ -202,14 +201,65 @@ impl> UnpackedChunks { // Handle trailing partial chunk if present if let Some(trailer) = self.trailer() { - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 + // TODO(connor): use maybe_uninit_write_slice when it gets stabilized. // SAFETY: &[T] and &[MaybeUninit] have the same layout. let init_trailer: &[MaybeUninit] = unsafe { mem::transmute(trailer) }; output[local_idx..][..init_trailer.len()].copy_from_slice(init_trailer); } } + /// Decode all chunks (initial, full, and trailer), mapping each unpacked value through f. + pub(crate) fn decode_map_into( + &mut self, + output: &mut [MaybeUninit], + mut f: impl FnMut(T) -> U, + ) { + debug_assert_eq!(output.len(), self.len); + let mut local_idx = 0; + + if let Some(initial) = self.initial() { + let chunk_len = initial.len(); + write_map(initial, &mut output[..chunk_len], &mut f); + local_idx += chunk_len; + } + + if self.num_chunks != 1 { + let first_chunk_is_sliced = self.first_chunk_is_sliced(); + let last_chunk_is_sliced = self.last_chunk_is_sliced(); + let full_chunks_range = + (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); + + let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed); + let elems_per_chunk = self.elems_per_chunk(); + for i in full_chunks_range { + let chunk = &packed_slice[i * elems_per_chunk..][..elems_per_chunk]; + unsafe { + let dst: &mut [T::Physical] = mem::transmute(&mut self.buffer[..]); + self.strategy.unpack_chunk(self.bit_width, chunk, dst); + let unpacked: &[T] = mem::transmute(&self.buffer[..]); + write_map( + unpacked, + &mut output[local_idx..local_idx + CHUNK_SIZE], + &mut f, + ); + } + local_idx += CHUNK_SIZE; + } + } + + if let Some(trailer) = self.trailer() { + let chunk_len = trailer.len(); + write_map( + trailer, + &mut output[local_idx..local_idx + chunk_len], + &mut f, + ); + local_idx += chunk_len; + } + + debug_assert_eq!(local_idx, self.len); + } + /// Unpack full chunks into output range starting at the given index. /// Returns the next local index to write to. fn decode_full_chunks_into_at( @@ -217,14 +267,12 @@ impl> UnpackedChunks { output: &mut [MaybeUninit], start_idx: usize, ) -> usize { - // If there's only one chunk it has been handled already by `initial` method + // If there is only one chunk it has been handled already by initial. if self.num_chunks == 1 { - // Return the start_idx since initial already wrote everything. return start_idx; } let first_chunk_is_sliced = self.first_chunk_is_sliced(); - let last_chunk_is_sliced = self.last_chunk_is_sliced(); let full_chunks_range = (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); @@ -238,7 +286,7 @@ impl> UnpackedChunks { unsafe { let uninit_dst = &mut output[local_idx..local_idx + CHUNK_SIZE]; - // SAFETY: &[T] and &[MaybeUninit] have the same layout + // SAFETY: &[T] and &[MaybeUninit] have the same layout. let dst: &mut [T::Physical] = mem::transmute(uninit_dst); self.strategy.unpack_chunk(self.bit_width, chunk, dst); } @@ -340,6 +388,12 @@ fn buffer_as_slice(buffer: &ByteBuffer) -> &[T] { unsafe { std::slice::from_raw_parts(packed_ptr, packed_len) } } +fn write_map(src: &[T], dst: &mut [MaybeUninit], f: &mut impl FnMut(T) -> U) { + for (dst, &src) in dst.iter_mut().zip(src.iter()) { + dst.write(f(src)); + } +} + pub trait BitPacked: PhysicalPType {} impl BitPacked for i8 {} diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 3cb810e0442..1b5346c48f7 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -5,8 +5,11 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::builders::PrimitiveBuilder; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_array::validity::Validity; @@ -14,6 +17,18 @@ use vortex_error::VortexResult; use crate::bitpacking::BitPacked; use crate::bitpacking::array::BitPackedArrayExt; +use crate::bitpacking::array::bitpack_decompress::unpack_and_cast_into_builder; + +/// Returns `true` if casting `src` to `tgt` is a widening integer cast for which every value a +/// bit-packed array can hold is guaranteed to be representable in `tgt` (so no per-value bounds +/// check is needed). This holds when `tgt` is strictly wider and either the source is unsigned +/// (always non-negative, fits in any wider type) or the target is also signed (sign-extension). +fn is_widening_int_cast(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) +} fn build_with_validity( array: ArrayView<'_, BitPacked>, @@ -56,14 +71,41 @@ impl CastKernel for BitPacked { dtype: &DType, ctx: &mut ExecutionCtx, ) -> VortexResult> { - if !array.dtype().eq_ignore_nullability(dtype) { + // Nullability-only change: keep the values bit-packed, just adjust validity. + if array.dtype().eq_ignore_nullability(dtype) { + let new_validity = + array + .validity()? + .cast_nullability(dtype.nullability(), array.len(), ctx)?; + return build_with_validity(array, dtype, new_validity).map(Some); + } + + // Widening integer cast: unpack each FastLanes chunk into a cache-resident scratch buffer + // and cast-copy straight into the wide output, avoiding a full-length intermediate buffer + // and the generic cast kernel's bounds-check scan (unnecessary when widening). + let DType::Primitive(tgt, tgt_nullability) = dtype else { + return Ok(None); + }; + let (tgt, tgt_nullability) = (*tgt, *tgt_nullability); + let src = array.dtype().as_ptype(); + if !is_widening_int_cast(src, tgt) { return Ok(None); } - let new_validity = - array - .validity()? - .cast_nullability(dtype.nullability(), array.len(), ctx)?; - build_with_validity(array, dtype, new_validity).map(Some) + + // Surface the standard error if a nullable source with nulls is cast to a non-nullable + // type; on success the per-value validity is handled inside the unpack below. + array + .validity()? + .cast_nullability(tgt_nullability, array.len(), ctx)?; + + let result = match_each_integer_ptype!(tgt, |T| { + let mut builder = PrimitiveBuilder::::with_capacity(tgt_nullability, array.len()); + match_each_integer_ptype!(src, |F| { + unpack_and_cast_into_builder::(array, &mut builder, ctx)?; + }); + builder.finish_into_primitive().into_array() + }); + Ok(Some(result)) } } @@ -79,9 +121,12 @@ mod tests { use vortex_array::builtins::ArrayBuiltins; use vortex_array::compute::conformance::cast::test_cast_conformance; use vortex_array::dtype::DType; + use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; + use vortex_array::match_each_integer_ptype; use vortex_buffer::buffer; + use vortex_error::VortexResult; use crate::BitPackedArray; use crate::BitPackedData; @@ -124,6 +169,87 @@ mod tests { ); } + /// End-to-end check that the real engine path `array.cast(target).execute()` routes through the + /// bit-packed widening pushdown and matches a plain primitive cast over the same values, across + /// every supported integer pair, several chunk-boundary lengths, and a sliced (offset > 0) case. + #[test] + fn test_cast_bitpacked_widening_via_execute() -> VortexResult<()> { + fn values(len: usize) -> PrimitiveArray { + PrimitiveArray::from_iter((0..len).map(|i| { + let value = if i % 17 == 0 { 31 } else { i % 8 }; + ::from_usize(value) + .expect("test values fit every integer ptype") + })) + } + + fn supported(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) + } + + let ptypes = [ + PType::I8, + PType::I16, + PType::I32, + PType::I64, + PType::U8, + PType::U16, + PType::U32, + PType::U64, + ]; + // Lengths exercise empty, sub-chunk, exact chunk, chunk+1, and multi-chunk-with-trailer. + let lengths = [0, 1, 7, 1023, 1024, 1025, 2051]; + + for src in ptypes { + for tgt in ptypes { + if !supported(src, tgt) { + continue; + } + + for len in lengths { + let source = match_each_integer_ptype!(src, |S| { values::(len) }); + let source_ref = source.into_array(); + let target = DType::Primitive(tgt, Nullability::NonNullable); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + // Reference: plain primitive cast of the same values. + let reference = source_ref + .clone() + .cast(target.clone())? + .execute::(&mut ctx)?; + + // Candidate: bit-pack, then cast through the real engine. This dispatches to + // `BitPacked`'s `CastKernel` widening pushdown. + let packed = bp(&source_ref, 3).into_array(); + let casted = packed + .cast(target.clone())? + .execute::(&mut ctx)?; + assert_arrays_eq!(casted, reference); + + // Also exercise the sliced/offset path (offset > 0, trailer present). + if len >= 4 { + let lo = len / 4; + let hi = len - len / 4; + let sliced = bp(&source_ref, 3).into_array().slice(lo..hi)?; + let casted = sliced + .cast(target.clone())? + .execute::(&mut ctx)?; + let reference = source_ref + .clone() + .slice(lo..hi)? + .cast(target.clone())? + .execute::(&mut ctx)?; + assert_arrays_eq!(casted, reference); + } + } + } + } + + Ok(()) + } + #[rstest] #[case(bp(&buffer![0u8, 10, 20, 30, 40, 50, 60, 63].into_array(), 6))] #[case(bp(&buffer![0u16, 100, 200, 300, 400, 500].into_array(), 9))] diff --git a/encodings/fastlanes/src/for/array/for_decompress.rs b/encodings/fastlanes/src/for/array/for_decompress.rs index a26d6e9053e..6b43ac85c24 100644 --- a/encodings/fastlanes/src/for/array/for_decompress.rs +++ b/encodings/fastlanes/src/for/array/for_decompress.rs @@ -123,7 +123,7 @@ pub(crate) fn fused_decompress< &mut uninit_range, &patches, ctx, - |v| v.wrapping_add(&ref_), + |v: T| v.wrapping_add(&ref_), )?; }; From 56d3c9ee567a51a0cbae5431f2c9311c1ed7044c Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 13:19:06 +0100 Subject: [PATCH 4/7] fix Signed-off-by: Joe Isaacs --- .../bitpacking/array/bitpack_decompress.rs | 94 +++++-------------- .../fastlanes/src/bitpacking/compute/cast.rs | 5 +- .../fastlanes/src/bitpacking/vtable/mod.rs | 5 +- .../fastlanes/src/for/array/for_decompress.rs | 2 +- 4 files changed, 29 insertions(+), 77 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 881fbb27ebe..6c7da310d95 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -37,67 +37,31 @@ pub fn unpack_primitive_array( ctx: &mut ExecutionCtx, ) -> VortexResult { let mut builder = PrimitiveBuilder::with_capacity(array.dtype().nullability(), array.len()); - unpack_into_primitive_builder::(array, &mut builder, ctx)?; + unpack_map_into_builder::(array, &mut builder, ctx, |v| v)?; assert_eq!(builder.len(), array.len()); Ok(builder.finish_into_primitive()) } -pub(crate) fn unpack_into_primitive_builder( - array: ArrayView<'_, BitPacked>, - // TODO(ngates): do we want to use fastlanes alignment for this buffer? - builder: &mut PrimitiveBuilder, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - // If the array is empty, then we don't need to add anything to the builder. - if array.is_empty() { - return Ok(()); - } - - let mut uninit_range = builder.uninit_range(array.len()); - - // SAFETY: We later initialize the the uninitialized range of values with `copy_from_slice`. - unsafe { - // Append a dense null Mask. - uninit_range.append_mask(array.validity()?.execute_mask(array.as_ref().len(), ctx)?); - } - - // SAFETY: `decode_into` will initialize all values in this range. - let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, array.len()) }; - - let mut bit_packed_iter = array.unpacked_chunks()?; - bit_packed_iter.decode_into(uninit_slice); - - if let Some(patches) = array.patches() { - apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx)?; - }; - - // SAFETY: We have set a correct validity mask via `append_mask` with `array.len()` values and - // initialized the same number of values needed via `decode_into`. - unsafe { - uninit_range.finish(); - } - Ok(()) -} - -/// Unpack a bit-packed array of physical type `F` directly into a wider primitive type `T`, -/// casting each value during decompression. +/// Unpack a bit-packed array of physical type `F` into a `PrimitiveBuilder`, applying `map` +/// to each value during decompression. /// -/// This is the "cast pushdown" path: rather than canonicalizing to a full-length `F`-typed -/// `PrimitiveArray` and then casting it to `T` (two full-length buffers, with the `F` intermediate -/// written out to RAM), we unpack each 1024-element FastLanes chunk into a small cache-resident -/// scratch buffer and cast-copy straight into the `T` output. Only the `T` output buffer is -/// allocated and touched in RAM. +/// Pass `|v| v` (with `F = T`) for plain decompression, `|v: F| v.as_()` for a widening cast, +/// or any other element-wise transform. Each 1024-element FastLanes chunk is unpacked into a +/// cache-resident scratch buffer and written through `map` directly into the `T` output, so when +/// `F != T` no full-length `F`-typed intermediate is materialized. /// -/// The caller must ensure all valid values are representable in `T` (it is intended for widening -/// casts such as `u16 -> u32`); narrowing or sign-changing casts are not validated here. -pub(crate) fn unpack_and_cast_into_builder( +/// The caller must ensure that every valid source value is representable in `T` under `map`; no +/// per-value bounds check is performed. +pub(crate) fn unpack_map_into_builder( array: ArrayView<'_, BitPacked>, builder: &mut PrimitiveBuilder, ctx: &mut ExecutionCtx, + map: M, ) -> VortexResult<()> where - F: BitPackedUnpack + AsPrimitive, + F: BitPackedUnpack, T: NativePType, + M: Fn(F) -> T, { if array.is_empty() { return Ok(()); @@ -115,10 +79,10 @@ where let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; let mut chunks = array.unpacked_chunks::()?; - chunks.decode_map_into(uninit_slice, |v: F| v.as_()); + chunks.decode_map_into(uninit_slice, &map); if let Some(patches) = array.patches() { - apply_patches_to_uninit_range_map(&mut uninit_range, &patches, ctx, |v: F| v.as_())?; + apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx, &map)?; } // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same @@ -129,24 +93,7 @@ where Ok(()) } -pub fn apply_patches_to_uninit_range( - dst: &mut UninitRange, - patches: &Patches, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - apply_patches_to_uninit_range_fn(dst, patches, ctx, |v: T| v) -} - -pub fn apply_patches_to_uninit_range_fn T>( - dst: &mut UninitRange, - patches: &Patches, - ctx: &mut ExecutionCtx, - f: F, -) -> VortexResult<()> { - apply_patches_to_uninit_range_map(dst, patches, ctx, f) -} - -pub(crate) fn apply_patches_to_uninit_range_map T>( +pub fn apply_patches_to_uninit_range T>( dst: &mut UninitRange, patches: &Patches, ctx: &mut ExecutionCtx, @@ -394,10 +341,11 @@ mod tests { let bitpacked = encode(&empty, 0); let mut builder = PrimitiveBuilder::::new(Nullability::NonNullable); - unpack_into_primitive_builder( + unpack_map_into_builder::<_, _, _>( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v| v, )?; let result = builder.finish_into_primitive(); @@ -422,10 +370,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 5); - unpack_into_primitive_builder( + unpack_map_into_builder::<_, _, _>( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v| v, )?; let result = builder.finish_into_primitive(); @@ -459,10 +408,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::NonNullable, 100); - unpack_into_primitive_builder( + unpack_map_into_builder::<_, _, _>( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v| v, )?; let result = builder.finish_into_primitive(); diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 1b5346c48f7..10060eb57e2 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use num_traits::AsPrimitive; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -17,7 +18,7 @@ use vortex_error::VortexResult; use crate::bitpacking::BitPacked; use crate::bitpacking::array::BitPackedArrayExt; -use crate::bitpacking::array::bitpack_decompress::unpack_and_cast_into_builder; +use crate::bitpacking::array::bitpack_decompress::unpack_map_into_builder; /// Returns `true` if casting `src` to `tgt` is a widening integer cast for which every value a /// bit-packed array can hold is guaranteed to be representable in `tgt` (so no per-value bounds @@ -101,7 +102,7 @@ impl CastKernel for BitPacked { let result = match_each_integer_ptype!(tgt, |T| { let mut builder = PrimitiveBuilder::::with_capacity(tgt_nullability, array.len()); match_each_integer_ptype!(src, |F| { - unpack_and_cast_into_builder::(array, &mut builder, ctx)?; + unpack_map_into_builder::(array, &mut builder, ctx, |v: F| v.as_())?; }); builder.finish_into_primitive().into_array() }); diff --git a/encodings/fastlanes/src/bitpacking/vtable/mod.rs b/encodings/fastlanes/src/bitpacking/vtable/mod.rs index 912dd4ff44b..9c6a0e1221b 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/mod.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/mod.rs @@ -44,7 +44,7 @@ use crate::BitPackedArrayExt; use crate::BitPackedData; use crate::BitPackedDataParts; use crate::bitpack_decompress::unpack_array; -use crate::bitpack_decompress::unpack_into_primitive_builder; +use crate::bitpack_decompress::unpack_map_into_builder; use crate::bitpacking::array::BitPackedSlots; use crate::bitpacking::array::BitPackedSlotsView; use crate::bitpacking::array::PATCH_SLOTS; @@ -239,13 +239,14 @@ impl VTable for BitPacked { ctx: &mut ExecutionCtx, ) -> VortexResult<()> { match_each_integer_ptype!(array.dtype().as_ptype(), |T| { - unpack_into_primitive_builder::( + unpack_map_into_builder::( array, builder .as_any_mut() .downcast_mut() .vortex_expect("bit packed array must canonicalize into a primitive array"), ctx, + |v| v, ) }) } diff --git a/encodings/fastlanes/src/for/array/for_decompress.rs b/encodings/fastlanes/src/for/array/for_decompress.rs index 6b43ac85c24..073864411fb 100644 --- a/encodings/fastlanes/src/for/array/for_decompress.rs +++ b/encodings/fastlanes/src/for/array/for_decompress.rs @@ -119,7 +119,7 @@ pub(crate) fn fused_decompress< unpacked.decode_into(uninit_slice); if let Some(patches) = bp.patches() { - bitpack_decompress::apply_patches_to_uninit_range_fn( + bitpack_decompress::apply_patches_to_uninit_range( &mut uninit_range, &patches, ctx, From 3875d5318495cd52e19ba549d08fc82b1207df74 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 13:24:00 +0100 Subject: [PATCH 5/7] fix Signed-off-by: Joe Isaacs --- .../src/bitpacking/array/bitpack_decompress.rs | 16 ++++++++-------- encodings/fastlanes/src/bitpacking/vtable/mod.rs | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 6c7da310d95..ec14bdae19b 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -37,7 +37,7 @@ pub fn unpack_primitive_array( ctx: &mut ExecutionCtx, ) -> VortexResult { let mut builder = PrimitiveBuilder::with_capacity(array.dtype().nullability(), array.len()); - unpack_map_into_builder::(array, &mut builder, ctx, |v| v)?; + unpack_map_into_builder(array, &mut builder, ctx, |v: T| v)?; assert_eq!(builder.len(), array.len()); Ok(builder.finish_into_primitive()) } @@ -93,7 +93,7 @@ where Ok(()) } -pub fn apply_patches_to_uninit_range T>( +pub(crate) fn apply_patches_to_uninit_range T>( dst: &mut UninitRange, patches: &Patches, ctx: &mut ExecutionCtx, @@ -341,11 +341,11 @@ mod tests { let bitpacked = encode(&empty, 0); let mut builder = PrimitiveBuilder::::new(Nullability::NonNullable); - unpack_map_into_builder::<_, _, _>( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), - |v| v, + |v: u32| v, )?; let result = builder.finish_into_primitive(); @@ -370,11 +370,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 5); - unpack_map_into_builder::<_, _, _>( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), - |v| v, + |v: u32| v, )?; let result = builder.finish_into_primitive(); @@ -408,11 +408,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::NonNullable, 100); - unpack_map_into_builder::<_, _, _>( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), - |v| v, + |v: u32| v, )?; let result = builder.finish_into_primitive(); diff --git a/encodings/fastlanes/src/bitpacking/vtable/mod.rs b/encodings/fastlanes/src/bitpacking/vtable/mod.rs index 9c6a0e1221b..bca5dd541e3 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/mod.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/mod.rs @@ -239,14 +239,14 @@ impl VTable for BitPacked { ctx: &mut ExecutionCtx, ) -> VortexResult<()> { match_each_integer_ptype!(array.dtype().as_ptype(), |T| { - unpack_map_into_builder::( + unpack_map_into_builder( array, builder .as_any_mut() .downcast_mut() .vortex_expect("bit packed array must canonicalize into a primitive array"), ctx, - |v| v, + |v: T| v, ) }) } From cd1b4c22e557215acdf442676f5f46218509df71 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 13:50:13 +0100 Subject: [PATCH 6/7] fix Signed-off-by: Joe Isaacs --- encodings/fastlanes/public-api.lock | 4 ---- 1 file changed, 4 deletions(-) diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 4f0ce3df18c..79c2a0c111d 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -34,10 +34,6 @@ pub fn vortex_fastlanes::bitpack_compress::gather_patches(&vortex_array::arrays: pub mod vortex_fastlanes::bitpack_decompress -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, F) -> vortex_error::VortexResult<()> - pub fn vortex_fastlanes::bitpack_decompress::count_exceptions(u8, &[usize]) -> usize pub fn vortex_fastlanes::bitpack_decompress::unpack_array(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult From 48a473f1212dbce260554c1905de117eb16cc9bd Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 15:06:18 +0100 Subject: [PATCH 7/7] fix Signed-off-by: Joe Isaacs --- .../bitpacking/array/bitpack_decompress.rs | 59 ++++++++++++++++--- .../src/bitpacking/array/unpack_iter.rs | 12 ++-- .../fastlanes/src/bitpacking/vtable/mod.rs | 5 +- 3 files changed, 57 insertions(+), 19 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index ec14bdae19b..5b81580d7a6 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::mem::MaybeUninit; + use fastlanes::BitPacking; use itertools::Itertools; use num_traits::AsPrimitive; @@ -21,6 +23,7 @@ use vortex_error::VortexResult; use crate::BitPacked; use crate::BitPackedArrayExt; use crate::unpack_iter::BitPacked as BitPackedUnpack; +use crate::unpack_iter::BitUnpackedChunks; /// Unpacks a bit-packed array into a primitive array. pub fn unpack_array( @@ -37,18 +40,38 @@ pub fn unpack_primitive_array( ctx: &mut ExecutionCtx, ) -> VortexResult { let mut builder = PrimitiveBuilder::with_capacity(array.dtype().nullability(), array.len()); - unpack_map_into_builder(array, &mut builder, ctx, |v: T| v)?; + unpack_into_primitive_builder::(array, &mut builder, ctx)?; assert_eq!(builder.len(), array.len()); Ok(builder.finish_into_primitive()) } +/// Unpack a bit-packed array directly into a same-typed `PrimitiveBuilder`. +/// +/// This is the fast path for ordinary decompression: full FastLanes chunks are unpacked straight +/// into the final output buffer, avoiding the scratch chunk and copy needed by mapped decode. +pub(crate) fn unpack_into_primitive_builder( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + unpack_into_builder_with( + array, + builder, + ctx, + |v: T| v, + |chunks, output, _| { + chunks.decode_into(output); + }, + ) +} + /// Unpack a bit-packed array of physical type `F` into a `PrimitiveBuilder`, applying `map` /// to each value during decompression. /// -/// Pass `|v| v` (with `F = T`) for plain decompression, `|v: F| v.as_()` for a widening cast, -/// or any other element-wise transform. Each 1024-element FastLanes chunk is unpacked into a -/// cache-resident scratch buffer and written through `map` directly into the `T` output, so when -/// `F != T` no full-length `F`-typed intermediate is materialized. +/// Use [`unpack_into_primitive_builder`] for same-type plain decompression. This mapped path is +/// for widening casts or other element-wise transforms: each 1024-element FastLanes chunk is +/// unpacked into a cache-resident scratch buffer and written through `map` directly into the `T` +/// output, so when `F != T` no full-length `F`-typed intermediate is materialized. /// /// The caller must ensure that every valid source value is representable in `T` under `map`; no /// per-value bounds check is performed. @@ -62,6 +85,24 @@ where F: BitPackedUnpack, T: NativePType, M: Fn(F) -> T, +{ + unpack_into_builder_with(array, builder, ctx, map, |chunks, output, map| { + chunks.decode_map_into(output, map); + }) +} + +fn unpack_into_builder_with( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, + map: M, + decode: D, +) -> VortexResult<()> +where + F: BitPackedUnpack, + T: NativePType, + M: Fn(F) -> T, + D: FnOnce(&mut BitUnpackedChunks, &mut [MaybeUninit], &M), { if array.is_empty() { return Ok(()); @@ -70,23 +111,23 @@ where let len = array.len(); let mut uninit_range = builder.uninit_range(len); - // SAFETY: We initialize all `len` values below via `decode_map_into` and the patch loop. + // SAFETY: We initialize all `len` values below via `decode` and the patch loop. unsafe { uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?); } - // SAFETY: `decode_map_into` writes a value to every slot in this range. + // SAFETY: `decode` writes a value to every slot in this range. let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; let mut chunks = array.unpacked_chunks::()?; - chunks.decode_map_into(uninit_slice, &map); + decode(&mut chunks, uninit_slice, &map); if let Some(patches) = array.patches() { apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx, &map)?; } // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same - // number of values was initialized via `decode_map_into` (and overwritten by patches). + // number of values was initialized via `decode` (and overwritten by patches). unsafe { uninit_range.finish(); } diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index c3476e5daf9..a02dfb6b998 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -181,12 +181,11 @@ impl> UnpackedChunks { }) } - /// Decode all chunks (initial, full, and trailer) into the output range. - /// This consolidates the logic for handling all three chunk types in one place. + /// Decode all chunks (initial, full, and trailer) directly into the output range. pub fn decode_into(&mut self, output: &mut [MaybeUninit]) { + debug_assert_eq!(output.len(), self.len); let mut local_idx = 0; - // Handle initial partial chunk if present if let Some(initial) = self.initial() { local_idx = initial.len(); @@ -196,16 +195,17 @@ impl> UnpackedChunks { output[..local_idx].copy_from_slice(init_initial); } - // Handle full chunks local_idx = self.decode_full_chunks_into_at(output, local_idx); - // Handle trailing partial chunk if present if let Some(trailer) = self.trailer() { // TODO(connor): use maybe_uninit_write_slice when it gets stabilized. // SAFETY: &[T] and &[MaybeUninit] have the same layout. let init_trailer: &[MaybeUninit] = unsafe { mem::transmute(trailer) }; output[local_idx..][..init_trailer.len()].copy_from_slice(init_trailer); + local_idx += init_trailer.len(); } + + debug_assert_eq!(local_idx, self.len); } /// Decode all chunks (initial, full, and trailer), mapping each unpacked value through f. @@ -261,13 +261,11 @@ impl> UnpackedChunks { } /// Unpack full chunks into output range starting at the given index. - /// Returns the next local index to write to. fn decode_full_chunks_into_at( &mut self, output: &mut [MaybeUninit], start_idx: usize, ) -> usize { - // If there is only one chunk it has been handled already by initial. if self.num_chunks == 1 { return start_idx; } diff --git a/encodings/fastlanes/src/bitpacking/vtable/mod.rs b/encodings/fastlanes/src/bitpacking/vtable/mod.rs index bca5dd541e3..912dd4ff44b 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/mod.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/mod.rs @@ -44,7 +44,7 @@ use crate::BitPackedArrayExt; use crate::BitPackedData; use crate::BitPackedDataParts; use crate::bitpack_decompress::unpack_array; -use crate::bitpack_decompress::unpack_map_into_builder; +use crate::bitpack_decompress::unpack_into_primitive_builder; use crate::bitpacking::array::BitPackedSlots; use crate::bitpacking::array::BitPackedSlotsView; use crate::bitpacking::array::PATCH_SLOTS; @@ -239,14 +239,13 @@ impl VTable for BitPacked { ctx: &mut ExecutionCtx, ) -> VortexResult<()> { match_each_integer_ptype!(array.dtype().as_ptype(), |T| { - unpack_map_into_builder( + unpack_into_primitive_builder::( array, builder .as_any_mut() .downcast_mut() .vortex_expect("bit packed array must canonicalize into a primitive array"), ctx, - |v: T| v, ) }) }