diff --git a/Cargo.lock b/Cargo.lock index 045c72176fd..d29c91edf62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9362,6 +9362,7 @@ dependencies = [ "itertools 0.14.0", "memmap2", "num-traits", + "rand 0.10.1", "rstest", "serde", "simdutf8", diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs index 86895fb2ce7..63eeb350e7a 100644 --- a/vortex-array/benches/cast_primitive.rs +++ b/vortex-array/benches/cast_primitive.rs @@ -18,18 +18,16 @@ fn main() { divan::main(); } -const N: usize = 100_000; +// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so +// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth. +const SIZES: &[usize] = &[65_536]; -#[divan::bench] -fn cast_u16_to_u32(bencher: Bencher) { +#[divan::bench(args = SIZES)] +fn cast_u16_to_u32(bencher: Bencher, n: usize) { let mut rng = StdRng::seed_from_u64(42); - #[expect(clippy::cast_possible_truncation)] - let arr = PrimitiveArray::from_option_iter((0..N).map(|i| { - if rng.random_bool(0.5) { - None - } else { - Some(i as u16) - } + let arr = PrimitiveArray::from_option_iter((0..n).map(|i| { + #[expect(clippy::cast_possible_truncation)] + rng.random_bool(0.5).then(|| i as u16) })) .into_array(); // Pre-compute min/max so values_fit_in is a cache hit during the benchmark. @@ -46,3 +44,38 @@ fn cast_u16_to_u32(bencher: Bencher) { .execute::(&mut LEGACY_SESSION.create_execution_ctx()) }); } + +/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded +/// so every value fits, isolating the kernel's per-lane checked-cast overhead. +#[divan::bench(args = SIZES)] +fn cast_u32_to_u8(bencher: Bencher, n: usize) { + let mut rng = StdRng::seed_from_u64(42); + let arr = PrimitiveArray::from_option_iter((0..n).map(|_| { + rng.random_bool(0.7) + .then(|| rng.random_range(0..u8::MAX) as u32) + })) + .into_array(); + bencher.with_inputs(|| arr.clone()).bench_refs(|a| { + #[expect(clippy::unwrap_used)] + a.cast(DType::Primitive(PType::U8, Nullability::Nullable)) + .unwrap() + .execute::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds +/// but still pays the per-lane `try_from` check. +#[divan::bench(args = SIZES)] +fn cast_i32_to_u32(bencher: Bencher, n: usize) { + let mut rng = StdRng::seed_from_u64(42); + let arr = PrimitiveArray::from_option_iter( + (0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))), + ) + .into_array(); + bencher.with_inputs(|| arr.clone()).bench_refs(|a| { + #[expect(clippy::unwrap_used)] + a.cast(DType::Primitive(PType::U32, Nullability::Nullable)) + .unwrap() + .execute::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 10c0b8d6eba..82bbb1c0d23 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -5,6 +5,9 @@ use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_buffer::lane_kernels::IndexedSinkExt; +use vortex_buffer::lane_kernels::IndexedSourceExt; +use vortex_buffer::lane_kernels::ReinterpretSink; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -102,9 +105,7 @@ impl CastKernel for Primitive { } } -/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts -/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them -/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out. +/// Cast Primitive values from `F` to `T`. fn cast_values( array: ArrayView<'_, Primitive>, new_validity: Validity, @@ -114,53 +115,101 @@ where F: NativePType + AsPrimitive, T: NativePType, { - let values = array.as_slice::(); - - // Fast path: statically infallible, or cached min/max prove every valid value fits in `T`. - // The cached check never triggers a stats computation — if the bounds aren't already known - // we fall through to the per-lane loop below. - if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) { - return Ok(PrimitiveArray::new(cast::(values), new_validity).into_array()); - } - - // TODO(joe): if the values source and target have the same bit-width we can - // mutate in place. - - // Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for - // them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow. - let mask = array.validity()?.execute_mask(array.len(), ctx)?; let overflow = || { vortex_err!( Compute: "Cannot cast {} to {} — value exceeds target range", F::PTYPE, T::PTYPE, ) }; - let buffer: Buffer = match &mask { - Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter( + + // Returns `true` if every value of `from` is representable in `to` without loss. + fn casts_losslessly_to(from: PType, to: PType) -> bool { + from.least_supertype(to) == Some(to) + } + + // Skip the fallible kernel when type widening or (cached) min/max prove every value fits. + let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); + let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE) + || cached_values_fit_in(array, &target_dtype) == Some(true); + + let len = array.len(); + + // If F and T have the same byte width, try to take unique ownership of the buffer. + let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width(); + let owned: Option> = if same_bit_width { + array.into_owned().try_into_buffer_mut::().ok() + } else { + None + }; + let values: &[F] = array.as_slice::(); + + if infallible { + return match owned { + Some(mut buf) => { + ReinterpretSink::::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_()); + // SAFETY: same size + alignment for NativePType + let result: BufferMut = unsafe { buf.transmute::() }; + Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()) + } + None => { + let mut buffer = BufferMut::::with_capacity(len); + values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_()); + // SAFETY: map_into initializes every lane. + unsafe { buffer.set_len(len) }; + Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()) + } + }; + } + + let mask = array.validity()?.execute_mask(len, ctx)?; + + let buffer: Buffer = match (&mask, owned) { + (Mask::AllTrue(_), Some(mut buf)) => { + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_in_place(|v: F| ::from(v)) + .map_err(|_| overflow())?; + // SAFETY: same size + alignment for NativePType + let result: BufferMut = unsafe { buf.transmute::() }; + result.freeze() + } + (Mask::AllTrue(_), None) => { + let mut buffer = BufferMut::::with_capacity(len); values - .iter() - .map(|&v| ::from(v).ok_or_else(overflow)), - )? - .freeze(), - Mask::AllFalse(_) => BufferMut::::zeroed(values.len()).freeze(), - Mask::Values(m) => BufferMut::try_from_trusted_len_iter( - values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| { - let factor = if valid { F::one() } else { F::zero() }; - ::from(v * factor).ok_or_else(overflow) - }), - )? - .freeze(), + .try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| { + ::from(v) + }) + .map_err(|_| overflow())?; + // SAFETY: initialized every lane. + unsafe { buffer.set_len(len) }; + buffer.freeze() + } + (Mask::AllFalse(_), _) => BufferMut::::zeroed(len).freeze(), + (Mask::Values(m), Some(mut buf)) => { + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_masked_in_place(m.bit_buffer(), |v: F| ::from(v)) + .map_err(|_| overflow())?; + // SAFETY: same size + alignment for NativePType + let result: BufferMut = unsafe { buf.transmute::() }; + result.freeze() + } + (Mask::Values(m), None) => { + let mut buffer = BufferMut::::with_capacity(len); + values + .try_map_masked_into( + m.bit_buffer(), + &mut buffer.spare_capacity_mut()[..len], + |v| ::from(v), + ) + .map_err(|_| overflow())?; + // SAFETY: initialized every lane. + unsafe { buffer.set_len(len) }; + buffer.freeze() + } }; Ok(PrimitiveArray::new(buffer, new_validity).into_array()) } -/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because -/// they are masked out by validity. -fn cast, T: NativePType>(array: &[F]) -> Buffer { - BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze() -} - fn reinterpret( array: ArrayView<'_, Primitive>, new_ptype: PType, @@ -178,23 +227,6 @@ fn reinterpret( .into_array() } -/// Returns `true` if every value of `src` is guaranteed representable in `target` without -/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never -/// produce an out-of-range result. -fn values_always_fit(src: PType, target: PType) -> bool { - if src == target { - return true; - } - if src.is_int() && target.is_int() { - return target.byte_width() > src.byte_width() - && (src.is_unsigned_int() || target.is_signed_int()); - } - if src.is_float() && target.is_float() { - return target.byte_width() > src.byte_width(); - } - src.is_int() && matches!(target, PType::F32 | PType::F64) -} - /// Returns `true` if all valid values in `array` are representable as `target_ptype`. /// /// Cached min/max statistics are consulted first. If either bound is missing, the function either diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index ae9d7e6cc05..31b9d1c8570 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -39,6 +39,7 @@ workspace = true [dev-dependencies] divan = { workspace = true } num-traits = { workspace = true } +rand = { workspace = true } rstest = { workspace = true } [[bench]] @@ -48,3 +49,7 @@ harness = false [[bench]] name = "vortex_bitbuffer" harness = false + +[[bench]] +name = "lane_kernels" +harness = false diff --git a/vortex-buffer/benches/lane_kernels.rs b/vortex-buffer/benches/lane_kernels.rs new file mode 100644 index 00000000000..60ab967e1ed --- /dev/null +++ b/vortex-buffer/benches/lane_kernels.rs @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Coverage benchmark for the lane-kernel variants used by primitive casts, +//! bit-packing paths, and `LaneZip` binary kernels. +//! +//! `add_checked` parity assertions (run at startup) verify that the bit-packed +//! fail-tracking scheme: +//! - propagates valid-lane overflow as `Err`, and +//! - suppresses null-lane overflow without the closure ever inspecting `valid`. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; + +use divan::Bencher; +use num_traits::AsPrimitive; +use num_traits::NumCast; +use rand::SeedableRng; +use rand::prelude::*; +use rand::rngs::StdRng; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::lane_kernels::IndexedSinkExt; +use vortex_buffer::lane_kernels::IndexedSourceExt; +use vortex_buffer::lane_kernels::LaneZip; +use vortex_buffer::lane_kernels::ReinterpretSink; + +fn main() { + assert_overflow_parity(); + assert_null_overflow_suppressed(); + divan::main(); +} + +const SIZES: &[usize] = &[65_536]; + +// ----------------------------------------------------------------------------- +// Cast fixture (u64/u16/i32 lanes + a single validity mask). +// ----------------------------------------------------------------------------- + +struct CastFixture { + values_u64: Buffer, + values_u16: Buffer, + /// Positive `i32` values (always representable as `u32`). Used by the + /// in-place-vs-out-of-place cast bench. + values_i32: Buffer, + mask: BitBuffer, +} + +fn cast_fixture(n: usize) -> CastFixture { + let mut rng = StdRng::seed_from_u64(0xC457_1D3E); + + let raw_values: Vec = (0..n) + .map(|_| rng.random_range(0..(u32::MAX as u64))) + .collect(); + let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); + + #[expect(clippy::cast_possible_truncation)] + let values_u16 = raw_values + .iter() + .copied() + .map(|v| v as u16) + .collect::>(); + + // Positive i32 values (top bit cleared) — every value fits in u32. + #[expect(clippy::cast_possible_truncation)] + let values_i32 = raw_values + .iter() + .copied() + .map(|v| (v as i32) & i32::MAX) + .collect::>(); + + CastFixture { + values_u64: raw_values.into(), + values_u16, + values_i32, + mask: BitBufferMut::from_iter(raw_valid).freeze(), + } +} + +fn uninit_out(n: usize) -> Vec> { + let mut out = Vec::with_capacity(n); + // SAFETY: A `MaybeUninit` does not require initialization. + unsafe { + out.set_len(n); + } + out +} + +// ----------------------------------------------------------------------------- +// Cast benches (single-input, source -> output). +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values + .as_slice() + .try_map_into(out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(&mut out, |v| v.as_()); + out + }); +} + +/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the +/// same runtime — for always-true map operations `try_map_masked_into` is +/// sufficient. +#[divan::bench(args = SIZES)] +fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + values + .as_slice() + .try_map_masked_into(&mask, out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(out.as_mut_slice(), |v| v.as_()); + out + }); +} + +// ----------------------------------------------------------------------------- +// In-place vs out-of-place fallible cast i32 → u32 (same byte width). +// +// `try_map_masked_in_place` mutates the input via `ReinterpretSink` and +// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates +// a fresh `BufferMut` and writes through it. Input values are all positive +// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any +// delta is allocation + memory-traffic overhead. +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + values + .as_slice() + .try_map_masked_into(&mask, out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_masked_in_place_narrow_i32_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone())) + .bench_values(|(mut values, mask)| { + ReinterpretSink::::new(values.as_mut_slice()) + .try_map_masked_in_place(&mask, ::from) + .unwrap(); + values + }); +} + +// ----------------------------------------------------------------------------- +// LaneZip binary kernel: checked `u32 + u32 -> u32` over two nullable columns. +// +// Per-lane `is_none()` flags are bit-packed and AND-ed with the chunk validity +// word, so null-lane overflow is filtered without the closure inspecting `valid`. +// Verified at startup via parity assertions (`assert_overflow_parity` and +// `assert_null_overflow_suppressed`). +// ----------------------------------------------------------------------------- + +const ADD_LHS_VALID_RATE: f64 = 0.7; +const ADD_RHS_VALID_RATE: f64 = 0.8; + +struct AddFixture { + /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel + /// that ignores validity would `Err` on them. The implementation under test + /// must suppress that. + lhs: Buffer, + rhs: Buffer, + lhs_mask: BitBuffer, + rhs_mask: BitBuffer, +} + +fn add_fixture(n: usize) -> AddFixture { + let mut lhs_rng = StdRng::seed_from_u64(0); + let mut rhs_rng = StdRng::seed_from_u64(1); + let mut lvr = StdRng::seed_from_u64(2); + let mut rvr = StdRng::seed_from_u64(3); + + let lhs_valid: Vec = (0..n) + .map(|_| lvr.random_bool(ADD_LHS_VALID_RATE)) + .collect(); + let rhs_valid: Vec = (0..n) + .map(|_| rvr.random_bool(ADD_RHS_VALID_RATE)) + .collect(); + + let lhs: Buffer = (0..n) + .map(|i| { + if lhs_valid[i] { + lhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + let rhs: Buffer = (0..n) + .map(|i| { + if rhs_valid[i] { + rhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + + let lhs_mask = BitBufferMut::from_iter(lhs_valid).freeze(); + let rhs_mask = BitBufferMut::from_iter(rhs_valid).freeze(); + + AddFixture { + lhs, + rhs, + lhs_mask, + rhs_mask, + } +} + +#[divan::bench(args = SIZES)] +fn lanezip_checked_add_u32(bencher: Bencher, n: usize) { + let f = add_fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = uninit_out::(n); + LaneZip::new(lhs.as_slice(), rhs.as_slice()) + .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b)) + .unwrap(); + (combined, out) + }); +} + +// ----------------------------------------------------------------------------- +// Parity assertions — must pass before divan runs benches. +// ----------------------------------------------------------------------------- + +/// Overflow at a valid lane must propagate as `Err`. +fn assert_overflow_parity() { + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true; 4]; + + let mask = BitBufferMut::from_iter(valid).freeze(); + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( + &mask, + out.as_mut_slice(), + |(a, b)| a.checked_add(b), + ); + assert!(r.is_err(), "bitpack should Err on overflow"); +} + +/// Overflow at a null lane must NOT propagate. +fn assert_null_overflow_suppressed() { + // Lane 2 is null and holds an overflowing value; valid lanes are safe. + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true, true, false, true]; + + let mask = BitBufferMut::from_iter(valid).freeze(); + let mut out = uninit_out::(4); + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( + &mask, + out.as_mut_slice(), + |(a, b)| a.checked_add(b), + ); + assert!(r.is_ok(), "bitpack: null-lane overflow leaked"); +} diff --git a/vortex-buffer/src/lane_kernels.rs b/vortex-buffer/src/lane_kernels.rs new file mode 100644 index 00000000000..d1e0d9b5f2e --- /dev/null +++ b/vortex-buffer/src/lane_kernels.rs @@ -0,0 +1,1076 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Elementwise lane kernels over indexed sources. +//! +//! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is +//! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]` +//! this inlines to the same indexed load as the slice kernel; for `LaneZip(&[A], &[B])` +//! it gives two independent indexed reads per lane — both shapes the auto-vectorizer +//! handles. +//! +//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated +//! this design. +//! +//! The output is always a caller-provided `&mut` slice — these kernels never allocate. +//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` +//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. + +use std::marker::PhantomData; +use std::mem::MaybeUninit; +use std::mem::align_of; +use std::mem::size_of; + +use crate::BitBuffer; + +/// A length-known source supporting unchecked indexed reads. +/// +/// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s. +/// The kernels in this module require this trait instead of `Iterator` so that lane +/// reads carry no inter-iteration data dependency — the autovectorizer treats each +/// lane independently. +pub trait IndexedSource { + /// The per-lane item type. Must be `Copy` so the kernels can pass it through + /// the closure by value without extra moves. + type Item: Copy; + /// Logical lane count. + fn len(&self) -> usize; + /// Returns true when there are no lanes. + fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Read the lane at `i` without bounds checking. + /// + /// # Safety + /// + /// `i` must be strictly less than `self.len()`. + unsafe fn get_unchecked(&self, i: usize) -> Self::Item; +} + +impl IndexedSource for &[T] { + type Item = T; + #[inline] + fn len(&self) -> usize { + <[T]>::len(self) + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> T { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked(self, i) } + } +} + +impl IndexedSource for &mut [T] { + type Item = T; + #[inline] + fn len(&self) -> usize { + <[T]>::len(self) + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> T { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked(self, i) } + } +} + +/// An [`IndexedSource`] that also supports unchecked indexed writes — the binding +/// for in-place kernels. +/// +/// `Write` is the type written by `set_unchecked` and may differ from +/// `IndexedSource::Item` (the read type). For the canonical `&mut [T]` impl +/// both are `T`. The decoupling is what makes [`ReinterpretSink`] possible — +/// a wrapper that reads `F` and writes `T` over the same backing memory when +/// the two have identical size and alignment. +/// +/// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a +/// `(A, B)` pair back to two separate sources via a single index). +pub trait IndexedSink: IndexedSource { + /// The per-lane write type. Equal to `::Item` for + /// `&mut [T]`; different for [`ReinterpretSink`]. + type Write: Copy; + + /// Write `value` into lane `i` without bounds checking. + /// + /// # Safety + /// + /// `i` must be strictly less than `self.len()`. + unsafe fn set_unchecked(&mut self, i: usize, value: Self::Write); +} + +impl IndexedSink for &mut [T] { + type Write = T; + #[inline] + unsafe fn set_unchecked(&mut self, i: usize, value: T) { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked_mut(self, i) = value }; + } +} + +/// A sink that reads `F`-values and writes `T`-values over the same backing +/// slice of `F`, reinterpreting each `T` as `F`-bits on write. +/// +/// Requires `size_of::() == size_of::()` and `align_of::() == align_of::()`. +/// Both hold for any pair of `NativePType` primitives with equal byte width +/// (e.g. `u32` ↔ `f32`, `u64` ↔ `i64`, `f64` ↔ `u64`). +/// +/// Use this when an in-place kernel needs to convert lanes between two +/// types of identical width without allocating a second buffer. After the +/// kernel completes every slot holds a valid `T`-bit pattern; the caller +/// can recover a typed view via `BufferMut::transmute::()`. +pub struct ReinterpretSink<'a, F, T> { + slice: &'a mut [F], + _phantom: PhantomData, +} + +impl<'a, F, T> ReinterpretSink<'a, F, T> { + /// Construct a `ReinterpretSink` from `&mut [F]`. + /// + /// # Panics + /// + /// Panics if `size_of::() != size_of::()` or + /// `align_of::() != align_of::()`. + pub fn new(slice: &'a mut [F]) -> Self { + assert_eq!( + size_of::(), + size_of::(), + "ReinterpretSink requires F and T to have the same size", + ); + assert_eq!( + align_of::(), + align_of::(), + "ReinterpretSink requires F and T to have the same alignment", + ); + Self { + slice, + _phantom: PhantomData, + } + } +} + +impl IndexedSource for ReinterpretSink<'_, F, T> { + type Item = F; + #[inline] + fn len(&self) -> usize { + self.slice.len() + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> F { + // SAFETY: caller guarantees i < self.slice.len(). Pointer arithmetic + // avoids method-resolution ambiguity between `<[F]>::get_unchecked` and + // `IndexedSource::get_unchecked`. + unsafe { *self.slice.as_ptr().add(i) } + } +} + +impl IndexedSink for ReinterpretSink<'_, F, T> { + type Write = T; + #[inline] + unsafe fn set_unchecked(&mut self, i: usize, value: T) { + // SAFETY: caller guarantees i < self.slice.len(); `new` enforces + // size_of::() == size_of::() and align_of::() == align_of::(), + // so the F-slot can hold a `T` without overflow or misalignment. + unsafe { + let ptr = self.slice.as_mut_ptr().add(i) as *mut T; + ptr.write(value); + } + } +} + +/// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane. +/// +/// Use this to drive a binary kernel from two columns. Length equality is enforced +/// at construction. +pub struct LaneZip(pub A, pub B); + +impl LaneZip { + /// Build a `LaneZip` from two equal-length sources. + /// + /// # Panics + /// + /// Panics if the two operands have different lengths. + pub fn new(a: A, b: B) -> Self { + assert_eq!( + a.len(), + b.len(), + "LaneZip operands must have the same length" + ); + Self(a, b) + } +} + +impl IndexedSource for LaneZip { + type Item = (A::Item, B::Item); + #[inline] + fn len(&self) -> usize { + debug_assert_eq!(self.0.len(), self.1.len()); + self.0.len() + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) { + // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths. + unsafe { (self.0.get_unchecked(i), self.1.get_unchecked(i)) } + } +} + +/// Extension trait providing lane-kernel methods on any [`IndexedSource`]. +/// +/// All methods have default implementations and are inherited via the blanket +/// `impl IndexedSourceExt for S` below. Bring the trait into +/// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call +/// them with method syntax: `values.try_map_masked_into(&mask, &mut out, f)`. +pub trait IndexedSourceExt: IndexedSource + Sized { + /// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` + /// indicates a per-lane failure (e.g. range overflow on a narrowing cast). + /// + /// **Null-lane failures are filtered automatically.** The closure is called on + /// every lane regardless of validity; if a null lane's stored value causes `f(v)` + /// to return `None`, the kernel does *not* propagate that as `Err`. The per-lane + /// `is_none()` flags are bit-packed into a `u64` at the lane's position, then + /// AND-combined with the chunk's validity bitmap — null-lane bits vanish. + /// + /// The closure shape is the same as [`try_map_into`] (`FnMut(Item) -> Option`); + /// the mask parameter is what makes this kernel mask-aware. Callers that need to + /// distinguish null lanes inside the closure (e.g. to short-circuit an expensive + /// computation) should construct their own per-lane validity check externally; for + /// the common case, the kernel's automatic filter is sufficient. + /// + /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` + /// write `R::default()` into `out`, but the contents of `out` must not be relied + /// upon when this function returns `Err`. + /// + /// [`map_with_mask`]: IndexedSourceExt::map_with_mask + /// [`try_map_into`]: IndexedSourceExt::try_map_into + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`. + #[inline] + fn try_map_masked_into( + self, + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, + ) -> Result<(), usize> + where + R: Copy + Default, + F: FnMut(Self::Item) -> Option, + { + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) -> Option + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_bits: u64 = 0; + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let result = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(idx).write(result) }; + } + let valid_failures = fail_bits & src_chunk; + (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) + } + + let values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) { + return Err(idx); + } + } + if remainder != 0 + && let Some(idx) = chunk( + &values, + out, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + ) + { + return Err(idx); + } + Ok(()) + } + + /// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every + /// closure invocation is treated as "happened", regardless of whether the lane + /// is null. Use this only when the input is known non-nullable. + /// + /// # Panics + /// + /// Panics if `out.len() != self.len()`. + #[inline] + fn map_into(self, out: &mut [MaybeUninit], mut f: F) + where + F: FnMut(Self::Item) -> R, + { + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) where + S: IndexedSource, + F: FnMut(S::Item) -> R, + { + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let val = unsafe { values.get_unchecked(idx) }; + unsafe { out.get_unchecked_mut(idx).write(f(val)) }; + } + } + + let values = self; + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + chunk(&values, out, &mut f, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk(&values, out, &mut f, chunks_count * 64, remainder); + } + } + + /// Fallible map with **no validity awareness at all** — every `None` returned + /// by the closure is treated as a failure, even at null lanes. + /// + /// # Use this only for non-nullable inputs. + /// + /// For nullable inputs with a fallible closure, use [`try_map_masked_into`] — + /// it has the same value-only closure shape (and the same perf win) but + /// **correctly suppresses null-lane failures** via per-chunk + /// `fail_bits & mask_chunk`. + /// + /// Using this kernel on a nullable input where a null lane's stored value + /// would cause `f` to return `None` will produce a spurious `Err`. This is a + /// correctness footgun on purpose — the name and this doc are how the API + /// signals "you must know your input has no nulls." + /// + /// On failure returns `Err(failing_lane_index)`. + /// + /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into + /// + /// # Panics + /// + /// Panics if `out.len() != self.len()`. + #[inline] + fn try_map_into(self, out: &mut [MaybeUninit], mut f: F) -> Result<(), usize> + where + R: Copy + Default, + F: FnMut(Self::Item) -> Option, + { + /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced); + /// the cold attribution path is called at the kernel level so it can be + /// inlined separately for full vs remainder. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) -> bool + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_acc: u64 = 0; + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); + fail_acc |= opt.is_none() as u64; + let result = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(idx).write(result) }; + } + fail_acc != 0 + } + + let values = self; + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + if chunk(&values, out, &mut f, base, 64) { + return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); + } + } + if remainder != 0 { + let base = chunks_count * 64; + if chunk(&values, out, &mut f, base, remainder) { + return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); + } + } + Ok(()) + } +} + +impl IndexedSourceExt for S {} + +/// Shared cold scan: walks a chunk, returns the first lane index where +/// `lane_fails(bit_idx, value)` returns `true`. Used by +/// [`attribute_failure_no_mask`]. +/// +/// Caller guarantees `base + chunk_len <= values.len()`. +#[cold] +#[inline(never)] +fn cold_scan( + values: &S, + base: usize, + chunk_len: usize, + mut lane_fails: impl FnMut(usize /* bit_idx */, S::Item) -> bool, +) -> usize +where + S: IndexedSource, +{ + for bit_idx in 0..chunk_len { + let idx = base + bit_idx; + // SAFETY: caller guarantees idx < values.len(). + let val = unsafe { values.get_unchecked(idx) }; + if lane_fails(bit_idx, val) { + return idx; + } + } + unreachable!("cold_scan called without a failing lane") +} + +/// Cold attribution for the no-mask variant. Replays `f` over the chunk to find +/// the first lane that returns `None`. +#[inline] +fn attribute_failure_no_mask(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize +where + S: IndexedSource, + F: FnMut(S::Item) -> Option, +{ + cold_scan(values, base, chunk_len, |_bit_idx, val| f(val).is_none()) +} + +/// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`]. +/// +/// All methods have default implementations and are inherited via the blanket +/// `impl IndexedSinkExt for S` below. Bring the trait into scope +/// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with +/// method syntax. +pub trait IndexedSinkExt: IndexedSink + Sized { + /// In-place counterpart of [`IndexedSourceExt::map_into`]. Each lane + /// is replaced with `f(self[i])`. + /// + /// The closure reads `Self::Item` and returns `Self::Write`. For the common + /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and + /// write types can differ (e.g. read `f32`, write `u32`) over the same + /// backing memory when sizes and alignments match. + /// + /// As with [`IndexedSourceExt::map_into`], use this only when the + /// input is known non-nullable. + #[inline] + fn map_into_in_place(self, mut f: F) + where + F: FnMut(Self::Item) -> Self::Write, + { + #[inline(always)] + fn chunk(values: &mut S, f: &mut F, base: usize, count: usize) + where + S: IndexedSink, + F: FnMut(S::Item) -> S::Write, + { + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let val = unsafe { values.get_unchecked(idx) }; + let result = f(val); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(idx, result) }; + } + } + + let mut values = self; + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + chunk(&mut values, &mut f, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk(&mut values, &mut f, chunks_count * 64, remainder); + } + } + + /// In-place counterpart of [`IndexedSourceExt::try_map_into`]. Each + /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f` + /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer + /// state on `Err` is unspecified. + /// + /// ## Error attribution + /// + /// Per-lane `is_none()` flags are bit-packed into a `u64` at the lane's + /// position — `fail_bits |= (opt.is_none() as u64) << bit_idx`. After the + /// 64-lane loop, `trailing_zeros()` of `fail_bits` recovers the first + /// failing lane index. `OR + shift` per lane is friendlier to the + /// autovectorizer than `min`/`csel` — see [`try_map_masked_in_place`] for + /// the same scheme over a masked variant. + /// + /// [`try_map_masked_in_place`]: IndexedSinkExt::try_map_masked_in_place + #[inline] + fn try_map_in_place(self, mut f: F) -> Result<(), usize> + where + Self::Write: Default, + F: FnMut(Self::Item) -> Option, + { + #[inline(always)] + fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option + where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_bits: u64 = 0; + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let result = opt.unwrap_or_default(); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(idx, result) }; + } + (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize) + } + + let mut values = self; + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) { + return Err(failing); + } + } + if remainder != 0 + && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f) + { + return Err(failing); + } + Ok(()) + } + + /// In-place counterpart of [`IndexedSourceExt::try_map_masked_into`]. Each + /// lane of `self` is replaced with `f(self[i])`, or `Self::Write::default()` + /// if `f` returned `None`. On failure returns `Err(first_failing_lane)`; + /// lanes before that point have been written, and lanes within the failing + /// chunk hold their unwrapped-or-default result. The buffer state on `Err` + /// is intentionally unspecified. + /// + /// **Null-lane failures are filtered automatically** — same semantics as + /// [`try_map_masked_into`]. The closure has no `valid` parameter; the kernel + /// AND-combines `is_none()` with the chunk's validity bitmap before folding + /// it into the attribution accumulator. + /// + /// ## Error attribution + /// + /// Per-lane `(is_none && valid)` flags are folded into `first_fail` via a + /// branchless `min` of `(if is_none && valid { i as u32 } else { u32::MAX })`. + /// After the 64-lane loop, `first_fail` holds the smallest valid failing index + /// in the chunk (or `MAX` if none). Vectorizes to NEON `bsl.16b` + `umin.4s` + /// on AArch64. The cold replay scheme used by [`try_map_masked_into`] isn't + /// viable here because the original input values have already been + /// overwritten by the time we would attribute the failure. + /// + /// ## Why in-place is slower at cache-resident sizes + /// + /// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the + /// out-of-place kernel despite having half the memory traffic, because + /// input and output share memory and the compiler must be conservative + /// reordering loads/stores across iterations. At sizes that exceed L2 the + /// in-place kernel wins back the gap by avoiding the second buffer's DRAM + /// read+write traffic. + /// + /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()`. + #[inline] + fn try_map_masked_in_place(self, mask: &BitBuffer, mut f: F) -> Result<(), usize> + where + Self::Write: Default, + F: FnMut(Self::Item) -> Option, + { + /// Bit-pack `is_none()` flags per lane, then AND with `src_chunk` post-loop to + /// drop null-lane failures — identical scheme to [`try_map_masked_into`]. The + /// per-lane attribution work is `OR + shift` (no `min`/`csel`), giving LLVM more + /// freedom to vectorize the value pipeline. + #[inline(always)] + fn chunk( + values: &mut S, + src_chunk: u64, + base: usize, + count: usize, + f: &mut F, + ) -> Option + where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_bits: u64 = 0; + for bit_idx in 0..count { + let idx = base + bit_idx; + // SAFETY: caller guarantees `base + count <= values.len()`. + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let result = opt.unwrap_or_default(); + unsafe { values.set_unchecked(idx, result) }; + } + let valid_failures = fail_bits & src_chunk; + (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) + } + + let mut values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { + return Err(failing); + } + } + if remainder != 0 + && let Some(failing) = chunk( + &mut values, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + &mut f, + ) + { + return Err(failing); + } + Ok(()) + } +} + +impl IndexedSinkExt for S {} + +#[cfg(test)] +#[allow(clippy::cast_possible_truncation)] +mod tests { + use super::*; + use crate::BitBufferMut; + + fn write_t(out: Vec>) -> Vec { + // SAFETY: tests always fully initialize the buffer. + unsafe { std::mem::transmute(out) } + } + + #[test] + fn try_map_masked_into_all_ok() { + let values: Vec = (0..200).collect(); + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..200u32).collect::>()); + } + + #[test] + fn try_map_masked_into_overflow_fails() { + // Put an overflowing value at lane 137 — the kernel must report Err(137). + let mut values: Vec = (0..200).collect(); + values[137] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert_eq!(res, Err(137)); + } + + #[test] + fn try_map_masked_into_overflow_reports_first_failing_lane() { + // Multiple failing lanes — must report the lowest index. + let mut values: Vec = (0..200).collect(); + values[50] = u64::MAX; + values[51] = u64::MAX; + values[137] = u64::MAX; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert_eq!(res, Err(50)); + } + + #[test] + fn try_map_masked_into_value_only_closure_filters_null_overflow() { + // `|v, _|` closure that ignores validity. A null lane with an overflowing + // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; // null lane with overflowing value + values[42] = u64::MAX; // null lane with overflowing value + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5 && i != 42); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!( + res.is_ok(), + "null-lane overflow should be filtered by the cold path" + ); + } + + #[test] + fn try_map_masked_into_value_only_closure_reports_first_valid_failure() { + // Valid lane overflow must propagate — and the reported index must be + // the lowest VALID failing lane, even if earlier null lanes also "failed" + // their unconditional cast. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; // null lane — filtered out + values[42] = u64::MAX; // null lane — filtered out + values[77] = u64::MAX; // VALID lane — should be reported + values[100] = u64::MAX; // VALID lane — higher index, ignored + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5 && i != 42); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_masked_into_null_lane_bypasses_check() { + // Null lanes are neutralized by `valid as u64` before the range check, so an + // out-of-range value at a null lane must NOT trigger failure. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got[5], 0); // null-lane wrote default + assert_eq!(got[6], 6); + } + + #[test] + fn try_map_masked_into_branchful_matches_branchless() { + let mut values: Vec = (0..130).map(|i| i as u64 * 7).collect(); + values[2] = u64::MAX; + values[65] = u32::MAX as u64; + let mask = { + let mut m = BitBufferMut::with_capacity(130); + for i in 0..130 { + m.append(!matches!(i, 2 | 17 | 99)); + } + m.freeze() + }; + + let mut branchless = vec![MaybeUninit::::uninit(); 130]; + let mut branchful = vec![MaybeUninit::::uninit(); 130]; + values + .as_slice() + .try_map_masked_into(&mask, &mut branchless, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }) + .unwrap(); + values + .as_slice() + .try_map_masked_into(&mask, &mut branchful, |v| u32::try_from(v).ok()) + .unwrap(); + + assert_eq!(write_t(branchful), write_t(branchless)); + } + + #[test] + fn try_map_masked_into_partial_chunk() { + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got.len(), 130); + assert_eq!(got[129], 129); + } + + #[test] + fn try_map_masked_into_sliced_mask_unaligned_offset() { + // The mask's first byte is not word-aligned: slice off 13 bits, so the + // underlying BitChunks iterator must shift across byte boundaries on every + // 64-bit chunk it yields. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5 + assert_eq!(mask.len(), 130); + + let values: Vec = (0..130).collect(); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..130u32).collect::>()); + } + + #[test] + fn try_map_masked_into_sliced_mask_with_overflow() { + // Sliced mask + overflowing value — the cold attribution path must report + // the correct lane index in the sliced (post-offset) coordinate space. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + let mut values: Vec = (0..130).collect(); + values[77] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_masked_into_sliced_mask_null_lanes() { + // Mix sliced offset with a non-trivial validity pattern. Null lanes must + // not contribute to fail_acc, even when their underlying value would overflow. + let mut m = BitBufferMut::with_capacity(256); + for i in 0..256 { + m.append(i % 3 != 0); + } + let big = m.freeze(); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + // After the 13-lane slice, original index `13 + j` becomes lane `j`. + // Lane `j` is valid iff `(13 + j) % 3 != 0`. + let mut values: Vec = (0..130).collect(); + // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid. + // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. + values[2] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!(res.is_ok(), "null lane should bypass the range check"); + } + + #[test] + fn try_map_masked_into_overflow_in_remainder() { + // Overflow in the trailing partial chunk (not aligned to 64). + let mut values: Vec = (0..130).collect(); + values[129] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert_eq!(res, Err(129)); + } + + #[test] + fn try_map_masked_in_place_all_ok() { + let mut values: Vec = (0..200).collect(); + let mask = BitBuffer::new_set(200); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert!(res.is_ok()); + let expected: Vec = (0..200u32).map(|v| v * 2).collect(); + assert_eq!(values, expected); + } + + #[test] + fn try_map_masked_in_place_first_failing_chunk_wins() { + let mut values: Vec = (0..200).collect(); + values[83] = u32::MAX; + values[150] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert_eq!(res, Err(83)); + } + + #[test] + fn try_map_masked_in_place_within_chunk_reports_lowest() { + let mut values: Vec = (0..200).collect(); + values[80] = u32::MAX; + values[100] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert_eq!(res, Err(80)); + } + + #[test] + fn try_map_masked_in_place_single_failure_lane_exact() { + let mut values: Vec = (0..200).collect(); + values[42] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert_eq!(res, Err(42)); + } + + #[test] + fn try_map_masked_in_place_null_bypass() { + let mut values: Vec = (0..200).collect(); + values[5] = u32::MAX; + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5); + } + m.freeze() + }; + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert!(res.is_ok(), "null-lane overflow should be filtered"); + // Null lane was overwritten with default (0). + assert_eq!(values[5], 0); + assert_eq!(values[6], 12); + } + + #[test] + fn try_map_masked_in_place_remainder_overflow() { + let mut values: Vec = (0..130).collect(); + values[129] = u32::MAX; + let mask = BitBuffer::new_set(130); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert_eq!(res, Err(129)); + } + + #[test] + fn try_map_masked_in_place_sliced_mask() { + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + let mut values: Vec = (0..130).collect(); + values[77] = u32::MAX; + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert_eq!(res, Err(77)); + } + + #[test] + fn reinterpret_sink_same_width_f32_u32() { + // Read f32, write u32-bits in place. After transmuting the slice back to u32 we + // should see exactly the bit patterns the closure produced. + let mut buf: Vec = (0..130).map(|i| i as f32).collect(); + let mask = BitBuffer::new_set(130); + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_masked_in_place(&mask, |f| Some(f.to_bits().wrapping_add(1))) + .unwrap(); + // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by + // the closure. + let as_u32: &[u32] = + unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const u32, buf.len()) }; + for (i, &got) in as_u32.iter().enumerate() { + assert_eq!(got, (i as f32).to_bits().wrapping_add(1), "lane {i}"); + } + } + + #[test] + fn reinterpret_sink_failure_reports_lane() { + // Closure fails at a specific lane; the kernel must report that lane index. + let mut buf: Vec = (0..200).map(|i| i as f32).collect(); + let mask = BitBuffer::new_set(200); + let res = ReinterpretSink::::new(buf.as_mut_slice()).try_map_masked_in_place( + &mask, + |f| { + if f as u32 == 137 { + None + } else { + Some(f as u32) + } + }, + ); + assert_eq!(res, Err(137)); + } + + #[test] + fn try_map_masked_in_place_partial_chunk_success() { + let mut values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let res = values + .as_mut_slice() + .try_map_masked_in_place(&mask, |v| Some(v + 1)); + assert!(res.is_ok()); + assert_eq!(values[0], 1); + assert_eq!(values[63], 64); + assert_eq!(values[64], 65); + assert_eq!(values[129], 130); + } +} diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 8319fffa387..667a1f11a9d 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -62,6 +62,7 @@ mod buffer_mut; mod bytes; mod r#const; mod debug; +pub mod lane_kernels; mod macros; #[cfg(feature = "memmap2")] mod memmap2;