From 7b5828f82478c88ae555685ffb3babe34a6932fc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 26 May 2026 20:16:13 +0100 Subject: [PATCH 01/21] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 4 + vortex-array/Cargo.toml | 1 + .../src/arrays/primitive/compute/cast.rs | 24 +- vortex-buffer/Cargo.toml | 10 + vortex-buffer/benches/cast_to.rs | 323 ++++++++ vortex-buffer/src/lane_ops.rs | 713 ++++++++++++++++++ vortex-buffer/src/lib.rs | 2 + 7 files changed, 1070 insertions(+), 7 deletions(-) create mode 100644 vortex-buffer/benches/cast_to.rs create mode 100644 vortex-buffer/src/lane_ops.rs diff --git a/Cargo.lock b/Cargo.lock index 045c72176fd..11afc6996a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,13 +9355,17 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ + "arrow-array", "arrow-buffer", + "arrow-cast", + "arrow-schema", "bitvec", "bytes", "codspeed-divan-compat", "itertools 0.14.0", "memmap2", "num-traits", + "rand 0.10.1", "rstest", "serde", "simdutf8", diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 666a23c02c4..e5233ce7cc6 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -218,3 +218,4 @@ harness = false [[bench]] name = "to_arrow" harness = false + diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 10c0b8d6eba..bbe2f89322d 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -5,6 +5,7 @@ use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_buffer::try_map_with_mask; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -143,13 +144,22 @@ where )? .freeze(), Mask::AllFalse(_) => BufferMut::::zeroed(values.len()).freeze(), - Mask::Values(m) => BufferMut::try_from_trusted_len_iter( - values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| { - let factor = if valid { F::one() } else { F::zero() }; - ::from(v * factor).ok_or_else(overflow) - }), - )? - .freeze(), + Mask::Values(m) => { + let mut buffer = BufferMut::::with_capacity(values.len()); + try_map_with_mask( + values, + m.bit_buffer(), + &mut buffer.spare_capacity_mut()[..values.len()], + |v, valid| { + let factor = if valid { F::one() } else { F::zero() }; + ::from(v * factor) + }, + ) + .map_err(|_| overflow())?; + // SAFETY: try_map_with_mask returned Ok, so it initialized every lane. + unsafe { buffer.set_len(values.len()) }; + buffer.freeze() + } }; Ok(PrimitiveArray::new(buffer, new_validity).into_array()) diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index ae9d7e6cc05..850aec4ec19 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,8 +37,14 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] +# TEMP: arrow-{array,cast,schema} are only used by the cast_to bench for cross-impl +# performance comparisons. Drop them when the bench is removed. +arrow-array = { workspace = true } +arrow-cast = { workspace = true } +arrow-schema = { workspace = true } divan = { workspace = true } num-traits = { workspace = true } +rand = { workspace = true } rstest = { workspace = true } [[bench]] @@ -48,3 +54,7 @@ harness = false [[bench]] name = "vortex_bitbuffer" harness = false + +[[bench]] +name = "cast_to" +harness = false diff --git a/vortex-buffer/benches/cast_to.rs b/vortex-buffer/benches/cast_to.rs new file mode 100644 index 00000000000..c070f65d3a0 --- /dev/null +++ b/vortex-buffer/benches/cast_to.rs @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Cast `u64 -> u32` over a nullable column, three ways: +//! +//! 1. `kernel_map_with_mask` — uses `map_with_mask`. Writes truncated values into a +//! pre-allocated `&mut [MaybeUninit]`. Null lanes write `0` via the branchless +//! `v * valid as u64` trick, mirroring `primitive/compute/cast.rs:147`. +//! 2. `iter_zip` — `values.iter().zip(mask.iter())` collected through +//! `BufferMut::from_trusted_len_iter`. This is the shape the current Vortex cast uses. +//! 3. `arrow_cast` — `arrow_cast::cast` against a `UInt64Array`, allocating a new +//! `UInt32Array`. +//! +//! Plus two fallible variants that error on overflow: +//! +//! 4. `kernel_try_map_with_mask` — `try_map_with_mask` with `|v, valid| (v <= MAX).then_some(...)`. +//! Unconditional cast + parallel range check OR-reduced into a u64 fail accumulator. +//! 5. `iter_zip_checked` — `BufferMut::try_from_trusted_len_iter` returning Err on overflow. +//! 6. `arrow_cast_checked` — `arrow_cast::cast` with `safe = false` (errors on overflow). +//! +//! Inputs are bounded to fit in `u32`, so the fallible variants always succeed and we +//! measure the cost of the range check on the success path. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; + +use arrow_array::UInt64Array; +use arrow_buffer::NullBuffer; +use arrow_buffer::ScalarBuffer; +use arrow_cast::CastOptions; +use arrow_cast::cast_with_options; +use arrow_schema::DataType; +use divan::Bencher; +use rand::SeedableRng; +use rand::prelude::*; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::map_with_mask; +use vortex_buffer::try_map_with_mask; + +fn main() { + divan::main(); +} + +const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; +const VALID_RATE: f64 = 0.7; +const DATA_SEED: u64 = 0; +const VALID_SEED: u64 = 1; + +// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte +// boundaries on every chunk it yields. +const SLICE_OFFSET: usize = 5; + +struct Fixture { + values: Buffer, + /// `offset() == 0`, underlying byte buffer starts on a byte boundary. + mask_aligned: BitBuffer, + /// Same validity bits but sliced so `offset() == SLICE_OFFSET`. + mask_unaligned: BitBuffer, + arrow_arr: UInt64Array, + /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset, + /// constructed by building an oversized array and slicing. + arrow_arr_unaligned: UInt64Array, +} + +fn fixture(n: usize) -> Fixture { + let mut data_rng = StdRng::seed_from_u64(DATA_SEED); + let mut valid_rng = StdRng::seed_from_u64(VALID_SEED); + let raw_values: Vec = (0..n) + .map(|_| data_rng.random_range(0..u32::MAX as u64)) + .collect(); + let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); + + let values: Buffer = raw_values.iter().copied().collect(); + + let mask_aligned = { + let mut m = BitBufferMut::with_capacity(n); + for &v in &raw_valid { + m.append(v); + } + m.freeze() + }; + + // Build n + SLICE_OFFSET bits then slice off the leading SLICE_OFFSET, so the + // remaining `n` lanes carry the SAME validity pattern as the aligned mask. + let mask_unaligned = { + let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET); + for _ in 0..SLICE_OFFSET { + m.append(false); // filler — sliced away + } + for &v in &raw_valid { + m.append(v); + } + m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n) + }; + debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET); + debug_assert_eq!(mask_unaligned.len(), n); + + let arrow_arr = UInt64Array::new( + ScalarBuffer::from(raw_values.clone()), + Some(NullBuffer::from(raw_valid.clone())), + ); + + // Oversized array → slice off SLICE_OFFSET lanes so the resulting array's + // NullBuffer has `offset() == SLICE_OFFSET`. The remaining `n` lanes hold the + // same validity pattern as `arrow_arr`. + let arrow_arr_unaligned = { + let mut padded_values: Vec = vec![0; SLICE_OFFSET]; + padded_values.extend_from_slice(&raw_values); + let mut padded_valid: Vec = vec![false; SLICE_OFFSET]; + padded_valid.extend_from_slice(&raw_valid); + let oversized = UInt64Array::new( + ScalarBuffer::from(padded_values), + Some(NullBuffer::from(padded_valid)), + ); + use arrow_array::Array; + let sliced = oversized.slice(SLICE_OFFSET, n); + debug_assert_eq!( + sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8, + SLICE_OFFSET + ); + sliced + }; + + Fixture { + values, + mask_aligned, + mask_unaligned, + arrow_arr, + arrow_arr_unaligned, + } +} + +const CAST_OPTS: CastOptions<'static> = CastOptions { + safe: true, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { + safe: false, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +#[divan::bench(args = SIZES)] +fn kernel_map_with_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + // Owned uninit-slot vector, sized once outside the timed region. + let mut out: Vec> = Vec::with_capacity(n); + // SAFETY: every lane is written before any read inside the kernel. + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + (v * valid as u64) as u32 + }); + }); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_unaligned(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr_unaligned.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn kernel_try_map_with_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + // SAFETY: every lane is written before any read inside the kernel. + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + }); +} + +/// Same kernel, but the mask has `offset() == 5` so `BitChunks::iter()` must shift +/// across byte boundaries on every chunk. Quantifies the cost of unaligned mask access. +#[divan::bench(args = SIZES)] +fn kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_unaligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + }); +} + +/// Aligned-mask counterpart for `map_with_mask` (infallible). Pair with the +/// `_unaligned` variant below to isolate the mask-iteration cost from the closure. +#[divan::bench(args = SIZES)] +fn kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_unaligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + (v * valid as u64) as u32 + }); + }); +} + +/// As above but with the branchful idiomatic form. Tests whether autovectorization +/// survives a per-lane `if valid { ... } else { ... }` shape. +#[divan::bench(args = SIZES)] +fn kernel_try_from_branchful(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + if valid { + u32::try_from(v).ok() + } else { + Some(0_u32) + } + }) + .unwrap(); + }); +} + +#[divan::bench(args = SIZES)] +fn iter_zip_checked(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values.clone(), f.mask_aligned.clone())) + .bench_refs(|(values, mask)| { + let buf: Buffer = BufferMut::try_from_trusted_len_iter( + values.iter().zip(mask.iter()).map(|(&v, valid)| { + let scaled = v * valid as u64; + if scaled <= u32::MAX as u64 { + Ok(scaled as u32) + } else { + Err(()) + } + }), + ) + .unwrap() + .freeze(); + buf + }); +} + +#[divan::bench(args = SIZES)] +fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone())) + .bench_refs(|(values, mask)| { + let buf: Buffer = BufferMut::try_from_trusted_len_iter( + values.iter().zip(mask.iter()).map(|(&v, valid)| { + let scaled = v * valid as u64; + if scaled <= u32::MAX as u64 { + Ok(scaled as u32) + } else { + Err(()) + } + }), + ) + .unwrap() + .freeze(); + buf + }); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_checked(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr_unaligned.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} diff --git a/vortex-buffer/src/lane_ops.rs b/vortex-buffer/src/lane_ops.rs new file mode 100644 index 00000000000..b145633465b --- /dev/null +++ b/vortex-buffer/src/lane_ops.rs @@ -0,0 +1,713 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Elementwise kernels that combine a `[T]` slice with a `BitBuffer` validity mask. +//! +//! The output is always a caller-provided `&mut` slice — these kernels never allocate. +//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` +//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. + +use std::mem::MaybeUninit; + +use crate::BitBuffer; + +/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`. +/// +/// All three inputs must have the same length. The output type `R` may differ from the +/// input type `T` — this kernel is the building block for both same-type transforms +/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out` +/// initialized (e.g. by calling `BufferMut::set_len` after this returns). +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. +#[inline] +pub fn map_with_mask(values: &[T], mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) +where + T: Copy, + F: FnMut(T, bool) -> R, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + // Inner loop is fixed-size 64 so the compiler can autovectorize + // for branchless closures like `|v, valid| v * (valid as T)`. + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: chunks.iter() yields chunks_count full words, so i < chunks_count * 64 <= len. + let v = unsafe { *values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i = chunks_count * 64 + bit_idx < chunks_count * 64 + remainder = len. + let v = unsafe { *values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } + } +} + +/// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` indicates a +/// per-lane failure (e.g. range overflow on a narrowing cast). +/// +/// The kernel does not short-circuit on the first failure inside a chunk: it processes +/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator, +/// then checks after each chunk. On failure, a cold scalar attribution pass replays the +/// closure over that chunk to identify the first failing lane. The hot loop stays +/// autovectorizable — the per-lane cost is one OR on top of the cast. +/// +/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write +/// `R::default()` into `out`, but the contents of `out` must not be relied upon when +/// this function returns `Err`. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. +#[inline] +pub fn try_map_with_mask( + values: &[T], + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, +) -> Result<(), usize> +where + T: Copy, + R: Copy + Default, + F: FnMut(T, bool) -> Option, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + // Per-chunk accumulator — does not escape the SIMD inner loop. + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { *values.get_unchecked(i) }; + let opt = f(v, bit); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure(values, src_chunk, base, 64, &mut f)); + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { *values.get_unchecked(i) }; + let opt = f(v, bit); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure( + values, src_chunk, base, remainder, &mut f, + )); + } + } + + Ok(()) +} + +/// Cold path: identify the first lane in a chunk where `f` returned `None`. +/// +/// Called only after the hot loop has detected that at least one lane failed. +/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only +/// runs once per error and the error path is supposed to be exceptional. +#[cold] +#[inline(never)] +fn attribute_failure( + values: &[T], + src_chunk: u64, + base: usize, + chunk_len: usize, + f: &mut F, +) -> usize +where + T: Copy, + F: FnMut(T, bool) -> Option, +{ + for bit_idx in 0..chunk_len { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees i < values.len(). + let v = unsafe { *values.get_unchecked(i) }; + if f(v, bit).is_none() { + return i; + } + } + // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed. + unreachable!("attribute_failure called without a failing lane") +} + +/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. +/// +/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the +/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive +/// buffer) and combine the validity bitmap in a separate pass — splitting the work +/// this way lets the value-compare loop autovectorize cleanly. +/// +/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word +/// beyond `len % 64` are written as `0`. +/// +/// # Panics +/// +/// Panics if `out.len() != values.len().div_ceil(64)`. +#[inline] +pub fn map_to_bits(values: &[T], out: &mut [u64], mut f: F) +where + T: Copy, + F: FnMut(T) -> bool, +{ + let len = values.len(); + assert_eq!( + out.len(), + len.div_ceil(64), + "out must have len.div_ceil(64) words", + ); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + let mut packed = 0u64; + for bit_idx in 0..64 { + // SAFETY: base + bit_idx < chunks_count * 64 <= len. + let v = unsafe { *values.get_unchecked(base + bit_idx) }; + packed |= (f(v) as u64) << bit_idx; + } + // SAFETY: chunk_idx < chunks_count <= out.len(). + unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; + } + + if remainder != 0 { + let base = chunks_count * 64; + let mut packed = 0u64; + for bit_idx in 0..remainder { + // SAFETY: base + bit_idx < len. + let v = unsafe { *values.get_unchecked(base + bit_idx) }; + packed |= (f(v) as u64) << bit_idx; + } + // SAFETY: chunks_count < out.len() because remainder != 0. + unsafe { *out.get_unchecked_mut(chunks_count) = packed }; + } +} + +/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words. +/// +/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word +/// beyond `len % 64` are written as `0`. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`. +#[inline] +pub fn map_with_mask_to_bits(values: &[T], mask: &BitBuffer, out: &mut [u64], mut f: F) +where + T: Copy, + F: FnMut(T, bool) -> bool, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!( + out.len(), + len.div_ceil(64), + "out must have len.div_ceil(64) words", + ); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut packed = 0u64; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { *values.get_unchecked(i) }; + packed |= (f(v, bit) as u64) << bit_idx; + } + // SAFETY: chunk_idx < chunks_count <= out.len(). + unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut packed = 0u64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { *values.get_unchecked(i) }; + packed |= (f(v, bit) as u64) << bit_idx; + } + // SAFETY: chunks_count < out.len() because remainder != 0. + unsafe { *out.get_unchecked_mut(chunks_count) = packed }; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::BitBufferMut; + + fn write_t(out: Vec>) -> Vec { + // SAFETY: tests always fully initialize the buffer. + unsafe { std::mem::transmute(out) } + } + + #[test] + fn map_with_mask_aligned() { + let values: Vec = (0..10).collect(); + let mask = { + let mut m = BitBufferMut::with_capacity(10); + for i in 0..10 { + m.append(i % 2 == 0); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 10]; + map_with_mask( + &values, + &mask, + &mut out, + |v, valid| if valid { v } else { -1 }, + ); + assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); + } + + #[test] + fn map_with_mask_partial_chunk() { + // 130 lanes — two full u64 words + a 2-bit remainder. + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + map_with_mask( + &values, + &mask, + &mut out, + |v, valid| if valid { v + 1 } else { 0 }, + ); + let got = write_t(out); + assert_eq!(got.len(), 130); + assert_eq!(got[0], 1); + assert_eq!(got[63], 64); + assert_eq!(got[64], 65); + assert_eq!(got[129], 130); + } + + #[test] + fn map_with_mask_offset_mask() { + // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5. + let big = BitBuffer::new_set(128); + let sliced = big.slice(5..70); // logical len = 65, offset = 5 + assert_eq!(sliced.len(), 65); + assert_eq!(sliced.offset(), 5); + + let values: Vec = (0..65).collect(); + let mut out = vec![MaybeUninit::::uninit(); 65]; + map_with_mask( + &values, + &sliced, + &mut out, + |v, valid| if valid { v } else { u32::MAX }, + ); + let got = write_t(out); + assert_eq!(got, (0..65).collect::>()); + } + + #[test] + fn map_with_mask_offset_past_word() { + // Slicing past a full word still works. `BitBuffer::slice` normalizes the + // logical offset to `offset % 8` and bumps the underlying byte pointer, + // so `offset()` won't equal 70 here — what we exercise is that the kernel + // walks the chunked u64 view (which BitChunks handles internally). + let big = BitBuffer::new_set(256); + let sliced = big.slice(70..200); + assert_eq!(sliced.len(), 130); + + let values: Vec = (0..130).map(|i| i as i16).collect(); + let mut out = vec![MaybeUninit::::uninit(); 130]; + map_with_mask( + &values, + &sliced, + &mut out, + |v, valid| if valid { v } else { -1 }, + ); + let got = write_t(out); + assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); + } + + #[test] + fn map_with_mask_empty() { + let values: Vec = vec![]; + let mask = BitBuffer::new_unset(0); + let mut out: Vec> = vec![]; + map_with_mask(&values, &mask, &mut out, |v, _| v); + } + + #[test] + fn map_with_mask_null_to_zero_branchless() { + // The trick from primitive/compute/cast.rs:147 — multiply by valid as T. + let values: Vec = (1..=100).collect(); + let mask = { + let mut m = BitBufferMut::with_capacity(100); + for i in 0..100 { + m.append(i % 3 != 0); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 100]; + map_with_mask(&values, &mask, &mut out, |v, valid| v * (valid as i64)); + let got = write_t(out); + for (i, &x) in got.iter().enumerate() { + if i % 3 == 0 { + assert_eq!(x, 0); + } else { + assert_eq!(x, (i + 1) as i64); + } + } + } + + #[test] + fn map_with_mask_to_bits_aligned() { + let values: Vec = (0..128).collect(); + let mask = BitBuffer::new_set(128); + let mut out = vec![0u64; 2]; + map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v % 2 == 0); + // Even numbers in [0, 128) set, odd unset. + for word_idx in 0..2 { + let word = out[word_idx]; + for bit in 0..64 { + let i = word_idx * 64 + bit; + let expected = i % 2 == 0; + assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}"); + } + } + } + + #[test] + fn map_with_mask_to_bits_partial_chunk() { + // 130 lanes — three u64 words, last word has only 2 valid bits. + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + assert_eq!(out.len(), 3); + map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v >= 64); + // Bits 64..128 set in word 1; bits 128..130 set in word 2. + assert_eq!(out[0], 0); + assert_eq!(out[1], u64::MAX); + assert_eq!(out[2], 0b11); + } + + #[test] + fn map_with_mask_to_bits_offset() { + let big = BitBuffer::new_set(256); + let sliced = big.slice(13..143); // offset=13, len=130 + assert_eq!(sliced.len(), 130); + let values: Vec = (0..130).map(|i| (i % 4) as u8).collect(); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + map_with_mask_to_bits(&values, &sliced, &mut out, |v, valid| valid && v == 0); + for i in 0..130 { + let word = out[i / 64]; + let bit = (word >> (i % 64)) & 1 == 1; + assert_eq!(bit, i % 4 == 0, "lane {i}"); + } + } + + #[test] + fn try_map_with_mask_all_ok() { + let values: Vec = (0..200).collect(); + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..200u32).collect::>()); + } + + #[test] + fn try_map_with_mask_overflow_fails() { + // Put an overflowing value at lane 137 — the kernel must report Err(137). + let mut values: Vec = (0..200).collect(); + values[137] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(137)); + } + + #[test] + fn try_map_with_mask_overflow_reports_first_failing_lane() { + // Multiple failing lanes — must report the lowest index. + let mut values: Vec = (0..200).collect(); + values[50] = u64::MAX; + values[51] = u64::MAX; + values[137] = u64::MAX; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(50)); + } + + #[test] + fn try_map_with_mask_null_lane_bypasses_check() { + // Null lanes are neutralized by `valid as u64` before the range check, so an + // out-of-range value at a null lane must NOT trigger failure. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got[5], 0); // null-lane wrote default + assert_eq!(got[6], 6); + } + + #[test] + fn try_map_with_mask_branchful_matches_branchless() { + let mut values: Vec = (0..130).map(|i| i as u64 * 7).collect(); + values[2] = u64::MAX; + values[65] = u32::MAX as u64; + let mask = { + let mut m = BitBufferMut::with_capacity(130); + for i in 0..130 { + m.append(!matches!(i, 2 | 17 | 99)); + } + m.freeze() + }; + + let mut branchless = vec![MaybeUninit::::uninit(); 130]; + let mut branchful = vec![MaybeUninit::::uninit(); 130]; + try_map_with_mask(&values, &mask, &mut branchless, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + try_map_with_mask(&values, &mask, &mut branchful, |v, valid| { + if valid { + u32::try_from(v).ok() + } else { + Some(0) + } + }) + .unwrap(); + + assert_eq!(write_t(branchful), write_t(branchless)); + } + + #[test] + fn try_map_with_mask_partial_chunk() { + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got.len(), 130); + assert_eq!(got[129], 129); + } + + #[test] + fn try_map_with_mask_sliced_mask_unaligned_offset() { + // The mask's first byte is not word-aligned: slice off 13 bits, so the + // underlying BitChunks iterator must shift across byte boundaries on every + // 64-bit chunk it yields. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5 + assert_eq!(mask.len(), 130); + + let values: Vec = (0..130).collect(); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..130u32).collect::>()); + } + + #[test] + fn try_map_with_mask_sliced_mask_with_overflow() { + // Sliced mask + overflowing value — the cold attribution path must report + // the correct lane index in the sliced (post-offset) coordinate space. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + let mut values: Vec = (0..130).collect(); + values[77] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_with_mask_sliced_mask_null_lanes() { + // Mix sliced offset with a non-trivial validity pattern. Null lanes must + // not contribute to fail_acc, even when their underlying value would overflow. + let mut m = BitBufferMut::with_capacity(256); + for i in 0..256 { + m.append(i % 3 != 0); + } + let big = m.freeze(); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + // After the 13-lane slice, original index `13 + j` becomes lane `j`. + // Lane `j` is valid iff `(13 + j) % 3 != 0`. + let mut values: Vec = (0..130).collect(); + // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid. + // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. + values[2] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok(), "null lane should bypass the range check"); + } + + #[test] + fn try_map_with_mask_overflow_in_remainder() { + // Overflow in the trailing partial chunk (not aligned to 64). + let mut values: Vec = (0..130).collect(); + values[129] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(129)); + } + + #[test] + fn map_to_bits_aligned() { + let values: Vec = (0..128).collect(); + let mut out = vec![0u64; 2]; + map_to_bits(&values, &mut out, |v| v % 2 == 0); + for word_idx in 0..2 { + for bit in 0..64 { + let i = word_idx * 64 + bit; + let expected = i % 2 == 0; + assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}"); + } + } + } + + #[test] + fn map_to_bits_partial_chunk() { + let values: Vec = (0..130).collect(); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + assert_eq!(out.len(), 3); + map_to_bits(&values, &mut out, |v| v >= 64); + assert_eq!(out[0], 0); + assert_eq!(out[1], u64::MAX); + assert_eq!(out[2], 0b11); + } + + #[test] + fn map_to_bits_empty() { + let values: Vec = vec![]; + let mut out: Vec = vec![]; + map_to_bits(&values, &mut out, |v| v > 0); + } + + #[test] + fn map_to_bits_matches_fused_with_all_valid_mask() { + // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits. + let values: Vec = (0..200).map(|i| i % 7).collect(); + let mask = BitBuffer::new_set(200); + + let mut a = vec![0u64; 200usize.div_ceil(64)]; + map_with_mask_to_bits(&values, &mask, &mut a, |v, valid| valid && v == 3); + + let mut b = vec![0u64; 200usize.div_ceil(64)]; + map_to_bits(&values, &mut b, |v| v == 3); + + assert_eq!(a, b); + } + + #[test] + fn map_with_mask_to_bits_validity_kills_lane() { + // Even if predicate is true, null lanes should produce false. + let values: Vec = vec![1; 70]; + let mask = { + let mut m = BitBufferMut::with_capacity(70); + for i in 0..70 { + m.append(i >= 32); // first 32 lanes are null + } + m.freeze() + }; + let mut out = vec![0u64; 70usize.div_ceil(64)]; + map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v == 1); + for i in 0..70 { + let bit = (out[i / 64] >> (i % 64)) & 1 == 1; + assert_eq!(bit, i >= 32, "lane {i}"); + } + } +} diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 8319fffa387..592762d7a26 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -52,6 +52,7 @@ pub use buffer::*; pub use buffer_mut::*; pub use bytes::*; pub use r#const::*; +pub use lane_ops::*; pub use string::*; mod alignment; #[cfg(feature = "arrow")] @@ -62,6 +63,7 @@ mod buffer_mut; mod bytes; mod r#const; mod debug; +mod lane_ops; mod macros; #[cfg(feature = "memmap2")] mod memmap2; From 85ef2f8893f0479e971ac340366735aebcc7b709 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 10:44:19 +0100 Subject: [PATCH 02/21] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-array/benches/cast_primitive.rs | 47 + .../src/arrays/primitive/compute/cast.rs | 86 +- vortex-buffer/Cargo.toml | 21 +- vortex-buffer/benches/cast_to_indexed.rs | 467 ++++++ vortex-buffer/src/lane_ops_indexed.rs | 1261 +++++++++++++++++ vortex-buffer/src/lib.rs | 6 + 7 files changed, 1837 insertions(+), 52 deletions(-) create mode 100644 vortex-buffer/benches/cast_to_indexed.rs create mode 100644 vortex-buffer/src/lane_ops_indexed.rs diff --git a/Cargo.lock b/Cargo.lock index 11afc6996a2..9bb032d0d35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,6 +9355,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ + "arrow-arith", "arrow-array", "arrow-buffer", "arrow-cast", diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs index 86895fb2ce7..0b67571e93d 100644 --- a/vortex-array/benches/cast_primitive.rs +++ b/vortex-array/benches/cast_primitive.rs @@ -20,6 +20,10 @@ fn main() { const N: usize = 100_000; +// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so +// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth. +const SIZES: &[usize] = &[65_536]; + #[divan::bench] fn cast_u16_to_u32(bencher: Bencher) { let mut rng = StdRng::seed_from_u64(42); @@ -46,3 +50,46 @@ fn cast_u16_to_u32(bencher: Bencher) { .execute::(&mut LEGACY_SESSION.create_execution_ctx()) }); } + +/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded +/// so every value fits, isolating the kernel's per-lane checked-cast overhead. +#[divan::bench(args = SIZES)] +fn cast_u32_to_u8(bencher: Bencher, n: usize) { + let mut rng = StdRng::seed_from_u64(42); + #[expect(clippy::cast_possible_truncation)] + let arr = PrimitiveArray::from_option_iter((0..n).map(|_| { + if rng.random_bool(0.7) { + Some(rng.random_range(0..u8::MAX) as u32) + } else { + None + } + })) + .into_array(); + bencher.with_inputs(|| arr.clone()).bench_refs(|a| { + #[expect(clippy::unwrap_used)] + a.cast(DType::Primitive(PType::U8, Nullability::Nullable)) + .unwrap() + .execute::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds +/// but still pays the per-lane `try_from` check. +#[divan::bench(args = SIZES)] +fn cast_i32_to_u32(bencher: Bencher, n: usize) { + let mut rng = StdRng::seed_from_u64(42); + let arr = PrimitiveArray::from_option_iter((0..n).map(|_| { + if rng.random_bool(0.7) { + Some(rng.random_range(0..i32::MAX)) + } else { + None + } + })) + .into_array(); + bencher.with_inputs(|| arr.clone()).bench_refs(|a| { + #[expect(clippy::unwrap_used)] + a.cast(DType::Primitive(PType::U32, Nullability::Nullable)) + .unwrap() + .execute::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index bbe2f89322d..dd5abc2f164 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -1,11 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; -use vortex_buffer::try_map_with_mask; +use vortex_buffer::lane_ops_indexed::try_map_no_validity; +use vortex_buffer::lane_ops_indexed::try_map_with_mask; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -103,32 +103,28 @@ impl CastKernel for Primitive { } } -/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts -/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them -/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out. +/// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with +/// `NumCast::from`. The kernel branches once on the mask shape: +/// +/// - `Mask::AllTrue` → [`try_map_no_validity`] — no per-lane validity work. +/// - `Mask::AllFalse` → bulk zero — the closure is never invoked. +/// - `Mask::Values` → [`try_map_with_mask`] — the closure neutralizes null lanes +/// via the `* valid as F` multiply trick so out-of-range null-lane values don't +/// trigger spurious errors. +/// +/// For statically-infallible casts (e.g. widening) LLVM proves `NumCast::from` always +/// returns `Some` and strips the fail-tracking machinery, generating the same bare +/// `ushll` widen loop the old hand-written `as_()` fast path produced. fn cast_values( array: ArrayView<'_, Primitive>, new_validity: Validity, ctx: &mut ExecutionCtx, ) -> VortexResult where - F: NativePType + AsPrimitive, + F: NativePType, T: NativePType, { let values = array.as_slice::(); - - // Fast path: statically infallible, or cached min/max prove every valid value fits in `T`. - // The cached check never triggers a stats computation — if the bounds aren't already known - // we fall through to the per-lane loop below. - if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) { - return Ok(PrimitiveArray::new(cast::(values), new_validity).into_array()); - } - - // TODO(joe): if the values source and target have the same bit-width we can - // mutate in place. - - // Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for - // them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow. let mask = array.validity()?.execute_mask(array.len(), ctx)?; let overflow = || { vortex_err!( @@ -136,13 +132,20 @@ where F::PTYPE, T::PTYPE, ) }; + let buffer: Buffer = match &mask { - Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter( - values - .iter() - .map(|&v| ::from(v).ok_or_else(overflow)), - )? - .freeze(), + Mask::AllTrue(_) => { + let mut buffer = BufferMut::::with_capacity(values.len()); + try_map_no_validity( + values, + &mut buffer.spare_capacity_mut()[..values.len()], + |v| ::from(v), + ) + .map_err(|_| overflow())?; + // SAFETY: try_map_no_validity returned Ok, so it initialized every lane. + unsafe { buffer.set_len(values.len()) }; + buffer.freeze() + } Mask::AllFalse(_) => BufferMut::::zeroed(values.len()).freeze(), Mask::Values(m) => { let mut buffer = BufferMut::::with_capacity(values.len()); @@ -150,9 +153,15 @@ where values, m.bit_buffer(), &mut buffer.spare_capacity_mut()[..values.len()], + // Lazy validity: only consult `valid` on the failure branch. For + // widening / statically-infallible casts, `NumCast::from` is always + // `Some` so the `or_else` is provably dead — LLVM DCEs the validity + // path entirely, giving the same codegen as the maskless kernel. + // For narrowing, `valid` is only read at lanes that actually + // overflowed (a cold check on top of the cast). |v, valid| { - let factor = if valid { F::one() } else { F::zero() }; - ::from(v * factor) + ::from(v) + .or_else(|| (!valid).then(T::zero)) }, ) .map_err(|_| overflow())?; @@ -165,12 +174,6 @@ where Ok(PrimitiveArray::new(buffer, new_validity).into_array()) } -/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because -/// they are masked out by validity. -fn cast, T: NativePType>(array: &[F]) -> Buffer { - BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze() -} - fn reinterpret( array: ArrayView<'_, Primitive>, new_ptype: PType, @@ -188,23 +191,6 @@ fn reinterpret( .into_array() } -/// Returns `true` if every value of `src` is guaranteed representable in `target` without -/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never -/// produce an out-of-range result. -fn values_always_fit(src: PType, target: PType) -> bool { - if src == target { - return true; - } - if src.is_int() && target.is_int() { - return target.byte_width() > src.byte_width() - && (src.is_unsigned_int() || target.is_signed_int()); - } - if src.is_float() && target.is_float() { - return target.byte_width() > src.byte_width(); - } - src.is_int() && matches!(target, PType::F32 | PType::F64) -} - /// Returns `true` if all valid values in `array` are representable as `target_ptype`. /// /// Cached min/max statistics are consulted first. If either bound is missing, the function either diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 850aec4ec19..42c882004bd 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,8 +37,9 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] -# TEMP: arrow-{array,cast,schema} are only used by the cast_to bench for cross-impl -# performance comparisons. Drop them when the bench is removed. +# TEMP: arrow-* are only used by the cast_to / add_checked benches for cross-impl +# performance comparisons. Drop them when the benches are removed. +arrow-arith = { workspace = true } arrow-array = { workspace = true } arrow-cast = { workspace = true } arrow-schema = { workspace = true } @@ -58,3 +59,19 @@ harness = false [[bench]] name = "cast_to" harness = false + +[[bench]] +name = "cast_to_indexed" +harness = false + +[[bench]] +name = "cast_iter_all" +harness = false + +[[bench]] +name = "cast_in_place" +harness = false + +[[bench]] +name = "add_checked" +harness = false diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs new file mode 100644 index 00000000000..b2abe29b890 --- /dev/null +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Mirror of `cast_to.rs` driving the kernels through [`vortex_buffer::lane_ops_indexed`] +//! (the `IndexedSource` trait) plus isolation benches that decompose the cost of the +//! kernel structure vs. the cast vs. the mask access. +//! +//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated +//! this design: a stateful `ExactSizeIterator` variant of these kernels was ~+100% +//! slower because per-lane `next()` calls create a 64-deep dependency chain across +//! iterations that blocks vectorization. The `IndexedSource` trait uses +//! `unsafe fn get_unchecked(i)` reads — independent across iterations — and inlines +//! to the same indexed load as the slice kernel. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; + +use arrow_array::UInt64Array; +use arrow_buffer::NullBuffer; +use arrow_buffer::ScalarBuffer; +use arrow_cast::CastOptions; +use arrow_cast::cast_with_options; +use arrow_schema::DataType; +use divan::Bencher; +use rand::SeedableRng; +use rand::prelude::*; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::lane_ops_indexed::map_with_mask as indexed_map_with_mask; +use vortex_buffer::lane_ops_indexed::try_map_validity_filtered as indexed_try_map_validity_filtered; +use vortex_buffer::lane_ops_indexed::try_map_with_mask as indexed_try_map_with_mask; + +fn main() { + divan::main(); +} + +const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; +const VALID_RATE: f64 = 0.7; +const DATA_SEED: u64 = 0; +const VALID_SEED: u64 = 1; + +// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte +// boundaries on every chunk it yields. +const SLICE_OFFSET: usize = 5; + +struct Fixture { + values: Buffer, + /// `offset() == 0`, underlying byte buffer starts on a byte boundary. + mask_aligned: BitBuffer, + /// Same validity bits but sliced so `offset() == SLICE_OFFSET`. + mask_unaligned: BitBuffer, + arrow_arr: UInt64Array, + /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset, + /// constructed by building an oversized array and slicing. + arrow_arr_unaligned: UInt64Array, +} + +fn fixture(n: usize) -> Fixture { + let mut data_rng = StdRng::seed_from_u64(DATA_SEED); + let mut valid_rng = StdRng::seed_from_u64(VALID_SEED); + let raw_values: Vec = (0..n) + .map(|_| data_rng.random_range(0..u32::MAX as u64)) + .collect(); + let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); + + let values: Buffer = raw_values.iter().copied().collect(); + + let mask_aligned = { + let mut m = BitBufferMut::with_capacity(n); + for &v in &raw_valid { + m.append(v); + } + m.freeze() + }; + + let mask_unaligned = { + let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET); + for _ in 0..SLICE_OFFSET { + m.append(false); + } + for &v in &raw_valid { + m.append(v); + } + m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n) + }; + debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET); + debug_assert_eq!(mask_unaligned.len(), n); + + let arrow_arr = UInt64Array::new( + ScalarBuffer::from(raw_values.clone()), + Some(NullBuffer::from(raw_valid.clone())), + ); + + let arrow_arr_unaligned = { + let mut padded_values: Vec = vec![0; SLICE_OFFSET]; + padded_values.extend_from_slice(&raw_values); + let mut padded_valid: Vec = vec![false; SLICE_OFFSET]; + padded_valid.extend_from_slice(&raw_valid); + let oversized = UInt64Array::new( + ScalarBuffer::from(padded_values), + Some(NullBuffer::from(padded_valid)), + ); + use arrow_array::Array; + let sliced = oversized.slice(SLICE_OFFSET, n); + debug_assert_eq!( + sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8, + SLICE_OFFSET + ); + sliced + }; + + Fixture { + values, + mask_aligned, + mask_unaligned, + arrow_arr, + arrow_arr_unaligned, + } +} + +const CAST_OPTS: CastOptions<'static> = CastOptions { + safe: true, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { + safe: false, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +#[divan::bench(args = SIZES)] +fn arrow_cast(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_unaligned(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr_unaligned.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn iter_zip_checked(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values.clone(), f.mask_aligned.clone())) + .bench_refs(|(values, mask)| { + let buf: Buffer = BufferMut::try_from_trusted_len_iter( + values.iter().zip(mask.iter()).map(|(&v, valid)| { + let scaled = v * valid as u64; + if scaled <= u32::MAX as u64 { + Ok(scaled as u32) + } else { + Err(()) + } + }), + ) + .unwrap() + .freeze(); + buf + }); +} + +#[divan::bench(args = SIZES)] +fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone())) + .bench_refs(|(values, mask)| { + let buf: Buffer = BufferMut::try_from_trusted_len_iter( + values.iter().zip(mask.iter()).map(|(&v, valid)| { + let scaled = v * valid as u64; + if scaled <= u32::MAX as u64 { + Ok(scaled as u32) + } else { + Err(()) + } + }), + ) + .unwrap() + .freeze(); + buf + }); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_checked(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| f.arrow_arr_unaligned.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} + +// ----------------------------------------------------------------------------- +// Isolation benches: drop the mask, isolate the cast u64 -> u32 to see whether +// the iterator cost is intrinsic or comes from the surrounding kernel structure. +// ----------------------------------------------------------------------------- + +/// Plain slice indexing, no mask. Upper bound on what the iter variants must beat. +#[divan::bench(args = SIZES)] +fn iso_slice_cast(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), out) + }) + .bench_refs(|(values, out)| { + let v = values.as_slice(); + let o = out.as_mut_slice(); + assert_eq!(v.len(), o.len()); + for i in 0..v.len() { + // SAFETY: bounds checked by the assert above. + unsafe { o.get_unchecked_mut(i).write(*v.get_unchecked(i) as u32) }; + } + }); +} + +/// Per-lane iterator zip, no mask. Tests whether `slice::Iter::next` autovectorizes +/// when nothing else is in the way. +#[divan::bench(args = SIZES)] +fn iso_iter_cast(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), out) + }) + .bench_refs(|(values, out)| { + for (slot, &v) in out.iter_mut().zip(values.iter()) { + slot.write(v as u32); + } + }); +} + +/// `chunks_exact(64)` + `try_into::<&[u64; 64]>` so the outer iter advances once per +/// 64 lanes and the inner loop indexes a fixed-size array. Tests whether moving the +/// iterator state from per-lane to per-chunk fixes vectorization. +#[divan::bench(args = SIZES)] +fn iso_iter_chunks_64(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), out) + }) + .bench_refs(|(values, out)| { + let v = values.as_slice(); + let o = out.as_mut_slice(); + assert_eq!(v.len(), o.len()); + for (v_chunk, o_chunk) in v.chunks_exact(64).zip(o.chunks_exact_mut(64)) { + let v_arr: &[u64; 64] = v_chunk.try_into().unwrap(); + let o_arr: &mut [MaybeUninit; 64] = o_chunk.try_into().unwrap(); + for bit_idx in 0..64 { + o_arr[bit_idx].write(v_arr[bit_idx] as u32); + } + } + // Ignore the tail — SIZES are all multiples of 64. + }); +} + +// ----------------------------------------------------------------------------- +// Indexed-source variant (lane_ops_indexed). The kernel takes an `IndexedSource` whose +// `&[T]` impl is `unsafe fn get_unchecked(i) -> T` — same indexed load as the slice +// kernel, but the trait also supports binary inputs via `LaneZip`. +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn indexed_kernel_map_with_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + (v * valid as u64) as u32 + }); + }); +} + +#[divan::bench(args = SIZES)] +fn indexed_kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_unaligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + (v * valid as u64) as u32 + }); + }); +} + +#[divan::bench(args = SIZES)] +fn indexed_kernel_try_map_with_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + }); +} + +#[divan::bench(args = SIZES)] +fn indexed_kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_unaligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + }); +} + +#[divan::bench(args = SIZES)] +fn indexed_kernel_try_from_branchful(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + if valid { + u32::try_from(v).ok() + } else { + Some(0_u32) + } + }) + .unwrap(); + }); +} + +// ----------------------------------------------------------------------------- +// Decoupled-design variant with CORRECT validity semantics: closure is `|v|` +// (no per-lane mask threading), but the mask filters out null-lane failures at +// the chunk boundary. A null row whose stored value would overflow does NOT +// cause Err — this matches the existing `try_map_with_mask` semantics while +// keeping the lighter inner loop. +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn indexed_decoupled_kernel_try_map_with_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + // SAFETY: every lane is written before any read inside the kernel. + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }) + .unwrap(); + }); +} + +#[divan::bench(args = SIZES)] +fn indexed_decoupled_kernel_try_from_branchful(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| { + u32::try_from(v).ok() + }) + .unwrap(); + }); +} + +/// Full checked-cast kernel using `chunks_exact(64)` + fixed-size array refs, with +/// the mask. If this matches the slice kernel, the cost is in the per-lane iterator +/// state, not the iter pattern in general. +#[divan::bench(args = SIZES)] +fn kernel_iter_chunks_64(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values.clone(), f.mask_aligned.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + let v = values.as_slice(); + let o = out.as_mut_slice(); + let len = v.len(); + assert_eq!(len, mask.len()); + assert_eq!(len, o.len()); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let full = chunks_count * 64; + let (v_full, _v_rem) = v.split_at(full); + let (o_full, _o_rem) = o.split_at_mut(full); + + for ((v_chunk, o_chunk), src_chunk) in v_full + .chunks_exact(64) + .zip(o_full.chunks_exact_mut(64)) + .zip(chunks.iter()) + { + let v_arr: &[u64; 64] = v_chunk.try_into().unwrap(); + let o_arr: &mut [MaybeUninit; 64] = o_chunk.try_into().unwrap(); + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + let bit = (src_chunk >> bit_idx) & 1 == 1; + let scaled = v_arr[bit_idx] * bit as u64; + let opt = (scaled <= u32::MAX as u64).then_some(scaled as u32); + fail_acc |= opt.is_none() as u64; + o_arr[bit_idx].write(opt.unwrap_or_default()); + } + assert_eq!(fail_acc, 0); + } + // Ignore the tail — SIZES are all multiples of 64. + }); +} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs new file mode 100644 index 00000000000..c83114d8bcd --- /dev/null +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -0,0 +1,1261 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Indexed-source variant of [`crate::lane_ops`]. +//! +//! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is +//! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]` +//! this inlines to the same indexed load as the slice kernel; for `LaneZip(&[A], &[B])` +//! it gives two independent indexed reads per lane — both shapes the auto-vectorizer +//! handles. +//! +//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated +//! this design. +//! +//! The output is always a caller-provided `&mut` slice — these kernels never allocate. +//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` +//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. + +use std::mem::MaybeUninit; + +use crate::BitBuffer; + +/// A length-known source supporting unchecked indexed reads. +/// +/// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s. +/// The kernels in this module require this trait instead of `Iterator` so that lane +/// reads carry no inter-iteration data dependency — the autovectorizer treats each +/// lane independently. +pub trait IndexedSource { + /// The per-lane item type. Must be `Copy` so the kernels can pass it through + /// the closure by value without extra moves. + type Item: Copy; + /// Logical lane count. + fn len(&self) -> usize; + /// Returns true when there are no lanes. + fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Read the lane at `i` without bounds checking. + /// + /// # Safety + /// + /// `i` must be strictly less than `self.len()`. + unsafe fn get_unchecked(&self, i: usize) -> Self::Item; +} + +impl IndexedSource for &[T] { + type Item = T; + #[inline] + fn len(&self) -> usize { + <[T]>::len(self) + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> T { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked(self, i) } + } +} + +impl IndexedSource for &mut [T] { + type Item = T; + #[inline] + fn len(&self) -> usize { + <[T]>::len(self) + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> T { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked(self, i) } + } +} + +/// An [`IndexedSource`] that also supports unchecked indexed writes — the binding +/// for in-place kernels. +/// +/// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a +/// `(A, B)` pair back to two separate sources via a single index). +pub trait IndexedSink: IndexedSource { + /// Write `value` into lane `i` without bounds checking. + /// + /// # Safety + /// + /// `i` must be strictly less than `self.len()`. + unsafe fn set_unchecked(&mut self, i: usize, value: Self::Item); +} + +impl IndexedSink for &mut [T] { + #[inline] + unsafe fn set_unchecked(&mut self, i: usize, value: T) { + // SAFETY: caller guarantees i < self.len(). + unsafe { *<[T]>::get_unchecked_mut(self, i) = value }; + } +} + +/// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane. +/// +/// Use this to drive a binary kernel from two columns. Length equality is enforced +/// at construction. +pub struct LaneZip(pub A, pub B); + +impl LaneZip { + /// Build a `LaneZip` from two equal-length sources. + /// + /// # Panics + /// + /// Panics if the two operands have different lengths. + pub fn new(a: A, b: B) -> Self { + assert_eq!(a.len(), b.len(), "LaneZip operands must have the same length"); + Self(a, b) + } +} + +impl IndexedSource for LaneZip { + type Item = (A::Item, B::Item); + #[inline] + fn len(&self) -> usize { + debug_assert_eq!(self.0.len(), self.1.len()); + self.0.len() + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) { + // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths. + unsafe { + ( + self.0.get_unchecked(i), + self.1.get_unchecked(i), + ) + } + } +} + +/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`. +/// +/// All three inputs must have the same length. The output type `R` may differ from the +/// input type `T` — this kernel is the building block for both same-type transforms +/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out` +/// initialized (e.g. by calling `BufferMut::set_len` after this returns). +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. +#[inline] +pub fn map_with_mask( + values: S, + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, +) where + S: IndexedSource, + F: FnMut(S::Item, bool) -> R, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + // Inner loop is fixed-size 64 with independent per-lane reads — no iterator + // state, no cross-iteration dependency, so the auto-vectorizer can fuse + // 64 indexed loads into vector loads. + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } + } +} + +/// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` indicates a +/// per-lane failure (e.g. range overflow on a narrowing cast). +/// +/// The kernel does not short-circuit on the first failure inside a chunk: it processes +/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator, +/// then checks after each chunk. On failure, a cold scalar attribution pass replays the +/// closure over that chunk to identify the first failing lane. The hot loop stays +/// autovectorizable — the per-lane cost is one OR on top of the cast. +/// +/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write +/// `R::default()` into `out`, but the contents of `out` must not be relied upon when +/// this function returns `Err`. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. +#[inline] +pub fn try_map_with_mask( + values: S, + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, +) -> Result<(), usize> +where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item, bool) -> Option, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + // Per-chunk accumulator — does not escape the SIMD inner loop. + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure(&values, src_chunk, base, 64, &mut f)); + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure( + &values, src_chunk, base, remainder, &mut f, + )); + } + } + + Ok(()) +} + +/// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every +/// closure invocation is treated as "happened", regardless of whether the lane +/// is null. Use this only when the input is known non-nullable. +/// +/// For nullable inputs where the closure is infallible (no overflow / no error +/// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible +/// closure, prefer [`try_map_validity_filtered`] — both correctly suppress +/// null-lane logic. This kernel exists for the narrow "no validity exists" +/// case (non-nullable column, internal pipelines, etc.). +/// +/// # Panics +/// +/// Panics if `out.len() != values.len()`. +#[inline] +pub fn map_no_validity(values: S, out: &mut [MaybeUninit], mut f: F) +where + S: IndexedSource, + F: FnMut(S::Item) -> R, +{ + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + for bit_idx in 0..64 { + let i = base + bit_idx; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v)) }; + } + } + + if remainder != 0 { + let base = chunks_count * 64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v)) }; + } + } +} + +/// Fallible map with **no validity awareness at all** — every `None` returned +/// by the closure is treated as a failure, even at null lanes. +/// +/// # Use this only for non-nullable inputs. +/// +/// For nullable inputs with a fallible closure, use +/// [`try_map_validity_filtered`] — it has the same value-only closure shape +/// (and the same perf win) but **correctly suppresses null-lane failures** +/// via per-chunk `fail_bits & mask_chunk`. +/// +/// Using this kernel on a nullable input where a null lane's stored value +/// would cause `f` to return `None` will produce a spurious `Err`. This is a +/// correctness footgun on purpose — the name and this doc are how the API +/// signals "you must know your input has no nulls." +/// +/// On failure returns `Err(failing_lane_index)`. +/// +/// # Panics +/// +/// Panics if `out.len() != values.len()`. +#[inline] +pub fn try_map_no_validity( + values: S, + out: &mut [MaybeUninit], + mut f: F, +) -> Result<(), usize> +where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, +{ + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + let i = base + bit_idx; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); + } + } + + if remainder != 0 { + let base = chunks_count * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..remainder { + let i = base + bit_idx; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + if fail_acc != 0 { + return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); + } + } + + Ok(()) +} + +/// Fallible value-only map with **chunk-level validity filtering**: closure is +/// `|v| -> Option`, no validity threaded through the inner loop. After each +/// 64-lane chunk, per-lane failure bits are ANDed against the mask chunk, so +/// failures at null lanes do **not** propagate as `Err`. +/// +/// This is the correct shape for "checked cast that respects validity" — a null +/// row whose stored value would overflow does **not** cause `Err`. It also +/// preserves the perf win of the value-only closure: the hot loop has no per-lane +/// mask extract, no `valid`-dependent branch. +/// +/// ## Inner-loop trick +/// +/// Per-lane fails are packed into a `u64` via `fail_bits |= (is_none as u64) << bit_idx`. +/// The shift amount is loop-invariant after unrolling (since `bit_idx` is the +/// compile-time loop counter), so the autovectorizer can issue 64 sequential +/// value reads + closure applications + packed-bit ORs as a vector pipeline. +/// +/// ## Attribution +/// +/// On failure, `valid_failures = fail_bits & mask_chunk` is non-zero; the lowest +/// set bit is the first failing valid lane. `trailing_zeros()` reads it out +/// directly — no cold replay path, no second pass. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. +#[inline] +pub fn try_map_validity_filtered( + values: S, + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, +) -> Result<(), usize> +where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, mask_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut fail_bits: u64 = 0; + for bit_idx in 0..64 { + let i = base + bit_idx; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + // Pack failure bit at the lane's position. After unrolling, `bit_idx` + // is a compile-time constant per-iteration, so the shift is folded. + fail_bits |= (opt.is_none() as u64) << bit_idx; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + // Filter failures to those at VALID lanes only. Null-lane failures vanish. + let valid_failures = fail_bits & mask_chunk; + if valid_failures != 0 { + return Err(base + valid_failures.trailing_zeros() as usize); + } + } + + if remainder != 0 { + let mask_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut fail_bits: u64 = 0; + for bit_idx in 0..remainder { + let i = base + bit_idx; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { out.get_unchecked_mut(i).write(r) }; + } + let valid_failures = fail_bits & mask_chunk; + if valid_failures != 0 { + return Err(base + valid_failures.trailing_zeros() as usize); + } + } + + Ok(()) +} + +/// Cold attribution for the no-mask variant. +#[cold] +#[inline(never)] +fn attribute_failure_no_mask( + values: &S, + base: usize, + chunk_len: usize, + f: &mut F, +) -> usize +where + S: IndexedSource, + F: FnMut(S::Item) -> Option, +{ + for bit_idx in 0..chunk_len { + let i = base + bit_idx; + // SAFETY: caller guarantees i < values.len(). + let v = unsafe { values.get_unchecked(i) }; + if f(v).is_none() { + return i; + } + } + unreachable!("attribute_failure_no_mask called without a failing lane") +} + +/// Cold path: identify the first lane in a chunk where `f` returned `None`. +/// +/// Called only after the hot loop has detected that at least one lane failed. +/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only +/// runs once per error and the error path is supposed to be exceptional. +#[cold] +#[inline(never)] +fn attribute_failure( + values: &S, + src_chunk: u64, + base: usize, + chunk_len: usize, + f: &mut F, +) -> usize +where + S: IndexedSource, + F: FnMut(S::Item, bool) -> Option, +{ + for bit_idx in 0..chunk_len { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + chunk_len <= values.len(). + let v = unsafe { values.get_unchecked(i) }; + if f(v, bit).is_none() { + return i; + } + } + // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed. + unreachable!("attribute_failure called without a failing lane") +} + +/// In-place variant of [`map_with_mask`]. Each lane is replaced with +/// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]). +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()`. +#[inline] +pub fn map_with_mask_in_place(mut values: S, mask: &BitBuffer, mut f: F) +where + S: IndexedSink, + F: FnMut(S::Item, bool) -> S::Item, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v, bit); + // SAFETY: i < len. + unsafe { values.set_unchecked(i, r) }; + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v, bit); + // SAFETY: i < len. + unsafe { values.set_unchecked(i, r) }; + } + } +} + +/// In-place variant of [`try_map_with_mask`]. Each lane of `values` is replaced +/// with `f(values[i], mask[i])`, or `S::Item::default()` if `f` returned `None`. +/// On failure returns `Err(first_failing_lane)`; lanes before that point have been +/// written, and lanes within the failing chunk hold their unwrapped-or-default +/// result. The buffer state on `Err` is intentionally unspecified. +/// +/// ## Error attribution +/// +/// Per-lane `is_none()` flags are folded into `first_fail` via a branchless +/// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane +/// loop, `first_fail` holds the smallest failing index in the chunk (or `MAX` +/// if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on AArch64. The +/// cold replay scheme used by [`try_map_with_mask`] isn't viable here because +/// the original input values have already been overwritten by the time we +/// would attribute the failure. +/// +/// ## Why in-place is slower at cache-resident sizes +/// +/// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the +/// out-of-place kernel despite having half the memory traffic, because input +/// and output share memory and the compiler must be conservative reordering +/// loads/stores across iterations. At sizes that exceed L2 the in-place kernel +/// wins back the gap by avoiding the second buffer's DRAM read+write traffic. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()`. +#[inline] +pub fn try_map_with_mask_in_place( + mut values: S, + mask: &BitBuffer, + mut f: F, +) -> Result<(), usize> +where + S: IndexedSink, + S::Item: Default, + F: FnMut(S::Item, bool) -> Option, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { values.set_unchecked(i, r) }; + } + if first_fail != u32::MAX { + return Err(first_fail as usize); + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + // SAFETY: i < len. + unsafe { values.set_unchecked(i, r) }; + } + if first_fail != u32::MAX { + return Err(first_fail as usize); + } + } + + Ok(()) +} + +/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. +/// +/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the +/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive +/// buffer) and combine the validity bitmap in a separate pass — splitting the work +/// this way lets the value-compare loop autovectorize cleanly. +/// +/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word +/// beyond `len % 64` are written as `0`. +/// +/// # Panics +/// +/// Panics if `out.len() != values.len().div_ceil(64)`. +#[inline] +pub fn map_to_bits(values: S, out: &mut [u64], mut f: F) +where + S: IndexedSource, + F: FnMut(S::Item) -> bool, +{ + let len = values.len(); + assert_eq!( + out.len(), + len.div_ceil(64), + "out must have len.div_ceil(64) words", + ); + + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + let mut packed = 0u64; + for bit_idx in 0..64 { + // SAFETY: base + bit_idx < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(base + bit_idx) }; + packed |= (f(v) as u64) << bit_idx; + } + // SAFETY: chunk_idx < chunks_count <= out.len(). + unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; + } + + if remainder != 0 { + let base = chunks_count * 64; + let mut packed = 0u64; + for bit_idx in 0..remainder { + // SAFETY: base + bit_idx < len. + let v = unsafe { values.get_unchecked(base + bit_idx) }; + packed |= (f(v) as u64) << bit_idx; + } + // SAFETY: chunks_count < out.len() because remainder != 0. + unsafe { *out.get_unchecked_mut(chunks_count) = packed }; + } +} + +/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words. +/// +/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word +/// beyond `len % 64` are written as `0`. +/// +/// # Panics +/// +/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`. +#[inline] +pub fn map_with_mask_to_bits(values: S, mask: &BitBuffer, out: &mut [u64], mut f: F) +where + S: IndexedSource, + F: FnMut(S::Item, bool) -> bool, +{ + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!( + out.len(), + len.div_ceil(64), + "out must have len.div_ceil(64) words", + ); + + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut packed = 0u64; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < chunks_count * 64 <= len. + let v = unsafe { values.get_unchecked(i) }; + packed |= (f(v, bit) as u64) << bit_idx; + } + // SAFETY: chunk_idx < chunks_count <= out.len(). + unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut packed = 0u64; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let v = unsafe { values.get_unchecked(i) }; + packed |= (f(v, bit) as u64) << bit_idx; + } + // SAFETY: chunks_count < out.len() because remainder != 0. + unsafe { *out.get_unchecked_mut(chunks_count) = packed }; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::BitBufferMut; + + fn write_t(out: Vec>) -> Vec { + // SAFETY: tests always fully initialize the buffer. + unsafe { std::mem::transmute(out) } + } + + #[test] + fn map_with_mask_aligned() { + let values: Vec = (0..10).collect(); + let mask = { + let mut m = BitBufferMut::with_capacity(10); + for i in 0..10 { + m.append(i % 2 == 0); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 10]; + map_with_mask( + values.as_slice(), + &mask, + &mut out, + |v, valid| if valid { v } else { -1 }, + ); + assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); + } + + #[test] + fn map_with_mask_partial_chunk() { + // 130 lanes — two full u64 words + a 2-bit remainder. + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + map_with_mask( + values.as_slice(), + &mask, + &mut out, + |v, valid| if valid { v + 1 } else { 0 }, + ); + let got = write_t(out); + assert_eq!(got.len(), 130); + assert_eq!(got[0], 1); + assert_eq!(got[63], 64); + assert_eq!(got[64], 65); + assert_eq!(got[129], 130); + } + + #[test] + fn map_with_mask_offset_mask() { + // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5. + let big = BitBuffer::new_set(128); + let sliced = big.slice(5..70); // logical len = 65, offset = 5 + assert_eq!(sliced.len(), 65); + assert_eq!(sliced.offset(), 5); + + let values: Vec = (0..65).collect(); + let mut out = vec![MaybeUninit::::uninit(); 65]; + map_with_mask( + values.as_slice(), + &sliced, + &mut out, + |v, valid| if valid { v } else { u32::MAX }, + ); + let got = write_t(out); + assert_eq!(got, (0..65).collect::>()); + } + + #[test] + fn map_with_mask_offset_past_word() { + // Slicing past a full word still works. `BitBuffer::slice` normalizes the + // logical offset to `offset % 8` and bumps the underlying byte pointer, + // so `offset()` won't equal 70 here — what we exercise is that the kernel + // walks the chunked u64 view (which BitChunks handles internally). + let big = BitBuffer::new_set(256); + let sliced = big.slice(70..200); + assert_eq!(sliced.len(), 130); + + let values: Vec = (0..130).map(|i| i as i16).collect(); + let mut out = vec![MaybeUninit::::uninit(); 130]; + map_with_mask( + values.as_slice(), + &sliced, + &mut out, + |v, valid| if valid { v } else { -1 }, + ); + let got = write_t(out); + assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); + } + + #[test] + fn map_with_mask_empty() { + let values: Vec = vec![]; + let mask = BitBuffer::new_unset(0); + let mut out: Vec> = vec![]; + map_with_mask(values.as_slice(), &mask, &mut out, |v, _| v); + } + + #[test] + fn map_with_mask_null_to_zero_branchless() { + // The trick from primitive/compute/cast.rs:147 — multiply by valid as T. + let values: Vec = (1..=100).collect(); + let mask = { + let mut m = BitBufferMut::with_capacity(100); + for i in 0..100 { + m.append(i % 3 != 0); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 100]; + map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| v * (valid as i64)); + let got = write_t(out); + for (i, &x) in got.iter().enumerate() { + if i % 3 == 0 { + assert_eq!(x, 0); + } else { + assert_eq!(x, (i + 1) as i64); + } + } + } + + #[test] + fn map_with_mask_to_bits_aligned() { + let values: Vec = (0..128).collect(); + let mask = BitBuffer::new_set(128); + let mut out = vec![0u64; 2]; + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v % 2 == 0); + // Even numbers in [0, 128) set, odd unset. + for word_idx in 0..2 { + let word = out[word_idx]; + for bit in 0..64 { + let i = word_idx * 64 + bit; + let expected = i % 2 == 0; + assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}"); + } + } + } + + #[test] + fn map_with_mask_to_bits_partial_chunk() { + // 130 lanes — three u64 words, last word has only 2 valid bits. + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + assert_eq!(out.len(), 3); + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v >= 64); + // Bits 64..128 set in word 1; bits 128..130 set in word 2. + assert_eq!(out[0], 0); + assert_eq!(out[1], u64::MAX); + assert_eq!(out[2], 0b11); + } + + #[test] + fn map_with_mask_to_bits_offset() { + let big = BitBuffer::new_set(256); + let sliced = big.slice(13..143); // offset=13, len=130 + assert_eq!(sliced.len(), 130); + let values: Vec = (0..130).map(|i| (i % 4) as u8).collect(); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + map_with_mask_to_bits(values.as_slice(),&sliced, &mut out, |v, valid| valid && v == 0); + for i in 0..130 { + let word = out[i / 64]; + let bit = (word >> (i % 64)) & 1 == 1; + assert_eq!(bit, i % 4 == 0, "lane {i}"); + } + } + + #[test] + fn try_map_with_mask_all_ok() { + let values: Vec = (0..200).collect(); + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..200u32).collect::>()); + } + + #[test] + fn try_map_with_mask_overflow_fails() { + // Put an overflowing value at lane 137 — the kernel must report Err(137). + let mut values: Vec = (0..200).collect(); + values[137] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(137)); + } + + #[test] + fn try_map_with_mask_overflow_reports_first_failing_lane() { + // Multiple failing lanes — must report the lowest index. + let mut values: Vec = (0..200).collect(); + values[50] = u64::MAX; + values[51] = u64::MAX; + values[137] = u64::MAX; + let mask = BitBuffer::new_set(200); + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(50)); + } + + #[test] + fn try_map_validity_filtered_null_lane_overflow_does_not_err() { + // Null lane with a value that would overflow MUST NOT cause Err. + // The closure is value-only — the mask filters the null-lane failure + // at the chunk boundary. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; // null lane with overflowing value + values[42] = u64::MAX; // null lane with overflowing value + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5 && i != 42); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_validity_filtered( + values.as_slice(), + &mask, + &mut out, + |v| (v <= u32::MAX as u64).then_some(v as u32), + ); + assert!(res.is_ok(), "null-lane overflow should not propagate as Err"); + } + + #[test] + fn try_map_validity_filtered_valid_overflow_does_err_with_first_index() { + // Valid lane overflow must propagate — and the reported index must be + // the lowest VALID failing lane, even if earlier null lanes also "failed" + // their unconditional cast. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; // null lane — filtered out + values[42] = u64::MAX; // null lane — filtered out + values[77] = u64::MAX; // VALID lane — should be reported + values[100] = u64::MAX; // VALID lane — higher index, ignored + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5 && i != 42); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_validity_filtered( + values.as_slice(), + &mask, + &mut out, + |v| (v <= u32::MAX as u64).then_some(v as u32), + ); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_with_mask_null_lane_bypasses_check() { + // Null lanes are neutralized by `valid as u64` before the range check, so an + // out-of-range value at a null lane must NOT trigger failure. + let mut values: Vec = (0..200).collect(); + values[5] = u64::MAX; + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5); + } + m.freeze() + }; + let mut out = vec![MaybeUninit::::uninit(); 200]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got[5], 0); // null-lane wrote default + assert_eq!(got[6], 6); + } + + #[test] + fn try_map_with_mask_branchful_matches_branchless() { + let mut values: Vec = (0..130).map(|i| i as u64 * 7).collect(); + values[2] = u64::MAX; + values[65] = u32::MAX as u64; + let mask = { + let mut m = BitBufferMut::with_capacity(130); + for i in 0..130 { + m.append(!matches!(i, 2 | 17 | 99)); + } + m.freeze() + }; + + let mut branchless = vec![MaybeUninit::::uninit(); 130]; + let mut branchful = vec![MaybeUninit::::uninit(); 130]; + try_map_with_mask(values.as_slice(), &mask, &mut branchless, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + try_map_with_mask(values.as_slice(), &mask, &mut branchful, |v, valid| { + if valid { + u32::try_from(v).ok() + } else { + Some(0) + } + }) + .unwrap(); + + assert_eq!(write_t(branchful), write_t(branchless)); + } + + #[test] + fn try_map_with_mask_partial_chunk() { + let values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got.len(), 130); + assert_eq!(got[129], 129); + } + + #[test] + fn try_map_with_mask_sliced_mask_unaligned_offset() { + // The mask's first byte is not word-aligned: slice off 13 bits, so the + // underlying BitChunks iterator must shift across byte boundaries on every + // 64-bit chunk it yields. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5 + assert_eq!(mask.len(), 130); + + let values: Vec = (0..130).collect(); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok()); + let got = write_t(out); + assert_eq!(got, (0..130u32).collect::>()); + } + + #[test] + fn try_map_with_mask_sliced_mask_with_overflow() { + // Sliced mask + overflowing value — the cold attribution path must report + // the correct lane index in the sliced (post-offset) coordinate space. + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + let mut values: Vec = (0..130).collect(); + values[77] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_with_mask_sliced_mask_null_lanes() { + // Mix sliced offset with a non-trivial validity pattern. Null lanes must + // not contribute to fail_acc, even when their underlying value would overflow. + let mut m = BitBufferMut::with_capacity(256); + for i in 0..256 { + m.append(i % 3 != 0); + } + let big = m.freeze(); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + // After the 13-lane slice, original index `13 + j` becomes lane `j`. + // Lane `j` is valid iff `(13 + j) % 3 != 0`. + let mut values: Vec = (0..130).collect(); + // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid. + // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. + values[2] = u64::MAX; + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert!(res.is_ok(), "null lane should bypass the range check"); + } + + #[test] + fn try_map_with_mask_overflow_in_remainder() { + // Overflow in the trailing partial chunk (not aligned to 64). + let mut values: Vec = (0..130).collect(); + values[129] = (u32::MAX as u64) + 1; + let mask = BitBuffer::new_set(130); + let mut out = vec![MaybeUninit::::uninit(); 130]; + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); + assert_eq!(res, Err(129)); + } + + #[test] + fn map_to_bits_aligned() { + let values: Vec = (0..128).collect(); + let mut out = vec![0u64; 2]; + map_to_bits(values.as_slice(), &mut out, |v| v % 2 == 0); + for word_idx in 0..2 { + for bit in 0..64 { + let i = word_idx * 64 + bit; + let expected = i % 2 == 0; + assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}"); + } + } + } + + #[test] + fn map_to_bits_partial_chunk() { + let values: Vec = (0..130).collect(); + let mut out = vec![0u64; 130usize.div_ceil(64)]; + assert_eq!(out.len(), 3); + map_to_bits(values.as_slice(), &mut out, |v| v >= 64); + assert_eq!(out[0], 0); + assert_eq!(out[1], u64::MAX); + assert_eq!(out[2], 0b11); + } + + #[test] + fn map_to_bits_empty() { + let values: Vec = vec![]; + let mut out: Vec = vec![]; + map_to_bits(values.as_slice(), &mut out, |v| v > 0); + } + + #[test] + fn map_to_bits_matches_fused_with_all_valid_mask() { + // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits. + let values: Vec = (0..200).map(|i| i % 7).collect(); + let mask = BitBuffer::new_set(200); + + let mut a = vec![0u64; 200usize.div_ceil(64)]; + map_with_mask_to_bits(values.as_slice(), &mask, &mut a, |v, valid| valid && v == 3); + + let mut b = vec![0u64; 200usize.div_ceil(64)]; + map_to_bits(values.as_slice(), &mut b, |v| v == 3); + + assert_eq!(a, b); + } + + #[test] + fn map_with_mask_to_bits_validity_kills_lane() { + // Even if predicate is true, null lanes should produce false. + let values: Vec = vec![1; 70]; + let mask = { + let mut m = BitBufferMut::with_capacity(70); + for i in 0..70 { + m.append(i >= 32); // first 32 lanes are null + } + m.freeze() + }; + let mut out = vec![0u64; 70usize.div_ceil(64)]; + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v == 1); + for i in 0..70 { + let bit = (out[i / 64] >> (i % 64)) & 1 == 1; + assert_eq!(bit, i >= 32, "lane {i}"); + } + } +} diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 592762d7a26..a4519ac62ec 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -64,6 +64,12 @@ mod bytes; mod r#const; mod debug; mod lane_ops; +/// Indexed-source variant of [`lane_ops`]: takes an `IndexedSource` trait whose +/// implementations expose `unsafe fn get_unchecked(i) -> Item`. `&[T]` impls inline +/// to the same indexed load as the slice kernel, but the trait also admits binary +/// inputs via `LaneZip`. See `HISTORY.md` for the iterator-API investigation that +/// led to this design. +pub mod lane_ops_indexed; mod macros; #[cfg(feature = "memmap2")] mod memmap2; From 5cf469ab06f192eca051bca820fe0247775cd9fe Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 10:58:56 +0100 Subject: [PATCH 03/21] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 4 - vortex-buffer/Cargo.toml | 22 - vortex-buffer/benches/cast_to.rs | 323 ---------- vortex-buffer/benches/cast_to_indexed.rs | 427 +------------- vortex-buffer/src/lane_ops.rs | 713 ----------------------- vortex-buffer/src/lane_ops_indexed.rs | 229 ++++++-- vortex-buffer/src/lib.rs | 7 - 7 files changed, 188 insertions(+), 1537 deletions(-) delete mode 100644 vortex-buffer/benches/cast_to.rs delete mode 100644 vortex-buffer/src/lane_ops.rs diff --git a/Cargo.lock b/Cargo.lock index 9bb032d0d35..d29c91edf62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,11 +9355,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-arith", - "arrow-array", "arrow-buffer", - "arrow-cast", - "arrow-schema", "bitvec", "bytes", "codspeed-divan-compat", diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 42c882004bd..6490516f846 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,12 +37,6 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] -# TEMP: arrow-* are only used by the cast_to / add_checked benches for cross-impl -# performance comparisons. Drop them when the benches are removed. -arrow-arith = { workspace = true } -arrow-array = { workspace = true } -arrow-cast = { workspace = true } -arrow-schema = { workspace = true } divan = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } @@ -56,22 +50,6 @@ harness = false name = "vortex_bitbuffer" harness = false -[[bench]] -name = "cast_to" -harness = false - [[bench]] name = "cast_to_indexed" harness = false - -[[bench]] -name = "cast_iter_all" -harness = false - -[[bench]] -name = "cast_in_place" -harness = false - -[[bench]] -name = "add_checked" -harness = false diff --git a/vortex-buffer/benches/cast_to.rs b/vortex-buffer/benches/cast_to.rs deleted file mode 100644 index c070f65d3a0..00000000000 --- a/vortex-buffer/benches/cast_to.rs +++ /dev/null @@ -1,323 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Cast `u64 -> u32` over a nullable column, three ways: -//! -//! 1. `kernel_map_with_mask` — uses `map_with_mask`. Writes truncated values into a -//! pre-allocated `&mut [MaybeUninit]`. Null lanes write `0` via the branchless -//! `v * valid as u64` trick, mirroring `primitive/compute/cast.rs:147`. -//! 2. `iter_zip` — `values.iter().zip(mask.iter())` collected through -//! `BufferMut::from_trusted_len_iter`. This is the shape the current Vortex cast uses. -//! 3. `arrow_cast` — `arrow_cast::cast` against a `UInt64Array`, allocating a new -//! `UInt32Array`. -//! -//! Plus two fallible variants that error on overflow: -//! -//! 4. `kernel_try_map_with_mask` — `try_map_with_mask` with `|v, valid| (v <= MAX).then_some(...)`. -//! Unconditional cast + parallel range check OR-reduced into a u64 fail accumulator. -//! 5. `iter_zip_checked` — `BufferMut::try_from_trusted_len_iter` returning Err on overflow. -//! 6. `arrow_cast_checked` — `arrow_cast::cast` with `safe = false` (errors on overflow). -//! -//! Inputs are bounded to fit in `u32`, so the fallible variants always succeed and we -//! measure the cost of the range check on the success path. - -#![expect(clippy::unwrap_used)] - -use std::mem::MaybeUninit; - -use arrow_array::UInt64Array; -use arrow_buffer::NullBuffer; -use arrow_buffer::ScalarBuffer; -use arrow_cast::CastOptions; -use arrow_cast::cast_with_options; -use arrow_schema::DataType; -use divan::Bencher; -use rand::SeedableRng; -use rand::prelude::*; -use vortex_buffer::BitBuffer; -use vortex_buffer::BitBufferMut; -use vortex_buffer::Buffer; -use vortex_buffer::BufferMut; -use vortex_buffer::map_with_mask; -use vortex_buffer::try_map_with_mask; - -fn main() { - divan::main(); -} - -const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; -const VALID_RATE: f64 = 0.7; -const DATA_SEED: u64 = 0; -const VALID_SEED: u64 = 1; - -// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte -// boundaries on every chunk it yields. -const SLICE_OFFSET: usize = 5; - -struct Fixture { - values: Buffer, - /// `offset() == 0`, underlying byte buffer starts on a byte boundary. - mask_aligned: BitBuffer, - /// Same validity bits but sliced so `offset() == SLICE_OFFSET`. - mask_unaligned: BitBuffer, - arrow_arr: UInt64Array, - /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset, - /// constructed by building an oversized array and slicing. - arrow_arr_unaligned: UInt64Array, -} - -fn fixture(n: usize) -> Fixture { - let mut data_rng = StdRng::seed_from_u64(DATA_SEED); - let mut valid_rng = StdRng::seed_from_u64(VALID_SEED); - let raw_values: Vec = (0..n) - .map(|_| data_rng.random_range(0..u32::MAX as u64)) - .collect(); - let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); - - let values: Buffer = raw_values.iter().copied().collect(); - - let mask_aligned = { - let mut m = BitBufferMut::with_capacity(n); - for &v in &raw_valid { - m.append(v); - } - m.freeze() - }; - - // Build n + SLICE_OFFSET bits then slice off the leading SLICE_OFFSET, so the - // remaining `n` lanes carry the SAME validity pattern as the aligned mask. - let mask_unaligned = { - let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET); - for _ in 0..SLICE_OFFSET { - m.append(false); // filler — sliced away - } - for &v in &raw_valid { - m.append(v); - } - m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n) - }; - debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET); - debug_assert_eq!(mask_unaligned.len(), n); - - let arrow_arr = UInt64Array::new( - ScalarBuffer::from(raw_values.clone()), - Some(NullBuffer::from(raw_valid.clone())), - ); - - // Oversized array → slice off SLICE_OFFSET lanes so the resulting array's - // NullBuffer has `offset() == SLICE_OFFSET`. The remaining `n` lanes hold the - // same validity pattern as `arrow_arr`. - let arrow_arr_unaligned = { - let mut padded_values: Vec = vec![0; SLICE_OFFSET]; - padded_values.extend_from_slice(&raw_values); - let mut padded_valid: Vec = vec![false; SLICE_OFFSET]; - padded_valid.extend_from_slice(&raw_valid); - let oversized = UInt64Array::new( - ScalarBuffer::from(padded_values), - Some(NullBuffer::from(padded_valid)), - ); - use arrow_array::Array; - let sliced = oversized.slice(SLICE_OFFSET, n); - debug_assert_eq!( - sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8, - SLICE_OFFSET - ); - sliced - }; - - Fixture { - values, - mask_aligned, - mask_unaligned, - arrow_arr, - arrow_arr_unaligned, - } -} - -const CAST_OPTS: CastOptions<'static> = CastOptions { - safe: true, - format_options: arrow_cast::display::FormatOptions::new(), -}; - -const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { - safe: false, - format_options: arrow_cast::display::FormatOptions::new(), -}; - -#[divan::bench(args = SIZES)] -fn kernel_map_with_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - // Owned uninit-slot vector, sized once outside the timed region. - let mut out: Vec> = Vec::with_capacity(n); - // SAFETY: every lane is written before any read inside the kernel. - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - (v * valid as u64) as u32 - }); - }); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_unaligned(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr_unaligned.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn kernel_try_map_with_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - // SAFETY: every lane is written before any read inside the kernel. - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - }); -} - -/// Same kernel, but the mask has `offset() == 5` so `BitChunks::iter()` must shift -/// across byte boundaries on every chunk. Quantifies the cost of unaligned mask access. -#[divan::bench(args = SIZES)] -fn kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_unaligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - }); -} - -/// Aligned-mask counterpart for `map_with_mask` (infallible). Pair with the -/// `_unaligned` variant below to isolate the mask-iteration cost from the closure. -#[divan::bench(args = SIZES)] -fn kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_unaligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - (v * valid as u64) as u32 - }); - }); -} - -/// As above but with the branchful idiomatic form. Tests whether autovectorization -/// survives a per-lane `if valid { ... } else { ... }` shape. -#[divan::bench(args = SIZES)] -fn kernel_try_from_branchful(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - if valid { - u32::try_from(v).ok() - } else { - Some(0_u32) - } - }) - .unwrap(); - }); -} - -#[divan::bench(args = SIZES)] -fn iter_zip_checked(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values.clone(), f.mask_aligned.clone())) - .bench_refs(|(values, mask)| { - let buf: Buffer = BufferMut::try_from_trusted_len_iter( - values.iter().zip(mask.iter()).map(|(&v, valid)| { - let scaled = v * valid as u64; - if scaled <= u32::MAX as u64 { - Ok(scaled as u32) - } else { - Err(()) - } - }), - ) - .unwrap() - .freeze(); - buf - }); -} - -#[divan::bench(args = SIZES)] -fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone())) - .bench_refs(|(values, mask)| { - let buf: Buffer = BufferMut::try_from_trusted_len_iter( - values.iter().zip(mask.iter()).map(|(&v, valid)| { - let scaled = v * valid as u64; - if scaled <= u32::MAX as u64 { - Ok(scaled as u32) - } else { - Err(()) - } - }), - ) - .unwrap() - .freeze(); - buf - }); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_checked(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr_unaligned.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index b2abe29b890..d3baec7885c 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -1,37 +1,28 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Mirror of `cast_to.rs` driving the kernels through [`vortex_buffer::lane_ops_indexed`] -//! (the `IndexedSource` trait) plus isolation benches that decompose the cost of the -//! kernel structure vs. the cast vs. the mask access. +//! Focused bench for the **best fallible cast kernel** — what `cast.rs` actually uses +//! in `vortex-array/src/arrays/primitive/compute/cast.rs`. Single bench, no cross-impl +//! baselines: just a regression guard for the production cast hot path. //! -//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated -//! this design: a stateful `ExactSizeIterator` variant of these kernels was ~+100% -//! slower because per-lane `next()` calls create a 64-deep dependency chain across -//! iterations that blocks vectorization. The `IndexedSource` trait uses -//! `unsafe fn get_unchecked(i)` reads — independent across iterations — and inlines -//! to the same indexed load as the slice kernel. +//! The kernel: [`vortex_buffer::lane_ops_indexed::try_map_with_mask`] called with a +//! lazy-validity `or_else` closure — for statically-infallible casts (widening) LLVM +//! proves `NumCast::from` is always `Some`, the `or_else` branch is dead, and the +//! validity path is DCE'd. For fallible casts (narrowing), validity is only consulted +//! on the cold failure branch. #![expect(clippy::unwrap_used)] use std::mem::MaybeUninit; -use arrow_array::UInt64Array; -use arrow_buffer::NullBuffer; -use arrow_buffer::ScalarBuffer; -use arrow_cast::CastOptions; -use arrow_cast::cast_with_options; -use arrow_schema::DataType; use divan::Bencher; +use num_traits::NumCast; use rand::SeedableRng; use rand::prelude::*; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; -use vortex_buffer::BufferMut; -use vortex_buffer::lane_ops_indexed::map_with_mask as indexed_map_with_mask; -use vortex_buffer::lane_ops_indexed::try_map_validity_filtered as indexed_try_map_validity_filtered; -use vortex_buffer::lane_ops_indexed::try_map_with_mask as indexed_try_map_with_mask; +use vortex_buffer::lane_ops_indexed::try_map_with_mask; fn main() { divan::main(); @@ -42,20 +33,9 @@ const VALID_RATE: f64 = 0.7; const DATA_SEED: u64 = 0; const VALID_SEED: u64 = 1; -// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte -// boundaries on every chunk it yields. -const SLICE_OFFSET: usize = 5; - struct Fixture { values: Buffer, - /// `offset() == 0`, underlying byte buffer starts on a byte boundary. - mask_aligned: BitBuffer, - /// Same validity bits but sliced so `offset() == SLICE_OFFSET`. - mask_unaligned: BitBuffer, - arrow_arr: UInt64Array, - /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset, - /// constructed by building an oversized array and slicing. - arrow_arr_unaligned: UInt64Array, + mask: BitBuffer, } fn fixture(n: usize) -> Fixture { @@ -67,8 +47,7 @@ fn fixture(n: usize) -> Fixture { let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); let values: Buffer = raw_values.iter().copied().collect(); - - let mask_aligned = { + let mask = { let mut m = BitBufferMut::with_capacity(n); for &v in &raw_valid { m.append(v); @@ -76,392 +55,26 @@ fn fixture(n: usize) -> Fixture { m.freeze() }; - let mask_unaligned = { - let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET); - for _ in 0..SLICE_OFFSET { - m.append(false); - } - for &v in &raw_valid { - m.append(v); - } - m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n) - }; - debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET); - debug_assert_eq!(mask_unaligned.len(), n); - - let arrow_arr = UInt64Array::new( - ScalarBuffer::from(raw_values.clone()), - Some(NullBuffer::from(raw_valid.clone())), - ); - - let arrow_arr_unaligned = { - let mut padded_values: Vec = vec![0; SLICE_OFFSET]; - padded_values.extend_from_slice(&raw_values); - let mut padded_valid: Vec = vec![false; SLICE_OFFSET]; - padded_valid.extend_from_slice(&raw_valid); - let oversized = UInt64Array::new( - ScalarBuffer::from(padded_values), - Some(NullBuffer::from(padded_valid)), - ); - use arrow_array::Array; - let sliced = oversized.slice(SLICE_OFFSET, n); - debug_assert_eq!( - sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8, - SLICE_OFFSET - ); - sliced - }; - - Fixture { - values, - mask_aligned, - mask_unaligned, - arrow_arr, - arrow_arr_unaligned, - } -} - -const CAST_OPTS: CastOptions<'static> = CastOptions { - safe: true, - format_options: arrow_cast::display::FormatOptions::new(), -}; - -const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { - safe: false, - format_options: arrow_cast::display::FormatOptions::new(), -}; - -#[divan::bench(args = SIZES)] -fn arrow_cast(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_unaligned(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr_unaligned.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn iter_zip_checked(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values.clone(), f.mask_aligned.clone())) - .bench_refs(|(values, mask)| { - let buf: Buffer = BufferMut::try_from_trusted_len_iter( - values.iter().zip(mask.iter()).map(|(&v, valid)| { - let scaled = v * valid as u64; - if scaled <= u32::MAX as u64 { - Ok(scaled as u32) - } else { - Err(()) - } - }), - ) - .unwrap() - .freeze(); - buf - }); + Fixture { values, mask } } +/// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity +/// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only +/// then reads `valid`) when the cast itself returned `None`. #[divan::bench(args = SIZES)] -fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone())) - .bench_refs(|(values, mask)| { - let buf: Buffer = BufferMut::try_from_trusted_len_iter( - values.iter().zip(mask.iter()).map(|(&v, valid)| { - let scaled = v * valid as u64; - if scaled <= u32::MAX as u64 { - Ok(scaled as u32) - } else { - Err(()) - } - }), - ) - .unwrap() - .freeze(); - buf - }); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_checked(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| f.arrow_arr_unaligned.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} - -// ----------------------------------------------------------------------------- -// Isolation benches: drop the mask, isolate the cast u64 -> u32 to see whether -// the iterator cost is intrinsic or comes from the surrounding kernel structure. -// ----------------------------------------------------------------------------- - -/// Plain slice indexing, no mask. Upper bound on what the iter variants must beat. -#[divan::bench(args = SIZES)] -fn iso_slice_cast(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), out) - }) - .bench_refs(|(values, out)| { - let v = values.as_slice(); - let o = out.as_mut_slice(); - assert_eq!(v.len(), o.len()); - for i in 0..v.len() { - // SAFETY: bounds checked by the assert above. - unsafe { o.get_unchecked_mut(i).write(*v.get_unchecked(i) as u32) }; - } - }); -} - -/// Per-lane iterator zip, no mask. Tests whether `slice::Iter::next` autovectorizes -/// when nothing else is in the way. -#[divan::bench(args = SIZES)] -fn iso_iter_cast(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), out) - }) - .bench_refs(|(values, out)| { - for (slot, &v) in out.iter_mut().zip(values.iter()) { - slot.write(v as u32); - } - }); -} - -/// `chunks_exact(64)` + `try_into::<&[u64; 64]>` so the outer iter advances once per -/// 64 lanes and the inner loop indexes a fixed-size array. Tests whether moving the -/// iterator state from per-lane to per-chunk fixes vectorization. -#[divan::bench(args = SIZES)] -fn iso_iter_chunks_64(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), out) - }) - .bench_refs(|(values, out)| { - let v = values.as_slice(); - let o = out.as_mut_slice(); - assert_eq!(v.len(), o.len()); - for (v_chunk, o_chunk) in v.chunks_exact(64).zip(o.chunks_exact_mut(64)) { - let v_arr: &[u64; 64] = v_chunk.try_into().unwrap(); - let o_arr: &mut [MaybeUninit; 64] = o_chunk.try_into().unwrap(); - for bit_idx in 0..64 { - o_arr[bit_idx].write(v_arr[bit_idx] as u32); - } - } - // Ignore the tail — SIZES are all multiples of 64. - }); -} - -// ----------------------------------------------------------------------------- -// Indexed-source variant (lane_ops_indexed). The kernel takes an `IndexedSource` whose -// `&[T]` impl is `unsafe fn get_unchecked(i) -> T` — same indexed load as the slice -// kernel, but the trait also supports binary inputs via `LaneZip`. -// ----------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn indexed_kernel_map_with_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - (v * valid as u64) as u32 - }); - }); -} - -#[divan::bench(args = SIZES)] -fn indexed_kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_unaligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - (v * valid as u64) as u32 - }); - }); -} - -#[divan::bench(args = SIZES)] -fn indexed_kernel_try_map_with_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - }); -} - -#[divan::bench(args = SIZES)] -fn indexed_kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_unaligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - }); -} - -#[divan::bench(args = SIZES)] -fn indexed_kernel_try_from_branchful(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - if valid { - u32::try_from(v).ok() - } else { - Some(0_u32) - } - }) - .unwrap(); - }); -} - -// ----------------------------------------------------------------------------- -// Decoupled-design variant with CORRECT validity semantics: closure is `|v|` -// (no per-lane mask threading), but the mask filters out null-lane failures at -// the chunk boundary. A null row whose stored value would overflow does NOT -// cause Err — this matches the existing `try_map_with_mask` semantics while -// keeping the lighter inner loop. -// ----------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn indexed_decoupled_kernel_try_map_with_mask(bencher: Bencher, n: usize) { +fn cast_lazy_validity(bencher: Bencher, n: usize) { let f = fixture(n); bencher .with_inputs(|| { let mut out: Vec> = Vec::with_capacity(n); // SAFETY: every lane is written before any read inside the kernel. unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) + (f.values.clone(), f.mask.clone(), out) }) .bench_refs(|(values, mask, out)| { - indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| { - (v <= u32::MAX as u64).then_some(v as u32) + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + ::from(v).or_else(|| (!valid).then(u32::default)) }) .unwrap(); }); } - -#[divan::bench(args = SIZES)] -fn indexed_decoupled_kernel_try_from_branchful(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| { - u32::try_from(v).ok() - }) - .unwrap(); - }); -} - -/// Full checked-cast kernel using `chunks_exact(64)` + fixed-size array refs, with -/// the mask. If this matches the slice kernel, the cost is in the per-lane iterator -/// state, not the iter pattern in general. -#[divan::bench(args = SIZES)] -fn kernel_iter_chunks_64(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values.clone(), f.mask_aligned.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - let v = values.as_slice(); - let o = out.as_mut_slice(); - let len = v.len(); - assert_eq!(len, mask.len()); - assert_eq!(len, o.len()); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let full = chunks_count * 64; - let (v_full, _v_rem) = v.split_at(full); - let (o_full, _o_rem) = o.split_at_mut(full); - - for ((v_chunk, o_chunk), src_chunk) in v_full - .chunks_exact(64) - .zip(o_full.chunks_exact_mut(64)) - .zip(chunks.iter()) - { - let v_arr: &[u64; 64] = v_chunk.try_into().unwrap(); - let o_arr: &mut [MaybeUninit; 64] = o_chunk.try_into().unwrap(); - let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { - let bit = (src_chunk >> bit_idx) & 1 == 1; - let scaled = v_arr[bit_idx] * bit as u64; - let opt = (scaled <= u32::MAX as u64).then_some(scaled as u32); - fail_acc |= opt.is_none() as u64; - o_arr[bit_idx].write(opt.unwrap_or_default()); - } - assert_eq!(fail_acc, 0); - } - // Ignore the tail — SIZES are all multiples of 64. - }); -} diff --git a/vortex-buffer/src/lane_ops.rs b/vortex-buffer/src/lane_ops.rs deleted file mode 100644 index b145633465b..00000000000 --- a/vortex-buffer/src/lane_ops.rs +++ /dev/null @@ -1,713 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Elementwise kernels that combine a `[T]` slice with a `BitBuffer` validity mask. -//! -//! The output is always a caller-provided `&mut` slice — these kernels never allocate. -//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` -//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. - -use std::mem::MaybeUninit; - -use crate::BitBuffer; - -/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`. -/// -/// All three inputs must have the same length. The output type `R` may differ from the -/// input type `T` — this kernel is the building block for both same-type transforms -/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out` -/// initialized (e.g. by calling `BufferMut::set_len` after this returns). -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. -#[inline] -pub fn map_with_mask(values: &[T], mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) -where - T: Copy, - F: FnMut(T, bool) -> R, -{ - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - // Inner loop is fixed-size 64 so the compiler can autovectorize - // for branchless closures like `|v, valid| v * (valid as T)`. - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: chunks.iter() yields chunks_count full words, so i < chunks_count * 64 <= len. - let v = unsafe { *values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - } - } - - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i = chunks_count * 64 + bit_idx < chunks_count * 64 + remainder = len. - let v = unsafe { *values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - } - } -} - -/// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` indicates a -/// per-lane failure (e.g. range overflow on a narrowing cast). -/// -/// The kernel does not short-circuit on the first failure inside a chunk: it processes -/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator, -/// then checks after each chunk. On failure, a cold scalar attribution pass replays the -/// closure over that chunk to identify the first failing lane. The hot loop stays -/// autovectorizable — the per-lane cost is one OR on top of the cast. -/// -/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write -/// `R::default()` into `out`, but the contents of `out` must not be relied upon when -/// this function returns `Err`. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. -#[inline] -pub fn try_map_with_mask( - values: &[T], - mask: &BitBuffer, - out: &mut [MaybeUninit], - mut f: F, -) -> Result<(), usize> -where - T: Copy, - R: Copy + Default, - F: FnMut(T, bool) -> Option, -{ - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - // Per-chunk accumulator — does not escape the SIMD inner loop. - let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { *values.get_unchecked(i) }; - let opt = f(v, bit); - fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - } - if fail_acc != 0 { - return Err(attribute_failure(values, src_chunk, base, 64, &mut f)); - } - } - - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut fail_acc: u64 = 0; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < len. - let v = unsafe { *values.get_unchecked(i) }; - let opt = f(v, bit); - fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - } - if fail_acc != 0 { - return Err(attribute_failure( - values, src_chunk, base, remainder, &mut f, - )); - } - } - - Ok(()) -} - -/// Cold path: identify the first lane in a chunk where `f` returned `None`. -/// -/// Called only after the hot loop has detected that at least one lane failed. -/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only -/// runs once per error and the error path is supposed to be exceptional. -#[cold] -#[inline(never)] -fn attribute_failure( - values: &[T], - src_chunk: u64, - base: usize, - chunk_len: usize, - f: &mut F, -) -> usize -where - T: Copy, - F: FnMut(T, bool) -> Option, -{ - for bit_idx in 0..chunk_len { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees i < values.len(). - let v = unsafe { *values.get_unchecked(i) }; - if f(v, bit).is_none() { - return i; - } - } - // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed. - unreachable!("attribute_failure called without a failing lane") -} - -/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. -/// -/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the -/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive -/// buffer) and combine the validity bitmap in a separate pass — splitting the work -/// this way lets the value-compare loop autovectorize cleanly. -/// -/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word -/// beyond `len % 64` are written as `0`. -/// -/// # Panics -/// -/// Panics if `out.len() != values.len().div_ceil(64)`. -#[inline] -pub fn map_to_bits(values: &[T], out: &mut [u64], mut f: F) -where - T: Copy, - F: FnMut(T) -> bool, -{ - let len = values.len(); - assert_eq!( - out.len(), - len.div_ceil(64), - "out must have len.div_ceil(64) words", - ); - - let chunks_count = len / 64; - let remainder = len % 64; - - for chunk_idx in 0..chunks_count { - let base = chunk_idx * 64; - let mut packed = 0u64; - for bit_idx in 0..64 { - // SAFETY: base + bit_idx < chunks_count * 64 <= len. - let v = unsafe { *values.get_unchecked(base + bit_idx) }; - packed |= (f(v) as u64) << bit_idx; - } - // SAFETY: chunk_idx < chunks_count <= out.len(). - unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; - } - - if remainder != 0 { - let base = chunks_count * 64; - let mut packed = 0u64; - for bit_idx in 0..remainder { - // SAFETY: base + bit_idx < len. - let v = unsafe { *values.get_unchecked(base + bit_idx) }; - packed |= (f(v) as u64) << bit_idx; - } - // SAFETY: chunks_count < out.len() because remainder != 0. - unsafe { *out.get_unchecked_mut(chunks_count) = packed }; - } -} - -/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words. -/// -/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word -/// beyond `len % 64` are written as `0`. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`. -#[inline] -pub fn map_with_mask_to_bits(values: &[T], mask: &BitBuffer, out: &mut [u64], mut f: F) -where - T: Copy, - F: FnMut(T, bool) -> bool, -{ - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!( - out.len(), - len.div_ceil(64), - "out must have len.div_ceil(64) words", - ); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - let mut packed = 0u64; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { *values.get_unchecked(i) }; - packed |= (f(v, bit) as u64) << bit_idx; - } - // SAFETY: chunk_idx < chunks_count <= out.len(). - unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; - } - - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut packed = 0u64; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < len. - let v = unsafe { *values.get_unchecked(i) }; - packed |= (f(v, bit) as u64) << bit_idx; - } - // SAFETY: chunks_count < out.len() because remainder != 0. - unsafe { *out.get_unchecked_mut(chunks_count) = packed }; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::BitBufferMut; - - fn write_t(out: Vec>) -> Vec { - // SAFETY: tests always fully initialize the buffer. - unsafe { std::mem::transmute(out) } - } - - #[test] - fn map_with_mask_aligned() { - let values: Vec = (0..10).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(10); - for i in 0..10 { - m.append(i % 2 == 0); - } - m.freeze() - }; - let mut out = vec![MaybeUninit::::uninit(); 10]; - map_with_mask( - &values, - &mask, - &mut out, - |v, valid| if valid { v } else { -1 }, - ); - assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); - } - - #[test] - fn map_with_mask_partial_chunk() { - // 130 lanes — two full u64 words + a 2-bit remainder. - let values: Vec = (0..130).collect(); - let mask = BitBuffer::new_set(130); - let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask( - &values, - &mask, - &mut out, - |v, valid| if valid { v + 1 } else { 0 }, - ); - let got = write_t(out); - assert_eq!(got.len(), 130); - assert_eq!(got[0], 1); - assert_eq!(got[63], 64); - assert_eq!(got[64], 65); - assert_eq!(got[129], 130); - } - - #[test] - fn map_with_mask_offset_mask() { - // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5. - let big = BitBuffer::new_set(128); - let sliced = big.slice(5..70); // logical len = 65, offset = 5 - assert_eq!(sliced.len(), 65); - assert_eq!(sliced.offset(), 5); - - let values: Vec = (0..65).collect(); - let mut out = vec![MaybeUninit::::uninit(); 65]; - map_with_mask( - &values, - &sliced, - &mut out, - |v, valid| if valid { v } else { u32::MAX }, - ); - let got = write_t(out); - assert_eq!(got, (0..65).collect::>()); - } - - #[test] - fn map_with_mask_offset_past_word() { - // Slicing past a full word still works. `BitBuffer::slice` normalizes the - // logical offset to `offset % 8` and bumps the underlying byte pointer, - // so `offset()` won't equal 70 here — what we exercise is that the kernel - // walks the chunked u64 view (which BitChunks handles internally). - let big = BitBuffer::new_set(256); - let sliced = big.slice(70..200); - assert_eq!(sliced.len(), 130); - - let values: Vec = (0..130).map(|i| i as i16).collect(); - let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask( - &values, - &sliced, - &mut out, - |v, valid| if valid { v } else { -1 }, - ); - let got = write_t(out); - assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); - } - - #[test] - fn map_with_mask_empty() { - let values: Vec = vec![]; - let mask = BitBuffer::new_unset(0); - let mut out: Vec> = vec![]; - map_with_mask(&values, &mask, &mut out, |v, _| v); - } - - #[test] - fn map_with_mask_null_to_zero_branchless() { - // The trick from primitive/compute/cast.rs:147 — multiply by valid as T. - let values: Vec = (1..=100).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(100); - for i in 0..100 { - m.append(i % 3 != 0); - } - m.freeze() - }; - let mut out = vec![MaybeUninit::::uninit(); 100]; - map_with_mask(&values, &mask, &mut out, |v, valid| v * (valid as i64)); - let got = write_t(out); - for (i, &x) in got.iter().enumerate() { - if i % 3 == 0 { - assert_eq!(x, 0); - } else { - assert_eq!(x, (i + 1) as i64); - } - } - } - - #[test] - fn map_with_mask_to_bits_aligned() { - let values: Vec = (0..128).collect(); - let mask = BitBuffer::new_set(128); - let mut out = vec![0u64; 2]; - map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v % 2 == 0); - // Even numbers in [0, 128) set, odd unset. - for word_idx in 0..2 { - let word = out[word_idx]; - for bit in 0..64 { - let i = word_idx * 64 + bit; - let expected = i % 2 == 0; - assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}"); - } - } - } - - #[test] - fn map_with_mask_to_bits_partial_chunk() { - // 130 lanes — three u64 words, last word has only 2 valid bits. - let values: Vec = (0..130).collect(); - let mask = BitBuffer::new_set(130); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - assert_eq!(out.len(), 3); - map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v >= 64); - // Bits 64..128 set in word 1; bits 128..130 set in word 2. - assert_eq!(out[0], 0); - assert_eq!(out[1], u64::MAX); - assert_eq!(out[2], 0b11); - } - - #[test] - fn map_with_mask_to_bits_offset() { - let big = BitBuffer::new_set(256); - let sliced = big.slice(13..143); // offset=13, len=130 - assert_eq!(sliced.len(), 130); - let values: Vec = (0..130).map(|i| (i % 4) as u8).collect(); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - map_with_mask_to_bits(&values, &sliced, &mut out, |v, valid| valid && v == 0); - for i in 0..130 { - let word = out[i / 64]; - let bit = (word >> (i % 64)) & 1 == 1; - assert_eq!(bit, i % 4 == 0, "lane {i}"); - } - } - - #[test] - fn try_map_with_mask_all_ok() { - let values: Vec = (0..200).collect(); - let mask = BitBuffer::new_set(200); - let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert!(res.is_ok()); - let got = write_t(out); - assert_eq!(got, (0..200u32).collect::>()); - } - - #[test] - fn try_map_with_mask_overflow_fails() { - // Put an overflowing value at lane 137 — the kernel must report Err(137). - let mut values: Vec = (0..200).collect(); - values[137] = (u32::MAX as u64) + 1; - let mask = BitBuffer::new_set(200); - let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert_eq!(res, Err(137)); - } - - #[test] - fn try_map_with_mask_overflow_reports_first_failing_lane() { - // Multiple failing lanes — must report the lowest index. - let mut values: Vec = (0..200).collect(); - values[50] = u64::MAX; - values[51] = u64::MAX; - values[137] = u64::MAX; - let mask = BitBuffer::new_set(200); - let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert_eq!(res, Err(50)); - } - - #[test] - fn try_map_with_mask_null_lane_bypasses_check() { - // Null lanes are neutralized by `valid as u64` before the range check, so an - // out-of-range value at a null lane must NOT trigger failure. - let mut values: Vec = (0..200).collect(); - values[5] = u64::MAX; - let mask = { - let mut m = BitBufferMut::with_capacity(200); - for i in 0..200 { - m.append(i != 5); - } - m.freeze() - }; - let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert!(res.is_ok()); - let got = write_t(out); - assert_eq!(got[5], 0); // null-lane wrote default - assert_eq!(got[6], 6); - } - - #[test] - fn try_map_with_mask_branchful_matches_branchless() { - let mut values: Vec = (0..130).map(|i| i as u64 * 7).collect(); - values[2] = u64::MAX; - values[65] = u32::MAX as u64; - let mask = { - let mut m = BitBufferMut::with_capacity(130); - for i in 0..130 { - m.append(!matches!(i, 2 | 17 | 99)); - } - m.freeze() - }; - - let mut branchless = vec![MaybeUninit::::uninit(); 130]; - let mut branchful = vec![MaybeUninit::::uninit(); 130]; - try_map_with_mask(&values, &mask, &mut branchless, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - try_map_with_mask(&values, &mask, &mut branchful, |v, valid| { - if valid { - u32::try_from(v).ok() - } else { - Some(0) - } - }) - .unwrap(); - - assert_eq!(write_t(branchful), write_t(branchless)); - } - - #[test] - fn try_map_with_mask_partial_chunk() { - let values: Vec = (0..130).collect(); - let mask = BitBuffer::new_set(130); - let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert!(res.is_ok()); - let got = write_t(out); - assert_eq!(got.len(), 130); - assert_eq!(got[129], 129); - } - - #[test] - fn try_map_with_mask_sliced_mask_unaligned_offset() { - // The mask's first byte is not word-aligned: slice off 13 bits, so the - // underlying BitChunks iterator must shift across byte boundaries on every - // 64-bit chunk it yields. - let big = BitBuffer::new_set(256); - let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5 - assert_eq!(mask.len(), 130); - - let values: Vec = (0..130).collect(); - let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert!(res.is_ok()); - let got = write_t(out); - assert_eq!(got, (0..130u32).collect::>()); - } - - #[test] - fn try_map_with_mask_sliced_mask_with_overflow() { - // Sliced mask + overflowing value — the cold attribution path must report - // the correct lane index in the sliced (post-offset) coordinate space. - let big = BitBuffer::new_set(256); - let mask = big.slice(13..143); - assert_eq!(mask.len(), 130); - - let mut values: Vec = (0..130).collect(); - values[77] = u64::MAX; - let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert_eq!(res, Err(77)); - } - - #[test] - fn try_map_with_mask_sliced_mask_null_lanes() { - // Mix sliced offset with a non-trivial validity pattern. Null lanes must - // not contribute to fail_acc, even when their underlying value would overflow. - let mut m = BitBufferMut::with_capacity(256); - for i in 0..256 { - m.append(i % 3 != 0); - } - let big = m.freeze(); - let mask = big.slice(13..143); - assert_eq!(mask.len(), 130); - - // After the 13-lane slice, original index `13 + j` becomes lane `j`. - // Lane `j` is valid iff `(13 + j) % 3 != 0`. - let mut values: Vec = (0..130).collect(); - // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid. - // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. - values[2] = u64::MAX; - let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert!(res.is_ok(), "null lane should bypass the range check"); - } - - #[test] - fn try_map_with_mask_overflow_in_remainder() { - // Overflow in the trailing partial chunk (not aligned to 64). - let mut values: Vec = (0..130).collect(); - values[129] = (u32::MAX as u64) + 1; - let mask = BitBuffer::new_set(130); - let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); - assert_eq!(res, Err(129)); - } - - #[test] - fn map_to_bits_aligned() { - let values: Vec = (0..128).collect(); - let mut out = vec![0u64; 2]; - map_to_bits(&values, &mut out, |v| v % 2 == 0); - for word_idx in 0..2 { - for bit in 0..64 { - let i = word_idx * 64 + bit; - let expected = i % 2 == 0; - assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}"); - } - } - } - - #[test] - fn map_to_bits_partial_chunk() { - let values: Vec = (0..130).collect(); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - assert_eq!(out.len(), 3); - map_to_bits(&values, &mut out, |v| v >= 64); - assert_eq!(out[0], 0); - assert_eq!(out[1], u64::MAX); - assert_eq!(out[2], 0b11); - } - - #[test] - fn map_to_bits_empty() { - let values: Vec = vec![]; - let mut out: Vec = vec![]; - map_to_bits(&values, &mut out, |v| v > 0); - } - - #[test] - fn map_to_bits_matches_fused_with_all_valid_mask() { - // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits. - let values: Vec = (0..200).map(|i| i % 7).collect(); - let mask = BitBuffer::new_set(200); - - let mut a = vec![0u64; 200usize.div_ceil(64)]; - map_with_mask_to_bits(&values, &mask, &mut a, |v, valid| valid && v == 3); - - let mut b = vec![0u64; 200usize.div_ceil(64)]; - map_to_bits(&values, &mut b, |v| v == 3); - - assert_eq!(a, b); - } - - #[test] - fn map_with_mask_to_bits_validity_kills_lane() { - // Even if predicate is true, null lanes should produce false. - let values: Vec = vec![1; 70]; - let mask = { - let mut m = BitBufferMut::with_capacity(70); - for i in 0..70 { - m.append(i >= 32); // first 32 lanes are null - } - m.freeze() - }; - let mut out = vec![0u64; 70usize.div_ceil(64)]; - map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v == 1); - for i in 0..70 { - let bit = (out[i / 64] >> (i % 64)) & 1 == 1; - assert_eq!(bit, i >= 32, "lane {i}"); - } - } -} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index c83114d8bcd..dfd2c41fd4a 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Indexed-source variant of [`crate::lane_ops`]. +//! Elementwise lane kernels over indexed sources. //! //! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is //! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]` @@ -16,6 +16,8 @@ //! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` //! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. +#![allow(clippy::many_single_char_names)] + use std::mem::MaybeUninit; use crate::BitBuffer; @@ -105,7 +107,11 @@ impl LaneZip { /// /// Panics if the two operands have different lengths. pub fn new(a: A, b: B) -> Self { - assert_eq!(a.len(), b.len(), "LaneZip operands must have the same length"); + assert_eq!( + a.len(), + b.len(), + "LaneZip operands must have the same length" + ); Self(a, b) } } @@ -120,12 +126,7 @@ impl IndexedSource for LaneZip { #[inline] unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) { // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths. - unsafe { - ( - self.0.get_unchecked(i), - self.1.get_unchecked(i), - ) - } + unsafe { (self.0.get_unchecked(i), self.1.get_unchecked(i)) } } } @@ -140,12 +141,8 @@ impl IndexedSource for LaneZip { /// /// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. #[inline] -pub fn map_with_mask( - values: S, - mask: &BitBuffer, - out: &mut [MaybeUninit], - mut f: F, -) where +pub fn map_with_mask(values: S, mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) +where S: IndexedSource, F: FnMut(S::Item, bool) -> R, { @@ -481,12 +478,7 @@ where /// Cold attribution for the no-mask variant. #[cold] #[inline(never)] -fn attribute_failure_no_mask( - values: &S, - base: usize, - chunk_len: usize, - f: &mut F, -) -> usize +fn attribute_failure_no_mask(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize where S: IndexedSource, F: FnMut(S::Item) -> Option, @@ -608,6 +600,7 @@ where /// /// Panics if `values.len() != mask.len()`. #[inline] +#[allow(clippy::cast_possible_truncation)] pub fn try_map_with_mask_in_place( mut values: S, mask: &BitBuffer, @@ -780,6 +773,7 @@ where } #[cfg(test)] +#[allow(clippy::cast_possible_truncation)] mod tests { use super::*; use crate::BitBufferMut; @@ -800,12 +794,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 10]; - map_with_mask( - values.as_slice(), - &mask, - &mut out, - |v, valid| if valid { v } else { -1 }, - ); + map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + if valid { v } else { -1 } + }); assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); } @@ -815,12 +806,9 @@ mod tests { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask( - values.as_slice(), - &mask, - &mut out, - |v, valid| if valid { v + 1 } else { 0 }, - ); + map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + if valid { v + 1 } else { 0 } + }); let got = write_t(out); assert_eq!(got.len(), 130); assert_eq!(got[0], 1); @@ -839,12 +827,9 @@ mod tests { let values: Vec = (0..65).collect(); let mut out = vec![MaybeUninit::::uninit(); 65]; - map_with_mask( - values.as_slice(), - &sliced, - &mut out, - |v, valid| if valid { v } else { u32::MAX }, - ); + map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| { + if valid { v } else { u32::MAX } + }); let got = write_t(out); assert_eq!(got, (0..65).collect::>()); } @@ -861,12 +846,9 @@ mod tests { let values: Vec = (0..130).map(|i| i as i16).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask( - values.as_slice(), - &sliced, - &mut out, - |v, valid| if valid { v } else { -1 }, - ); + map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| { + if valid { v } else { -1 } + }); let got = write_t(out); assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); } @@ -891,7 +873,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 100]; - map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| v * (valid as i64)); + map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + v * (valid as i64) + }); let got = write_t(out); for (i, &x) in got.iter().enumerate() { if i % 3 == 0 { @@ -907,7 +891,9 @@ mod tests { let values: Vec = (0..128).collect(); let mask = BitBuffer::new_set(128); let mut out = vec![0u64; 2]; - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v % 2 == 0); + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { + valid && v % 2 == 0 + }); // Even numbers in [0, 128) set, odd unset. for word_idx in 0..2 { let word = out[word_idx]; @@ -926,7 +912,9 @@ mod tests { let mask = BitBuffer::new_set(130); let mut out = vec![0u64; 130usize.div_ceil(64)]; assert_eq!(out.len(), 3); - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v >= 64); + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { + valid && v >= 64 + }); // Bits 64..128 set in word 1; bits 128..130 set in word 2. assert_eq!(out[0], 0); assert_eq!(out[1], u64::MAX); @@ -940,7 +928,9 @@ mod tests { assert_eq!(sliced.len(), 130); let values: Vec = (0..130).map(|i| (i % 4) as u8).collect(); let mut out = vec![0u64; 130usize.div_ceil(64)]; - map_with_mask_to_bits(values.as_slice(),&sliced, &mut out, |v, valid| valid && v == 0); + map_with_mask_to_bits(values.as_slice(), &sliced, &mut out, |v, valid| { + valid && v == 0 + }); for i in 0..130 { let word = out[i / 64]; let bit = (word >> (i % 64)) & 1 == 1; @@ -1008,13 +998,13 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_validity_filtered( - values.as_slice(), - &mask, - &mut out, - |v| (v <= u32::MAX as u64).then_some(v as u32), + let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); + assert!( + res.is_ok(), + "null-lane overflow should not propagate as Err" ); - assert!(res.is_ok(), "null-lane overflow should not propagate as Err"); } #[test] @@ -1035,12 +1025,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_validity_filtered( - values.as_slice(), - &mask, - &mut out, - |v| (v <= u32::MAX as u64).then_some(v as u32), - ); + let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(77)); } @@ -1193,6 +1180,124 @@ mod tests { assert_eq!(res, Err(129)); } + #[test] + fn map_with_mask_in_place_basic() { + let mut values: Vec = (0..130).collect(); + let mask = { + let mut m = BitBufferMut::with_capacity(130); + for i in 0..130 { + m.append(i % 2 == 0); + } + m.freeze() + }; + map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + v.wrapping_mul(valid as u32) + }); + let expected: Vec = (0..130u32) + .map(|v| if v % 2 == 0 { v } else { 0 }) + .collect(); + assert_eq!(values, expected); + } + + #[test] + fn try_map_with_mask_in_place_all_ok() { + let mut values: Vec = (0..200).collect(); + let mask = BitBuffer::new_set(200); + let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + let scaled = v.wrapping_mul(valid as u32); + scaled.checked_mul(2) + }); + assert!(res.is_ok()); + let expected: Vec = (0..200u32).map(|v| v * 2).collect(); + assert_eq!(values, expected); + } + + #[test] + fn try_map_with_mask_in_place_first_failing_chunk_wins() { + let mut values: Vec = (0..200).collect(); + values[83] = u32::MAX; + values[150] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + assert_eq!(res, Err(83)); + } + + #[test] + fn try_map_with_mask_in_place_within_chunk_reports_lowest() { + let mut values: Vec = (0..200).collect(); + values[80] = u32::MAX; + values[100] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + assert_eq!(res, Err(80)); + } + + #[test] + fn try_map_with_mask_in_place_single_failure_lane_exact() { + let mut values: Vec = (0..200).collect(); + values[42] = u32::MAX; + let mask = BitBuffer::new_set(200); + let res = + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + assert_eq!(res, Err(42)); + } + + #[test] + fn try_map_with_mask_in_place_null_bypass() { + let mut values: Vec = (0..200).collect(); + values[5] = u32::MAX; + let mask = { + let mut m = BitBufferMut::with_capacity(200); + for i in 0..200 { + m.append(i != 5); + } + m.freeze() + }; + let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + v.wrapping_mul(valid as u32).checked_mul(2) + }); + assert!(res.is_ok()); + assert_eq!(values[5], 0); + assert_eq!(values[6], 12); + } + + #[test] + fn try_map_with_mask_in_place_remainder_overflow() { + let mut values: Vec = (0..130).collect(); + values[129] = u32::MAX; + let mask = BitBuffer::new_set(130); + let res = + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + assert_eq!(res, Err(129)); + } + + #[test] + fn try_map_with_mask_in_place_sliced_mask() { + let big = BitBuffer::new_set(256); + let mask = big.slice(13..143); + assert_eq!(mask.len(), 130); + + let mut values: Vec = (0..130).collect(); + values[77] = u32::MAX; + let res = + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + assert_eq!(res, Err(77)); + } + + #[test] + fn try_map_with_mask_in_place_partial_chunk_success() { + let mut values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1)); + assert!(res.is_ok()); + assert_eq!(values[0], 1); + assert_eq!(values[63], 64); + assert_eq!(values[64], 65); + assert_eq!(values[129], 130); + } + #[test] fn map_to_bits_aligned() { let values: Vec = (0..128).collect(); @@ -1252,7 +1357,9 @@ mod tests { m.freeze() }; let mut out = vec![0u64; 70usize.div_ceil(64)]; - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v == 1); + map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { + valid && v == 1 + }); for i in 0..70 { let bit = (out[i / 64] >> (i % 64)) & 1 == 1; assert_eq!(bit, i >= 32, "lane {i}"); diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index a4519ac62ec..5fe7a4cf40d 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -52,7 +52,6 @@ pub use buffer::*; pub use buffer_mut::*; pub use bytes::*; pub use r#const::*; -pub use lane_ops::*; pub use string::*; mod alignment; #[cfg(feature = "arrow")] @@ -63,12 +62,6 @@ mod buffer_mut; mod bytes; mod r#const; mod debug; -mod lane_ops; -/// Indexed-source variant of [`lane_ops`]: takes an `IndexedSource` trait whose -/// implementations expose `unsafe fn get_unchecked(i) -> Item`. `&[T]` impls inline -/// to the same indexed load as the slice kernel, but the trait also admits binary -/// inputs via `LaneZip`. See `HISTORY.md` for the iterator-API investigation that -/// led to this design. pub mod lane_ops_indexed; mod macros; #[cfg(feature = "memmap2")] From 502a2861f11b842c4b85927a307e4782adb94415 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 11:22:36 +0100 Subject: [PATCH 04/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 23 +++++-- vortex-buffer/benches/cast_to_indexed.rs | 63 +++++++++++++++++-- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index dd5abc2f164..edb5ced01b9 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -4,6 +4,7 @@ use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_buffer::lane_ops_indexed::map_no_validity; use vortex_buffer::lane_ops_indexed::try_map_no_validity; use vortex_buffer::lane_ops_indexed::try_map_with_mask; use vortex_error::VortexResult; @@ -125,7 +126,6 @@ where T: NativePType, { let values = array.as_slice::(); - let mask = array.validity()?.execute_mask(array.len(), ctx)?; let overflow = || { vortex_err!( Compute: "Cannot cast {} to {} — value exceeds target range", @@ -133,6 +133,22 @@ where ) }; + // If this cast doesn't fail use the unchecked casting variant + let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); + if cached_values_fit_in(array, &target_dtype) == Some(true) { + let mut buffer = BufferMut::::with_capacity(values.len()); + map_no_validity( + values, + &mut buffer.spare_capacity_mut()[..values.len()], + v.as_(), // |v| ::from(v).unwrap_or_default(), + ); + // SAFETY: map_no_validity initializes every lane. + unsafe { buffer.set_len(values.len()) }; + return Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()); + } + + let mask = array.validity()?.execute_mask(array.len(), ctx)?; + let buffer: Buffer = match &mask { Mask::AllTrue(_) => { let mut buffer = BufferMut::::with_capacity(values.len()); @@ -159,10 +175,7 @@ where // path entirely, giving the same codegen as the maskless kernel. // For narrowing, `valid` is only read at lanes that actually // overflowed (a cold check on top of the cast). - |v, valid| { - ::from(v) - .or_else(|| (!valid).then(T::zero)) - }, + |v, valid| ::from(v).or_else(|| (!valid).then(T::zero)), ) .map_err(|_| overflow())?; // SAFETY: try_map_with_mask returned Ok, so it initialized every lane. diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index d3baec7885c..1dfba41f8fd 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -34,7 +34,10 @@ const DATA_SEED: u64 = 0; const VALID_SEED: u64 = 1; struct Fixture { - values: Buffer, + /// u64 source for the narrowing-cast bench (`cast_lazy_validity`). + values_u64: Buffer, + /// u16 source for the widening-cast benches that compare closure forms. + values_u16: Buffer, mask: BitBuffer, } @@ -46,7 +49,9 @@ fn fixture(n: usize) -> Fixture { .collect(); let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); - let values: Buffer = raw_values.iter().copied().collect(); + let values_u64: Buffer = raw_values.iter().copied().collect(); + #[expect(clippy::cast_possible_truncation)] + let values_u16: Buffer = raw_values.iter().map(|&v| v as u16).collect(); let mask = { let mut m = BitBufferMut::with_capacity(n); for &v in &raw_valid { @@ -55,9 +60,12 @@ fn fixture(n: usize) -> Fixture { m.freeze() }; - Fixture { values, mask } + Fixture { + values_u64, + values_u16, + mask, + } } - /// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity /// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only /// then reads `valid`) when the cast itself returned `None`. @@ -69,7 +77,33 @@ fn cast_lazy_validity(bencher: Bencher, n: usize) { let mut out: Vec> = Vec::with_capacity(n); // SAFETY: every lane is written before any read inside the kernel. unsafe { out.set_len(n) }; - (f.values.clone(), f.mask.clone(), out) + (f.values_u64.clone(), f.mask.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + ::from(v).or_else(|| (!valid).then(u32::default)) + }) + .unwrap(); + }); +} + +// ----------------------------------------------------------------------------- +// Widening benches (u16 → u32). Compare closure forms on a statically-infallible +// cast to confirm the asm finding empirically: the `or_else` and `_valid` +// (maskless) closures should produce identical timings, since LLVM aliases the +// `or_else` function symbol directly to the maskless one (proven via +// `cargo rustc --emit=asm` — see the `asm_u16_u32_*` helpers above). +// ----------------------------------------------------------------------------- + +/// Widening with the `or_else` closure — the cast.rs shape. +#[divan::bench(args = SIZES)] +fn widen_u16_u32_or_else(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values_u16.clone(), f.mask.clone(), out) }) .bench_refs(|(values, mask, out)| { try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { @@ -78,3 +112,22 @@ fn cast_lazy_validity(bencher: Bencher, n: usize) { .unwrap(); }); } + +/// Widening with `_valid` ignored — the upper bound. Should match `or_else` per the +/// asm aliasing finding. +#[divan::bench(args = SIZES)] +fn widen_u16_u32_maskless(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + let mut out: Vec> = Vec::with_capacity(n); + unsafe { out.set_len(n) }; + (f.values_u16.clone(), f.mask.clone(), out) + }) + .bench_refs(|(values, mask, out)| { + try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, _valid| { + ::from(v) + }) + .unwrap(); + }); +} From 2f6df638d80c05107d7b488849ba0a22455692c5 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 12:42:49 +0100 Subject: [PATCH 05/21] f Signed-off-by: Joe Isaacs --- Cargo.lock | 3 + .../src/arrays/primitive/compute/cast.rs | 25 +- vortex-buffer/Cargo.toml | 4 + vortex-buffer/benches/cast_to_indexed.rs | 337 ++++++++++++++---- vortex-buffer/src/lane_ops_indexed.rs | 325 ++++++++++------- 5 files changed, 499 insertions(+), 195 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d29c91edf62..11afc6996a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,7 +9355,10 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ + "arrow-array", "arrow-buffer", + "arrow-cast", + "arrow-schema", "bitvec", "bytes", "codspeed-divan-compat", diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index edb5ced01b9..8cdd27cb5c5 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; @@ -122,7 +123,7 @@ fn cast_values( ctx: &mut ExecutionCtx, ) -> VortexResult where - F: NativePType, + F: NativePType + AsPrimitive, T: NativePType, { let values = array.as_slice::(); @@ -133,14 +134,30 @@ where ) }; - // If this cast doesn't fail use the unchecked casting variant + // Returns `true` if every value of `from` is representable in `to` without loss. + // + // Equivalent to `from.least_supertype(to) == Some(to)`, i.e. the value domain of `from` + // is a subset of `to`'s. This is the static-only check — it does not consult any array + // statistics. Used to short-circuit checked casts when the conversion is infallible by + // type alone (widening uint→uint, signed→signed, u8→i16, i32→f64, etc.). + fn casts_losslessly_to(from: PType, to: PType) -> bool { + from.least_supertype(to) == Some(to) + } + + // Skip the fallible kernel when the conversion is infallible by type alone (widening) or + // when cached min/max prove every value fits in `T`. let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); - if cached_values_fit_in(array, &target_dtype) == Some(true) { + if casts_losslessly_to(F::PTYPE, T::PTYPE) + || cached_values_fit_in(array, &target_dtype) == Some(true) + { let mut buffer = BufferMut::::with_capacity(values.len()); + // Truncating `as`-cast — safe here because stats prove every valid value fits. + // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result + // validity bitmap masks them downstream). map_no_validity( values, &mut buffer.spare_capacity_mut()[..values.len()], - v.as_(), // |v| ::from(v).unwrap_or_default(), + |v| v.as_(), ); // SAFETY: map_no_validity initializes every lane. unsafe { buffer.set_len(values.len()) }; diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 6490516f846..385efa36dcf 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,6 +37,10 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] +# arrow-* are used by the cast_to_indexed bench to compare against arrow-rs. +arrow-array = { workspace = true } +arrow-cast = { workspace = true } +arrow-schema = { workspace = true } divan = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index 1dfba41f8fd..8349b47eb26 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -1,133 +1,338 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Focused bench for the **best fallible cast kernel** — what `cast.rs` actually uses -//! in `vortex-array/src/arrays/primitive/compute/cast.rs`. Single bench, no cross-impl -//! baselines: just a regression guard for the production cast hot path. -//! -//! The kernel: [`vortex_buffer::lane_ops_indexed::try_map_with_mask`] called with a -//! lazy-validity `or_else` closure — for statically-infallible casts (widening) LLVM -//! proves `NumCast::from` is always `Some`, the `or_else` branch is dead, and the -//! validity path is DCE'd. For fallible casts (narrowing), validity is only consulted -//! on the cold failure branch. +//! Coverage benchmark for the indexed lane-op variants used by primitive casts +//! and bit-packing paths. #![expect(clippy::unwrap_used)] use std::mem::MaybeUninit; +use arrow_array::UInt16Array; +use arrow_array::UInt64Array; +use arrow_buffer::NullBuffer; +use arrow_buffer::ScalarBuffer; +use arrow_cast::CastOptions; +use arrow_cast::cast_with_options; +use arrow_schema::DataType; use divan::Bencher; use num_traits::NumCast; use rand::SeedableRng; use rand::prelude::*; +use rand::rngs::StdRng; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; +use vortex_buffer::lane_ops_indexed::map_no_validity; +use vortex_buffer::lane_ops_indexed::map_to_bits; +use vortex_buffer::lane_ops_indexed::map_with_mask; +use vortex_buffer::lane_ops_indexed::map_with_mask_in_place; +use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits; +use vortex_buffer::lane_ops_indexed::try_map_no_validity; +use vortex_buffer::lane_ops_indexed::try_map_validity_filtered; use vortex_buffer::lane_ops_indexed::try_map_with_mask; +use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; fn main() { divan::main(); } const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; -const VALID_RATE: f64 = 0.7; -const DATA_SEED: u64 = 0; -const VALID_SEED: u64 = 1; +const U32_THRESHOLD: u32 = u32::MAX / 2; struct Fixture { - /// u64 source for the narrowing-cast bench (`cast_lazy_validity`). values_u64: Buffer, - /// u16 source for the widening-cast benches that compare closure forms. + values_u64_invalid_overflows: Buffer, + values_u32: Buffer, + values_u32_small: Buffer, values_u16: Buffer, mask: BitBuffer, + /// `UInt64Array` baseline for arrow casts. Same values + validity as `values_u64` / `mask`. + arrow_u64: UInt64Array, + /// `UInt16Array` baseline. Same as `values_u16` / `mask`. + arrow_u16: UInt16Array, } fn fixture(n: usize) -> Fixture { - let mut data_rng = StdRng::seed_from_u64(DATA_SEED); - let mut valid_rng = StdRng::seed_from_u64(VALID_SEED); + let mut rng = StdRng::seed_from_u64(0xC457_1D3E); + let raw_values: Vec = (0..n) - .map(|_| data_rng.random_range(0..u32::MAX as u64)) + .map(|_| rng.random_range(0..(u32::MAX as u64))) .collect(); - let raw_valid: Vec = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect(); + let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); + + #[expect(clippy::cast_possible_truncation)] + let values_u16 = raw_values + .iter() + .copied() + .map(|v| v as u16) + .collect::>(); + + #[expect(clippy::cast_possible_truncation)] + let values_u32 = raw_values + .iter() + .copied() + .map(|v| v as u32) + .collect::>(); + + #[expect(clippy::cast_possible_truncation)] + let values_u32_small = raw_values + .iter() + .copied() + .map(|v| (v % ((u32::MAX as u64) / 2)) as u32) + .collect::>(); - let values_u64: Buffer = raw_values.iter().copied().collect(); + let values_u64_invalid_overflows = raw_values + .iter() + .copied() + .zip(raw_valid.iter().copied()) + .map(|(v, valid)| if valid { v } else { u64::MAX }) + .collect::>(); + + let arrow_u64 = UInt64Array::new( + ScalarBuffer::from(raw_values.clone()), + Some(NullBuffer::from(raw_valid.clone())), + ); #[expect(clippy::cast_possible_truncation)] - let values_u16: Buffer = raw_values.iter().map(|&v| v as u16).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(n); - for &v in &raw_valid { - m.append(v); - } - m.freeze() - }; + let raw_u16: Vec = raw_values.iter().map(|&v| v as u16).collect(); + let arrow_u16 = UInt16Array::new( + ScalarBuffer::from(raw_u16), + Some(NullBuffer::from(raw_valid.clone())), + ); Fixture { - values_u64, + values_u64: raw_values.into(), + values_u64_invalid_overflows, + values_u32, + values_u32_small, values_u16, - mask, + mask: BitBufferMut::from_iter(raw_valid).freeze(), + arrow_u64, + arrow_u16, } } -/// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity -/// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only -/// then reads `valid`) when the cast itself returned `None`. + +const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { + safe: false, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +fn uninit_out(n: usize) -> Vec> { + let mut out = Vec::with_capacity(n); + // SAFETY: A `MaybeUninit` does not require initialization. + unsafe { + out.set_len(n); + } + out +} + #[divan::bench(args = SIZES)] -fn cast_lazy_validity(bencher: Bencher, n: usize) { +fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) { let f = fixture(n); + bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - // SAFETY: every lane is written before any read inside the kernel. - unsafe { out.set_len(n) }; - (f.values_u64.clone(), f.mask.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { - ::from(v).or_else(|| (!valid).then(u32::default)) + .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + map_no_validity( + values.as_slice(), + out.as_mut_slice(), + >::from, + ); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + >::from(v) * valid as u32 + }); + out + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + try_map_no_validity(values.as_slice(), out.as_mut_slice(), |v| { + ::from(v) }) .unwrap(); + out }); } -// ----------------------------------------------------------------------------- -// Widening benches (u16 → u32). Compare closure forms on a statically-infallible -// cast to confirm the asm finding empirically: the `or_else` and `_valid` -// (maskless) closures should produce identical timings, since LLVM aliases the -// `or_else` function symbol directly to the maskless one (proven via -// `cargo rustc --emit=asm` — see the `asm_u16_u32_*` helpers above). -// ----------------------------------------------------------------------------- +/// `try_map_with_mask` with a closure that **ignores `valid`**. Tests whether +/// LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` mask extract. Uses +/// non-overflowing `values_u64` so the closure-ignores-valid spurious-failure +/// case never triggers (would otherwise err on null-lane overflow). +#[divan::bench(args = SIZES)] +fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + ::from(v) + }) + .unwrap(); + out + }); +} -/// Widening with the `or_else` closure — the cast.rs shape. #[divan::bench(args = SIZES)] -fn widen_u16_u32_or_else(bencher: Bencher, n: usize) { +fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) { let f = fixture(n); + bencher - .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values_u16.clone(), f.mask.clone(), out) - }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| { + .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { ::from(v).or_else(|| (!valid).then(u32::default)) }) .unwrap(); + out }); } -/// Widening with `_valid` ignored — the upper bound. Should match `or_else` per the -/// asm aliasing finding. #[divan::bench(args = SIZES)] -fn widen_u16_u32_maskless(bencher: Bencher, n: usize) { +fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) { let f = fixture(n); + bencher .with_inputs(|| { - let mut out: Vec> = Vec::with_capacity(n); - unsafe { out.set_len(n) }; - (f.values_u16.clone(), f.mask.clone(), out) + ( + f.values_u64_invalid_overflows.clone(), + f.mask.clone(), + uninit_out::(n), + ) }) - .bench_refs(|(values, mask, out)| { - try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, _valid| { + .bench_values(|(values, mask, mut out)| { + try_map_validity_filtered(values.as_slice(), &mask, out.as_mut_slice(), |v| { ::from(v) }) .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + Some(>::from(v)).or_else(|| (!valid).then(u32::default)) + }) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + Some(>::from(v)) + }) + .unwrap(); + out }); } + +#[divan::bench(args = SIZES)] +fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone())) + .bench_values(|(mut values, mask)| { + map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| v * valid as u32); + values + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone())) + .bench_values(|(mut values, mask)| { + try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)) + .unwrap(); + values + }); +} + +#[divan::bench(args = SIZES)] +fn map_to_bits_u32_threshold(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| (f.values_u32.clone(), vec![0; n.div_ceil(64)])) + .bench_values(|(values, mut out)| { + map_to_bits(values.as_slice(), out.as_mut_slice(), |v| { + v >= U32_THRESHOLD + }); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_to_bits_u32_threshold(bencher: Bencher, n: usize) { + let f = fixture(n); + + bencher + .with_inputs(|| { + ( + f.values_u32.clone(), + f.mask.clone(), + vec![0; n.div_ceil(64)], + ) + }) + .bench_values(|(values, mask, mut out)| { + map_with_mask_to_bits(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + valid && v >= U32_THRESHOLD + }); + out + }); +} + +// ----------------------------------------------------------------------------- +// Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one +// narrowing (u64 → u32, can fail). Each pairs with the cast variants above of +// matching direction. +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn arrow_cast_widen_u16_u32(bencher: Bencher, _n: usize) { + let f = fixture(_n); + bencher + .with_inputs(|| f.arrow_u16.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} + +#[divan::bench(args = SIZES)] +fn arrow_cast_narrow_u64_u32(bencher: Bencher, _n: usize) { + let f = fixture(_n); + bencher + .with_inputs(|| f.arrow_u64.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index dfd2c41fd4a..4f7c42e4603 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -22,6 +22,52 @@ use std::mem::MaybeUninit; use crate::BitBuffer; +macro_rules! for_full_lanes { + ($base:expr, | $bit_idx:ident, $i:ident | $body:block) => { + for $bit_idx in 0..64 { + let $i = $base + $bit_idx; + $body + } + }; +} + +macro_rules! for_remainder_lanes { + ($base:expr, $remainder:expr, | $bit_idx:ident, $i:ident | $body:block) => { + for $bit_idx in 0..$remainder { + let $i = $base + $bit_idx; + $body + } + }; +} + +macro_rules! for_full_mask_lanes { + ($src_chunk:expr, $base:expr, | $bit_idx:ident, $i:ident, $valid:ident | $body:block) => { + for $bit_idx in 0..64 { + let $i = $base + $bit_idx; + let $valid = ($src_chunk >> $bit_idx) & 1 == 1; + $body + } + }; +} + +macro_rules! for_remainder_mask_lanes { + ( + $src_chunk:expr, + $base:expr, + $remainder:expr, | + $bit_idx:ident, + $i:ident, + $valid:ident | + $body:block + ) => { + for $bit_idx in 0..$remainder { + let $i = $base + $bit_idx; + let $valid = ($src_chunk >> $bit_idx) & 1 == 1; + $body + } + }; +} + /// A length-known source supporting unchecked indexed reads. /// /// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s. @@ -159,40 +205,51 @@ where // Inner loop is fixed-size 64 with independent per-lane reads — no iterator // state, no cross-iteration dependency, so the auto-vectorizer can fuse // 64 indexed loads into vector loads. - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - } + }); } if remainder != 0 { let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - } + }); } } /// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` indicates a /// per-lane failure (e.g. range overflow on a narrowing cast). /// -/// The kernel does not short-circuit on the first failure inside a chunk: it processes -/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator, -/// then checks after each chunk. On failure, a cold scalar attribution pass replays the -/// closure over that chunk to identify the first failing lane. The hot loop stays -/// autovectorizable — the per-lane cost is one OR on top of the cast. +/// **Null-lane failures are filtered automatically.** If a null lane's stored value +/// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as +/// `Err` — the cold attribution pass skips lanes where the mask bit is `0`. The +/// closure may also explicitly suppress null-lane failures by branching on `valid` +/// itself; both behaviors compose, with the kernel's filter as a safety net. +/// +/// ## Hot loop +/// +/// Per-lane `is_none()` flags are OR-reduced into a single `u64` (just bit 0). +/// When the closure ignores `valid`, LLVM DCEs the per-lane mask extract +/// `(src_chunk >> bit_idx) & 1` entirely — the inner loop becomes pure value +/// computation with no mask traffic. When the closure uses `valid`, the bit is +/// passed through and the closure threads validity normally. /// -/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write -/// `R::default()` into `out`, but the contents of `out` must not be relied upon when -/// this function returns `Err`. +/// ## Cold attribution +/// +/// On `fail_acc != 0`, [`cold_first_valid_failure`] walks the chunk filtering by +/// mask and returns either `Some(first_valid_failure_index)` or `None` (all +/// failures were at null lanes — the kernel continues). Not autovectorized; runs +/// at most once per failing chunk. +/// +/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` +/// write `R::default()` into `out`, but the contents of `out` must not be relied +/// upon when this function returns `Err`. /// /// # Panics /// @@ -219,11 +276,11 @@ where for (chunk_idx, src_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; - // Per-chunk accumulator — does not escape the SIMD inner loop. + // Per-chunk accumulator — just bit 0. When the closure ignores `valid`, + // the per-lane `(src_chunk >> bit_idx) & 1` is dead code and LLVM removes + // it, leaving a value-only SIMD body. let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v, bit); @@ -231,9 +288,14 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); if fail_acc != 0 { - return Err(attribute_failure(&values, src_chunk, base, 64, &mut f)); + if let Some(idx) = + cold_first_valid_failure(&values, src_chunk, base, 64, &mut f) + { + return Err(idx); + } + // All failures were at null lanes — continue (rescue). } } @@ -241,9 +303,7 @@ where let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; let mut fail_acc: u64 = 0; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v, bit); @@ -251,11 +311,13 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); if fail_acc != 0 { - return Err(attribute_failure( - &values, src_chunk, base, remainder, &mut f, - )); + if let Some(idx) = + cold_first_valid_failure(&values, src_chunk, base, remainder, &mut f) + { + return Err(idx); + } } } @@ -289,22 +351,20 @@ where for chunk_idx in 0..chunks_count { let base = chunk_idx * 64; - for bit_idx in 0..64 { - let i = base + bit_idx; + for_full_lanes!(base, |bit_idx, i| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; unsafe { out.get_unchecked_mut(i).write(f(v)) }; - } + }); } if remainder != 0 { let base = chunks_count * 64; - for bit_idx in 0..remainder { - let i = base + bit_idx; + for_remainder_lanes!(base, remainder, |bit_idx, i| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; unsafe { out.get_unchecked_mut(i).write(f(v)) }; - } + }); } } @@ -348,8 +408,7 @@ where for chunk_idx in 0..chunks_count { let base = chunk_idx * 64; let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { - let i = base + bit_idx; + for_full_lanes!(base, |bit_idx, i| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v); @@ -357,7 +416,7 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); if fail_acc != 0 { return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); } @@ -366,8 +425,7 @@ where if remainder != 0 { let base = chunks_count * 64; let mut fail_acc: u64 = 0; - for bit_idx in 0..remainder { - let i = base + bit_idx; + for_remainder_lanes!(base, remainder, |bit_idx, i| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v); @@ -375,7 +433,7 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); if fail_acc != 0 { return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); } @@ -433,8 +491,7 @@ where for (chunk_idx, mask_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; let mut fail_bits: u64 = 0; - for bit_idx in 0..64 { - let i = base + bit_idx; + for_full_lanes!(base, |bit_idx, i| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v); @@ -444,7 +501,7 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); // Filter failures to those at VALID lanes only. Null-lane failures vanish. let valid_failures = fail_bits & mask_chunk; if valid_failures != 0 { @@ -456,8 +513,7 @@ where let mask_chunk = chunks.remainder_bits(); let base = chunks_count * 64; let mut fail_bits: u64 = 0; - for bit_idx in 0..remainder { - let i = base + bit_idx; + for_remainder_lanes!(base, remainder, |bit_idx, i| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v); @@ -465,7 +521,7 @@ where let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; - } + }); let valid_failures = fail_bits & mask_chunk; if valid_failures != 0 { return Err(base + valid_failures.trailing_zeros() as usize); @@ -475,32 +531,47 @@ where Ok(()) } -/// Cold attribution for the no-mask variant. +/// Shared cold scan: walks a chunk, returns the first lane index where +/// `lane_fails(bit_idx, value)` returns `true`. Used by both +/// [`attribute_failure`] and [`attribute_failure_no_mask`] via thin wrappers. +/// +/// Caller guarantees `base + chunk_len <= values.len()`. #[cold] #[inline(never)] -fn attribute_failure_no_mask(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize +fn cold_scan( + values: &S, + base: usize, + chunk_len: usize, + mut lane_fails: impl FnMut(usize /* bit_idx */, S::Item) -> bool, +) -> usize where S: IndexedSource, - F: FnMut(S::Item) -> Option, { for bit_idx in 0..chunk_len { let i = base + bit_idx; // SAFETY: caller guarantees i < values.len(). let v = unsafe { values.get_unchecked(i) }; - if f(v).is_none() { + if lane_fails(bit_idx, v) { return i; } } - unreachable!("attribute_failure_no_mask called without a failing lane") + unreachable!("cold_scan called without a failing lane") } -/// Cold path: identify the first lane in a chunk where `f` returned `None`. -/// -/// Called only after the hot loop has detected that at least one lane failed. -/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only -/// runs once per error and the error path is supposed to be exceptional. -#[cold] -#[inline(never)] +/// Cold attribution for the no-mask variant. Replays `f` over the chunk to find +/// the first lane that returns `None`. +#[inline] +fn attribute_failure_no_mask(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize +where + S: IndexedSource, + F: FnMut(S::Item) -> Option, +{ + cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none()) +} + +/// Cold attribution for the mask variant. Replays `f` over the chunk, passing +/// each lane's validity bit, and returns the first lane where `f` returned `None`. +#[inline] fn attribute_failure( values: &S, src_chunk: u64, @@ -512,17 +583,9 @@ where S: IndexedSource, F: FnMut(S::Item, bool) -> Option, { - for bit_idx in 0..chunk_len { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + chunk_len <= values.len(). - let v = unsafe { values.get_unchecked(i) }; - if f(v, bit).is_none() { - return i; - } - } - // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed. - unreachable!("attribute_failure called without a failing lane") + cold_scan(values, base, chunk_len, |bit_idx, v| { + f(v, (src_chunk >> bit_idx) & 1 == 1).is_none() + }) } /// In-place variant of [`map_with_mask`]. Each lane is replaced with @@ -546,29 +609,25 @@ where for (chunk_idx, src_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; let r = f(v, bit); // SAFETY: i < len. unsafe { values.set_unchecked(i, r) }; - } + }); } if remainder != 0 { let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; let r = f(v, bit); // SAFETY: i < len. unsafe { values.set_unchecked(i, r) }; - } + }); } } @@ -619,49 +678,69 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - let mut first_fail: u32 = u32::MAX; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { values.set_unchecked(i, r) }; - } - if first_fail != u32::MAX { - return Err(first_fail as usize); + // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body + // into this loop and the compiler propagates 64 into the inner `0..count` bound, + // unrolling exactly as `for_full_mask_lanes!` would. + if let Some(failing) = + try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) + { + return Err(failing as usize); } } if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut first_fail: u32 = u32::MAX; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { values.set_unchecked(i, r) }; - } - if first_fail != u32::MAX { - return Err(first_fail as usize); + // Runtime `count = remainder` — same shape as the prior remainder loop. + if let Some(failing) = try_inplace_chunk( + &mut values, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + &mut f, + ) { + return Err(failing as usize); } } Ok(()) } +/// Per-chunk worker for [`try_map_with_mask_in_place`]. Body written once; the kernel +/// calls this twice (with `count = 64` for full chunks, `count = remainder` for the +/// tail). `#[inline(always)]` so the const-64 unroll for the full-chunk callers is +/// preserved. +/// +/// Returns `Some(first_failing_lane_index_as_u32)` if any lane in `[base, base+count)` +/// failed (cast width-truncated since `i < 2^32` in any realistic batch), else `None`. +#[inline(always)] +#[allow(clippy::cast_possible_truncation)] +fn try_inplace_chunk( + values: &mut S, + src_chunk: u64, + base: usize, + count: usize, + f: &mut F, +) -> Option +where + S: IndexedSink, + S::Item: Default, + F: FnMut(S::Item, bool) -> Option, +{ + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees `base + count <= values.len()`. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + // SAFETY: same as above. + unsafe { values.set_unchecked(i, r) }; + } + (first_fail != u32::MAX).then_some(first_fail) +} + /// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. /// /// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the @@ -694,11 +773,11 @@ where for chunk_idx in 0..chunks_count { let base = chunk_idx * 64; let mut packed = 0u64; - for bit_idx in 0..64 { + for_full_lanes!(base, |bit_idx, i| { // SAFETY: base + bit_idx < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(base + bit_idx) }; + let v = unsafe { values.get_unchecked(i) }; packed |= (f(v) as u64) << bit_idx; - } + }); // SAFETY: chunk_idx < chunks_count <= out.len(). unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; } @@ -706,11 +785,11 @@ where if remainder != 0 { let base = chunks_count * 64; let mut packed = 0u64; - for bit_idx in 0..remainder { + for_remainder_lanes!(base, remainder, |bit_idx, i| { // SAFETY: base + bit_idx < len. - let v = unsafe { values.get_unchecked(base + bit_idx) }; + let v = unsafe { values.get_unchecked(i) }; packed |= (f(v) as u64) << bit_idx; - } + }); // SAFETY: chunks_count < out.len() because remainder != 0. unsafe { *out.get_unchecked_mut(chunks_count) = packed }; } @@ -745,13 +824,11 @@ where for (chunk_idx, src_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; let mut packed = 0u64; - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; packed |= (f(v, bit) as u64) << bit_idx; - } + }); // SAFETY: chunk_idx < chunks_count <= out.len(). unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; } @@ -760,13 +837,11 @@ where let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; let mut packed = 0u64; - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; packed |= (f(v, bit) as u64) << bit_idx; - } + }); // SAFETY: chunks_count < out.len() because remainder != 0. unsafe { *out.get_unchecked_mut(chunks_count) = packed }; } From 769a2583e62ef29bfa1becd106e1b9c46c213bad Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 13:19:11 +0100 Subject: [PATCH 06/21] f Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-buffer/Cargo.toml | 7 +- vortex-buffer/benches/cast_to_indexed.rs | 9 +- vortex-buffer/src/lane_ops_indexed.rs | 198 +++++------------------ 4 files changed, 56 insertions(+), 159 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 11afc6996a2..9bb032d0d35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,6 +9355,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ + "arrow-arith", "arrow-array", "arrow-buffer", "arrow-cast", diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 385efa36dcf..882de199818 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,7 +37,8 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] -# arrow-* are used by the cast_to_indexed bench to compare against arrow-rs. +# arrow-* are used by cast_to_indexed / add_checked benches to compare against arrow-rs. +arrow-arith = { workspace = true } arrow-array = { workspace = true } arrow-cast = { workspace = true } arrow-schema = { workspace = true } @@ -57,3 +58,7 @@ harness = false [[bench]] name = "cast_to_indexed" harness = false + +[[bench]] +name = "add_checked" +harness = false diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index 8349b47eb26..848f50cd142 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -29,7 +29,6 @@ use vortex_buffer::lane_ops_indexed::map_with_mask; use vortex_buffer::lane_ops_indexed::map_with_mask_in_place; use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits; use vortex_buffer::lane_ops_indexed::try_map_no_validity; -use vortex_buffer::lane_ops_indexed::try_map_validity_filtered; use vortex_buffer::lane_ops_indexed::try_map_with_mask; use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; @@ -205,8 +204,12 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) { }); } +/// Migrated from the old `try_map_validity_filtered` bench: same inputs (null +/// lanes contain overflowing values) and same correctness expectation (no Err), +/// but now driven through the merged `try_map_with_mask` with a `|v, _|` closure. +/// The hot loop is value-only via DCE; the cold path filters null-lane failures. #[divan::bench(args = SIZES)] -fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) { +fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usize) { let f = fixture(n); bencher @@ -218,7 +221,7 @@ fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) { ) }) .bench_values(|(values, mask, mut out)| { - try_map_validity_filtered(values.as_slice(), &mask, out.as_mut_slice(), |v| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { ::from(v) }) .unwrap(); diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 4f7c42e4603..47887b92810 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -228,24 +228,25 @@ where /// /// **Null-lane failures are filtered automatically.** If a null lane's stored value /// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as -/// `Err` — the cold attribution pass skips lanes where the mask bit is `0`. The -/// closure may also explicitly suppress null-lane failures by branching on `valid` -/// itself; both behaviors compose, with the kernel's filter as a safety net. +/// `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at the lane's +/// position, then ANDed with the chunk's validity bitmap — null-lane bits vanish. +/// The closure may also explicitly suppress null-lane failures by branching on +/// `valid` itself; both behaviors compose. /// /// ## Hot loop /// -/// Per-lane `is_none()` flags are OR-reduced into a single `u64` (just bit 0). -/// When the closure ignores `valid`, LLVM DCEs the per-lane mask extract -/// `(src_chunk >> bit_idx) & 1` entirely — the inner loop becomes pure value -/// computation with no mask traffic. When the closure uses `valid`, the bit is -/// passed through and the closure threads validity normally. +/// `fail_bits |= (opt.is_none() as u64) << bit_idx`. After unrolling, `bit_idx` is a +/// compile-time constant per-iteration, so the shift folds. The closure receives +/// `(value, valid)`; LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` extract +/// when the closure ignores `valid`, leaving a value-only SIMD body. /// -/// ## Cold attribution +/// ## Attribution /// -/// On `fail_acc != 0`, [`cold_first_valid_failure`] walks the chunk filtering by -/// mask and returns either `Some(first_valid_failure_index)` or `None` (all -/// failures were at null lanes — the kernel continues). Not autovectorized; runs -/// at most once per failing chunk. +/// `valid_failures = fail_bits & src_chunk` — non-zero only when at least one +/// valid lane failed. `trailing_zeros()` gives the first failing valid lane. +/// **No cold replay**: failure detection and lane attribution happen entirely in +/// the hot loop. Worst-case bounded per chunk regardless of how many null lanes +/// returned `None`. /// /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` /// write `R::default()` into `out`, but the contents of `out` must not be relied @@ -276,48 +277,45 @@ where for (chunk_idx, src_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; - // Per-chunk accumulator — just bit 0. When the closure ignores `valid`, - // the per-lane `(src_chunk >> bit_idx) & 1` is dead code and LLVM removes - // it, leaving a value-only SIMD body. - let mut fail_acc: u64 = 0; + // Bit-pack per-lane fails into a u64 at lane-position. `bit_idx` is a + // compile-time constant after unrolling, so the shift folds. The + // `src_chunk` here is the validity bitmap for this chunk; the closure + // still gets `bit` per lane — LLVM DCEs the per-lane mask extract if + // the closure ignores it. + let mut fail_bits: u64 = 0; for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { // SAFETY: i < chunks_count * 64 <= len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v, bit); - fail_acc |= opt.is_none() as u64; + fail_bits |= (opt.is_none() as u64) << bit_idx; let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; }); - if fail_acc != 0 { - if let Some(idx) = - cold_first_valid_failure(&values, src_chunk, base, 64, &mut f) - { - return Err(idx); - } - // All failures were at null lanes — continue (rescue). + // Drop null-lane failures: only failures at lanes the mask marks as + // valid count. Direct attribution via trailing_zeros — no cold replay. + let valid_failures = fail_bits & src_chunk; + if valid_failures != 0 { + return Err(base + valid_failures.trailing_zeros() as usize); } } if remainder != 0 { let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; - let mut fail_acc: u64 = 0; + let mut fail_bits: u64 = 0; for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { // SAFETY: i < len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v, bit); - fail_acc |= opt.is_none() as u64; + fail_bits |= (opt.is_none() as u64) << bit_idx; let r = opt.unwrap_or_default(); // SAFETY: i < len. unsafe { out.get_unchecked_mut(i).write(r) }; }); - if fail_acc != 0 { - if let Some(idx) = - cold_first_valid_failure(&values, src_chunk, base, remainder, &mut f) - { - return Err(idx); - } + let valid_failures = fail_bits & src_chunk; + if valid_failures != 0 { + return Err(base + valid_failures.trailing_zeros() as usize); } } @@ -330,7 +328,7 @@ where /// /// For nullable inputs where the closure is infallible (no overflow / no error /// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible -/// closure, prefer [`try_map_validity_filtered`] — both correctly suppress +/// closure, prefer [`try_map_with_mask`] — both correctly suppress /// null-lane logic. This kernel exists for the narrow "no validity exists" /// case (non-nullable column, internal pipelines, etc.). /// @@ -374,7 +372,7 @@ where /// # Use this only for non-nullable inputs. /// /// For nullable inputs with a fallible closure, use -/// [`try_map_validity_filtered`] — it has the same value-only closure shape +/// [`try_map_with_mask`] — it has the same value-only closure shape /// (and the same perf win) but **correctly suppresses null-lane failures** /// via per-chunk `fail_bits & mask_chunk`. /// @@ -442,98 +440,9 @@ where Ok(()) } -/// Fallible value-only map with **chunk-level validity filtering**: closure is -/// `|v| -> Option`, no validity threaded through the inner loop. After each -/// 64-lane chunk, per-lane failure bits are ANDed against the mask chunk, so -/// failures at null lanes do **not** propagate as `Err`. -/// -/// This is the correct shape for "checked cast that respects validity" — a null -/// row whose stored value would overflow does **not** cause `Err`. It also -/// preserves the perf win of the value-only closure: the hot loop has no per-lane -/// mask extract, no `valid`-dependent branch. -/// -/// ## Inner-loop trick -/// -/// Per-lane fails are packed into a `u64` via `fail_bits |= (is_none as u64) << bit_idx`. -/// The shift amount is loop-invariant after unrolling (since `bit_idx` is the -/// compile-time loop counter), so the autovectorizer can issue 64 sequential -/// value reads + closure applications + packed-bit ORs as a vector pipeline. -/// -/// ## Attribution -/// -/// On failure, `valid_failures = fail_bits & mask_chunk` is non-zero; the lowest -/// set bit is the first failing valid lane. `trailing_zeros()` reads it out -/// directly — no cold replay path, no second pass. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. -#[inline] -pub fn try_map_validity_filtered( - values: S, - mask: &BitBuffer, - out: &mut [MaybeUninit], - mut f: F, -) -> Result<(), usize> -where - S: IndexedSource, - R: Copy + Default, - F: FnMut(S::Item) -> Option, -{ - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, mask_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - let mut fail_bits: u64 = 0; - for_full_lanes!(base, |bit_idx, i| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - // Pack failure bit at the lane's position. After unrolling, `bit_idx` - // is a compile-time constant per-iteration, so the shift is folded. - fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - // Filter failures to those at VALID lanes only. Null-lane failures vanish. - let valid_failures = fail_bits & mask_chunk; - if valid_failures != 0 { - return Err(base + valid_failures.trailing_zeros() as usize); - } - } - - if remainder != 0 { - let mask_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut fail_bits: u64 = 0; - for_remainder_lanes!(base, remainder, |bit_idx, i| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - let valid_failures = fail_bits & mask_chunk; - if valid_failures != 0 { - return Err(base + valid_failures.trailing_zeros() as usize); - } - } - - Ok(()) -} - /// Shared cold scan: walks a chunk, returns the first lane index where -/// `lane_fails(bit_idx, value)` returns `true`. Used by both -/// [`attribute_failure`] and [`attribute_failure_no_mask`] via thin wrappers. +/// `lane_fails(bit_idx, value)` returns `true`. Used by +/// [`attribute_failure_no_mask`]. /// /// Caller guarantees `base + chunk_len <= values.len()`. #[cold] @@ -569,25 +478,6 @@ where cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none()) } -/// Cold attribution for the mask variant. Replays `f` over the chunk, passing -/// each lane's validity bit, and returns the first lane where `f` returned `None`. -#[inline] -fn attribute_failure( - values: &S, - src_chunk: u64, - base: usize, - chunk_len: usize, - f: &mut F, -) -> usize -where - S: IndexedSource, - F: FnMut(S::Item, bool) -> Option, -{ - cold_scan(values, base, chunk_len, |bit_idx, v| { - f(v, (src_chunk >> bit_idx) & 1 == 1).is_none() - }) -} - /// In-place variant of [`map_with_mask`]. Each lane is replaced with /// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]). /// @@ -681,8 +571,7 @@ where // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body // into this loop and the compiler propagates 64 into the inner `0..count` bound, // unrolling exactly as `for_full_mask_lanes!` would. - if let Some(failing) = - try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) + if let Some(failing) = try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { return Err(failing as usize); } @@ -1058,10 +947,9 @@ mod tests { } #[test] - fn try_map_validity_filtered_null_lane_overflow_does_not_err() { - // Null lane with a value that would overflow MUST NOT cause Err. - // The closure is value-only — the mask filters the null-lane failure - // at the chunk boundary. + fn try_map_with_mask_value_only_closure_filters_null_overflow() { + // `|v, _|` closure that ignores validity. A null lane with an overflowing + // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us. let mut values: Vec = (0..200).collect(); values[5] = u64::MAX; // null lane with overflowing value values[42] = u64::MAX; // null lane with overflowing value @@ -1073,17 +961,17 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| { + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| { (v <= u32::MAX as u64).then_some(v as u32) }); assert!( res.is_ok(), - "null-lane overflow should not propagate as Err" + "null-lane overflow should be filtered by the cold path" ); } #[test] - fn try_map_validity_filtered_valid_overflow_does_err_with_first_index() { + fn try_map_with_mask_value_only_closure_reports_first_valid_failure() { // Valid lane overflow must propagate — and the reported index must be // the lowest VALID failing lane, even if earlier null lanes also "failed" // their unconditional cast. @@ -1100,7 +988,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| { + let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| { (v <= u32::MAX as u64).then_some(v as u32) }); assert_eq!(res, Err(77)); From 3a30290f33b31bf54ae2eb92b97536df5f61abd1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 14:05:45 +0100 Subject: [PATCH 07/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/Cargo.toml | 4 + vortex-buffer/benches/add_checked.rs | 676 ++++++++++++++++++++++++ vortex-buffer/benches/pack_vs_unpack.rs | 389 ++++++++++++++ vortex-buffer/src/lane_ops_indexed.rs | 464 ++++++++-------- 4 files changed, 1297 insertions(+), 236 deletions(-) create mode 100644 vortex-buffer/benches/add_checked.rs create mode 100644 vortex-buffer/benches/pack_vs_unpack.rs diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 882de199818..048d2612364 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -62,3 +62,7 @@ harness = false [[bench]] name = "add_checked" harness = false + +[[bench]] +name = "pack_vs_unpack" +harness = false diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs new file mode 100644 index 00000000000..df857922d6f --- /dev/null +++ b/vortex-buffer/benches/add_checked.rs @@ -0,0 +1,676 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Checked `u32 + u32 -> u32` over two nullable columns — exhaustive variant +//! comparison. +//! +//! Variants differ along three axes: +//! +//! 1. **Closure suppression strategy** — how the closure (if any) handles null lanes +//! - `value_only`: `|(a,b), _|` ignores validity +//! - `if_else`: `|(a,b), valid| if valid { ... } else { Some(default) }` +//! - `or_else`: `|(a,b), valid| ....or_else(|| (!valid).then(...))` +//! - `mul_trick`: `(a * valid as u32).checked_add(b * valid as u32)` +//! +//! 2. **Fail tracking scheme** +//! - bit-pack: `fail_bits |= (is_none << bit_idx)`; chunk-AND with mask +//! - boolean: `fail_acc |= is_none as u64`; cold replay attribution +//! +//! 3. **Validity application** +//! - in closure: closure consumes `valid` +//! - post-mask: kernel ANDs fail bitmap with `src_chunk` +//! - pre-mask: kernel zeros null-lane values via bit-broadcast before SIMD add +//! - none: ignore validity (ceiling only — not correct for real inputs) +//! +//! All correctness-preserving variants are verified via [`assert_overflow_parity`] +//! and [`assert_null_overflow_suppressed`] at startup. The `pure_simd_no_validity` +//! variant is benched as a ceiling only — it does not respect nullability. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; +use std::sync::Arc; + +use arrow_array::Datum; +use arrow_array::UInt32Array; +use arrow_buffer::NullBuffer; +use arrow_buffer::ScalarBuffer; +use divan::Bencher; +use rand::SeedableRng; +use rand::prelude::*; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::lane_ops_indexed::LaneZip; +use vortex_buffer::lane_ops_indexed::try_map_with_mask; + +fn main() { + assert_overflow_parity(); + assert_null_overflow_suppressed(); + assert_pure_simd_errs_on_realistic_data(); + divan::main(); +} + +const SIZES: &[usize] = &[4_096, 65_536, 1_048_576, 2_097_152, 4_194_304]; +const LHS_VALID_RATE: f64 = 0.7; +const RHS_VALID_RATE: f64 = 0.8; + +struct Fixture { + /// **Realistic** lhs: valid lanes bounded, null lanes `u32::MAX`. + /// A kernel that ignores validity will see overflow at null lanes. + lhs: Buffer, + rhs: Buffer, + /// **Sanitized** lhs: valid lanes bounded, null lanes pre-zeroed. + /// Used by `pure_simd_no_validity_sanitized` only — its precondition is + /// "someone already zeroed the nulls." + lhs_sanitized: Buffer, + rhs_sanitized: Buffer, + lhs_mask: BitBuffer, + rhs_mask: BitBuffer, + lhs_arrow: Arc, + rhs_arrow: Arc, +} + +fn fixture(n: usize) -> Fixture { + let mut lhs_rng = StdRng::seed_from_u64(0); + let mut rhs_rng = StdRng::seed_from_u64(1); + let mut lvr = StdRng::seed_from_u64(2); + let mut rvr = StdRng::seed_from_u64(3); + + let lhs_valid: Vec = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect(); + let rhs_valid: Vec = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect(); + + // **Realistic null storage**: null lanes contain u32::MAX. Adding two such + // values overflows — a kernel that ignores validity will spuriously Err. + // Valid lanes carry bounded values so the success path is measured at lanes + // where overflow shouldn't fire. + let raw_lhs: Vec = (0..n) + .map(|i| { + if lhs_valid[i] { + lhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + let raw_rhs: Vec = (0..n) + .map(|i| { + if rhs_valid[i] { + rhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + + let lhs: Buffer = raw_lhs.iter().copied().collect(); + let rhs: Buffer = raw_rhs.iter().copied().collect(); + + let lhs_sanitized: Buffer = (0..n) + .map(|i| if lhs_valid[i] { raw_lhs[i] } else { 0 }) + .collect(); + let rhs_sanitized: Buffer = (0..n) + .map(|i| if rhs_valid[i] { raw_rhs[i] } else { 0 }) + .collect(); + + let lhs_mask = { + let mut m = BitBufferMut::with_capacity(n); + for &v in &lhs_valid { + m.append(v); + } + m.freeze() + }; + let rhs_mask = { + let mut m = BitBufferMut::with_capacity(n); + for &v in &rhs_valid { + m.append(v); + } + m.freeze() + }; + + let lhs_arrow = Arc::new(UInt32Array::new( + ScalarBuffer::from(raw_lhs), + Some(NullBuffer::from(lhs_valid)), + )); + let rhs_arrow = Arc::new(UInt32Array::new( + ScalarBuffer::from(raw_rhs), + Some(NullBuffer::from(rhs_valid)), + )); + + Fixture { + lhs, + rhs, + lhs_sanitized, + rhs_sanitized, + lhs_mask, + rhs_mask, + lhs_arrow, + rhs_arrow, + } +} + +fn alloc_out(n: usize) -> Vec> { + let mut out = Vec::with_capacity(n); + // SAFETY: every lane is written before any read inside the kernel. + unsafe { out.set_len(n) }; + out +} + +// --------------------------------------------------------------------------- +// Variant 0: arrow_arith::numeric::add — baseline +// --------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn arrow_add(bencher: Bencher, n: usize) { + let _ = n; + let f = fixture(n); + bencher + .with_inputs(|| (f.lhs_arrow.clone(), f.rhs_arrow.clone())) + .bench_refs(|(lhs, rhs)| { + arrow_arith::numeric::add(lhs.as_ref() as &dyn Datum, rhs.as_ref() as &dyn Datum) + .unwrap() + }); +} + +// --------------------------------------------------------------------------- +// Variant 1: try_map_with_mask + closure `|(a, b), _|` (value-only) +// Fail tracking: bit-pack via the kernel. +// LLVM DCEs per-lane mask extract. +// --------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn bitpack_value_only(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &combined, + out.as_mut_slice(), + |(a, b), _valid| a.checked_add(b), + ) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 2: try_map_with_mask + closure `|(a, b), valid|` with if-else +// Fail tracking: bit-pack via the kernel. +// Closure explicitly suppresses null-lane fails (redundant with bit-pack filter). +// --------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn bitpack_closure_suppresses_if_else(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &combined, + out.as_mut_slice(), + |(a, b), valid| { + if valid { a.checked_add(b) } else { Some(0) } + }, + ) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 3: try_map_with_mask + closure `.or_else(|| (!valid).then(...))` +// Fail tracking: bit-pack via the kernel. +// Lazy suppression: closure only consults `valid` when overflow actually fires. +// --------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn bitpack_closure_suppresses_or_else(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &combined, + out.as_mut_slice(), + |(a, b), valid| a.checked_add(b).or_else(|| (!valid).then_some(0)), + ) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 4: try_map_with_mask + closure with `(a * valid).checked_add(b * valid)` +// Fail tracking: bit-pack via the kernel. +// The multiply-by-valid trick zeroes null-lane operands so they can't overflow. +// --------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn bitpack_closure_mul_trick(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &combined, + out.as_mut_slice(), + |(a, b), valid| { + let m = valid as u32; + (a * m).checked_add(b * m) + }, + ) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 5: hand-rolled, boolean fail_acc, closure suppresses nulls, cold replay +// --------------------------------------------------------------------------- + +/// Hand-rolled kernel: boolean `fail_acc`, cold replay attribution. +/// Closure is expected to suppress null-lane fails by returning `Some(...)`; +/// `fail_acc` only fires for real valid-lane overflows. +#[inline] +fn handrolled_boolean( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], + mut f: F, +) -> Result<(), usize> +where + F: FnMut(u32, u32, bool) -> Option, +{ + let len = lhs.len(); + assert_eq!(len, rhs.len()); + assert_eq!(len, mask.len()); + assert_eq!(len, out.len()); + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: i < len. + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + let opt = f(a, b, bit); + fail_acc |= opt.is_none() as u64; + unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) }; + } + if fail_acc != 0 { + // Cold: find first failing lane (closure already suppressed nulls). + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + if f(a, b, bit).is_none() { + return Err(i); + } + } + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + let opt = f(a, b, bit); + fail_acc |= opt.is_none() as u64; + unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) }; + } + if fail_acc != 0 { + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + if f(a, b, bit).is_none() { + return Err(i); + } + } + } + } + Ok(()) +} + +#[divan::bench(args = SIZES)] +fn boolean_closure_suppresses(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + handrolled_boolean( + lhs.as_slice(), + rhs.as_slice(), + &combined, + out.as_mut_slice(), + |a, b, valid| { + if valid { a.checked_add(b) } else { Some(0) } + }, + ) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 6: hand-rolled pre-mask. Kernel zeros null-lane values via bit +// broadcast, then unconditional add + overflow detect. Boolean fail_acc. +// --------------------------------------------------------------------------- + +#[inline] +fn handrolled_premask( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], +) -> Result<(), usize> { + let len = lhs.len(); + assert_eq!(len, rhs.len()); + assert_eq!(len, mask.len()); + assert_eq!(len, out.len()); + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; + + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..64 { + // bit-broadcast: 0 → 0x00000000, 1 → 0xFFFFFFFF + let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); + let i = base + bit_idx; + // SAFETY: i < len. + let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; + let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; + let (sum, overflow) = a.overflowing_add(b); + fail_acc |= overflow as u64; + unsafe { out.get_unchecked_mut(i).write(sum) }; + } + if fail_acc != 0 { + // Cold: walk chunk to find first valid lane that actually overflows on + // the unmasked inputs. Null lanes were premasked to 0+0, can't overflow. + for bit_idx in 0..64 { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + if !bit { + continue; + } + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + if a.checked_add(b).is_none() { + return Err(i); + } + } + } + } + + if remainder != 0 { + let src_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + let mut fail_acc: u64 = 0; + for bit_idx in 0..remainder { + let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); + let i = base + bit_idx; + let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; + let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; + let (sum, overflow) = a.overflowing_add(b); + fail_acc |= overflow as u64; + unsafe { out.get_unchecked_mut(i).write(sum) }; + } + if fail_acc != 0 { + for bit_idx in 0..remainder { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + if !bit { + continue; + } + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + if a.checked_add(b).is_none() { + return Err(i); + } + } + } + } + Ok(()) +} + +#[divan::bench(args = SIZES)] +fn premask_then_simd(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = alloc_out(n); + handrolled_premask(lhs.as_slice(), rhs.as_slice(), &combined, out.as_mut_slice()) + .unwrap(); + (combined, out) + }); +} + +// --------------------------------------------------------------------------- +// Variant 7: pure SIMD, no mask awareness — CEILING REFERENCE ONLY. +// Incorrect for arrays where null lanes might overflow; benchmarked just to +// show the theoretical floor for nullable add. +// --------------------------------------------------------------------------- + +#[inline] +fn handrolled_no_validity( + lhs: &[u32], + rhs: &[u32], + out: &mut [MaybeUninit], +) -> Result<(), usize> { + assert_eq!(lhs.len(), rhs.len()); + assert_eq!(lhs.len(), out.len()); + let mut fail = false; + for i in 0..lhs.len() { + let a = unsafe { *lhs.get_unchecked(i) }; + let b = unsafe { *rhs.get_unchecked(i) }; + let (sum, overflow) = a.overflowing_add(b); + fail |= overflow; + unsafe { out.get_unchecked_mut(i).write(sum) }; + } + if fail { Err(0) } else { Ok(()) } +} + +/// Pure-SIMD ceiling on **pre-sanitized** input (null lanes pre-zeroed in the +/// fixture, outside the timed region). Cannot run on the realistic +/// `(lhs, rhs)` arrays because their null lanes hold `u32::MAX` and would +/// Err — proven by [`assert_pure_simd_errs_on_realistic_data`]. +/// +/// Showing the SIMD-only arithmetic floor — what an ideal nullable-add would +/// look like if validity could be free. +#[divan::bench(args = SIZES)] +fn pure_simd_no_validity_sanitized(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.lhs_sanitized.clone(), f.rhs_sanitized.clone())) + .bench_refs(|(lhs, rhs)| { + let mut out = alloc_out(n); + handrolled_no_validity(lhs.as_slice(), rhs.as_slice(), out.as_mut_slice()).unwrap(); + out + }); +} + +// --------------------------------------------------------------------------- +// Parity assertions — must pass before divan runs benches. +// --------------------------------------------------------------------------- + +/// Both arrow and our kernel must Err on overflow at a valid lane. +fn assert_overflow_parity() { + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true; 4]; + + let lhs_arrow = UInt32Array::new( + ScalarBuffer::from(lhs.clone()), + Some(NullBuffer::from(valid.clone())), + ); + let rhs_arrow = UInt32Array::new( + ScalarBuffer::from(rhs.clone()), + Some(NullBuffer::from(valid.clone())), + ); + let arrow_result = + arrow_arith::numeric::add(&lhs_arrow as &dyn Datum, &rhs_arrow as &dyn Datum); + assert!(arrow_result.is_err(), "arrow should Err on overflow"); + + let mask = { + let mut m = BitBufferMut::with_capacity(4); + for &v in &valid { + m.append(v); + } + m.freeze() + }; + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let ours = try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), _| a.checked_add(b), + ); + assert!(ours.is_err(), "bitpack should Err on overflow"); + + let mut out2: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let boolean = handrolled_boolean(&lhs, &rhs, &mask, &mut out2, |a, b, valid| { + if valid { a.checked_add(b) } else { Some(0) } + }); + assert!(boolean.is_err(), "boolean should Err on overflow"); + + let mut out3: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out3); + assert!(prem.is_err(), "premask should Err on overflow"); +} + +/// All correctness-preserving variants must NOT Err when only null lanes +/// would overflow. (Pure-SIMD variant is excluded — it doesn't see validity.) +fn assert_null_overflow_suppressed() { + // Lane 2 is null and contains overflowing values; valid lanes are safe. + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true, true, false, true]; + + let mask = { + let mut m = BitBufferMut::with_capacity(4); + for &v in &valid { + m.append(v); + } + m.freeze() + }; + + // Bit-pack with value-only closure — kernel filters null-lane fails. + let mut out = alloc_out(4); + let r = try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), _| a.checked_add(b), + ); + assert!(r.is_ok(), "bitpack_value_only: null-lane overflow leaked"); + + // Boolean with closure that suppresses nulls. + let mut out = alloc_out(4); + let r = handrolled_boolean(&lhs, &rhs, &mask, &mut out, |a, b, valid| { + if valid { a.checked_add(b) } else { Some(0) } + }); + assert!(r.is_ok(), "boolean_closure_suppresses: null-lane leaked"); + + // Pre-mask: kernel zeroes null-lane values. + let mut out = alloc_out(4); + let r = handrolled_premask(&lhs, &rhs, &mask, &mut out); + assert!(r.is_ok(), "premask_then_simd: null-lane overflow leaked"); +} + +/// Demonstrates that `pure_simd_no_validity` is **incorrect** on realistic +/// fixture inputs — i.e., when null lanes contain values that overflow on add. +/// This is what justifies excluding pure_simd from the realistic bench and +/// running it only on the sanitized inputs. Without this, the "ignore the +/// mask" approach would look too fast because the test data lets it cheat. +fn assert_pure_simd_errs_on_realistic_data() { + // Lane 2 is a "null lane" in arrow-style storage: bitmap says null, but + // the data buffer still holds an overflowing value. The realistic + // `fixture` does exactly this. + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + + let r = handrolled_no_validity(&lhs, &rhs, &mut out); + assert!( + r.is_err(), + "pure_simd_no_validity should Err on realistic data (null lane has \ + u32::MAX). If this passes, the bench fixture isn't exercising the \ + unsafe-null-storage case and the pure_simd ceiling number is \ + misleading — it's running on data the kernel happens to handle even \ + without a mask." + ); +} diff --git a/vortex-buffer/benches/pack_vs_unpack.rs b/vortex-buffer/benches/pack_vs_unpack.rs new file mode 100644 index 00000000000..0ae41fb5573 --- /dev/null +++ b/vortex-buffer/benches/pack_vs_unpack.rs @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compare two strategies for handling validity in `try_map_with_mask`: +//! +//! 1. **Unpack the mask** — closure consults `valid` per-lane. Null lanes are +//! short-circuited inside the closure (return `Some(default)` immediately), +//! so the checked operation never runs with garbage. The kernel still does +//! its `fail_bits & src_chunk` post-filter, but it's a no-op because the +//! closure already produced `Some` at null lanes. +//! +//! 2. **Pack and filter** — closure ignores `_valid`. The checked operation +//! runs at every lane, including null lanes (where it may produce `None` +//! on garbage). The kernel's post-loop `fail_bits & src_chunk` filter +//! drops those null-lane fails. LLVM DCEs the per-lane mask extract since +//! the closure doesn't consult `valid`. +//! +//! Two ops × two strategies = four vortex benches, plus arrow baselines. +//! +//! - `widen_u16_u32_*` — statically-infallible widening cast. `NumCast::from` +//! always returns `Some`; LLVM proves it and strips fail-tracking entirely. +//! - `checked_add_u32_*` — genuinely fallible: `u32 + u32` can overflow. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; +use std::sync::Arc; + +use arrow_arith::numeric::add; +use arrow_array::Datum; +use arrow_array::UInt16Array; +use arrow_array::UInt32Array; +use arrow_buffer::NullBuffer; +use arrow_buffer::ScalarBuffer; +use arrow_cast::CastOptions; +use arrow_cast::cast_with_options; +use arrow_schema::DataType; +use divan::Bencher; +use num_traits::NumCast; +use rand::SeedableRng; +use rand::prelude::*; +use rand::rngs::StdRng; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::lane_ops_indexed::LaneZip; +use vortex_buffer::lane_ops_indexed::try_map_with_mask; + +fn main() { + divan::main(); +} + +const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; + +struct Fixture { + values_u16: Buffer, + lhs_u32: Buffer, + rhs_u32: Buffer, + mask: BitBuffer, + arrow_u16: UInt16Array, + arrow_lhs: Arc, + arrow_rhs: Arc, +} + +fn fixture(n: usize) -> Fixture { + let mut rng = StdRng::seed_from_u64(0xC0DE_BEEF); + // Bounded so `u16 + u16` (as u32) and `u32 + u32` never overflow u32. + // Both strategies succeed; we measure success-path perf. + let raw_lhs: Vec = (0..n) + .map(|_| rng.random_range(0..(u32::MAX / 2))) + .collect(); + let raw_rhs: Vec = (0..n) + .map(|_| rng.random_range(0..(u32::MAX / 2))) + .collect(); + let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); + + #[expect(clippy::cast_possible_truncation)] + let values_u16: Buffer = raw_lhs.iter().map(|&v| v as u16).collect(); + let lhs_u32: Buffer = raw_lhs.iter().copied().collect(); + let rhs_u32: Buffer = raw_rhs.iter().copied().collect(); + + let mask = { + let mut m = BitBufferMut::with_capacity(n); + for &v in &raw_valid { + m.append(v); + } + m.freeze() + }; + + #[expect(clippy::cast_possible_truncation)] + let arrow_u16 = UInt16Array::new( + ScalarBuffer::from(raw_lhs.iter().map(|&v| v as u16).collect::>()), + Some(NullBuffer::from(raw_valid.clone())), + ); + let arrow_lhs = Arc::new(UInt32Array::new( + ScalarBuffer::from(raw_lhs), + Some(NullBuffer::from(raw_valid.clone())), + )); + let arrow_rhs = Arc::new(UInt32Array::new( + ScalarBuffer::from(raw_rhs), + Some(NullBuffer::from(raw_valid)), + )); + + Fixture { + values_u16, + lhs_u32, + rhs_u32, + mask, + arrow_u16, + arrow_lhs, + arrow_rhs, + } +} + +fn uninit_out(n: usize) -> Vec> { + let mut out = Vec::with_capacity(n); + // SAFETY: a `MaybeUninit` does not require initialization. + unsafe { out.set_len(n) }; + out +} + +const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { + safe: false, + format_options: arrow_cast::display::FormatOptions::new(), +}; + +// ----------------------------------------------------------------------------- +// Widening cast u16 → u32 (statically infallible). NumCast::from never returns +// None for widening, so the failure path is dead in both strategies. +// ----------------------------------------------------------------------------- + +/// Strategy 1 (unpack mask): closure consults `valid`, short-circuits at null +/// lanes. For widening the short-circuit is dead anyway (no failure possible). +#[divan::bench(args = SIZES)] +fn widen_u16_u32_unpack_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + if !valid { + return Some(0u32); + } + ::from(v) + }) + .unwrap(); + out + }); +} + +/// Strategy 2 (pack and filter): closure ignores `_valid`. LLVM DCEs the +/// per-lane mask extract; post-loop `& src_chunk` would filter null-lane fails +/// (none happen for widening). +#[divan::bench(args = SIZES)] +fn widen_u16_u32_pack_and_filter(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + ::from(v) + }) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn widen_u16_u32_arrow(bencher: Bencher, _n: usize) { + let f = fixture(_n); + bencher + .with_inputs(|| f.arrow_u16.clone()) + .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); +} + +// ----------------------------------------------------------------------------- +// Checked add u32 + u32 → u32 (genuinely fallible). LaneZip(lhs, rhs) drives +// two-input lanewise. +// ----------------------------------------------------------------------------- + +/// Strategy 1 (unpack mask): closure short-circuits null lanes; `checked_add` +/// only runs at valid lanes. +#[divan::bench(args = SIZES)] +fn checked_add_u32_unpack_mask(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs_u32.clone(), + f.rhs_u32.clone(), + f.mask.clone(), + uninit_out::(n), + ) + }) + .bench_values(|(lhs, rhs, mask, mut out)| { + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), valid| { + if !valid { + return Some(0u32); + } + a.checked_add(b) + }, + ) + .unwrap(); + out + }); +} + +/// Strategy 2 (pack and filter): `checked_add` runs at every lane (including +/// null lanes with garbage values); kernel's `fail_bits & src_chunk` post-filter +/// drops any null-lane fails. +#[divan::bench(args = SIZES)] +fn checked_add_u32_pack_and_filter(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs_u32.clone(), + f.rhs_u32.clone(), + f.mask.clone(), + uninit_out::(n), + ) + }) + .bench_values(|(lhs, rhs, mask, mut out)| { + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), _valid| a.checked_add(b), + ) + .unwrap(); + out + }); +} + +// Asm-extraction helpers: `#[unsafe(no_mangle)] #[inline(never)]` so a single +// `cargo rustc --emit=asm` produces clearly-labeled symbols to diff. + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn asm_add_unpack_branchy( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], +) -> Result<(), usize> { + try_map_with_mask( + LaneZip::new(lhs, rhs), + mask, + out, + |(a, b), valid| { + if !valid { + return Some(0u32); + } + a.checked_add(b) + }, + ) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn asm_add_unpack_branchless( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], +) -> Result<(), usize> { + try_map_with_mask( + LaneZip::new(lhs, rhs), + mask, + out, + |(a, b), valid| { + // Compute first, then select. No early-return; LLVM may if-convert. + let r = a.checked_add(b); + if valid { r } else { Some(0u32) } + }, + ) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn asm_add_unpack_multiply( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], +) -> Result<(), usize> { + try_map_with_mask( + LaneZip::new(lhs, rhs), + mask, + out, + |(a, b), valid| { + // Neutralize null lanes via multiply (BIC); checked_add runs unconditionally. + let m = valid as u32; + (a * m).checked_add(b * m) + }, + ) +} + +#[unsafe(no_mangle)] +#[inline(never)] +pub fn asm_add_pack_filter( + lhs: &[u32], + rhs: &[u32], + mask: &BitBuffer, + out: &mut [MaybeUninit], +) -> Result<(), usize> { + try_map_with_mask( + LaneZip::new(lhs, rhs), + mask, + out, + |(a, b), _valid| a.checked_add(b), + ) +} + +/// Branchless-multiply variant of unpack_mask: scale lhs/rhs by `valid as u32` so +/// the checked op runs at every lane (with zeros at null lanes — never overflows) +/// and the kernel's post-loop `& src_chunk` filter still applies. +#[divan::bench(args = SIZES)] +fn checked_add_u32_unpack_multiply(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs_u32.clone(), + f.rhs_u32.clone(), + f.mask.clone(), + uninit_out::(n), + ) + }) + .bench_values(|(lhs, rhs, mask, mut out)| { + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), valid| { + let m = valid as u32; + (a * m).checked_add(b * m) + }, + ) + .unwrap(); + out + }); +} + +/// Compute-first-then-select variant of unpack_mask: removes the early `return`, +/// keeps the `valid` consult per-lane. Tests whether LLVM if-converts when both +/// branches are pure expressions. +#[divan::bench(args = SIZES)] +fn checked_add_u32_unpack_branchless(bencher: Bencher, n: usize) { + let f = fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs_u32.clone(), + f.rhs_u32.clone(), + f.mask.clone(), + uninit_out::(n), + ) + }) + .bench_values(|(lhs, rhs, mask, mut out)| { + try_map_with_mask( + LaneZip::new(lhs.as_slice(), rhs.as_slice()), + &mask, + out.as_mut_slice(), + |(a, b), valid| { + let r = a.checked_add(b); + if valid { r } else { Some(0u32) } + }, + ) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn checked_add_u32_arrow(bencher: Bencher, _n: usize) { + let f = fixture(_n); + bencher + .with_inputs(|| (f.arrow_lhs.clone(), f.arrow_rhs.clone())) + .bench_refs(|(lhs, rhs)| { + let lhs_datum: &dyn Datum = lhs.as_ref(); + let rhs_datum: &dyn Datum = rhs.as_ref(); + add(lhs_datum, rhs_datum).unwrap() + }); +} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 47887b92810..144f83a429a 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -22,52 +22,6 @@ use std::mem::MaybeUninit; use crate::BitBuffer; -macro_rules! for_full_lanes { - ($base:expr, | $bit_idx:ident, $i:ident | $body:block) => { - for $bit_idx in 0..64 { - let $i = $base + $bit_idx; - $body - } - }; -} - -macro_rules! for_remainder_lanes { - ($base:expr, $remainder:expr, | $bit_idx:ident, $i:ident | $body:block) => { - for $bit_idx in 0..$remainder { - let $i = $base + $bit_idx; - $body - } - }; -} - -macro_rules! for_full_mask_lanes { - ($src_chunk:expr, $base:expr, | $bit_idx:ident, $i:ident, $valid:ident | $body:block) => { - for $bit_idx in 0..64 { - let $i = $base + $bit_idx; - let $valid = ($src_chunk >> $bit_idx) & 1 == 1; - $body - } - }; -} - -macro_rules! for_remainder_mask_lanes { - ( - $src_chunk:expr, - $base:expr, - $remainder:expr, | - $bit_idx:ident, - $i:ident, - $valid:ident | - $body:block - ) => { - for $bit_idx in 0..$remainder { - let $i = $base + $bit_idx; - let $valid = ($src_chunk >> $bit_idx) & 1 == 1; - $body - } - }; -} - /// A length-known source supporting unchecked indexed reads. /// /// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s. @@ -192,6 +146,30 @@ where S: IndexedSource, F: FnMut(S::Item, bool) -> R, { + /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder` + /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the + /// full-chunk call site via constant propagation through inlining. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) where + S: IndexedSource, + F: FnMut(S::Item, bool) -> R, + { + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } + } + let len = values.len(); assert_eq!(len, mask.len(), "values and mask must have the same length"); assert_eq!(out.len(), len, "out must have the same length as values"); @@ -201,25 +179,13 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - // Inner loop is fixed-size 64 with independent per-lane reads — no iterator - // state, no cross-iteration dependency, so the auto-vectorizer can fuse - // 64 indexed loads into vector loads. - for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - }); + chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64); } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - }); + chunk( + &values, out, &mut f, + chunks.remainder_bits(), chunks_count * 64, remainder, + ); } } @@ -267,6 +233,38 @@ where R: Copy + Default, F: FnMut(S::Item, bool) -> Option, { + /// Bit-packs `is_none()` into `fail_bits` at lane position; the post-loop + /// `& src_chunk` filter drops null-lane fails. Returns `Some(failing_idx)` if + /// any *valid* lane failed in `[base, base+count)`. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) -> Option + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item, bool) -> Option, + { + let mut fail_bits: u64 = 0; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let r = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(i).write(r) }; + } + let valid_failures = fail_bits & src_chunk; + (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) + } + let len = values.len(); assert_eq!(len, mask.len(), "values and mask must have the same length"); assert_eq!(out.len(), len, "out must have the same length as values"); @@ -276,49 +274,18 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - // Bit-pack per-lane fails into a u64 at lane-position. `bit_idx` is a - // compile-time constant after unrolling, so the shift folds. The - // `src_chunk` here is the validity bitmap for this chunk; the closure - // still gets `bit` per lane — LLVM DCEs the per-lane mask extract if - // the closure ignores it. - let mut fail_bits: u64 = 0; - for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - // Drop null-lane failures: only failures at lanes the mask marks as - // valid count. Direct attribution via trailing_zeros — no cold replay. - let valid_failures = fail_bits & src_chunk; - if valid_failures != 0 { - return Err(base + valid_failures.trailing_zeros() as usize); + if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) { + return Err(idx); } } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut fail_bits: u64 = 0; - for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - let valid_failures = fail_bits & src_chunk; - if valid_failures != 0 { - return Err(base + valid_failures.trailing_zeros() as usize); + if let Some(idx) = chunk( + &values, out, &mut f, + chunks.remainder_bits(), chunks_count * 64, remainder, + ) { + return Err(idx); } } - Ok(()) } @@ -341,6 +308,25 @@ where S: IndexedSource, F: FnMut(S::Item) -> R, { + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) where + S: IndexedSource, + F: FnMut(S::Item) -> R, + { + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v)) }; + } + } + let len = values.len(); assert_eq!(out.len(), len, "out must have the same length as values"); @@ -348,21 +334,10 @@ where let remainder = len % 64; for chunk_idx in 0..chunks_count { - let base = chunk_idx * 64; - for_full_lanes!(base, |bit_idx, i| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v)) }; - }); + chunk(&values, out, &mut f, chunk_idx * 64, 64); } - if remainder != 0 { - let base = chunks_count * 64; - for_remainder_lanes!(base, remainder, |bit_idx, i| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v)) }; - }); + chunk(&values, out, &mut f, chunks_count * 64, remainder); } } @@ -397,6 +372,35 @@ where R: Copy + Default, F: FnMut(S::Item) -> Option, { + /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced); + /// the cold attribution path is called at the kernel level so it can be + /// inlined separately for full vs remainder. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) -> bool + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_acc: u64 = 0; + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(i).write(r) }; + } + fail_acc != 0 + } + let len = values.len(); assert_eq!(out.len(), len, "out must have the same length as values"); @@ -405,38 +409,16 @@ where for chunk_idx in 0..chunks_count { let base = chunk_idx * 64; - let mut fail_acc: u64 = 0; - for_full_lanes!(base, |bit_idx, i| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - if fail_acc != 0 { + if chunk(&values, out, &mut f, base, 64) { return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); } } - if remainder != 0 { let base = chunks_count * 64; - let mut fail_acc: u64 = 0; - for_remainder_lanes!(base, remainder, |bit_idx, i| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - // SAFETY: i < len. - unsafe { out.get_unchecked_mut(i).write(r) }; - }); - if fail_acc != 0 { + if chunk(&values, out, &mut f, base, remainder) { return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); } } - Ok(()) } @@ -490,6 +472,27 @@ where S: IndexedSink, F: FnMut(S::Item, bool) -> S::Item, { + #[inline(always)] + fn chunk( + values: &mut S, + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) where + S: IndexedSink, + F: FnMut(S::Item, bool) -> S::Item, + { + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v, bit); + unsafe { values.set_unchecked(i, r) }; + } + } + let len = values.len(); assert_eq!(len, mask.len(), "values and mask must have the same length"); @@ -498,26 +501,13 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v, bit); - // SAFETY: i < len. - unsafe { values.set_unchecked(i, r) }; - }); + chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64); } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v, bit); - // SAFETY: i < len. - unsafe { values.set_unchecked(i, r) }; - }); + chunk( + &mut values, &mut f, + chunks.remainder_bits(), chunks_count * 64, remainder, + ); } } @@ -560,6 +550,39 @@ where S::Item: Default, F: FnMut(S::Item, bool) -> Option, { + /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in + /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any + /// realistic batch), else `None`. `#[inline(always)]` so the literal `64` at the + /// full-chunk call site enables const-propagation through inlining. + #[inline(always)] + #[allow(clippy::cast_possible_truncation)] + fn chunk( + values: &mut S, + src_chunk: u64, + base: usize, + count: usize, + f: &mut F, + ) -> Option + where + S: IndexedSink, + S::Item: Default, + F: FnMut(S::Item, bool) -> Option, + { + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees `base + count <= values.len()`. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + unsafe { values.set_unchecked(i, r) }; + } + (first_fail != u32::MAX).then_some(first_fail) + } + let len = values.len(); assert_eq!(len, mask.len(), "values and mask must have the same length"); @@ -568,68 +591,22 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body - // into this loop and the compiler propagates 64 into the inner `0..count` bound, - // unrolling exactly as `for_full_mask_lanes!` would. - if let Some(failing) = try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) - { + if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { return Err(failing as usize); } } - if remainder != 0 { - // Runtime `count = remainder` — same shape as the prior remainder loop. - if let Some(failing) = try_inplace_chunk( + if let Some(failing) = chunk( &mut values, - chunks.remainder_bits(), - chunks_count * 64, - remainder, + chunks.remainder_bits(), chunks_count * 64, remainder, &mut f, ) { return Err(failing as usize); } } - Ok(()) } -/// Per-chunk worker for [`try_map_with_mask_in_place`]. Body written once; the kernel -/// calls this twice (with `count = 64` for full chunks, `count = remainder` for the -/// tail). `#[inline(always)]` so the const-64 unroll for the full-chunk callers is -/// preserved. -/// -/// Returns `Some(first_failing_lane_index_as_u32)` if any lane in `[base, base+count)` -/// failed (cast width-truncated since `i < 2^32` in any realistic batch), else `None`. -#[inline(always)] -#[allow(clippy::cast_possible_truncation)] -fn try_inplace_chunk( - values: &mut S, - src_chunk: u64, - base: usize, - count: usize, - f: &mut F, -) -> Option -where - S: IndexedSink, - S::Item: Default, - F: FnMut(S::Item, bool) -> Option, -{ - let mut first_fail: u32 = u32::MAX; - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees `base + count <= values.len()`. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); - let r = opt.unwrap_or_default(); - // SAFETY: same as above. - unsafe { values.set_unchecked(i, r) }; - } - (first_fail != u32::MAX).then_some(first_fail) -} - /// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. /// /// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the @@ -649,6 +626,22 @@ where S: IndexedSource, F: FnMut(S::Item) -> bool, { + #[inline(always)] + fn chunk(values: &S, f: &mut F, base: usize, count: usize) -> u64 + where + S: IndexedSource, + F: FnMut(S::Item) -> bool, + { + let mut packed = 0u64; + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + packed |= (f(v) as u64) << bit_idx; + } + packed + } + let len = values.len(); assert_eq!( out.len(), @@ -660,25 +653,12 @@ where let remainder = len % 64; for chunk_idx in 0..chunks_count { - let base = chunk_idx * 64; - let mut packed = 0u64; - for_full_lanes!(base, |bit_idx, i| { - // SAFETY: base + bit_idx < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v) as u64) << bit_idx; - }); + let packed = chunk(&values, &mut f, chunk_idx * 64, 64); // SAFETY: chunk_idx < chunks_count <= out.len(). unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; } - if remainder != 0 { - let base = chunks_count * 64; - let mut packed = 0u64; - for_remainder_lanes!(base, remainder, |bit_idx, i| { - // SAFETY: base + bit_idx < len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v) as u64) << bit_idx; - }); + let packed = chunk(&values, &mut f, chunks_count * 64, remainder); // SAFETY: chunks_count < out.len() because remainder != 0. unsafe { *out.get_unchecked_mut(chunks_count) = packed }; } @@ -698,6 +678,29 @@ where S: IndexedSource, F: FnMut(S::Item, bool) -> bool, { + #[inline(always)] + fn chunk( + values: &S, + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) -> u64 + where + S: IndexedSource, + F: FnMut(S::Item, bool) -> bool, + { + let mut packed = 0u64; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + packed |= (f(v, bit) as u64) << bit_idx; + } + packed + } + let len = values.len(); assert_eq!(len, mask.len(), "values and mask must have the same length"); assert_eq!( @@ -711,26 +714,15 @@ where let remainder = len % 64; for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - let mut packed = 0u64; - for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| { - // SAFETY: i < chunks_count * 64 <= len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v, bit) as u64) << bit_idx; - }); + let packed = chunk(&values, &mut f, src_chunk, chunk_idx * 64, 64); // SAFETY: chunk_idx < chunks_count <= out.len(). unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut packed = 0u64; - for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| { - // SAFETY: i < len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v, bit) as u64) << bit_idx; - }); + let packed = chunk( + &values, &mut f, + chunks.remainder_bits(), chunks_count * 64, remainder, + ); // SAFETY: chunks_count < out.len() because remainder != 0. unsafe { *out.get_unchecked_mut(chunks_count) = packed }; } From d2bca9357ab16ee786778e8f765eafc3f27d0b49 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 14:58:56 +0100 Subject: [PATCH 08/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 88 ++- vortex-buffer/src/lane_ops_indexed.rs | 555 +++++++++--------- 2 files changed, 362 insertions(+), 281 deletions(-) diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 8cdd27cb5c5..ad0d1c8e399 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -1,13 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::mem::align_of; +use std::mem::size_of; + use num_traits::AsPrimitive; use num_traits::NumCast; +use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_buffer::lane_ops_indexed::ReinterpretSink; use vortex_buffer::lane_ops_indexed::map_no_validity; use vortex_buffer::lane_ops_indexed::try_map_no_validity; use vortex_buffer::lane_ops_indexed::try_map_with_mask; +use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -147,9 +153,21 @@ where // Skip the fallible kernel when the conversion is infallible by type alone (widening) or // when cached min/max prove every value fits in `T`. let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); - if casts_losslessly_to(F::PTYPE, T::PTYPE) - || cached_values_fit_in(array, &target_dtype) == Some(true) + let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE) + || cached_values_fit_in(array, &target_dtype) == Some(true); + + // Same-bit-width in-place fast path: when F and T have the same byte width and the + // buffer is uniquely owned, mutate in place and transmute the wrapper. Saves the + // output allocation. Falls through to the out-of-place path when the buffer is shared + // (the common case under the current borrow-based kernel API). + let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width(); + if same_bit_width + && let Ok(buffer_mut) = array.into_owned().try_into_buffer_mut::() { + return cast_buffer_in_place::(buffer_mut, array, new_validity, ctx, infallible); + } + + if infallible { let mut buffer = BufferMut::::with_capacity(values.len()); // Truncating `as`-cast — safe here because stats prove every valid value fits. // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result @@ -204,6 +222,72 @@ where Ok(PrimitiveArray::new(buffer, new_validity).into_array()) } +/// In-place cast of an owned `BufferMut` to `BufferMut` when `F` and `T` have the +/// same byte width. Each slot is read as `F`, converted, and written back as `T`-bits +/// using `BufferMut`'s transmute family. Avoids allocating a second output buffer. +/// +/// The caller has already verified `F::PTYPE.byte_width() == T::PTYPE.byte_width()`. +fn cast_buffer_in_place( + buffer: BufferMut, + array: ArrayView<'_, Primitive>, + new_validity: Validity, + ctx: &mut ExecutionCtx, + infallible: bool, +) -> VortexResult +where + F: NativePType + AsPrimitive, + T: NativePType, +{ + debug_assert_eq!(size_of::(), size_of::()); + debug_assert_eq!(align_of::(), align_of::()); + + if infallible { + // `map_each_in_place` does the BufferMut → BufferMut transmute internally + // (same size + alignment for primitives of equal byte width) and walks each slot + // with the closure. + let result: BufferMut = buffer.map_each_in_place(|v: F| v.as_()); + return Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()); + } + + let mask = array.validity()?.execute_mask(array.len(), ctx)?; + let overflow = || { + vortex_err!( + Compute: "Cannot cast {} to {} — value exceeds target range", + F::PTYPE, T::PTYPE, + ) + }; + + // All-null short-circuit: zero out the buffer and skip the conversion loop entirely. + if matches!(mask, Mask::AllFalse(_)) { + // SAFETY: same size + alignment by NativePType same-byte-width invariant. + let mut t_buf: BufferMut = unsafe { buffer.transmute::() }; + t_buf.as_mut_slice().fill(T::zero()); + return Ok(PrimitiveArray::new(t_buf.freeze(), new_validity).into_array()); + } + + let bit_buffer = match &mask { + Mask::AllTrue(n) => BitBuffer::new_set(*n), + Mask::AllFalse(_) => unreachable!("handled above"), + Mask::Values(m) => m.bit_buffer().clone(), + }; + + let mut buffer = buffer; + try_map_with_mask_in_place( + ReinterpretSink::::new(buffer.as_mut_slice()), + &bit_buffer, + |f_val: F, valid| -> Option { + ::from(f_val).or_else(|| (!valid).then(T::zero)) + }, + ) + .map_err(|_| overflow())?; + + // SAFETY: same size + alignment for NativePType same-byte-width pairs. Every F-slot + // now holds a valid T-bit pattern because `ReinterpretSink::set_unchecked` wrote a + // real `T` at every visited lane. + let result: BufferMut = unsafe { buffer.transmute::() }; + Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()) +} + fn reinterpret( array: ArrayView<'_, Primitive>, new_ptype: PType, diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 144f83a429a..f8d028eb7b7 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -18,7 +18,10 @@ #![allow(clippy::many_single_char_names)] +use std::marker::PhantomData; use std::mem::MaybeUninit; +use std::mem::align_of; +use std::mem::size_of; use crate::BitBuffer; @@ -75,18 +78,29 @@ impl IndexedSource for &mut [T] { /// An [`IndexedSource`] that also supports unchecked indexed writes — the binding /// for in-place kernels. /// +/// `Write` is the type written by `set_unchecked` and may differ from +/// `IndexedSource::Item` (the read type). For the canonical `&mut [T]` impl +/// both are `T`. The decoupling is what makes [`ReinterpretSink`] possible — +/// a wrapper that reads `F` and writes `T` over the same backing memory when +/// the two have identical size and alignment. +/// /// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a /// `(A, B)` pair back to two separate sources via a single index). pub trait IndexedSink: IndexedSource { + /// The per-lane write type. Equal to `::Item` for + /// `&mut [T]`; different for [`ReinterpretSink`]. + type Write: Copy; + /// Write `value` into lane `i` without bounds checking. /// /// # Safety /// /// `i` must be strictly less than `self.len()`. - unsafe fn set_unchecked(&mut self, i: usize, value: Self::Item); + unsafe fn set_unchecked(&mut self, i: usize, value: Self::Write); } impl IndexedSink for &mut [T] { + type Write = T; #[inline] unsafe fn set_unchecked(&mut self, i: usize, value: T) { // SAFETY: caller guarantees i < self.len(). @@ -94,6 +108,76 @@ impl IndexedSink for &mut [T] { } } +/// A sink that reads `F`-values and writes `T`-values over the same backing +/// slice of `F`, reinterpreting each `T` as `F`-bits on write. +/// +/// Requires `size_of::() == size_of::()` and `align_of::() == align_of::()`. +/// Both hold for any pair of `NativePType` primitives with equal byte width +/// (e.g. `u32` ↔ `f32`, `u64` ↔ `i64`, `f64` ↔ `u64`). +/// +/// Use this when an in-place kernel needs to convert lanes between two +/// types of identical width without allocating a second buffer. After the +/// kernel completes every slot holds a valid `T`-bit pattern; the caller +/// can recover a typed view via `BufferMut::transmute::()`. +pub struct ReinterpretSink<'a, F, T> { + slice: &'a mut [F], + _phantom: PhantomData, +} + +impl<'a, F, T> ReinterpretSink<'a, F, T> { + /// Construct a `ReinterpretSink` from `&mut [F]`. + /// + /// # Panics + /// + /// Panics if `size_of::() != size_of::()` or + /// `align_of::() != align_of::()`. + pub fn new(slice: &'a mut [F]) -> Self { + assert_eq!( + size_of::(), + size_of::(), + "ReinterpretSink requires F and T to have the same size", + ); + assert_eq!( + align_of::(), + align_of::(), + "ReinterpretSink requires F and T to have the same alignment", + ); + Self { + slice, + _phantom: PhantomData, + } + } +} + +impl IndexedSource for ReinterpretSink<'_, F, T> { + type Item = F; + #[inline] + fn len(&self) -> usize { + self.slice.len() + } + #[inline] + unsafe fn get_unchecked(&self, i: usize) -> F { + // SAFETY: caller guarantees i < self.slice.len(). Pointer arithmetic + // avoids method-resolution ambiguity between `<[F]>::get_unchecked` and + // `IndexedSource::get_unchecked`. + unsafe { *self.slice.as_ptr().add(i) } + } +} + +impl IndexedSink for ReinterpretSink<'_, F, T> { + type Write = T; + #[inline] + unsafe fn set_unchecked(&mut self, i: usize, value: T) { + // SAFETY: caller guarantees i < self.slice.len(); `new` enforces + // size_of::() == size_of::() and align_of::() == align_of::(), + // so the F-slot can hold a `T` without overflow or misalignment. + unsafe { + let ptr = self.slice.as_mut_ptr().add(i) as *mut T; + ptr.write(value); + } + } +} + /// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane. /// /// Use this to drive a binary kernel from two columns. Length equality is enforced @@ -183,8 +267,12 @@ where } if remainder != 0 { chunk( - &values, out, &mut f, - chunks.remainder_bits(), chunks_count * 64, remainder, + &values, + out, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, ); } } @@ -278,13 +366,17 @@ where return Err(idx); } } - if remainder != 0 { - if let Some(idx) = chunk( - &values, out, &mut f, - chunks.remainder_bits(), chunks_count * 64, remainder, - ) { - return Err(idx); - } + if remainder != 0 + && let Some(idx) = chunk( + &values, + out, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + ) + { + return Err(idx); } Ok(()) } @@ -309,13 +401,8 @@ where F: FnMut(S::Item) -> R, { #[inline(always)] - fn chunk( - values: &S, - out: &mut [MaybeUninit], - f: &mut F, - base: usize, - count: usize, - ) where + fn chunk(values: &S, out: &mut [MaybeUninit], f: &mut F, base: usize, count: usize) + where S: IndexedSource, F: FnMut(S::Item) -> R, { @@ -460,9 +547,120 @@ where cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none()) } +/// In-place variant of [`map_no_validity`]. Each lane is replaced with `f(values[i])`. +/// The source `S` must be writable (an [`IndexedSink`]). +/// +/// The closure reads `S::Item` and returns `S::Write`. For the common case +/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write +/// types can differ (e.g. read `f32`, write `u32`) over the same backing memory +/// when sizes and alignments match. +/// +/// As with [`map_no_validity`], use this only when the input is known +/// non-nullable. +#[inline] +pub fn map_no_validity_in_place(mut values: S, mut f: F) +where + S: IndexedSink, + F: FnMut(S::Item) -> S::Write, +{ + #[inline(always)] + fn chunk(values: &mut S, f: &mut F, base: usize, count: usize) + where + S: IndexedSink, + F: FnMut(S::Item) -> S::Write, + { + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(i, r) }; + } + } + + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + chunk(&mut values, &mut f, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk(&mut values, &mut f, chunks_count * 64, remainder); + } +} + +/// In-place variant of [`try_map_no_validity`]. Each lane is replaced with +/// `f(values[i])`, or `S::Write::default()` when `f` returns `None`. On failure +/// returns `Err(first_failing_lane)`; the buffer state on `Err` is unspecified. +/// +/// As with [`try_map_no_validity`], use this only when the input is known +/// non-nullable — a `None` from `f` is treated as a failure regardless of any +/// upstream validity bitmap. +/// +/// ## Error attribution +/// +/// Per-lane `is_none()` flags are folded into `first_fail` via the same +/// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay +/// isn't viable here because the original input values have already been +/// overwritten by the time we'd attribute the failure. +#[inline] +#[allow(clippy::cast_possible_truncation)] +pub fn try_map_no_validity_in_place(mut values: S, mut f: F) -> Result<(), usize> +where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item) -> Option, +{ + #[inline(always)] + #[allow(clippy::cast_possible_truncation)] + fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option + where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item) -> Option, + { + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(i, r) }; + } + (first_fail != u32::MAX).then_some(first_fail) + } + + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; + + for chunk_idx in 0..chunks_count { + if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) { + return Err(failing as usize); + } + } + if remainder != 0 + && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f) + { + return Err(failing as usize); + } + Ok(()) +} + /// In-place variant of [`map_with_mask`]. Each lane is replaced with /// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]). /// +/// The closure reads `S::Item` and returns `S::Write`. For the common case +/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write +/// types can differ (e.g. read `f32`, write `u32`) over the same backing +/// memory when sizes and alignments match. +/// /// # Panics /// /// Panics if `values.len() != mask.len()`. @@ -470,18 +668,13 @@ where pub fn map_with_mask_in_place(mut values: S, mask: &BitBuffer, mut f: F) where S: IndexedSink, - F: FnMut(S::Item, bool) -> S::Item, + F: FnMut(S::Item, bool) -> S::Write, { #[inline(always)] - fn chunk( - values: &mut S, - f: &mut F, - src_chunk: u64, - base: usize, - count: usize, - ) where + fn chunk(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize) + where S: IndexedSink, - F: FnMut(S::Item, bool) -> S::Item, + F: FnMut(S::Item, bool) -> S::Write, { for bit_idx in 0..count { let i = base + bit_idx; @@ -505,8 +698,11 @@ where } if remainder != 0 { chunk( - &mut values, &mut f, - chunks.remainder_bits(), chunks_count * 64, remainder, + &mut values, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, ); } } @@ -547,8 +743,8 @@ pub fn try_map_with_mask_in_place( ) -> Result<(), usize> where S: IndexedSink, - S::Item: Default, - F: FnMut(S::Item, bool) -> Option, + S::Write: Default, + F: FnMut(S::Item, bool) -> Option, { /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any @@ -565,8 +761,8 @@ where ) -> Option where S: IndexedSink, - S::Item: Default, - F: FnMut(S::Item, bool) -> Option, + S::Write: Default, + F: FnMut(S::Item, bool) -> Option, { let mut first_fail: u32 = u32::MAX; for bit_idx in 0..count { @@ -595,137 +791,18 @@ where return Err(failing as usize); } } - if remainder != 0 { - if let Some(failing) = chunk( + if remainder != 0 + && let Some(failing) = chunk( &mut values, - chunks.remainder_bits(), chunks_count * 64, remainder, + chunks.remainder_bits(), + chunks_count * 64, + remainder, &mut f, - ) { - return Err(failing as usize); - } - } - Ok(()) -} - -/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words. -/// -/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the -/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive -/// buffer) and combine the validity bitmap in a separate pass — splitting the work -/// this way lets the value-compare loop autovectorize cleanly. -/// -/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word -/// beyond `len % 64` are written as `0`. -/// -/// # Panics -/// -/// Panics if `out.len() != values.len().div_ceil(64)`. -#[inline] -pub fn map_to_bits(values: S, out: &mut [u64], mut f: F) -where - S: IndexedSource, - F: FnMut(S::Item) -> bool, -{ - #[inline(always)] - fn chunk(values: &S, f: &mut F, base: usize, count: usize) -> u64 - where - S: IndexedSource, - F: FnMut(S::Item) -> bool, + ) { - let mut packed = 0u64; - for bit_idx in 0..count { - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v) as u64) << bit_idx; - } - packed - } - - let len = values.len(); - assert_eq!( - out.len(), - len.div_ceil(64), - "out must have len.div_ceil(64) words", - ); - - let chunks_count = len / 64; - let remainder = len % 64; - - for chunk_idx in 0..chunks_count { - let packed = chunk(&values, &mut f, chunk_idx * 64, 64); - // SAFETY: chunk_idx < chunks_count <= out.len(). - unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; - } - if remainder != 0 { - let packed = chunk(&values, &mut f, chunks_count * 64, remainder); - // SAFETY: chunks_count < out.len() because remainder != 0. - unsafe { *out.get_unchecked_mut(chunks_count) = packed }; - } -} - -/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words. -/// -/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word -/// beyond `len % 64` are written as `0`. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`. -#[inline] -pub fn map_with_mask_to_bits(values: S, mask: &BitBuffer, out: &mut [u64], mut f: F) -where - S: IndexedSource, - F: FnMut(S::Item, bool) -> bool, -{ - #[inline(always)] - fn chunk( - values: &S, - f: &mut F, - src_chunk: u64, - base: usize, - count: usize, - ) -> u64 - where - S: IndexedSource, - F: FnMut(S::Item, bool) -> bool, - { - let mut packed = 0u64; - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - packed |= (f(v, bit) as u64) << bit_idx; - } - packed - } - - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!( - out.len(), - len.div_ceil(64), - "out must have len.div_ceil(64) words", - ); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let packed = chunk(&values, &mut f, src_chunk, chunk_idx * 64, 64); - // SAFETY: chunk_idx < chunks_count <= out.len(). - unsafe { *out.get_unchecked_mut(chunk_idx) = packed }; - } - if remainder != 0 { - let packed = chunk( - &values, &mut f, - chunks.remainder_bits(), chunks_count * 64, remainder, - ); - // SAFETY: chunks_count < out.len() because remainder != 0. - unsafe { *out.get_unchecked_mut(chunks_count) = packed }; + return Err(failing as usize); } + Ok(()) } #[cfg(test)] @@ -842,58 +919,6 @@ mod tests { } } - #[test] - fn map_with_mask_to_bits_aligned() { - let values: Vec = (0..128).collect(); - let mask = BitBuffer::new_set(128); - let mut out = vec![0u64; 2]; - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { - valid && v % 2 == 0 - }); - // Even numbers in [0, 128) set, odd unset. - for word_idx in 0..2 { - let word = out[word_idx]; - for bit in 0..64 { - let i = word_idx * 64 + bit; - let expected = i % 2 == 0; - assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}"); - } - } - } - - #[test] - fn map_with_mask_to_bits_partial_chunk() { - // 130 lanes — three u64 words, last word has only 2 valid bits. - let values: Vec = (0..130).collect(); - let mask = BitBuffer::new_set(130); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - assert_eq!(out.len(), 3); - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { - valid && v >= 64 - }); - // Bits 64..128 set in word 1; bits 128..130 set in word 2. - assert_eq!(out[0], 0); - assert_eq!(out[1], u64::MAX); - assert_eq!(out[2], 0b11); - } - - #[test] - fn map_with_mask_to_bits_offset() { - let big = BitBuffer::new_set(256); - let sliced = big.slice(13..143); // offset=13, len=130 - assert_eq!(sliced.len(), 130); - let values: Vec = (0..130).map(|i| (i % 4) as u8).collect(); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - map_with_mask_to_bits(values.as_slice(), &sliced, &mut out, |v, valid| { - valid && v == 0 - }); - for i in 0..130 { - let word = out[i / 64]; - let bit = (word >> (i % 64)) & 1 == 1; - assert_eq!(bit, i % 4 == 0, "lane {i}"); - } - } - #[test] fn try_map_with_mask_all_ok() { let values: Vec = (0..200).collect(); @@ -1242,82 +1267,54 @@ mod tests { } #[test] - fn try_map_with_mask_in_place_partial_chunk_success() { - let mut values: Vec = (0..130).collect(); + fn reinterpret_sink_same_width_f32_u32() { + // Read f32, write u32-bits in place. After transmuting the slice back to u32 we + // should see exactly the bit patterns the closure produced. + let mut buf: Vec = (0..130).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(130); - let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1)); - assert!(res.is_ok()); - assert_eq!(values[0], 1); - assert_eq!(values[63], 64); - assert_eq!(values[64], 65); - assert_eq!(values[129], 130); - } - - #[test] - fn map_to_bits_aligned() { - let values: Vec = (0..128).collect(); - let mut out = vec![0u64; 2]; - map_to_bits(values.as_slice(), &mut out, |v| v % 2 == 0); - for word_idx in 0..2 { - for bit in 0..64 { - let i = word_idx * 64 + bit; - let expected = i % 2 == 0; - assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}"); - } + try_map_with_mask_in_place( + ReinterpretSink::::new(buf.as_mut_slice()), + &mask, + |f, _valid| Some(f.to_bits().wrapping_add(1)), + ) + .unwrap(); + // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by + // the closure. + let as_u32: &[u32] = + unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const u32, buf.len()) }; + for (i, &got) in as_u32.iter().enumerate() { + assert_eq!(got, (i as f32).to_bits().wrapping_add(1), "lane {i}"); } } #[test] - fn map_to_bits_partial_chunk() { - let values: Vec = (0..130).collect(); - let mut out = vec![0u64; 130usize.div_ceil(64)]; - assert_eq!(out.len(), 3); - map_to_bits(values.as_slice(), &mut out, |v| v >= 64); - assert_eq!(out[0], 0); - assert_eq!(out[1], u64::MAX); - assert_eq!(out[2], 0b11); - } - - #[test] - fn map_to_bits_empty() { - let values: Vec = vec![]; - let mut out: Vec = vec![]; - map_to_bits(values.as_slice(), &mut out, |v| v > 0); - } - - #[test] - fn map_to_bits_matches_fused_with_all_valid_mask() { - // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits. - let values: Vec = (0..200).map(|i| i % 7).collect(); + fn reinterpret_sink_failure_reports_lane() { + // Closure fails at a specific lane; the kernel must report that lane index. + let mut buf: Vec = (0..200).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(200); - - let mut a = vec![0u64; 200usize.div_ceil(64)]; - map_with_mask_to_bits(values.as_slice(), &mask, &mut a, |v, valid| valid && v == 3); - - let mut b = vec![0u64; 200usize.div_ceil(64)]; - map_to_bits(values.as_slice(), &mut b, |v| v == 3); - - assert_eq!(a, b); + let res = try_map_with_mask_in_place( + ReinterpretSink::::new(buf.as_mut_slice()), + &mask, + |f, _valid| { + if f as u32 == 137 { + None + } else { + Some(f as u32) + } + }, + ); + assert_eq!(res, Err(137)); } #[test] - fn map_with_mask_to_bits_validity_kills_lane() { - // Even if predicate is true, null lanes should produce false. - let values: Vec = vec![1; 70]; - let mask = { - let mut m = BitBufferMut::with_capacity(70); - for i in 0..70 { - m.append(i >= 32); // first 32 lanes are null - } - m.freeze() - }; - let mut out = vec![0u64; 70usize.div_ceil(64)]; - map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| { - valid && v == 1 - }); - for i in 0..70 { - let bit = (out[i / 64] >> (i % 64)) & 1 == 1; - assert_eq!(bit, i >= 32, "lane {i}"); - } + fn try_map_with_mask_in_place_partial_chunk_success() { + let mut values: Vec = (0..130).collect(); + let mask = BitBuffer::new_set(130); + let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1)); + assert!(res.is_ok()); + assert_eq!(values[0], 1); + assert_eq!(values[63], 64); + assert_eq!(values[64], 65); + assert_eq!(values[129], 130); } } From 6fd7fc1212305994fe4879d7063c8e7f24e6b0c2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 14:59:01 +0100 Subject: [PATCH 09/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/benches/cast_to_indexed.rs | 37 ------------------------ 1 file changed, 37 deletions(-) diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index 848f50cd142..2751cdc8418 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -24,10 +24,8 @@ use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; use vortex_buffer::lane_ops_indexed::map_no_validity; -use vortex_buffer::lane_ops_indexed::map_to_bits; use vortex_buffer::lane_ops_indexed::map_with_mask; use vortex_buffer::lane_ops_indexed::map_with_mask_in_place; -use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits; use vortex_buffer::lane_ops_indexed::try_map_no_validity; use vortex_buffer::lane_ops_indexed::try_map_with_mask; use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; @@ -37,7 +35,6 @@ fn main() { } const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; -const U32_THRESHOLD: u32 = u32::MAX / 2; struct Fixture { values_u64: Buffer, @@ -284,40 +281,6 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) { }); } -#[divan::bench(args = SIZES)] -fn map_to_bits_u32_threshold(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u32.clone(), vec![0; n.div_ceil(64)])) - .bench_values(|(values, mut out)| { - map_to_bits(values.as_slice(), out.as_mut_slice(), |v| { - v >= U32_THRESHOLD - }); - out - }); -} - -#[divan::bench(args = SIZES)] -fn map_with_mask_to_bits_u32_threshold(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| { - ( - f.values_u32.clone(), - f.mask.clone(), - vec![0; n.div_ceil(64)], - ) - }) - .bench_values(|(values, mask, mut out)| { - map_with_mask_to_bits(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { - valid && v >= U32_THRESHOLD - }); - out - }); -} - // ----------------------------------------------------------------------------- // Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one // narrowing (u64 → u32, can fail). Each pairs with the cast variants above of From 72bca8b91ec4ad931fbf5d05755d9bf35435a146 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 15:16:53 +0100 Subject: [PATCH 10/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 195 ++++----- vortex-buffer/benches/add_checked.rs | 9 +- vortex-buffer/benches/pack_vs_unpack.rs | 389 ------------------ 3 files changed, 90 insertions(+), 503 deletions(-) delete mode 100644 vortex-buffer/benches/pack_vs_unpack.rs diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index ad0d1c8e399..8242b5845bd 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -1,17 +1,15 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::mem::align_of; -use std::mem::size_of; - use num_traits::AsPrimitive; use num_traits::NumCast; -use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_buffer::lane_ops_indexed::ReinterpretSink; use vortex_buffer::lane_ops_indexed::map_no_validity; +use vortex_buffer::lane_ops_indexed::map_no_validity_in_place; use vortex_buffer::lane_ops_indexed::try_map_no_validity; +use vortex_buffer::lane_ops_indexed::try_map_no_validity_in_place; use vortex_buffer::lane_ops_indexed::try_map_with_mask; use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; use vortex_error::VortexResult; @@ -132,7 +130,6 @@ where F: NativePType + AsPrimitive, T: NativePType, { - let values = array.as_slice::(); let overflow = || { vortex_err!( Compute: "Cannot cast {} to {} — value exceeds target range", @@ -156,65 +153,105 @@ where let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE) || cached_values_fit_in(array, &target_dtype) == Some(true); - // Same-bit-width in-place fast path: when F and T have the same byte width and the - // buffer is uniquely owned, mutate in place and transmute the wrapper. Saves the - // output allocation. Falls through to the out-of-place path when the buffer is shared - // (the common case under the current borrow-based kernel API). + let len = array.len(); + + // Same-bit-width in-place fast path: when F and T have the same byte width, try to take + // unique ownership of the buffer. If successful, each kernel call site below mutates in + // place via `ReinterpretSink` and transmutes the wrapper at the end, saving the output + // allocation. Falls back to the out-of-place path (borrowed slice + fresh buffer) when + // the buffer is shared — the common case under the current borrow-based kernel API. let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width(); - if same_bit_width - && let Ok(buffer_mut) = array.into_owned().try_into_buffer_mut::() - { - return cast_buffer_in_place::(buffer_mut, array, new_validity, ctx, infallible); - } + let owned: Option> = if same_bit_width { + array.into_owned().try_into_buffer_mut::().ok() + } else { + None + }; + let values: &[F] = array.as_slice::(); if infallible { - let mut buffer = BufferMut::::with_capacity(values.len()); - // Truncating `as`-cast — safe here because stats prove every valid value fits. - // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result - // validity bitmap masks them downstream). - map_no_validity( - values, - &mut buffer.spare_capacity_mut()[..values.len()], - |v| v.as_(), - ); - // SAFETY: map_no_validity initializes every lane. - unsafe { buffer.set_len(values.len()) }; - return Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()); + // Truncating `as`-cast — safe here because static type analysis or cached stats prove + // every valid value fits. Null lanes' underlying garbage gets truncated/wrapped + // (harmless: the result validity bitmap masks them downstream). + return match owned { + Some(mut buf) => { + map_no_validity_in_place( + ReinterpretSink::::new(buf.as_mut_slice()), + |v: F| v.as_(), + ); + // SAFETY: same size + alignment for NativePType same-byte-width pairs; + // every F-slot was overwritten with a real `T` bit pattern. + let result: BufferMut = unsafe { buf.transmute::() }; + Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()) + } + None => { + let mut buffer = BufferMut::::with_capacity(len); + map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| v.as_()); + // SAFETY: map_no_validity initializes every lane. + unsafe { buffer.set_len(len) }; + Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()) + } + }; } - let mask = array.validity()?.execute_mask(array.len(), ctx)?; + let mask = array.validity()?.execute_mask(len, ctx)?; - let buffer: Buffer = match &mask { - Mask::AllTrue(_) => { - let mut buffer = BufferMut::::with_capacity(values.len()); - try_map_no_validity( - values, - &mut buffer.spare_capacity_mut()[..values.len()], - |v| ::from(v), + let buffer: Buffer = match (&mask, owned) { + (Mask::AllTrue(_), Some(mut buf)) => { + try_map_no_validity_in_place( + ReinterpretSink::::new(buf.as_mut_slice()), + |v: F| ::from(v), ) .map_err(|_| overflow())?; + // SAFETY: same size + alignment for NativePType same-byte-width pairs; + // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. + let result: BufferMut = unsafe { buf.transmute::() }; + result.freeze() + } + (Mask::AllTrue(_), None) => { + let mut buffer = BufferMut::::with_capacity(len); + try_map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| { + ::from(v) + }) + .map_err(|_| overflow())?; // SAFETY: try_map_no_validity returned Ok, so it initialized every lane. - unsafe { buffer.set_len(values.len()) }; + unsafe { buffer.set_len(len) }; buffer.freeze() } - Mask::AllFalse(_) => BufferMut::::zeroed(values.len()).freeze(), - Mask::Values(m) => { - let mut buffer = BufferMut::::with_capacity(values.len()); + (Mask::AllFalse(_), Some(buf)) => { + // SAFETY: same size + alignment by NativePType same-byte-width invariant. + let mut t_buf: BufferMut = unsafe { buf.transmute::() }; + t_buf.as_mut_slice().fill(T::zero()); + t_buf.freeze() + } + (Mask::AllFalse(_), None) => BufferMut::::zeroed(len).freeze(), + (Mask::Values(m), Some(mut buf)) => { + try_map_with_mask_in_place( + ReinterpretSink::::new(buf.as_mut_slice()), + m.bit_buffer(), + |v: F, valid| ::from(v).or_else(|| (!valid).then(T::zero)), + ) + .map_err(|_| overflow())?; + // SAFETY: same size + alignment for NativePType same-byte-width pairs; + // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. + let result: BufferMut = unsafe { buf.transmute::() }; + result.freeze() + } + (Mask::Values(m), None) => { + let mut buffer = BufferMut::::with_capacity(len); try_map_with_mask( values, m.bit_buffer(), - &mut buffer.spare_capacity_mut()[..values.len()], - // Lazy validity: only consult `valid` on the failure branch. For - // widening / statically-infallible casts, `NumCast::from` is always - // `Some` so the `or_else` is provably dead — LLVM DCEs the validity - // path entirely, giving the same codegen as the maskless kernel. - // For narrowing, `valid` is only read at lanes that actually - // overflowed (a cold check on top of the cast). + &mut buffer.spare_capacity_mut()[..len], + // Lazy validity: only consult `valid` on the failure branch. For widening / + // statically-infallible casts, `NumCast::from` is always `Some` so the + // `or_else` is provably dead — LLVM DCEs the validity path entirely, giving + // the same codegen as the maskless kernel. For narrowing, `valid` is only + // read at lanes that actually overflowed (a cold check on top of the cast). |v, valid| ::from(v).or_else(|| (!valid).then(T::zero)), ) .map_err(|_| overflow())?; // SAFETY: try_map_with_mask returned Ok, so it initialized every lane. - unsafe { buffer.set_len(values.len()) }; + unsafe { buffer.set_len(len) }; buffer.freeze() } }; @@ -222,72 +259,6 @@ where Ok(PrimitiveArray::new(buffer, new_validity).into_array()) } -/// In-place cast of an owned `BufferMut` to `BufferMut` when `F` and `T` have the -/// same byte width. Each slot is read as `F`, converted, and written back as `T`-bits -/// using `BufferMut`'s transmute family. Avoids allocating a second output buffer. -/// -/// The caller has already verified `F::PTYPE.byte_width() == T::PTYPE.byte_width()`. -fn cast_buffer_in_place( - buffer: BufferMut, - array: ArrayView<'_, Primitive>, - new_validity: Validity, - ctx: &mut ExecutionCtx, - infallible: bool, -) -> VortexResult -where - F: NativePType + AsPrimitive, - T: NativePType, -{ - debug_assert_eq!(size_of::(), size_of::()); - debug_assert_eq!(align_of::(), align_of::()); - - if infallible { - // `map_each_in_place` does the BufferMut → BufferMut transmute internally - // (same size + alignment for primitives of equal byte width) and walks each slot - // with the closure. - let result: BufferMut = buffer.map_each_in_place(|v: F| v.as_()); - return Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()); - } - - let mask = array.validity()?.execute_mask(array.len(), ctx)?; - let overflow = || { - vortex_err!( - Compute: "Cannot cast {} to {} — value exceeds target range", - F::PTYPE, T::PTYPE, - ) - }; - - // All-null short-circuit: zero out the buffer and skip the conversion loop entirely. - if matches!(mask, Mask::AllFalse(_)) { - // SAFETY: same size + alignment by NativePType same-byte-width invariant. - let mut t_buf: BufferMut = unsafe { buffer.transmute::() }; - t_buf.as_mut_slice().fill(T::zero()); - return Ok(PrimitiveArray::new(t_buf.freeze(), new_validity).into_array()); - } - - let bit_buffer = match &mask { - Mask::AllTrue(n) => BitBuffer::new_set(*n), - Mask::AllFalse(_) => unreachable!("handled above"), - Mask::Values(m) => m.bit_buffer().clone(), - }; - - let mut buffer = buffer; - try_map_with_mask_in_place( - ReinterpretSink::::new(buffer.as_mut_slice()), - &bit_buffer, - |f_val: F, valid| -> Option { - ::from(f_val).or_else(|| (!valid).then(T::zero)) - }, - ) - .map_err(|_| overflow())?; - - // SAFETY: same size + alignment for NativePType same-byte-width pairs. Every F-slot - // now holds a valid T-bit pattern because `ReinterpretSink::set_unchecked` wrote a - // real `T` at every visited lane. - let result: BufferMut = unsafe { buffer.transmute::() }; - Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()) -} - fn reinterpret( array: ArrayView<'_, Primitive>, new_ptype: PType, diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index df857922d6f..ff4f5f64e9a 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -513,8 +513,13 @@ fn premask_then_simd(bencher: Bencher, n: usize) { .bench_refs(|(lhs, rhs, lm, rm)| { let combined = lm as &BitBuffer & rm as &BitBuffer; let mut out = alloc_out(n); - handrolled_premask(lhs.as_slice(), rhs.as_slice(), &combined, out.as_mut_slice()) - .unwrap(); + handrolled_premask( + lhs.as_slice(), + rhs.as_slice(), + &combined, + out.as_mut_slice(), + ) + .unwrap(); (combined, out) }); } diff --git a/vortex-buffer/benches/pack_vs_unpack.rs b/vortex-buffer/benches/pack_vs_unpack.rs deleted file mode 100644 index 0ae41fb5573..00000000000 --- a/vortex-buffer/benches/pack_vs_unpack.rs +++ /dev/null @@ -1,389 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Compare two strategies for handling validity in `try_map_with_mask`: -//! -//! 1. **Unpack the mask** — closure consults `valid` per-lane. Null lanes are -//! short-circuited inside the closure (return `Some(default)` immediately), -//! so the checked operation never runs with garbage. The kernel still does -//! its `fail_bits & src_chunk` post-filter, but it's a no-op because the -//! closure already produced `Some` at null lanes. -//! -//! 2. **Pack and filter** — closure ignores `_valid`. The checked operation -//! runs at every lane, including null lanes (where it may produce `None` -//! on garbage). The kernel's post-loop `fail_bits & src_chunk` filter -//! drops those null-lane fails. LLVM DCEs the per-lane mask extract since -//! the closure doesn't consult `valid`. -//! -//! Two ops × two strategies = four vortex benches, plus arrow baselines. -//! -//! - `widen_u16_u32_*` — statically-infallible widening cast. `NumCast::from` -//! always returns `Some`; LLVM proves it and strips fail-tracking entirely. -//! - `checked_add_u32_*` — genuinely fallible: `u32 + u32` can overflow. - -#![expect(clippy::unwrap_used)] - -use std::mem::MaybeUninit; -use std::sync::Arc; - -use arrow_arith::numeric::add; -use arrow_array::Datum; -use arrow_array::UInt16Array; -use arrow_array::UInt32Array; -use arrow_buffer::NullBuffer; -use arrow_buffer::ScalarBuffer; -use arrow_cast::CastOptions; -use arrow_cast::cast_with_options; -use arrow_schema::DataType; -use divan::Bencher; -use num_traits::NumCast; -use rand::SeedableRng; -use rand::prelude::*; -use rand::rngs::StdRng; -use vortex_buffer::BitBuffer; -use vortex_buffer::BitBufferMut; -use vortex_buffer::Buffer; -use vortex_buffer::lane_ops_indexed::LaneZip; -use vortex_buffer::lane_ops_indexed::try_map_with_mask; - -fn main() { - divan::main(); -} - -const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; - -struct Fixture { - values_u16: Buffer, - lhs_u32: Buffer, - rhs_u32: Buffer, - mask: BitBuffer, - arrow_u16: UInt16Array, - arrow_lhs: Arc, - arrow_rhs: Arc, -} - -fn fixture(n: usize) -> Fixture { - let mut rng = StdRng::seed_from_u64(0xC0DE_BEEF); - // Bounded so `u16 + u16` (as u32) and `u32 + u32` never overflow u32. - // Both strategies succeed; we measure success-path perf. - let raw_lhs: Vec = (0..n) - .map(|_| rng.random_range(0..(u32::MAX / 2))) - .collect(); - let raw_rhs: Vec = (0..n) - .map(|_| rng.random_range(0..(u32::MAX / 2))) - .collect(); - let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); - - #[expect(clippy::cast_possible_truncation)] - let values_u16: Buffer = raw_lhs.iter().map(|&v| v as u16).collect(); - let lhs_u32: Buffer = raw_lhs.iter().copied().collect(); - let rhs_u32: Buffer = raw_rhs.iter().copied().collect(); - - let mask = { - let mut m = BitBufferMut::with_capacity(n); - for &v in &raw_valid { - m.append(v); - } - m.freeze() - }; - - #[expect(clippy::cast_possible_truncation)] - let arrow_u16 = UInt16Array::new( - ScalarBuffer::from(raw_lhs.iter().map(|&v| v as u16).collect::>()), - Some(NullBuffer::from(raw_valid.clone())), - ); - let arrow_lhs = Arc::new(UInt32Array::new( - ScalarBuffer::from(raw_lhs), - Some(NullBuffer::from(raw_valid.clone())), - )); - let arrow_rhs = Arc::new(UInt32Array::new( - ScalarBuffer::from(raw_rhs), - Some(NullBuffer::from(raw_valid)), - )); - - Fixture { - values_u16, - lhs_u32, - rhs_u32, - mask, - arrow_u16, - arrow_lhs, - arrow_rhs, - } -} - -fn uninit_out(n: usize) -> Vec> { - let mut out = Vec::with_capacity(n); - // SAFETY: a `MaybeUninit` does not require initialization. - unsafe { out.set_len(n) }; - out -} - -const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { - safe: false, - format_options: arrow_cast::display::FormatOptions::new(), -}; - -// ----------------------------------------------------------------------------- -// Widening cast u16 → u32 (statically infallible). NumCast::from never returns -// None for widening, so the failure path is dead in both strategies. -// ----------------------------------------------------------------------------- - -/// Strategy 1 (unpack mask): closure consults `valid`, short-circuits at null -/// lanes. For widening the short-circuit is dead anyway (no failure possible). -#[divan::bench(args = SIZES)] -fn widen_u16_u32_unpack_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { - if !valid { - return Some(0u32); - } - ::from(v) - }) - .unwrap(); - out - }); -} - -/// Strategy 2 (pack and filter): closure ignores `_valid`. LLVM DCEs the -/// per-lane mask extract; post-loop `& src_chunk` would filter null-lane fails -/// (none happen for widening). -#[divan::bench(args = SIZES)] -fn widen_u16_u32_pack_and_filter(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { - ::from(v) - }) - .unwrap(); - out - }); -} - -#[divan::bench(args = SIZES)] -fn widen_u16_u32_arrow(bencher: Bencher, _n: usize) { - let f = fixture(_n); - bencher - .with_inputs(|| f.arrow_u16.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} - -// ----------------------------------------------------------------------------- -// Checked add u32 + u32 → u32 (genuinely fallible). LaneZip(lhs, rhs) drives -// two-input lanewise. -// ----------------------------------------------------------------------------- - -/// Strategy 1 (unpack mask): closure short-circuits null lanes; `checked_add` -/// only runs at valid lanes. -#[divan::bench(args = SIZES)] -fn checked_add_u32_unpack_mask(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs_u32.clone(), - f.rhs_u32.clone(), - f.mask.clone(), - uninit_out::(n), - ) - }) - .bench_values(|(lhs, rhs, mask, mut out)| { - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &mask, - out.as_mut_slice(), - |(a, b), valid| { - if !valid { - return Some(0u32); - } - a.checked_add(b) - }, - ) - .unwrap(); - out - }); -} - -/// Strategy 2 (pack and filter): `checked_add` runs at every lane (including -/// null lanes with garbage values); kernel's `fail_bits & src_chunk` post-filter -/// drops any null-lane fails. -#[divan::bench(args = SIZES)] -fn checked_add_u32_pack_and_filter(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs_u32.clone(), - f.rhs_u32.clone(), - f.mask.clone(), - uninit_out::(n), - ) - }) - .bench_values(|(lhs, rhs, mask, mut out)| { - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &mask, - out.as_mut_slice(), - |(a, b), _valid| a.checked_add(b), - ) - .unwrap(); - out - }); -} - -// Asm-extraction helpers: `#[unsafe(no_mangle)] #[inline(never)]` so a single -// `cargo rustc --emit=asm` produces clearly-labeled symbols to diff. - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn asm_add_unpack_branchy( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { - try_map_with_mask( - LaneZip::new(lhs, rhs), - mask, - out, - |(a, b), valid| { - if !valid { - return Some(0u32); - } - a.checked_add(b) - }, - ) -} - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn asm_add_unpack_branchless( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { - try_map_with_mask( - LaneZip::new(lhs, rhs), - mask, - out, - |(a, b), valid| { - // Compute first, then select. No early-return; LLVM may if-convert. - let r = a.checked_add(b); - if valid { r } else { Some(0u32) } - }, - ) -} - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn asm_add_unpack_multiply( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { - try_map_with_mask( - LaneZip::new(lhs, rhs), - mask, - out, - |(a, b), valid| { - // Neutralize null lanes via multiply (BIC); checked_add runs unconditionally. - let m = valid as u32; - (a * m).checked_add(b * m) - }, - ) -} - -#[unsafe(no_mangle)] -#[inline(never)] -pub fn asm_add_pack_filter( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { - try_map_with_mask( - LaneZip::new(lhs, rhs), - mask, - out, - |(a, b), _valid| a.checked_add(b), - ) -} - -/// Branchless-multiply variant of unpack_mask: scale lhs/rhs by `valid as u32` so -/// the checked op runs at every lane (with zeros at null lanes — never overflows) -/// and the kernel's post-loop `& src_chunk` filter still applies. -#[divan::bench(args = SIZES)] -fn checked_add_u32_unpack_multiply(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs_u32.clone(), - f.rhs_u32.clone(), - f.mask.clone(), - uninit_out::(n), - ) - }) - .bench_values(|(lhs, rhs, mask, mut out)| { - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &mask, - out.as_mut_slice(), - |(a, b), valid| { - let m = valid as u32; - (a * m).checked_add(b * m) - }, - ) - .unwrap(); - out - }); -} - -/// Compute-first-then-select variant of unpack_mask: removes the early `return`, -/// keeps the `valid` consult per-lane. Tests whether LLVM if-converts when both -/// branches are pure expressions. -#[divan::bench(args = SIZES)] -fn checked_add_u32_unpack_branchless(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs_u32.clone(), - f.rhs_u32.clone(), - f.mask.clone(), - uninit_out::(n), - ) - }) - .bench_values(|(lhs, rhs, mask, mut out)| { - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &mask, - out.as_mut_slice(), - |(a, b), valid| { - let r = a.checked_add(b); - if valid { r } else { Some(0u32) } - }, - ) - .unwrap(); - out - }); -} - -#[divan::bench(args = SIZES)] -fn checked_add_u32_arrow(bencher: Bencher, _n: usize) { - let f = fixture(_n); - bencher - .with_inputs(|| (f.arrow_lhs.clone(), f.arrow_rhs.clone())) - .bench_refs(|(lhs, rhs)| { - let lhs_datum: &dyn Datum = lhs.as_ref(); - let rhs_datum: &dyn Datum = rhs.as_ref(); - add(lhs_datum, rhs_datum).unwrap() - }); -} From fe34ccbe7ffd4021d550907bd6bf85755970567f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 15:44:41 +0100 Subject: [PATCH 11/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/Cargo.toml | 4 - vortex-buffer/benches/add_checked.rs | 521 ++++----------------------- 2 files changed, 76 insertions(+), 449 deletions(-) diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 048d2612364..882de199818 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -62,7 +62,3 @@ harness = false [[bench]] name = "add_checked" harness = false - -[[bench]] -name = "pack_vs_unpack" -harness = false diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index ff4f5f64e9a..2d4db4959e7 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -1,40 +1,28 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Checked `u32 + u32 -> u32` over two nullable columns — exhaustive variant -//! comparison. +//! Checked `u32 + u32 -> u32` over two nullable columns. //! -//! Variants differ along three axes: +//! Two implementations: //! -//! 1. **Closure suppression strategy** — how the closure (if any) handles null lanes -//! - `value_only`: `|(a,b), _|` ignores validity -//! - `if_else`: `|(a,b), valid| if valid { ... } else { Some(default) }` -//! - `or_else`: `|(a,b), valid| ....or_else(|| (!valid).then(...))` -//! - `mul_trick`: `(a * valid as u32).checked_add(b * valid as u32)` +//! - [`bitpack_value_only`] — production path via [`try_map_with_mask`] with a +//! value-only closure. Per-lane `is_none()` flags are bit-packed and AND-ed +//! with the chunk validity word so null-lane overflow is filtered without +//! the closure ever inspecting `valid`. +//! - [`premask_then_simd`] — hand-rolled ceiling. Bit-broadcasts each mask bit +//! to `0x00000000`/`0xFFFFFFFF`, ANDs into both operands (null lanes become +//! `0+0`), then unconditional `overflowing_add` with a per-chunk OR-reduced +//! `fail_acc` and cold scalar attribution. Same pattern that beat arrow on +//! the primitive cast bench (37 µs vs 55 µs). //! -//! 2. **Fail tracking scheme** -//! - bit-pack: `fail_bits |= (is_none << bit_idx)`; chunk-AND with mask -//! - boolean: `fail_acc |= is_none as u64`; cold replay attribution -//! -//! 3. **Validity application** -//! - in closure: closure consumes `valid` -//! - post-mask: kernel ANDs fail bitmap with `src_chunk` -//! - pre-mask: kernel zeros null-lane values via bit-broadcast before SIMD add -//! - none: ignore validity (ceiling only — not correct for real inputs) -//! -//! All correctness-preserving variants are verified via [`assert_overflow_parity`] -//! and [`assert_null_overflow_suppressed`] at startup. The `pure_simd_no_validity` -//! variant is benched as a ceiling only — it does not respect nullability. +//! Both are verified at startup via [`assert_overflow_parity`] (valid-lane +//! overflow propagates as `Err`) and [`assert_null_overflow_suppressed`] +//! (null-lane overflow does not). #![expect(clippy::unwrap_used)] use std::mem::MaybeUninit; -use std::sync::Arc; -use arrow_array::Datum; -use arrow_array::UInt32Array; -use arrow_buffer::NullBuffer; -use arrow_buffer::ScalarBuffer; use divan::Bencher; use rand::SeedableRng; use rand::prelude::*; @@ -47,7 +35,6 @@ use vortex_buffer::lane_ops_indexed::try_map_with_mask; fn main() { assert_overflow_parity(); assert_null_overflow_suppressed(); - assert_pure_simd_errs_on_realistic_data(); divan::main(); } @@ -56,19 +43,13 @@ const LHS_VALID_RATE: f64 = 0.7; const RHS_VALID_RATE: f64 = 0.8; struct Fixture { - /// **Realistic** lhs: valid lanes bounded, null lanes `u32::MAX`. - /// A kernel that ignores validity will see overflow at null lanes. + /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel + /// that ignores validity would Err on them. Both implementations under test + /// must suppress that. lhs: Buffer, rhs: Buffer, - /// **Sanitized** lhs: valid lanes bounded, null lanes pre-zeroed. - /// Used by `pure_simd_no_validity_sanitized` only — its precondition is - /// "someone already zeroed the nulls." - lhs_sanitized: Buffer, - rhs_sanitized: Buffer, lhs_mask: BitBuffer, rhs_mask: BitBuffer, - lhs_arrow: Arc, - rhs_arrow: Arc, } fn fixture(n: usize) -> Fixture { @@ -80,11 +61,7 @@ fn fixture(n: usize) -> Fixture { let lhs_valid: Vec = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect(); let rhs_valid: Vec = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect(); - // **Realistic null storage**: null lanes contain u32::MAX. Adding two such - // values overflows — a kernel that ignores validity will spuriously Err. - // Valid lanes carry bounded values so the success path is measured at lanes - // where overflow shouldn't fire. - let raw_lhs: Vec = (0..n) + let lhs: Buffer = (0..n) .map(|i| { if lhs_valid[i] { lhs_rng.random_range(0..u16::MAX as u32) @@ -93,7 +70,7 @@ fn fixture(n: usize) -> Fixture { } }) .collect(); - let raw_rhs: Vec = (0..n) + let rhs: Buffer = (0..n) .map(|i| { if rhs_valid[i] { rhs_rng.random_range(0..u16::MAX as u32) @@ -103,16 +80,6 @@ fn fixture(n: usize) -> Fixture { }) .collect(); - let lhs: Buffer = raw_lhs.iter().copied().collect(); - let rhs: Buffer = raw_rhs.iter().copied().collect(); - - let lhs_sanitized: Buffer = (0..n) - .map(|i| if lhs_valid[i] { raw_lhs[i] } else { 0 }) - .collect(); - let rhs_sanitized: Buffer = (0..n) - .map(|i| if rhs_valid[i] { raw_rhs[i] } else { 0 }) - .collect(); - let lhs_mask = { let mut m = BitBufferMut::with_capacity(n); for &v in &lhs_valid { @@ -128,24 +95,11 @@ fn fixture(n: usize) -> Fixture { m.freeze() }; - let lhs_arrow = Arc::new(UInt32Array::new( - ScalarBuffer::from(raw_lhs), - Some(NullBuffer::from(lhs_valid)), - )); - let rhs_arrow = Arc::new(UInt32Array::new( - ScalarBuffer::from(raw_rhs), - Some(NullBuffer::from(rhs_valid)), - )); - Fixture { lhs, rhs, - lhs_sanitized, - rhs_sanitized, lhs_mask, rhs_mask, - lhs_arrow, - rhs_arrow, } } @@ -157,25 +111,7 @@ fn alloc_out(n: usize) -> Vec> { } // --------------------------------------------------------------------------- -// Variant 0: arrow_arith::numeric::add — baseline -// --------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn arrow_add(bencher: Bencher, n: usize) { - let _ = n; - let f = fixture(n); - bencher - .with_inputs(|| (f.lhs_arrow.clone(), f.rhs_arrow.clone())) - .bench_refs(|(lhs, rhs)| { - arrow_arith::numeric::add(lhs.as_ref() as &dyn Datum, rhs.as_ref() as &dyn Datum) - .unwrap() - }); -} - -// --------------------------------------------------------------------------- -// Variant 1: try_map_with_mask + closure `|(a, b), _|` (value-only) -// Fail tracking: bit-pack via the kernel. -// LLVM DCEs per-lane mask extract. +// bitpack_value_only — production path via try_map_with_mask. // --------------------------------------------------------------------------- #[divan::bench(args = SIZES)] @@ -205,228 +141,65 @@ fn bitpack_value_only(bencher: Bencher, n: usize) { } // --------------------------------------------------------------------------- -// Variant 2: try_map_with_mask + closure `|(a, b), valid|` with if-else -// Fail tracking: bit-pack via the kernel. -// Closure explicitly suppresses null-lane fails (redundant with bit-pack filter). -// --------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn bitpack_closure_suppresses_if_else(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &combined, - out.as_mut_slice(), - |(a, b), valid| { - if valid { a.checked_add(b) } else { Some(0) } - }, - ) - .unwrap(); - (combined, out) - }); -} - -// --------------------------------------------------------------------------- -// Variant 3: try_map_with_mask + closure `.or_else(|| (!valid).then(...))` -// Fail tracking: bit-pack via the kernel. -// Lazy suppression: closure only consults `valid` when overflow actually fires. -// --------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn bitpack_closure_suppresses_or_else(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &combined, - out.as_mut_slice(), - |(a, b), valid| a.checked_add(b).or_else(|| (!valid).then_some(0)), - ) - .unwrap(); - (combined, out) - }); -} - -// --------------------------------------------------------------------------- -// Variant 4: try_map_with_mask + closure with `(a * valid).checked_add(b * valid)` -// Fail tracking: bit-pack via the kernel. -// The multiply-by-valid trick zeroes null-lane operands so they can't overflow. -// --------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn bitpack_closure_mul_trick(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &combined, - out.as_mut_slice(), - |(a, b), valid| { - let m = valid as u32; - (a * m).checked_add(b * m) - }, - ) - .unwrap(); - (combined, out) - }); -} - -// --------------------------------------------------------------------------- -// Variant 5: hand-rolled, boolean fail_acc, closure suppresses nulls, cold replay +// premask_then_simd — hand-rolled ceiling. // --------------------------------------------------------------------------- -/// Hand-rolled kernel: boolean `fail_acc`, cold replay attribution. -/// Closure is expected to suppress null-lane fails by returning `Some(...)`; -/// `fail_acc` only fires for real valid-lane overflows. #[inline] -fn handrolled_boolean( +fn handrolled_premask( lhs: &[u32], rhs: &[u32], mask: &BitBuffer, out: &mut [MaybeUninit], - mut f: F, -) -> Result<(), usize> -where - F: FnMut(u32, u32, bool) -> Option, -{ - let len = lhs.len(); - assert_eq!(len, rhs.len()); - assert_eq!(len, mask.len()); - assert_eq!(len, out.len()); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; +) -> Result<(), usize> { + /// Per-chunk hot loop. Bit-broadcasts each validity bit to 0x00 / 0xFF, + /// ANDs both operands, then `overflowing_add`. Returns true if any lane in + /// `[base, base+count)` overflowed. `#[inline(always)]` keeps the literal + /// `64` at the full-chunk call site for const propagation. + #[inline(always)] + fn chunk( + lhs: &[u32], + rhs: &[u32], + out: &mut [MaybeUninit], + src_chunk: u64, + base: usize, + count: usize, + ) -> bool { let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { + for bit_idx in 0..count { let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: i < len. - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - let opt = f(a, b, bit); - fail_acc |= opt.is_none() as u64; - unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) }; - } - if fail_acc != 0 { - // Cold: find first failing lane (closure already suppressed nulls). - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - if f(a, b, bit).is_none() { - return Err(i); - } - } + let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); + // SAFETY: caller guarantees base + count <= len. + let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; + let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; + let (sum, overflow) = a.overflowing_add(b); + fail_acc |= overflow as u64; + // SAFETY: caller guarantees base + count <= len. + unsafe { out.get_unchecked_mut(i).write(sum) }; } + fail_acc != 0 } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - let mut fail_acc: u64 = 0; - for bit_idx in 0..remainder { + /// Cold attribution. Walks the chunk on raw (unmasked) operands and reports + /// the first valid lane that overflows. Null lanes were premasked to `0+0` + /// in the hot loop so they cannot contribute here. + #[cold] + #[inline(never)] + fn attribute(lhs: &[u32], rhs: &[u32], src_chunk: u64, base: usize, count: usize) -> usize { + for bit_idx in 0..count { + if (src_chunk >> bit_idx) & 1 == 0 { + continue; + } let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. let a = unsafe { *lhs.get_unchecked(i) }; let b = unsafe { *rhs.get_unchecked(i) }; - let opt = f(a, b, bit); - fail_acc |= opt.is_none() as u64; - unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) }; - } - if fail_acc != 0 { - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - if f(a, b, bit).is_none() { - return Err(i); - } + if a.checked_add(b).is_none() { + return i; } } + unreachable!("attribute called without a failing valid lane") } - Ok(()) -} - -#[divan::bench(args = SIZES)] -fn boolean_closure_suppresses(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - handrolled_boolean( - lhs.as_slice(), - rhs.as_slice(), - &combined, - out.as_mut_slice(), - |a, b, valid| { - if valid { a.checked_add(b) } else { Some(0) } - }, - ) - .unwrap(); - (combined, out) - }); -} -// --------------------------------------------------------------------------- -// Variant 6: hand-rolled pre-mask. Kernel zeros null-lane values via bit -// broadcast, then unconditional add + overflow detect. Boolean fail_acc. -// --------------------------------------------------------------------------- - -#[inline] -fn handrolled_premask( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { let len = lhs.len(); assert_eq!(len, rhs.len()); assert_eq!(len, mask.len()); @@ -437,62 +210,15 @@ fn handrolled_premask( for (chunk_idx, src_chunk) in chunks.iter().enumerate() { let base = chunk_idx * 64; - let mut fail_acc: u64 = 0; - for bit_idx in 0..64 { - // bit-broadcast: 0 → 0x00000000, 1 → 0xFFFFFFFF - let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); - let i = base + bit_idx; - // SAFETY: i < len. - let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; - let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; - let (sum, overflow) = a.overflowing_add(b); - fail_acc |= overflow as u64; - unsafe { out.get_unchecked_mut(i).write(sum) }; - } - if fail_acc != 0 { - // Cold: walk chunk to find first valid lane that actually overflows on - // the unmasked inputs. Null lanes were premasked to 0+0, can't overflow. - for bit_idx in 0..64 { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - if !bit { - continue; - } - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - if a.checked_add(b).is_none() { - return Err(i); - } - } + if chunk(lhs, rhs, out, src_chunk, base, 64) { + return Err(attribute(lhs, rhs, src_chunk, base, 64)); } } - if remainder != 0 { let src_chunk = chunks.remainder_bits(); let base = chunks_count * 64; - let mut fail_acc: u64 = 0; - for bit_idx in 0..remainder { - let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); - let i = base + bit_idx; - let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; - let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; - let (sum, overflow) = a.overflowing_add(b); - fail_acc |= overflow as u64; - unsafe { out.get_unchecked_mut(i).write(sum) }; - } - if fail_acc != 0 { - for bit_idx in 0..remainder { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - if !bit { - continue; - } - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - if a.checked_add(b).is_none() { - return Err(i); - } - } + if chunk(lhs, rhs, out, src_chunk, base, remainder) { + return Err(attribute(lhs, rhs, src_chunk, base, remainder)); } } Ok(()) @@ -524,72 +250,16 @@ fn premask_then_simd(bencher: Bencher, n: usize) { }); } -// --------------------------------------------------------------------------- -// Variant 7: pure SIMD, no mask awareness — CEILING REFERENCE ONLY. -// Incorrect for arrays where null lanes might overflow; benchmarked just to -// show the theoretical floor for nullable add. -// --------------------------------------------------------------------------- - -#[inline] -fn handrolled_no_validity( - lhs: &[u32], - rhs: &[u32], - out: &mut [MaybeUninit], -) -> Result<(), usize> { - assert_eq!(lhs.len(), rhs.len()); - assert_eq!(lhs.len(), out.len()); - let mut fail = false; - for i in 0..lhs.len() { - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - let (sum, overflow) = a.overflowing_add(b); - fail |= overflow; - unsafe { out.get_unchecked_mut(i).write(sum) }; - } - if fail { Err(0) } else { Ok(()) } -} - -/// Pure-SIMD ceiling on **pre-sanitized** input (null lanes pre-zeroed in the -/// fixture, outside the timed region). Cannot run on the realistic -/// `(lhs, rhs)` arrays because their null lanes hold `u32::MAX` and would -/// Err — proven by [`assert_pure_simd_errs_on_realistic_data`]. -/// -/// Showing the SIMD-only arithmetic floor — what an ideal nullable-add would -/// look like if validity could be free. -#[divan::bench(args = SIZES)] -fn pure_simd_no_validity_sanitized(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| (f.lhs_sanitized.clone(), f.rhs_sanitized.clone())) - .bench_refs(|(lhs, rhs)| { - let mut out = alloc_out(n); - handrolled_no_validity(lhs.as_slice(), rhs.as_slice(), out.as_mut_slice()).unwrap(); - out - }); -} - // --------------------------------------------------------------------------- // Parity assertions — must pass before divan runs benches. // --------------------------------------------------------------------------- -/// Both arrow and our kernel must Err on overflow at a valid lane. +/// Both implementations must Err on overflow at a valid lane. fn assert_overflow_parity() { let lhs: Vec = vec![1, 2, u32::MAX, 4]; let rhs: Vec = vec![10, 20, 1, 40]; let valid = vec![true; 4]; - let lhs_arrow = UInt32Array::new( - ScalarBuffer::from(lhs.clone()), - Some(NullBuffer::from(valid.clone())), - ); - let rhs_arrow = UInt32Array::new( - ScalarBuffer::from(rhs.clone()), - Some(NullBuffer::from(valid.clone())), - ); - let arrow_result = - arrow_arith::numeric::add(&lhs_arrow as &dyn Datum, &rhs_arrow as &dyn Datum); - assert!(arrow_result.is_err(), "arrow should Err on overflow"); - let mask = { let mut m = BitBufferMut::with_capacity(4); for &v in &valid { @@ -597,30 +267,24 @@ fn assert_overflow_parity() { } m.freeze() }; + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let ours = try_map_with_mask( + let bitpack = try_map_with_mask( LaneZip::new(lhs.as_slice(), rhs.as_slice()), &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), ); - assert!(ours.is_err(), "bitpack should Err on overflow"); + assert!(bitpack.is_err(), "bitpack should Err on overflow"); - let mut out2: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let boolean = handrolled_boolean(&lhs, &rhs, &mask, &mut out2, |a, b, valid| { - if valid { a.checked_add(b) } else { Some(0) } - }); - assert!(boolean.is_err(), "boolean should Err on overflow"); - - let mut out3: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out3); + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out); assert!(prem.is_err(), "premask should Err on overflow"); } -/// All correctness-preserving variants must NOT Err when only null lanes -/// would overflow. (Pure-SIMD variant is excluded — it doesn't see validity.) +/// Both implementations must NOT Err when only null lanes would overflow. fn assert_null_overflow_suppressed() { - // Lane 2 is null and contains overflowing values; valid lanes are safe. + // Lane 2 is null and holds an overflowing value; valid lanes are safe. let lhs: Vec = vec![1, 2, u32::MAX, 4]; let rhs: Vec = vec![10, 20, 1, 40]; let valid = vec![true, true, false, true]; @@ -633,49 +297,16 @@ fn assert_null_overflow_suppressed() { m.freeze() }; - // Bit-pack with value-only closure — kernel filters null-lane fails. let mut out = alloc_out(4); - let r = try_map_with_mask( + let bitpack = try_map_with_mask( LaneZip::new(lhs.as_slice(), rhs.as_slice()), &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), ); - assert!(r.is_ok(), "bitpack_value_only: null-lane overflow leaked"); + assert!(bitpack.is_ok(), "bitpack: null-lane overflow leaked"); - // Boolean with closure that suppresses nulls. let mut out = alloc_out(4); - let r = handrolled_boolean(&lhs, &rhs, &mask, &mut out, |a, b, valid| { - if valid { a.checked_add(b) } else { Some(0) } - }); - assert!(r.is_ok(), "boolean_closure_suppresses: null-lane leaked"); - - // Pre-mask: kernel zeroes null-lane values. - let mut out = alloc_out(4); - let r = handrolled_premask(&lhs, &rhs, &mask, &mut out); - assert!(r.is_ok(), "premask_then_simd: null-lane overflow leaked"); -} - -/// Demonstrates that `pure_simd_no_validity` is **incorrect** on realistic -/// fixture inputs — i.e., when null lanes contain values that overflow on add. -/// This is what justifies excluding pure_simd from the realistic bench and -/// running it only on the sanitized inputs. Without this, the "ignore the -/// mask" approach would look too fast because the test data lets it cheat. -fn assert_pure_simd_errs_on_realistic_data() { - // Lane 2 is a "null lane" in arrow-style storage: bitmap says null, but - // the data buffer still holds an overflowing value. The realistic - // `fixture` does exactly this. - let lhs: Vec = vec![1, 2, u32::MAX, 4]; - let rhs: Vec = vec![10, 20, 1, 40]; - let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - - let r = handrolled_no_validity(&lhs, &rhs, &mut out); - assert!( - r.is_err(), - "pure_simd_no_validity should Err on realistic data (null lane has \ - u32::MAX). If this passes, the bench fixture isn't exercising the \ - unsafe-null-storage case and the pure_simd ceiling number is \ - misleading — it's running on data the kernel happens to handle even \ - without a mask." - ); + let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out); + assert!(prem.is_ok(), "premask: null-lane overflow leaked"); } From 4299cf0e8391dbbd1461a9ad7fd11904a7ae890a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 15:59:20 +0100 Subject: [PATCH 12/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/benches/add_checked.rs | 159 +++------------------------ 1 file changed, 14 insertions(+), 145 deletions(-) diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index 2d4db4959e7..5814e14262e 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -1,23 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Checked `u32 + u32 -> u32` over two nullable columns. +//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_with_mask`] +//! with a value-only closure. Per-lane `is_none()` flags are bit-packed and +//! AND-ed with the chunk validity word so null-lane overflow is filtered +//! without the closure ever inspecting `valid`. //! -//! Two implementations: -//! -//! - [`bitpack_value_only`] — production path via [`try_map_with_mask`] with a -//! value-only closure. Per-lane `is_none()` flags are bit-packed and AND-ed -//! with the chunk validity word so null-lane overflow is filtered without -//! the closure ever inspecting `valid`. -//! - [`premask_then_simd`] — hand-rolled ceiling. Bit-broadcasts each mask bit -//! to `0x00000000`/`0xFFFFFFFF`, ANDs into both operands (null lanes become -//! `0+0`), then unconditional `overflowing_add` with a per-chunk OR-reduced -//! `fail_acc` and cold scalar attribution. Same pattern that beat arrow on -//! the primitive cast bench (37 µs vs 55 µs). -//! -//! Both are verified at startup via [`assert_overflow_parity`] (valid-lane -//! overflow propagates as `Err`) and [`assert_null_overflow_suppressed`] -//! (null-lane overflow does not). +//! Verified at startup via [`assert_overflow_parity`] (valid-lane overflow +//! propagates as `Err`) and [`assert_null_overflow_suppressed`] (null-lane +//! overflow does not). #![expect(clippy::unwrap_used)] @@ -44,7 +35,7 @@ const RHS_VALID_RATE: f64 = 0.8; struct Fixture { /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel - /// that ignores validity would Err on them. Both implementations under test + /// that ignores validity would Err on them. The implementation under test /// must suppress that. lhs: Buffer, rhs: Buffer, @@ -110,10 +101,6 @@ fn alloc_out(n: usize) -> Vec> { out } -// --------------------------------------------------------------------------- -// bitpack_value_only — production path via try_map_with_mask. -// --------------------------------------------------------------------------- - #[divan::bench(args = SIZES)] fn bitpack_value_only(bencher: Bencher, n: usize) { let f = fixture(n); @@ -140,121 +127,11 @@ fn bitpack_value_only(bencher: Bencher, n: usize) { }); } -// --------------------------------------------------------------------------- -// premask_then_simd — hand-rolled ceiling. -// --------------------------------------------------------------------------- - -#[inline] -fn handrolled_premask( - lhs: &[u32], - rhs: &[u32], - mask: &BitBuffer, - out: &mut [MaybeUninit], -) -> Result<(), usize> { - /// Per-chunk hot loop. Bit-broadcasts each validity bit to 0x00 / 0xFF, - /// ANDs both operands, then `overflowing_add`. Returns true if any lane in - /// `[base, base+count)` overflowed. `#[inline(always)]` keeps the literal - /// `64` at the full-chunk call site for const propagation. - #[inline(always)] - fn chunk( - lhs: &[u32], - rhs: &[u32], - out: &mut [MaybeUninit], - src_chunk: u64, - base: usize, - count: usize, - ) -> bool { - let mut fail_acc: u64 = 0; - for bit_idx in 0..count { - let i = base + bit_idx; - let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg(); - // SAFETY: caller guarantees base + count <= len. - let a = unsafe { *lhs.get_unchecked(i) } & lane_mask; - let b = unsafe { *rhs.get_unchecked(i) } & lane_mask; - let (sum, overflow) = a.overflowing_add(b); - fail_acc |= overflow as u64; - // SAFETY: caller guarantees base + count <= len. - unsafe { out.get_unchecked_mut(i).write(sum) }; - } - fail_acc != 0 - } - - /// Cold attribution. Walks the chunk on raw (unmasked) operands and reports - /// the first valid lane that overflows. Null lanes were premasked to `0+0` - /// in the hot loop so they cannot contribute here. - #[cold] - #[inline(never)] - fn attribute(lhs: &[u32], rhs: &[u32], src_chunk: u64, base: usize, count: usize) -> usize { - for bit_idx in 0..count { - if (src_chunk >> bit_idx) & 1 == 0 { - continue; - } - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let a = unsafe { *lhs.get_unchecked(i) }; - let b = unsafe { *rhs.get_unchecked(i) }; - if a.checked_add(b).is_none() { - return i; - } - } - unreachable!("attribute called without a failing valid lane") - } - - let len = lhs.len(); - assert_eq!(len, rhs.len()); - assert_eq!(len, mask.len()); - assert_eq!(len, out.len()); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - let base = chunk_idx * 64; - if chunk(lhs, rhs, out, src_chunk, base, 64) { - return Err(attribute(lhs, rhs, src_chunk, base, 64)); - } - } - if remainder != 0 { - let src_chunk = chunks.remainder_bits(); - let base = chunks_count * 64; - if chunk(lhs, rhs, out, src_chunk, base, remainder) { - return Err(attribute(lhs, rhs, src_chunk, base, remainder)); - } - } - Ok(()) -} - -#[divan::bench(args = SIZES)] -fn premask_then_simd(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - handrolled_premask( - lhs.as_slice(), - rhs.as_slice(), - &combined, - out.as_mut_slice(), - ) - .unwrap(); - (combined, out) - }); -} - // --------------------------------------------------------------------------- // Parity assertions — must pass before divan runs benches. // --------------------------------------------------------------------------- -/// Both implementations must Err on overflow at a valid lane. +/// Overflow at a valid lane must propagate as `Err`. fn assert_overflow_parity() { let lhs: Vec = vec![1, 2, u32::MAX, 4]; let rhs: Vec = vec![10, 20, 1, 40]; @@ -269,20 +146,16 @@ fn assert_overflow_parity() { }; let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let bitpack = try_map_with_mask( + let r = try_map_with_mask( LaneZip::new(lhs.as_slice(), rhs.as_slice()), &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), ); - assert!(bitpack.is_err(), "bitpack should Err on overflow"); - - let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out); - assert!(prem.is_err(), "premask should Err on overflow"); + assert!(r.is_err(), "bitpack should Err on overflow"); } -/// Both implementations must NOT Err when only null lanes would overflow. +/// Overflow at a null lane must NOT propagate. fn assert_null_overflow_suppressed() { // Lane 2 is null and holds an overflowing value; valid lanes are safe. let lhs: Vec = vec![1, 2, u32::MAX, 4]; @@ -298,15 +171,11 @@ fn assert_null_overflow_suppressed() { }; let mut out = alloc_out(4); - let bitpack = try_map_with_mask( + let r = try_map_with_mask( LaneZip::new(lhs.as_slice(), rhs.as_slice()), &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), ); - assert!(bitpack.is_ok(), "bitpack: null-lane overflow leaked"); - - let mut out = alloc_out(4); - let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out); - assert!(prem.is_ok(), "premask: null-lane overflow leaked"); + assert!(r.is_ok(), "bitpack: null-lane overflow leaked"); } From 8e5945f5818ec904914a4dd632413ebd697bd782 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 16:10:23 +0100 Subject: [PATCH 13/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/src/lane_ops_indexed.rs | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index f8d028eb7b7..f3ade8eeda6 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -287,21 +287,6 @@ where /// The closure may also explicitly suppress null-lane failures by branching on /// `valid` itself; both behaviors compose. /// -/// ## Hot loop -/// -/// `fail_bits |= (opt.is_none() as u64) << bit_idx`. After unrolling, `bit_idx` is a -/// compile-time constant per-iteration, so the shift folds. The closure receives -/// `(value, valid)`; LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` extract -/// when the closure ignores `valid`, leaving a value-only SIMD body. -/// -/// ## Attribution -/// -/// `valid_failures = fail_bits & src_chunk` — non-zero only when at least one -/// valid lane failed. `trailing_zeros()` gives the first failing valid lane. -/// **No cold replay**: failure detection and lane attribution happen entirely in -/// the hot loop. Worst-case bounded per chunk regardless of how many null lanes -/// returned `None`. -/// /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` /// write `R::default()` into `out`, but the contents of `out` must not be relied /// upon when this function returns `Err`. @@ -321,9 +306,6 @@ where R: Copy + Default, F: FnMut(S::Item, bool) -> Option, { - /// Bit-packs `is_none()` into `fail_bits` at lane position; the post-loop - /// `& src_chunk` filter drops null-lane fails. Returns `Some(failing_idx)` if - /// any *valid* lane failed in `[base, base+count)`. #[inline(always)] fn chunk( values: &S, @@ -385,12 +367,6 @@ where /// closure invocation is treated as "happened", regardless of whether the lane /// is null. Use this only when the input is known non-nullable. /// -/// For nullable inputs where the closure is infallible (no overflow / no error -/// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible -/// closure, prefer [`try_map_with_mask`] — both correctly suppress -/// null-lane logic. This kernel exists for the narrow "no validity exists" -/// case (non-nullable column, internal pipelines, etc.). -/// /// # Panics /// /// Panics if `out.len() != values.len()`. From e9aac1d057c55ac263a686a916f22a0a00a1571d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 16:58:03 +0100 Subject: [PATCH 14/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 69 +- vortex-buffer/benches/add_checked.rs | 20 +- vortex-buffer/benches/cast_to_indexed.rs | 34 +- vortex-buffer/src/lane_ops_indexed.rs | 1019 +++++++++-------- 4 files changed, 577 insertions(+), 565 deletions(-) diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 8242b5845bd..9aef97e6c9d 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -5,13 +5,9 @@ use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_buffer::lane_ops_indexed::IndexedSinkExt; +use vortex_buffer::lane_ops_indexed::IndexedSourceExt; use vortex_buffer::lane_ops_indexed::ReinterpretSink; -use vortex_buffer::lane_ops_indexed::map_no_validity; -use vortex_buffer::lane_ops_indexed::map_no_validity_in_place; -use vortex_buffer::lane_ops_indexed::try_map_no_validity; -use vortex_buffer::lane_ops_indexed::try_map_no_validity_in_place; -use vortex_buffer::lane_ops_indexed::try_map_with_mask; -use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -174,10 +170,8 @@ where // (harmless: the result validity bitmap masks them downstream). return match owned { Some(mut buf) => { - map_no_validity_in_place( - ReinterpretSink::::new(buf.as_mut_slice()), - |v: F| v.as_(), - ); + ReinterpretSink::::new(buf.as_mut_slice()) + .map_no_validity_in_place(|v: F| v.as_()); // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot was overwritten with a real `T` bit pattern. let result: BufferMut = unsafe { buf.transmute::() }; @@ -185,7 +179,7 @@ where } None => { let mut buffer = BufferMut::::with_capacity(len); - map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| v.as_()); + values.map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| v.as_()); // SAFETY: map_no_validity initializes every lane. unsafe { buffer.set_len(len) }; Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()) @@ -197,11 +191,9 @@ where let buffer: Buffer = match (&mask, owned) { (Mask::AllTrue(_), Some(mut buf)) => { - try_map_no_validity_in_place( - ReinterpretSink::::new(buf.as_mut_slice()), - |v: F| ::from(v), - ) - .map_err(|_| overflow())?; + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_no_validity_in_place(|v: F| ::from(v)) + .map_err(|_| overflow())?; // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. let result: BufferMut = unsafe { buf.transmute::() }; @@ -209,10 +201,11 @@ where } (Mask::AllTrue(_), None) => { let mut buffer = BufferMut::::with_capacity(len); - try_map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| { - ::from(v) - }) - .map_err(|_| overflow())?; + values + .try_map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| { + ::from(v) + }) + .map_err(|_| overflow())?; // SAFETY: try_map_no_validity returned Ok, so it initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() @@ -225,12 +218,11 @@ where } (Mask::AllFalse(_), None) => BufferMut::::zeroed(len).freeze(), (Mask::Values(m), Some(mut buf)) => { - try_map_with_mask_in_place( - ReinterpretSink::::new(buf.as_mut_slice()), - m.bit_buffer(), - |v: F, valid| ::from(v).or_else(|| (!valid).then(T::zero)), - ) - .map_err(|_| overflow())?; + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_with_mask_in_place(m.bit_buffer(), |v: F, valid| { + ::from(v).or_else(|| (!valid).then(T::zero)) + }) + .map_err(|_| overflow())?; // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. let result: BufferMut = unsafe { buf.transmute::() }; @@ -238,18 +230,19 @@ where } (Mask::Values(m), None) => { let mut buffer = BufferMut::::with_capacity(len); - try_map_with_mask( - values, - m.bit_buffer(), - &mut buffer.spare_capacity_mut()[..len], - // Lazy validity: only consult `valid` on the failure branch. For widening / - // statically-infallible casts, `NumCast::from` is always `Some` so the - // `or_else` is provably dead — LLVM DCEs the validity path entirely, giving - // the same codegen as the maskless kernel. For narrowing, `valid` is only - // read at lanes that actually overflowed (a cold check on top of the cast). - |v, valid| ::from(v).or_else(|| (!valid).then(T::zero)), - ) - .map_err(|_| overflow())?; + values + .try_map_with_mask( + m.bit_buffer(), + &mut buffer.spare_capacity_mut()[..len], + // Lazy validity: only consult `valid` on the failure branch. For widening / + // statically-infallible casts, `NumCast::from` is always `Some` so the + // `or_else` is provably dead — LLVM DCEs the validity path entirely, + // giving the same codegen as the maskless kernel. For narrowing, `valid` + // is only read at lanes that actually overflowed (a cold check on top of + // the cast). + |v, valid| ::from(v).or_else(|| (!valid).then(T::zero)), + ) + .map_err(|_| overflow())?; // SAFETY: try_map_with_mask returned Ok, so it initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index 5814e14262e..5c838479a13 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -20,8 +20,8 @@ use rand::prelude::*; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; +use vortex_buffer::lane_ops_indexed::IndexedSourceExt; use vortex_buffer::lane_ops_indexed::LaneZip; -use vortex_buffer::lane_ops_indexed::try_map_with_mask; fn main() { assert_overflow_parity(); @@ -116,13 +116,11 @@ fn bitpack_value_only(bencher: Bencher, n: usize) { .bench_refs(|(lhs, rhs, lm, rm)| { let combined = lm as &BitBuffer & rm as &BitBuffer; let mut out = alloc_out(n); - try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), - &combined, - out.as_mut_slice(), - |(a, b), _valid| a.checked_add(b), - ) - .unwrap(); + LaneZip::new(lhs.as_slice(), rhs.as_slice()) + .try_map_with_mask(&combined, out.as_mut_slice(), |(a, b), _valid| { + a.checked_add(b) + }) + .unwrap(); (combined, out) }); } @@ -146,8 +144,7 @@ fn assert_overflow_parity() { }; let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let r = try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask( &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), @@ -171,8 +168,7 @@ fn assert_null_overflow_suppressed() { }; let mut out = alloc_out(4); - let r = try_map_with_mask( - LaneZip::new(lhs.as_slice(), rhs.as_slice()), + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask( &mask, out.as_mut_slice(), |(a, b), _| a.checked_add(b), diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index 2751cdc8418..bcc30669ccb 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -23,12 +23,8 @@ use rand::rngs::StdRng; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; -use vortex_buffer::lane_ops_indexed::map_no_validity; -use vortex_buffer::lane_ops_indexed::map_with_mask; -use vortex_buffer::lane_ops_indexed::map_with_mask_in_place; -use vortex_buffer::lane_ops_indexed::try_map_no_validity; -use vortex_buffer::lane_ops_indexed::try_map_with_mask; -use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place; +use vortex_buffer::lane_ops_indexed::IndexedSinkExt; +use vortex_buffer::lane_ops_indexed::IndexedSourceExt; fn main() { divan::main(); @@ -129,11 +125,9 @@ fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) .bench_values(|(values, mut out)| { - map_no_validity( - values.as_slice(), - out.as_mut_slice(), - >::from, - ); + values + .as_slice() + .map_no_validity(out.as_mut_slice(), >::from); out }); } @@ -145,7 +139,7 @@ fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + values.as_slice().map_with_mask(&mask, out.as_mut_slice(), |v, valid| { >::from(v) * valid as u32 }); out @@ -159,7 +153,7 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) .bench_values(|(values, mut out)| { - try_map_no_validity(values.as_slice(), out.as_mut_slice(), |v| { + values.as_slice().try_map_no_validity(out.as_mut_slice(), |v| { ::from(v) }) .unwrap(); @@ -178,7 +172,7 @@ fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { ::from(v) }) .unwrap(); @@ -193,7 +187,7 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { ::from(v).or_else(|| (!valid).then(u32::default)) }) .unwrap(); @@ -218,7 +212,7 @@ fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usi ) }) .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { ::from(v) }) .unwrap(); @@ -233,7 +227,7 @@ fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| { + values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { Some(>::from(v)).or_else(|| (!valid).then(u32::default)) }) .unwrap(); @@ -248,7 +242,7 @@ fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| { + values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { Some(>::from(v)) }) .unwrap(); @@ -263,7 +257,7 @@ fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone())) .bench_values(|(mut values, mask)| { - map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| v * valid as u32); + values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| v * valid as u32); values }); } @@ -275,7 +269,7 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone())) .bench_values(|(mut values, mask)| { - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)) + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)) .unwrap(); values }); diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index f3ade8eeda6..683a03c5539 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -214,277 +214,296 @@ impl IndexedSource for LaneZip { } } -/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`. +/// Extension trait providing lane-kernel methods on any [`IndexedSource`]. /// -/// All three inputs must have the same length. The output type `R` may differ from the -/// input type `T` — this kernel is the building block for both same-type transforms -/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out` -/// initialized (e.g. by calling `BufferMut::set_len` after this returns). -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. -#[inline] -pub fn map_with_mask(values: S, mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) -where - S: IndexedSource, - F: FnMut(S::Item, bool) -> R, -{ - /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder` - /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the - /// full-chunk call site via constant propagation through inlining. - #[inline(always)] - fn chunk( - values: &S, - out: &mut [MaybeUninit], - f: &mut F, - src_chunk: u64, - base: usize, - count: usize, - ) where - S: IndexedSource, - F: FnMut(S::Item, bool) -> R, +/// All methods have default implementations and are inherited via the blanket +/// `impl IndexedSourceExt for S` below. Bring the trait into +/// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call +/// them with method syntax: `values.try_map_with_mask(&mask, &mut out, f)`. +pub trait IndexedSourceExt: IndexedSource + Sized { + /// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(self[i], mask[i])`. + /// + /// All three inputs must have the same length. The output type `R` may differ from + /// the input type — this kernel is the building block for both same-type transforms + /// (fill_null) and cross-type ones (cast). The caller is responsible for marking + /// `out` initialized (e.g. by calling `BufferMut::set_len` after this returns). + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`. + #[inline] + fn map_with_mask(self, mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) + where + F: FnMut(Self::Item, bool) -> R, { - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder` + /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the + /// full-chunk call site via constant propagation through inlining. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) where + S: IndexedSource, + F: FnMut(S::Item, bool) -> R, + { + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; + } } - } - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); + let values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk( - &values, - out, - &mut f, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - ); + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk( + &values, + out, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + ); + } } -} -/// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` indicates a -/// per-lane failure (e.g. range overflow on a narrowing cast). -/// -/// **Null-lane failures are filtered automatically.** If a null lane's stored value -/// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as -/// `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at the lane's -/// position, then ANDed with the chunk's validity bitmap — null-lane bits vanish. -/// The closure may also explicitly suppress null-lane failures by branching on -/// `valid` itself; both behaviors compose. -/// -/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` -/// write `R::default()` into `out`, but the contents of `out` must not be relied -/// upon when this function returns `Err`. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`. -#[inline] -pub fn try_map_with_mask( - values: S, - mask: &BitBuffer, - out: &mut [MaybeUninit], - mut f: F, -) -> Result<(), usize> -where - S: IndexedSource, - R: Copy + Default, - F: FnMut(S::Item, bool) -> Option, -{ - #[inline(always)] - fn chunk( - values: &S, + /// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` + /// indicates a per-lane failure (e.g. range overflow on a narrowing cast). + /// + /// **Null-lane failures are filtered automatically.** If a null lane's stored + /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate + /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at + /// the lane's position, then ANDed with the chunk's validity bitmap — null-lane + /// bits vanish. The closure may also explicitly suppress null-lane failures by + /// branching on `valid` itself; both behaviors compose. + /// + /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` + /// write `R::default()` into `out`, but the contents of `out` must not be relied + /// upon when this function returns `Err`. + /// + /// [`map_with_mask`]: IndexedSourceExt::map_with_mask + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`. + #[inline] + fn try_map_with_mask( + self, + mask: &BitBuffer, out: &mut [MaybeUninit], - f: &mut F, - src_chunk: u64, - base: usize, - count: usize, - ) -> Option + mut f: F, + ) -> Result<(), usize> where - S: IndexedSource, R: Copy + Default, - F: FnMut(S::Item, bool) -> Option, + F: FnMut(Self::Item, bool) -> Option, { - let mut fail_bits: u64 = 0; - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - unsafe { out.get_unchecked_mut(i).write(r) }; + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + src_chunk: u64, + base: usize, + count: usize, + ) -> Option + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item, bool) -> Option, + { + let mut fail_bits: u64 = 0; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + fail_bits |= (opt.is_none() as u64) << bit_idx; + let r = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(i).write(r) }; + } + let valid_failures = fail_bits & src_chunk; + (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) } - let valid_failures = fail_bits & src_chunk; - (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) - } - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); + let values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); + assert_eq!(out.len(), len, "out must have the same length as values"); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) { + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) { + return Err(idx); + } + } + if remainder != 0 + && let Some(idx) = chunk( + &values, + out, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + ) + { return Err(idx); } + Ok(()) } - if remainder != 0 - && let Some(idx) = chunk( - &values, - out, - &mut f, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - ) - { - return Err(idx); - } - Ok(()) -} -/// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every -/// closure invocation is treated as "happened", regardless of whether the lane -/// is null. Use this only when the input is known non-nullable. -/// -/// # Panics -/// -/// Panics if `out.len() != values.len()`. -#[inline] -pub fn map_no_validity(values: S, out: &mut [MaybeUninit], mut f: F) -where - S: IndexedSource, - F: FnMut(S::Item) -> R, -{ - #[inline(always)] - fn chunk(values: &S, out: &mut [MaybeUninit], f: &mut F, base: usize, count: usize) + /// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every + /// closure invocation is treated as "happened", regardless of whether the lane + /// is null. Use this only when the input is known non-nullable. + /// + /// # Panics + /// + /// Panics if `out.len() != self.len()`. + #[inline] + fn map_no_validity(self, out: &mut [MaybeUninit], mut f: F) where - S: IndexedSource, - F: FnMut(S::Item) -> R, + F: FnMut(Self::Item) -> R, { - for bit_idx in 0..count { - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v)) }; + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) where + S: IndexedSource, + F: FnMut(S::Item) -> R, + { + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + unsafe { out.get_unchecked_mut(i).write(f(v)) }; + } } - } - let len = values.len(); - assert_eq!(out.len(), len, "out must have the same length as values"); + let values = self; + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks_count = len / 64; + let remainder = len % 64; - for chunk_idx in 0..chunks_count { - chunk(&values, out, &mut f, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk(&values, out, &mut f, chunks_count * 64, remainder); + for chunk_idx in 0..chunks_count { + chunk(&values, out, &mut f, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk(&values, out, &mut f, chunks_count * 64, remainder); + } } -} -/// Fallible map with **no validity awareness at all** — every `None` returned -/// by the closure is treated as a failure, even at null lanes. -/// -/// # Use this only for non-nullable inputs. -/// -/// For nullable inputs with a fallible closure, use -/// [`try_map_with_mask`] — it has the same value-only closure shape -/// (and the same perf win) but **correctly suppresses null-lane failures** -/// via per-chunk `fail_bits & mask_chunk`. -/// -/// Using this kernel on a nullable input where a null lane's stored value -/// would cause `f` to return `None` will produce a spurious `Err`. This is a -/// correctness footgun on purpose — the name and this doc are how the API -/// signals "you must know your input has no nulls." -/// -/// On failure returns `Err(failing_lane_index)`. -/// -/// # Panics -/// -/// Panics if `out.len() != values.len()`. -#[inline] -pub fn try_map_no_validity( - values: S, - out: &mut [MaybeUninit], - mut f: F, -) -> Result<(), usize> -where - S: IndexedSource, - R: Copy + Default, - F: FnMut(S::Item) -> Option, -{ - /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced); - /// the cold attribution path is called at the kernel level so it can be - /// inlined separately for full vs remainder. - #[inline(always)] - fn chunk( - values: &S, + /// Fallible map with **no validity awareness at all** — every `None` returned + /// by the closure is treated as a failure, even at null lanes. + /// + /// # Use this only for non-nullable inputs. + /// + /// For nullable inputs with a fallible closure, use [`try_map_with_mask`] — + /// it has the same value-only closure shape (and the same perf win) but + /// **correctly suppresses null-lane failures** via per-chunk + /// `fail_bits & mask_chunk`. + /// + /// Using this kernel on a nullable input where a null lane's stored value + /// would cause `f` to return `None` will produce a spurious `Err`. This is a + /// correctness footgun on purpose — the name and this doc are how the API + /// signals "you must know your input has no nulls." + /// + /// On failure returns `Err(failing_lane_index)`. + /// + /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask + /// + /// # Panics + /// + /// Panics if `out.len() != self.len()`. + #[inline] + fn try_map_no_validity( + self, out: &mut [MaybeUninit], - f: &mut F, - base: usize, - count: usize, - ) -> bool + mut f: F, + ) -> Result<(), usize> where - S: IndexedSource, R: Copy + Default, - F: FnMut(S::Item) -> Option, + F: FnMut(Self::Item) -> Option, { - let mut fail_acc: u64 = 0; - for bit_idx in 0..count { - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - unsafe { out.get_unchecked_mut(i).write(r) }; + /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced); + /// the cold attribution path is called at the kernel level so it can be + /// inlined separately for full vs remainder. + #[inline(always)] + fn chunk( + values: &S, + out: &mut [MaybeUninit], + f: &mut F, + base: usize, + count: usize, + ) -> bool + where + S: IndexedSource, + R: Copy + Default, + F: FnMut(S::Item) -> Option, + { + let mut fail_acc: u64 = 0; + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + fail_acc |= opt.is_none() as u64; + let r = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(i).write(r) }; + } + fail_acc != 0 } - fail_acc != 0 - } - let len = values.len(); - assert_eq!(out.len(), len, "out must have the same length as values"); + let values = self; + let len = values.len(); + assert_eq!(out.len(), len, "out must have the same length as values"); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks_count = len / 64; + let remainder = len % 64; - for chunk_idx in 0..chunks_count { - let base = chunk_idx * 64; - if chunk(&values, out, &mut f, base, 64) { - return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); + for chunk_idx in 0..chunks_count { + let base = chunk_idx * 64; + if chunk(&values, out, &mut f, base, 64) { + return Err(attribute_failure_no_mask(&values, base, 64, &mut f)); + } } - } - if remainder != 0 { - let base = chunks_count * 64; - if chunk(&values, out, &mut f, base, remainder) { - return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); + if remainder != 0 { + let base = chunks_count * 64; + if chunk(&values, out, &mut f, base, remainder) { + return Err(attribute_failure_no_mask(&values, base, remainder, &mut f)); + } } + Ok(()) } - Ok(()) } +impl IndexedSourceExt for S {} + /// Shared cold scan: walks a chunk, returns the first lane index where /// `lane_fails(bit_idx, value)` returns `true`. Used by /// [`attribute_failure_no_mask`]. @@ -523,264 +542,277 @@ where cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none()) } -/// In-place variant of [`map_no_validity`]. Each lane is replaced with `f(values[i])`. -/// The source `S` must be writable (an [`IndexedSink`]). -/// -/// The closure reads `S::Item` and returns `S::Write`. For the common case -/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write -/// types can differ (e.g. read `f32`, write `u32`) over the same backing memory -/// when sizes and alignments match. +/// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`]. /// -/// As with [`map_no_validity`], use this only when the input is known -/// non-nullable. -#[inline] -pub fn map_no_validity_in_place(mut values: S, mut f: F) -where - S: IndexedSink, - F: FnMut(S::Item) -> S::Write, -{ - #[inline(always)] - fn chunk(values: &mut S, f: &mut F, base: usize, count: usize) +/// All methods have default implementations and are inherited via the blanket +/// `impl IndexedSinkExt for S` below. Bring the trait into scope +/// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with +/// method syntax. +pub trait IndexedSinkExt: IndexedSink + Sized { + /// In-place counterpart of [`IndexedSourceExt::map_no_validity`]. Each lane + /// is replaced with `f(self[i])`. + /// + /// The closure reads `Self::Item` and returns `Self::Write`. For the common + /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and + /// write types can differ (e.g. read `f32`, write `u32`) over the same + /// backing memory when sizes and alignments match. + /// + /// As with [`IndexedSourceExt::map_no_validity`], use this only when the + /// input is known non-nullable. + #[inline] + fn map_no_validity_in_place(self, mut f: F) where - S: IndexedSink, - F: FnMut(S::Item) -> S::Write, + F: FnMut(Self::Item) -> Self::Write, { - for bit_idx in 0..count { - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v); - // SAFETY: caller guarantees base + count <= len. - unsafe { values.set_unchecked(i, r) }; + #[inline(always)] + fn chunk(values: &mut S, f: &mut F, base: usize, count: usize) + where + S: IndexedSink, + F: FnMut(S::Item) -> S::Write, + { + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(i, r) }; + } } - } - let len = values.len(); - let chunks_count = len / 64; - let remainder = len % 64; + let mut values = self; + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; - for chunk_idx in 0..chunks_count { - chunk(&mut values, &mut f, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk(&mut values, &mut f, chunks_count * 64, remainder); + for chunk_idx in 0..chunks_count { + chunk(&mut values, &mut f, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk(&mut values, &mut f, chunks_count * 64, remainder); + } } -} -/// In-place variant of [`try_map_no_validity`]. Each lane is replaced with -/// `f(values[i])`, or `S::Write::default()` when `f` returns `None`. On failure -/// returns `Err(first_failing_lane)`; the buffer state on `Err` is unspecified. -/// -/// As with [`try_map_no_validity`], use this only when the input is known -/// non-nullable — a `None` from `f` is treated as a failure regardless of any -/// upstream validity bitmap. -/// -/// ## Error attribution -/// -/// Per-lane `is_none()` flags are folded into `first_fail` via the same -/// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay -/// isn't viable here because the original input values have already been -/// overwritten by the time we'd attribute the failure. -#[inline] -#[allow(clippy::cast_possible_truncation)] -pub fn try_map_no_validity_in_place(mut values: S, mut f: F) -> Result<(), usize> -where - S: IndexedSink, - S::Write: Default, - F: FnMut(S::Item) -> Option, -{ - #[inline(always)] + /// In-place counterpart of [`IndexedSourceExt::try_map_no_validity`]. Each + /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f` + /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer + /// state on `Err` is unspecified. + /// + /// As with [`IndexedSourceExt::try_map_no_validity`], use this only when the + /// input is known non-nullable — a `None` from `f` is treated as a failure + /// regardless of any upstream validity bitmap. + /// + /// ## Error attribution + /// + /// Per-lane `is_none()` flags are folded into `first_fail` via the same + /// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay + /// isn't viable here because the original input values have already been + /// overwritten by the time we'd attribute the failure. + /// + /// [`try_map_with_mask_in_place`]: IndexedSinkExt::try_map_with_mask_in_place + #[inline] #[allow(clippy::cast_possible_truncation)] - fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option + fn try_map_no_validity_in_place(self, mut f: F) -> Result<(), usize> where - S: IndexedSink, - S::Write: Default, - F: FnMut(S::Item) -> Option, + Self::Write: Default, + F: FnMut(Self::Item) -> Option, { - let mut first_fail: u32 = u32::MAX; - for bit_idx in 0..count { - let i = base + bit_idx; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); - let r = opt.unwrap_or_default(); - // SAFETY: caller guarantees base + count <= len. - unsafe { values.set_unchecked(i, r) }; + #[inline(always)] + #[allow(clippy::cast_possible_truncation)] + fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option + where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item) -> Option, + { + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..count { + let i = base + bit_idx; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + // SAFETY: caller guarantees base + count <= len. + unsafe { values.set_unchecked(i, r) }; + } + (first_fail != u32::MAX).then_some(first_fail) } - (first_fail != u32::MAX).then_some(first_fail) - } - let len = values.len(); - let chunks_count = len / 64; - let remainder = len % 64; + let mut values = self; + let len = values.len(); + let chunks_count = len / 64; + let remainder = len % 64; - for chunk_idx in 0..chunks_count { - if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) { + for chunk_idx in 0..chunks_count { + if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) { + return Err(failing as usize); + } + } + if remainder != 0 + && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f) + { return Err(failing as usize); } + Ok(()) } - if remainder != 0 - && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f) - { - return Err(failing as usize); - } - Ok(()) -} -/// In-place variant of [`map_with_mask`]. Each lane is replaced with -/// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]). -/// -/// The closure reads `S::Item` and returns `S::Write`. For the common case -/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write -/// types can differ (e.g. read `f32`, write `u32`) over the same backing -/// memory when sizes and alignments match. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()`. -#[inline] -pub fn map_with_mask_in_place(mut values: S, mask: &BitBuffer, mut f: F) -where - S: IndexedSink, - F: FnMut(S::Item, bool) -> S::Write, -{ - #[inline(always)] - fn chunk(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize) + /// In-place counterpart of [`IndexedSourceExt::map_with_mask`]. Each lane + /// is replaced with `f(self[i], mask[i])`. + /// + /// The closure reads `Self::Item` and returns `Self::Write`. For the common + /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and + /// write types can differ (e.g. read `f32`, write `u32`) over the same + /// backing memory when sizes and alignments match. + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()`. + #[inline] + fn map_with_mask_in_place(self, mask: &BitBuffer, mut f: F) where - S: IndexedSink, - F: FnMut(S::Item, bool) -> S::Write, + F: FnMut(Self::Item, bool) -> Self::Write, { - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v, bit); - unsafe { values.set_unchecked(i, r) }; + #[inline(always)] + fn chunk(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize) + where + S: IndexedSink, + F: FnMut(S::Item, bool) -> S::Write, + { + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees base + count <= len. + let v = unsafe { values.get_unchecked(i) }; + let r = f(v, bit); + unsafe { values.set_unchecked(i, r) }; + } } - } - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); + let mut values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk( - &mut values, - &mut f, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - ); + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64); + } + if remainder != 0 { + chunk( + &mut values, + &mut f, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + ); + } } -} -/// In-place variant of [`try_map_with_mask`]. Each lane of `values` is replaced -/// with `f(values[i], mask[i])`, or `S::Item::default()` if `f` returned `None`. -/// On failure returns `Err(first_failing_lane)`; lanes before that point have been -/// written, and lanes within the failing chunk hold their unwrapped-or-default -/// result. The buffer state on `Err` is intentionally unspecified. -/// -/// ## Error attribution -/// -/// Per-lane `is_none()` flags are folded into `first_fail` via a branchless -/// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane -/// loop, `first_fail` holds the smallest failing index in the chunk (or `MAX` -/// if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on AArch64. The -/// cold replay scheme used by [`try_map_with_mask`] isn't viable here because -/// the original input values have already been overwritten by the time we -/// would attribute the failure. -/// -/// ## Why in-place is slower at cache-resident sizes -/// -/// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the -/// out-of-place kernel despite having half the memory traffic, because input -/// and output share memory and the compiler must be conservative reordering -/// loads/stores across iterations. At sizes that exceed L2 the in-place kernel -/// wins back the gap by avoiding the second buffer's DRAM read+write traffic. -/// -/// # Panics -/// -/// Panics if `values.len() != mask.len()`. -#[inline] -#[allow(clippy::cast_possible_truncation)] -pub fn try_map_with_mask_in_place( - mut values: S, - mask: &BitBuffer, - mut f: F, -) -> Result<(), usize> -where - S: IndexedSink, - S::Write: Default, - F: FnMut(S::Item, bool) -> Option, -{ - /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in - /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any - /// realistic batch), else `None`. `#[inline(always)]` so the literal `64` at the - /// full-chunk call site enables const-propagation through inlining. - #[inline(always)] + /// In-place counterpart of [`IndexedSourceExt::try_map_with_mask`]. Each + /// lane of `self` is replaced with `f(self[i], mask[i])`, or + /// `Self::Write::default()` if `f` returned `None`. On failure returns + /// `Err(first_failing_lane)`; lanes before that point have been written, + /// and lanes within the failing chunk hold their unwrapped-or-default + /// result. The buffer state on `Err` is intentionally unspecified. + /// + /// ## Error attribution + /// + /// Per-lane `is_none()` flags are folded into `first_fail` via a branchless + /// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane + /// loop, `first_fail` holds the smallest failing index in the chunk (or + /// `MAX` if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on + /// AArch64. The cold replay scheme used by [`try_map_with_mask`] isn't + /// viable here because the original input values have already been + /// overwritten by the time we would attribute the failure. + /// + /// ## Why in-place is slower at cache-resident sizes + /// + /// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the + /// out-of-place kernel despite having half the memory traffic, because + /// input and output share memory and the compiler must be conservative + /// reordering loads/stores across iterations. At sizes that exceed L2 the + /// in-place kernel wins back the gap by avoiding the second buffer's DRAM + /// read+write traffic. + /// + /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask + /// + /// # Panics + /// + /// Panics if `self.len() != mask.len()`. + #[inline] #[allow(clippy::cast_possible_truncation)] - fn chunk( - values: &mut S, - src_chunk: u64, - base: usize, - count: usize, - f: &mut F, - ) -> Option + fn try_map_with_mask_in_place( + self, + mask: &BitBuffer, + mut f: F, + ) -> Result<(), usize> where - S: IndexedSink, - S::Write: Default, - F: FnMut(S::Item, bool) -> Option, + Self::Write: Default, + F: FnMut(Self::Item, bool) -> Option, { - let mut first_fail: u32 = u32::MAX; - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees `base + count <= values.len()`. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); - let r = opt.unwrap_or_default(); - unsafe { values.set_unchecked(i, r) }; + #[inline(always)] + #[allow(clippy::cast_possible_truncation)] + fn chunk( + values: &mut S, + src_chunk: u64, + base: usize, + count: usize, + f: &mut F, + ) -> Option + where + S: IndexedSink, + S::Write: Default, + F: FnMut(S::Item, bool) -> Option, + { + let mut first_fail: u32 = u32::MAX; + for bit_idx in 0..count { + let i = base + bit_idx; + let bit = (src_chunk >> bit_idx) & 1 == 1; + // SAFETY: caller guarantees `base + count <= values.len()`. + let v = unsafe { values.get_unchecked(i) }; + let opt = f(v, bit); + let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; + first_fail = first_fail.min(candidate); + let r = opt.unwrap_or_default(); + unsafe { values.set_unchecked(i, r) }; + } + (first_fail != u32::MAX).then_some(first_fail) } - (first_fail != u32::MAX).then_some(first_fail) - } - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); + let mut values = self; + let len = values.len(); + assert_eq!(len, mask.len(), "values and mask must have the same length"); - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; + let chunks = mask.chunks(); + let chunks_count = len / 64; + let remainder = len % 64; - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { + for (chunk_idx, src_chunk) in chunks.iter().enumerate() { + if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { + return Err(failing as usize); + } + } + if remainder != 0 + && let Some(failing) = chunk( + &mut values, + chunks.remainder_bits(), + chunks_count * 64, + remainder, + &mut f, + ) + { return Err(failing as usize); } + Ok(()) } - if remainder != 0 - && let Some(failing) = chunk( - &mut values, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - &mut f, - ) - { - return Err(failing as usize); - } - Ok(()) } +impl IndexedSinkExt for S {} + #[cfg(test)] #[allow(clippy::cast_possible_truncation)] mod tests { @@ -803,7 +835,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 10]; - map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { if valid { v } else { -1 } }); assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); @@ -815,7 +847,7 @@ mod tests { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { if valid { v + 1 } else { 0 } }); let got = write_t(out); @@ -836,7 +868,7 @@ mod tests { let values: Vec = (0..65).collect(); let mut out = vec![MaybeUninit::::uninit(); 65]; - map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| { + values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| { if valid { v } else { u32::MAX } }); let got = write_t(out); @@ -855,7 +887,7 @@ mod tests { let values: Vec = (0..130).map(|i| i as i16).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| { + values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| { if valid { v } else { -1 } }); let got = write_t(out); @@ -867,7 +899,7 @@ mod tests { let values: Vec = vec![]; let mask = BitBuffer::new_unset(0); let mut out: Vec> = vec![]; - map_with_mask(values.as_slice(), &mask, &mut out, |v, _| v); + values.as_slice().map_with_mask(&mask, &mut out, |v, _| v); } #[test] @@ -882,7 +914,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 100]; - map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { v * (valid as i64) }); let got = write_t(out); @@ -900,7 +932,7 @@ mod tests { let values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -916,7 +948,7 @@ mod tests { values[137] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -932,7 +964,7 @@ mod tests { values[137] = u64::MAX; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -954,7 +986,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| { (v <= u32::MAX as u64).then_some(v as u32) }); assert!( @@ -981,7 +1013,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| { (v <= u32::MAX as u64).then_some(v as u32) }); assert_eq!(res, Err(77)); @@ -1001,7 +1033,7 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1026,12 +1058,12 @@ mod tests { let mut branchless = vec![MaybeUninit::::uninit(); 130]; let mut branchful = vec![MaybeUninit::::uninit(); 130]; - try_map_with_mask(values.as_slice(), &mask, &mut branchless, |v, valid| { + values.as_slice().try_map_with_mask(&mask, &mut branchless, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }) .unwrap(); - try_map_with_mask(values.as_slice(), &mask, &mut branchful, |v, valid| { + values.as_slice().try_map_with_mask(&mask, &mut branchful, |v, valid| { if valid { u32::try_from(v).ok() } else { @@ -1048,7 +1080,7 @@ mod tests { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1069,7 +1101,7 @@ mod tests { let values: Vec = (0..130).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1089,7 +1121,7 @@ mod tests { let mut values: Vec = (0..130).collect(); values[77] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1115,7 +1147,7 @@ mod tests { // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. values[2] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1129,7 +1161,7 @@ mod tests { values[129] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| { + let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { let scaled = v * valid as u64; (scaled <= u32::MAX as u64).then_some(scaled as u32) }); @@ -1146,7 +1178,7 @@ mod tests { } m.freeze() }; - map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| { v.wrapping_mul(valid as u32) }); let expected: Vec = (0..130u32) @@ -1159,7 +1191,7 @@ mod tests { fn try_map_with_mask_in_place_all_ok() { let mut values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); - let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| { let scaled = v.wrapping_mul(valid as u32); scaled.checked_mul(2) }); @@ -1175,7 +1207,7 @@ mod tests { values[150] = u32::MAX; let mask = BitBuffer::new_set(200); let res = - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(83)); } @@ -1186,7 +1218,7 @@ mod tests { values[100] = u32::MAX; let mask = BitBuffer::new_set(200); let res = - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(80)); } @@ -1196,7 +1228,7 @@ mod tests { values[42] = u32::MAX; let mask = BitBuffer::new_set(200); let res = - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(42)); } @@ -1211,7 +1243,7 @@ mod tests { } m.freeze() }; - let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| { + let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| { v.wrapping_mul(valid as u32).checked_mul(2) }); assert!(res.is_ok()); @@ -1225,7 +1257,7 @@ mod tests { values[129] = u32::MAX; let mask = BitBuffer::new_set(130); let res = - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(129)); } @@ -1238,7 +1270,7 @@ mod tests { let mut values: Vec = (0..130).collect(); values[77] = u32::MAX; let res = - try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2)); + values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(77)); } @@ -1248,12 +1280,9 @@ mod tests { // should see exactly the bit patterns the closure produced. let mut buf: Vec = (0..130).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(130); - try_map_with_mask_in_place( - ReinterpretSink::::new(buf.as_mut_slice()), - &mask, - |f, _valid| Some(f.to_bits().wrapping_add(1)), - ) - .unwrap(); + ReinterpretSink::::new(buf.as_mut_slice()) + .try_map_with_mask_in_place(&mask, |f, _valid| Some(f.to_bits().wrapping_add(1))) + .unwrap(); // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by // the closure. let as_u32: &[u32] = @@ -1268,17 +1297,17 @@ mod tests { // Closure fails at a specific lane; the kernel must report that lane index. let mut buf: Vec = (0..200).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(200); - let res = try_map_with_mask_in_place( - ReinterpretSink::::new(buf.as_mut_slice()), - &mask, - |f, _valid| { - if f as u32 == 137 { - None - } else { - Some(f as u32) - } - }, - ); + let res = + ReinterpretSink::::new(buf.as_mut_slice()).try_map_with_mask_in_place( + &mask, + |f, _valid| { + if f as u32 == 137 { + None + } else { + Some(f as u32) + } + }, + ); assert_eq!(res, Err(137)); } @@ -1286,7 +1315,7 @@ mod tests { fn try_map_with_mask_in_place_partial_chunk_success() { let mut values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); - let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1)); + let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1)); assert!(res.is_ok()); assert_eq!(values[0], 1); assert_eq!(values[63], 64); From d8d5463edfa54ed37032867917ec381a347368dc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 17:06:39 +0100 Subject: [PATCH 15/21] f Signed-off-by: Joe Isaacs --- vortex-array/benches/cast_primitive.rs | 18 +- vortex-buffer/benches/cast_to_indexed.rs | 74 ++++--- vortex-buffer/src/lane_ops_indexed.rs | 256 +++++++++++++---------- 3 files changed, 194 insertions(+), 154 deletions(-) diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs index 0b67571e93d..d4279993068 100644 --- a/vortex-array/benches/cast_primitive.rs +++ b/vortex-array/benches/cast_primitive.rs @@ -56,13 +56,9 @@ fn cast_u16_to_u32(bencher: Bencher) { #[divan::bench(args = SIZES)] fn cast_u32_to_u8(bencher: Bencher, n: usize) { let mut rng = StdRng::seed_from_u64(42); - #[expect(clippy::cast_possible_truncation)] let arr = PrimitiveArray::from_option_iter((0..n).map(|_| { - if rng.random_bool(0.7) { - Some(rng.random_range(0..u8::MAX) as u32) - } else { - None - } + rng.random_bool(0.7) + .then(|| rng.random_range(0..u8::MAX) as u32) })) .into_array(); bencher.with_inputs(|| arr.clone()).bench_refs(|a| { @@ -78,13 +74,9 @@ fn cast_u32_to_u8(bencher: Bencher, n: usize) { #[divan::bench(args = SIZES)] fn cast_i32_to_u32(bencher: Bencher, n: usize) { let mut rng = StdRng::seed_from_u64(42); - let arr = PrimitiveArray::from_option_iter((0..n).map(|_| { - if rng.random_bool(0.7) { - Some(rng.random_range(0..i32::MAX)) - } else { - None - } - })) + let arr = PrimitiveArray::from_option_iter( + (0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))), + ) .into_array(); bencher.with_inputs(|| arr.clone()).bench_refs(|a| { #[expect(clippy::unwrap_used)] diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index bcc30669ccb..dedddc1733a 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -139,9 +139,11 @@ fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - values.as_slice().map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - >::from(v) * valid as u32 - }); + values + .as_slice() + .map_with_mask(&mask, out.as_mut_slice(), |v, valid| { + >::from(v) * valid as u32 + }); out }); } @@ -153,10 +155,10 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) .bench_values(|(values, mut out)| { - values.as_slice().try_map_no_validity(out.as_mut_slice(), |v| { - ::from(v) - }) - .unwrap(); + values + .as_slice() + .try_map_no_validity(out.as_mut_slice(), ::from) + .unwrap(); out }); } @@ -172,10 +174,12 @@ fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - ::from(v) - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { + ::from(v) + }) + .unwrap(); out }); } @@ -187,10 +191,12 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - ::from(v).or_else(|| (!valid).then(u32::default)) - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { + ::from(v).or_else(|| (!valid).then(u32::default)) + }) + .unwrap(); out }); } @@ -212,10 +218,12 @@ fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usi ) }) .bench_values(|(values, mask, mut out)| { - values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - ::from(v) - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { + ::from(v) + }) + .unwrap(); out }); } @@ -227,10 +235,12 @@ fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - Some(>::from(v)).or_else(|| (!valid).then(u32::default)) - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { + Some(>::from(v)).or_else(|| (!valid).then(u32::default)) + }) + .unwrap(); out }); } @@ -242,10 +252,12 @@ fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { - values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - Some(>::from(v)) - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { + Some(>::from(v)) + }) + .unwrap(); out }); } @@ -257,7 +269,9 @@ fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone())) .bench_values(|(mut values, mask)| { - values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| v * valid as u32); + values + .as_mut_slice() + .map_with_mask_in_place(&mask, |v, valid| v * valid as u32); values }); } @@ -269,7 +283,9 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) { bencher .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone())) .bench_values(|(mut values, mask)| { - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)) + values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)) .unwrap(); values }); diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 683a03c5539..98d3edf5473 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -441,11 +441,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized { /// /// Panics if `out.len() != self.len()`. #[inline] - fn try_map_no_validity( - self, - out: &mut [MaybeUninit], - mut f: F, - ) -> Result<(), usize> + fn try_map_no_validity(self, out: &mut [MaybeUninit], mut f: F) -> Result<(), usize> where R: Copy + Default, F: FnMut(Self::Item) -> Option, @@ -745,11 +741,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized { /// Panics if `self.len() != mask.len()`. #[inline] #[allow(clippy::cast_possible_truncation)] - fn try_map_with_mask_in_place( - self, - mask: &BitBuffer, - mut f: F, - ) -> Result<(), usize> + fn try_map_with_mask_in_place(self, mask: &BitBuffer, mut f: F) -> Result<(), usize> where Self::Write: Default, F: FnMut(Self::Item, bool) -> Option, @@ -835,9 +827,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 10]; - values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { - if valid { v } else { -1 } - }); + values + .as_slice() + .map_with_mask(&mask, &mut out, |v, valid| if valid { v } else { -1 }); assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); } @@ -847,9 +839,9 @@ mod tests { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { - if valid { v + 1 } else { 0 } - }); + values + .as_slice() + .map_with_mask(&mask, &mut out, |v, valid| if valid { v + 1 } else { 0 }); let got = write_t(out); assert_eq!(got.len(), 130); assert_eq!(got[0], 1); @@ -868,9 +860,13 @@ mod tests { let values: Vec = (0..65).collect(); let mut out = vec![MaybeUninit::::uninit(); 65]; - values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| { - if valid { v } else { u32::MAX } - }); + values.as_slice().map_with_mask( + &sliced, + &mut out, + |v, valid| { + if valid { v } else { u32::MAX } + }, + ); let got = write_t(out); assert_eq!(got, (0..65).collect::>()); } @@ -887,9 +883,9 @@ mod tests { let values: Vec = (0..130).map(|i| i as i16).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| { - if valid { v } else { -1 } - }); + values + .as_slice() + .map_with_mask(&sliced, &mut out, |v, valid| if valid { v } else { -1 }); let got = write_t(out); assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); } @@ -914,9 +910,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 100]; - values.as_slice().map_with_mask(&mask, &mut out, |v, valid| { - v * (valid as i64) - }); + values + .as_slice() + .map_with_mask(&mask, &mut out, |v, valid| v * (valid as i64)); let got = write_t(out); for (i, &x) in got.iter().enumerate() { if i % 3 == 0 { @@ -932,10 +928,12 @@ mod tests { let values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got, (0..200u32).collect::>()); @@ -948,10 +946,12 @@ mod tests { values[137] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert_eq!(res, Err(137)); } @@ -964,10 +964,12 @@ mod tests { values[137] = u64::MAX; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert_eq!(res, Err(50)); } @@ -986,9 +988,11 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| { - (v <= u32::MAX as u64).then_some(v as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, _valid| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!( res.is_ok(), "null-lane overflow should be filtered by the cold path" @@ -1013,9 +1017,11 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| { - (v <= u32::MAX as u64).then_some(v as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, _valid| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(77)); } @@ -1033,10 +1039,12 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got[5], 0); // null-lane wrote default @@ -1058,19 +1066,23 @@ mod tests { let mut branchless = vec![MaybeUninit::::uninit(); 130]; let mut branchful = vec![MaybeUninit::::uninit(); 130]; - values.as_slice().try_map_with_mask(&mask, &mut branchless, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }) - .unwrap(); - values.as_slice().try_map_with_mask(&mask, &mut branchful, |v, valid| { - if valid { - u32::try_from(v).ok() - } else { - Some(0) - } - }) - .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, &mut branchless, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }) + .unwrap(); + values + .as_slice() + .try_map_with_mask(&mask, &mut branchful, |v, valid| { + if valid { + u32::try_from(v).ok() + } else { + Some(0) + } + }) + .unwrap(); assert_eq!(write_t(branchful), write_t(branchless)); } @@ -1080,10 +1092,12 @@ mod tests { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got.len(), 130); @@ -1101,10 +1115,12 @@ mod tests { let values: Vec = (0..130).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got, (0..130u32).collect::>()); @@ -1121,10 +1137,12 @@ mod tests { let mut values: Vec = (0..130).collect(); values[77] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert_eq!(res, Err(77)); } @@ -1147,10 +1165,12 @@ mod tests { // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. values[2] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert!(res.is_ok(), "null lane should bypass the range check"); } @@ -1161,10 +1181,12 @@ mod tests { values[129] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values + .as_slice() + .try_map_with_mask(&mask, &mut out, |v, valid| { + let scaled = v * valid as u64; + (scaled <= u32::MAX as u64).then_some(scaled as u32) + }); assert_eq!(res, Err(129)); } @@ -1178,9 +1200,9 @@ mod tests { } m.freeze() }; - values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| { - v.wrapping_mul(valid as u32) - }); + values + .as_mut_slice() + .map_with_mask_in_place(&mask, |v, valid| v.wrapping_mul(valid as u32)); let expected: Vec = (0..130u32) .map(|v| if v % 2 == 0 { v } else { 0 }) .collect(); @@ -1191,10 +1213,12 @@ mod tests { fn try_map_with_mask_in_place_all_ok() { let mut values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); - let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| { - let scaled = v.wrapping_mul(valid as u32); - scaled.checked_mul(2) - }); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, valid| { + let scaled = v.wrapping_mul(valid as u32); + scaled.checked_mul(2) + }); assert!(res.is_ok()); let expected: Vec = (0..200u32).map(|v| v * 2).collect(); assert_eq!(values, expected); @@ -1206,8 +1230,9 @@ mod tests { values[83] = u32::MAX; values[150] = u32::MAX; let mask = BitBuffer::new_set(200); - let res = - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(83)); } @@ -1217,8 +1242,9 @@ mod tests { values[80] = u32::MAX; values[100] = u32::MAX; let mask = BitBuffer::new_set(200); - let res = - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(80)); } @@ -1227,8 +1253,9 @@ mod tests { let mut values: Vec = (0..200).collect(); values[42] = u32::MAX; let mask = BitBuffer::new_set(200); - let res = - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(42)); } @@ -1243,9 +1270,11 @@ mod tests { } m.freeze() }; - let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| { - v.wrapping_mul(valid as u32).checked_mul(2) - }); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, valid| { + v.wrapping_mul(valid as u32).checked_mul(2) + }); assert!(res.is_ok()); assert_eq!(values[5], 0); assert_eq!(values[6], 12); @@ -1256,8 +1285,9 @@ mod tests { let mut values: Vec = (0..130).collect(); values[129] = u32::MAX; let mask = BitBuffer::new_set(130); - let res = - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(129)); } @@ -1269,8 +1299,9 @@ mod tests { let mut values: Vec = (0..130).collect(); values[77] = u32::MAX; - let res = - values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); assert_eq!(res, Err(77)); } @@ -1297,17 +1328,16 @@ mod tests { // Closure fails at a specific lane; the kernel must report that lane index. let mut buf: Vec = (0..200).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(200); - let res = - ReinterpretSink::::new(buf.as_mut_slice()).try_map_with_mask_in_place( - &mask, - |f, _valid| { - if f as u32 == 137 { - None - } else { - Some(f as u32) - } - }, - ); + let res = ReinterpretSink::::new(buf.as_mut_slice()).try_map_with_mask_in_place( + &mask, + |f, _valid| { + if f as u32 == 137 { + None + } else { + Some(f as u32) + } + }, + ); assert_eq!(res, Err(137)); } @@ -1315,7 +1345,9 @@ mod tests { fn try_map_with_mask_in_place_partial_chunk_success() { let mut values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); - let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1)); + let res = values + .as_mut_slice() + .try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1)); assert!(res.is_ok()); assert_eq!(values[0], 1); assert_eq!(values[63], 64); From 2556d5331a961abe734430d23efbc78cab1e131f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 17:20:47 +0100 Subject: [PATCH 16/21] f Signed-off-by: Joe Isaacs --- vortex-array/Cargo.toml | 1 - vortex-buffer/src/lane_ops_indexed.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index e5233ce7cc6..666a23c02c4 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -218,4 +218,3 @@ harness = false [[bench]] name = "to_arrow" harness = false - diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 98d3edf5473..1244ef3d0b7 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -290,7 +290,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized { /// **Null-lane failures are filtered automatically.** If a null lane's stored /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at - /// the lane's position, then ANDed with the chunk's validity bitmap — null-lane + /// the lane's position, then AND-combined with the chunk's validity bitmap — null-lane /// bits vanish. The closure may also explicitly suppress null-lane failures by /// branching on `valid` itself; both behaviors compose. /// From aa8a6d181601704a14011204960bf1a7c8676e14 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 17:22:29 +0100 Subject: [PATCH 17/21] f Signed-off-by: Joe Isaacs --- vortex-array/benches/cast_primitive.rs | 16 +++++----------- vortex-buffer/benches/add_checked.rs | 2 +- vortex-buffer/benches/cast_to_indexed.rs | 2 +- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs index d4279993068..63eeb350e7a 100644 --- a/vortex-array/benches/cast_primitive.rs +++ b/vortex-array/benches/cast_primitive.rs @@ -18,22 +18,16 @@ fn main() { divan::main(); } -const N: usize = 100_000; - // Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so // the kernel cost shows up clearly rather than being hidden by DRAM bandwidth. const SIZES: &[usize] = &[65_536]; -#[divan::bench] -fn cast_u16_to_u32(bencher: Bencher) { +#[divan::bench(args = SIZES)] +fn cast_u16_to_u32(bencher: Bencher, n: usize) { let mut rng = StdRng::seed_from_u64(42); - #[expect(clippy::cast_possible_truncation)] - let arr = PrimitiveArray::from_option_iter((0..N).map(|i| { - if rng.random_bool(0.5) { - None - } else { - Some(i as u16) - } + let arr = PrimitiveArray::from_option_iter((0..n).map(|i| { + #[expect(clippy::cast_possible_truncation)] + rng.random_bool(0.5).then(|| i as u16) })) .into_array(); // Pre-compute min/max so values_fit_in is a cache hit during the benchmark. diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index 5c838479a13..c568cffaa23 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -29,7 +29,7 @@ fn main() { divan::main(); } -const SIZES: &[usize] = &[4_096, 65_536, 1_048_576, 2_097_152, 4_194_304]; +const SIZES: &[usize] = &[65_536]; const LHS_VALID_RATE: f64 = 0.7; const RHS_VALID_RATE: f64 = 0.8; diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index dedddc1733a..524985baa6b 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -30,7 +30,7 @@ fn main() { divan::main(); } -const SIZES: &[usize] = &[4_096, 65_536, 1_048_576]; +const SIZES: &[usize] = &[65_536]; struct Fixture { values_u64: Buffer, From ca2ad88e52ffb6e657ce800f98a6214ed262697f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 19:16:43 +0100 Subject: [PATCH 18/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 38 +- vortex-buffer/benches/add_checked.rs | 14 +- vortex-buffer/benches/cast_to_indexed.rs | 195 ++---- vortex-buffer/src/lane_ops_indexed.rs | 556 +++++------------- 4 files changed, 207 insertions(+), 596 deletions(-) diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 9aef97e6c9d..34bc6ba3445 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -108,9 +108,9 @@ impl CastKernel for Primitive { /// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with /// `NumCast::from`. The kernel branches once on the mask shape: /// -/// - `Mask::AllTrue` → [`try_map_no_validity`] — no per-lane validity work. +/// - `Mask::AllTrue` → [`try_map_into`] — no per-lane validity work. /// - `Mask::AllFalse` → bulk zero — the closure is never invoked. -/// - `Mask::Values` → [`try_map_with_mask`] — the closure neutralizes null lanes +/// - `Mask::Values` → [`try_map_masked_into`] — the closure neutralizes null lanes /// via the `* valid as F` multiply trick so out-of-range null-lane values don't /// trigger spurious errors. /// @@ -170,8 +170,7 @@ where // (harmless: the result validity bitmap masks them downstream). return match owned { Some(mut buf) => { - ReinterpretSink::::new(buf.as_mut_slice()) - .map_no_validity_in_place(|v: F| v.as_()); + ReinterpretSink::::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_()); // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot was overwritten with a real `T` bit pattern. let result: BufferMut = unsafe { buf.transmute::() }; @@ -179,8 +178,8 @@ where } None => { let mut buffer = BufferMut::::with_capacity(len); - values.map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| v.as_()); - // SAFETY: map_no_validity initializes every lane. + values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_()); + // SAFETY: map_into initializes every lane. unsafe { buffer.set_len(len) }; Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array()) } @@ -192,7 +191,7 @@ where let buffer: Buffer = match (&mask, owned) { (Mask::AllTrue(_), Some(mut buf)) => { ReinterpretSink::::new(buf.as_mut_slice()) - .try_map_no_validity_in_place(|v: F| ::from(v)) + .try_map_in_place(|v: F| ::from(v)) .map_err(|_| overflow())?; // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. @@ -202,11 +201,11 @@ where (Mask::AllTrue(_), None) => { let mut buffer = BufferMut::::with_capacity(len); values - .try_map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| { + .try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| { ::from(v) }) .map_err(|_| overflow())?; - // SAFETY: try_map_no_validity returned Ok, so it initialized every lane. + // SAFETY: try_map_into returned Ok, so it initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() } @@ -219,9 +218,7 @@ where (Mask::AllFalse(_), None) => BufferMut::::zeroed(len).freeze(), (Mask::Values(m), Some(mut buf)) => { ReinterpretSink::::new(buf.as_mut_slice()) - .try_map_with_mask_in_place(m.bit_buffer(), |v: F, valid| { - ::from(v).or_else(|| (!valid).then(T::zero)) - }) + .try_map_masked_in_place(m.bit_buffer(), |v: F| ::from(v)) .map_err(|_| overflow())?; // SAFETY: same size + alignment for NativePType same-byte-width pairs; // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. @@ -230,20 +227,19 @@ where } (Mask::Values(m), None) => { let mut buffer = BufferMut::::with_capacity(len); + // Null-lane failures (where the underlying garbage value can't be represented in + // `T`) are filtered automatically by `try_map_masked_into`'s post-loop + // `fail_bits & src_chunk` AND. The closure is value-only — LLVM proves it's + // statically infallible for widening casts and DCEs the fail-tracking, giving the + // same codegen as the maskless kernel. values - .try_map_with_mask( + .try_map_masked_into( m.bit_buffer(), &mut buffer.spare_capacity_mut()[..len], - // Lazy validity: only consult `valid` on the failure branch. For widening / - // statically-infallible casts, `NumCast::from` is always `Some` so the - // `or_else` is provably dead — LLVM DCEs the validity path entirely, - // giving the same codegen as the maskless kernel. For narrowing, `valid` - // is only read at lanes that actually overflowed (a cold check on top of - // the cast). - |v, valid| ::from(v).or_else(|| (!valid).then(T::zero)), + |v| ::from(v), ) .map_err(|_| overflow())?; - // SAFETY: try_map_with_mask returned Ok, so it initialized every lane. + // SAFETY: try_map_masked_into returned Ok, so it initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() } diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs index c568cffaa23..4f71f085847 100644 --- a/vortex-buffer/benches/add_checked.rs +++ b/vortex-buffer/benches/add_checked.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_with_mask`] +//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_masked_into`] //! with a value-only closure. Per-lane `is_none()` flags are bit-packed and //! AND-ed with the chunk validity word so null-lane overflow is filtered //! without the closure ever inspecting `valid`. @@ -117,9 +117,7 @@ fn bitpack_value_only(bencher: Bencher, n: usize) { let combined = lm as &BitBuffer & rm as &BitBuffer; let mut out = alloc_out(n); LaneZip::new(lhs.as_slice(), rhs.as_slice()) - .try_map_with_mask(&combined, out.as_mut_slice(), |(a, b), _valid| { - a.checked_add(b) - }) + .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b)) .unwrap(); (combined, out) }); @@ -144,10 +142,10 @@ fn assert_overflow_parity() { }; let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask( + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( &mask, out.as_mut_slice(), - |(a, b), _| a.checked_add(b), + |(a, b)| a.checked_add(b), ); assert!(r.is_err(), "bitpack should Err on overflow"); } @@ -168,10 +166,10 @@ fn assert_null_overflow_suppressed() { }; let mut out = alloc_out(4); - let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask( + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( &mask, out.as_mut_slice(), - |(a, b), _| a.checked_add(b), + |(a, b)| a.checked_add(b), ); assert!(r.is_ok(), "bitpack: null-lane overflow leaked"); } diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs index 524985baa6b..5ab1041f5cc 100644 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ b/vortex-buffer/benches/cast_to_indexed.rs @@ -13,9 +13,8 @@ use arrow_array::UInt64Array; use arrow_buffer::NullBuffer; use arrow_buffer::ScalarBuffer; use arrow_cast::CastOptions; -use arrow_cast::cast_with_options; -use arrow_schema::DataType; use divan::Bencher; +use num_traits::AsPrimitive; use num_traits::NumCast; use rand::SeedableRng; use rand::prelude::*; @@ -25,6 +24,7 @@ use vortex_buffer::BitBufferMut; use vortex_buffer::Buffer; use vortex_buffer::lane_ops_indexed::IndexedSinkExt; use vortex_buffer::lane_ops_indexed::IndexedSourceExt; +use vortex_buffer::lane_ops_indexed::ReinterpretSink; fn main() { divan::main(); @@ -34,15 +34,11 @@ const SIZES: &[usize] = &[65_536]; struct Fixture { values_u64: Buffer, - values_u64_invalid_overflows: Buffer, - values_u32: Buffer, - values_u32_small: Buffer, values_u16: Buffer, + /// Positive `i32` values (always representable as `u32`). Used by the + /// in-place-vs-out-of-place cast bench. + values_i32: Buffer, mask: BitBuffer, - /// `UInt64Array` baseline for arrow casts. Same values + validity as `values_u64` / `mask`. - arrow_u64: UInt64Array, - /// `UInt16Array` baseline. Same as `values_u16` / `mask`. - arrow_u16: UInt16Array, } fn fixture(n: usize) -> Fixture { @@ -60,6 +56,14 @@ fn fixture(n: usize) -> Fixture { .map(|v| v as u16) .collect::>(); + // Positive i32 values (top bit cleared) — every value fits in u32. + #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + let values_i32 = raw_values + .iter() + .copied() + .map(|v| (v as i32) & i32::MAX) + .collect::>(); + #[expect(clippy::cast_possible_truncation)] let values_u32 = raw_values .iter() @@ -94,21 +98,12 @@ fn fixture(n: usize) -> Fixture { Fixture { values_u64: raw_values.into(), - values_u64_invalid_overflows, - values_u32, - values_u32_small, values_u16, + values_i32, mask: BitBufferMut::from_iter(raw_valid).freeze(), - arrow_u64, - arrow_u16, } } -const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions { - safe: false, - format_options: arrow_cast::display::FormatOptions::new(), -}; - fn uninit_out(n: usize) -> Vec> { let mut out = Vec::with_capacity(n); // SAFETY: A `MaybeUninit` does not require initialization. @@ -119,37 +114,7 @@ fn uninit_out(n: usize) -> Vec> { } #[divan::bench(args = SIZES)] -fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) - .bench_values(|(values, mut out)| { - values - .as_slice() - .map_no_validity(out.as_mut_slice(), >::from); - out - }); -} - -#[divan::bench(args = SIZES)] -fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - >::from(v) * valid as u32 - }); - out - }); -} - -#[divan::bench(args = SIZES)] -fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) { +fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher @@ -157,158 +122,88 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) { .bench_values(|(values, mut out)| { values .as_slice() - .try_map_no_validity(out.as_mut_slice(), ::from) + .try_map_into(out.as_mut_slice(), ::from) .unwrap(); out }); } -/// `try_map_with_mask` with a closure that **ignores `valid`**. Tests whether -/// LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` mask extract. Uses -/// non-overflowing `values_u64` so the closure-ignores-valid spurious-failure -/// case never triggers (would otherwise err on null-lane overflow). #[divan::bench(args = SIZES)] -fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) { +fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher - .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - ::from(v) - }) - .unwrap(); + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(&mut out, |v| v.as_()); out }); } +/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the same runtime +/// and showing for always true map operations `try_map_masked_into` is sufficient. #[divan::bench(args = SIZES)] -fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) { +fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher - .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::(n))) + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { values .as_slice() - .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - ::from(v).or_else(|| (!valid).then(u32::default)) - }) + .try_map_masked_into(&mask, out.as_mut_slice(), |v| ::from(v)) .unwrap(); out }); } -/// Migrated from the old `try_map_validity_filtered` bench: same inputs (null -/// lanes contain overflowing values) and same correctness expectation (no Err), -/// but now driven through the merged `try_map_with_mask` with a `|v, _|` closure. -/// The hot loop is value-only via DCE; the cold path filters null-lane failures. #[divan::bench(args = SIZES)] -fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usize) { +fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher - .with_inputs(|| { - ( - f.values_u64_invalid_overflows.clone(), - f.mask.clone(), - uninit_out::(n), - ) - }) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - ::from(v) - }) - .unwrap(); + .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(out.as_mut_slice(), |v| v.as_()); out }); } -#[divan::bench(args = SIZES)] -fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| { - Some(>::from(v)).or_else(|| (!valid).then(u32::default)) - }) - .unwrap(); - out - }); -} +// ----------------------------------------------------------------------------- +// In-place vs out-of-place fallible cast i32 → u32 (same byte width). +// +// `try_map_masked_into_in_place` mutates the input via `ReinterpretSink` and +// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates +// a fresh `BufferMut` and writes through it. Input values are all positive +// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any +// delta is pure allocation + memory-traffic overhead. +// ----------------------------------------------------------------------------- #[divan::bench(args = SIZES)] -fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) { +fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::(n))) .bench_values(|(values, mask, mut out)| { values .as_slice() - .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| { - Some(>::from(v)) - }) + .try_map_masked_into(&mask, out.as_mut_slice(), |v| ::from(v)) .unwrap(); out }); } #[divan::bench(args = SIZES)] -fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) { +fn try_map_masked_into_in_place_narrow_i32_u32(bencher: Bencher, n: usize) { let f = fixture(n); bencher - .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone())) + .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone())) .bench_values(|(mut values, mask)| { - values - .as_mut_slice() - .map_with_mask_in_place(&mask, |v, valid| v * valid as u32); - values - }); -} - -#[divan::bench(args = SIZES)] -fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone())) - .bench_values(|(mut values, mask)| { - values - .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)) + ReinterpretSink::::new(values.as_mut_slice()) + .try_map_masked_in_place(&mask, |v| ::from(v)) .unwrap(); values }); } - -// ----------------------------------------------------------------------------- -// Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one -// narrowing (u64 → u32, can fail). Each pairs with the cast variants above of -// matching direction. -// ----------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn arrow_cast_widen_u16_u32(bencher: Bencher, _n: usize) { - let f = fixture(_n); - bencher - .with_inputs(|| f.arrow_u16.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} - -#[divan::bench(args = SIZES)] -fn arrow_cast_narrow_u64_u32(bencher: Bencher, _n: usize) { - let f = fixture(_n); - bencher - .with_inputs(|| f.arrow_u64.clone()) - .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap()); -} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs index 1244ef3d0b7..810d76c0900 100644 --- a/vortex-buffer/src/lane_ops_indexed.rs +++ b/vortex-buffer/src/lane_ops_indexed.rs @@ -219,92 +219,35 @@ impl IndexedSource for LaneZip { /// All methods have default implementations and are inherited via the blanket /// `impl IndexedSourceExt for S` below. Bring the trait into /// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call -/// them with method syntax: `values.try_map_with_mask(&mask, &mut out, f)`. +/// them with method syntax: `values.try_map_masked_into(&mask, &mut out, f)`. pub trait IndexedSourceExt: IndexedSource + Sized { - /// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(self[i], mask[i])`. - /// - /// All three inputs must have the same length. The output type `R` may differ from - /// the input type — this kernel is the building block for both same-type transforms - /// (fill_null) and cross-type ones (cast). The caller is responsible for marking - /// `out` initialized (e.g. by calling `BufferMut::set_len` after this returns). - /// - /// # Panics - /// - /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`. - #[inline] - fn map_with_mask(self, mask: &BitBuffer, out: &mut [MaybeUninit], mut f: F) - where - F: FnMut(Self::Item, bool) -> R, - { - /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder` - /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the - /// full-chunk call site via constant propagation through inlining. - #[inline(always)] - fn chunk( - values: &S, - out: &mut [MaybeUninit], - f: &mut F, - src_chunk: u64, - base: usize, - count: usize, - ) where - S: IndexedSource, - F: FnMut(S::Item, bool) -> R, - { - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v, bit)) }; - } - } - - let values = self; - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - assert_eq!(out.len(), len, "out must have the same length as values"); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk( - &values, - out, - &mut f, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - ); - } - } - /// Fallible variant of [`map_with_mask`]. `f` returns `Option`; `None` /// indicates a per-lane failure (e.g. range overflow on a narrowing cast). /// - /// **Null-lane failures are filtered automatically.** If a null lane's stored - /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate - /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at - /// the lane's position, then AND-combined with the chunk's validity bitmap — null-lane - /// bits vanish. The closure may also explicitly suppress null-lane failures by - /// branching on `valid` itself; both behaviors compose. + /// **Null-lane failures are filtered automatically.** The closure is called on + /// every lane regardless of validity; if a null lane's stored value causes `f(v)` + /// to return `None`, the kernel does *not* propagate that as `Err`. The per-lane + /// `is_none()` flags are bit-packed into a `u64` at the lane's position, then + /// AND-combined with the chunk's validity bitmap — null-lane bits vanish. + /// + /// The closure shape is the same as [`try_map_into`] (`FnMut(Item) -> Option`); + /// the mask parameter is what makes this kernel mask-aware. Callers that need to + /// distinguish null lanes inside the closure (e.g. to short-circuit an expensive + /// computation) should construct their own per-lane validity check externally; for + /// the common case, the kernel's automatic filter is sufficient. /// /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` /// write `R::default()` into `out`, but the contents of `out` must not be relied /// upon when this function returns `Err`. /// /// [`map_with_mask`]: IndexedSourceExt::map_with_mask + /// [`try_map_into`]: IndexedSourceExt::try_map_into /// /// # Panics /// /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`. #[inline] - fn try_map_with_mask( + fn try_map_masked_into( self, mask: &BitBuffer, out: &mut [MaybeUninit], @@ -312,7 +255,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized { ) -> Result<(), usize> where R: Copy + Default, - F: FnMut(Self::Item, bool) -> Option, + F: FnMut(Self::Item) -> Option, { #[inline(always)] fn chunk( @@ -326,15 +269,14 @@ pub trait IndexedSourceExt: IndexedSource + Sized { where S: IndexedSource, R: Copy + Default, - F: FnMut(S::Item, bool) -> Option, + F: FnMut(S::Item) -> Option, { let mut fail_bits: u64 = 0; for bit_idx in 0..count { let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; // SAFETY: caller guarantees base + count <= len. let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); + let opt = f(v); fail_bits |= (opt.is_none() as u64) << bit_idx; let r = opt.unwrap_or_default(); unsafe { out.get_unchecked_mut(i).write(r) }; @@ -380,7 +322,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized { /// /// Panics if `out.len() != self.len()`. #[inline] - fn map_no_validity(self, out: &mut [MaybeUninit], mut f: F) + fn map_into(self, out: &mut [MaybeUninit], mut f: F) where F: FnMut(Self::Item) -> R, { @@ -423,7 +365,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized { /// /// # Use this only for non-nullable inputs. /// - /// For nullable inputs with a fallible closure, use [`try_map_with_mask`] — + /// For nullable inputs with a fallible closure, use [`try_map_masked_into`] — /// it has the same value-only closure shape (and the same perf win) but /// **correctly suppresses null-lane failures** via per-chunk /// `fail_bits & mask_chunk`. @@ -435,13 +377,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized { /// /// On failure returns `Err(failing_lane_index)`. /// - /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask + /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into /// /// # Panics /// /// Panics if `out.len() != self.len()`. #[inline] - fn try_map_no_validity(self, out: &mut [MaybeUninit], mut f: F) -> Result<(), usize> + fn try_map_into(self, out: &mut [MaybeUninit], mut f: F) -> Result<(), usize> where R: Copy + Default, F: FnMut(Self::Item) -> Option, @@ -545,7 +487,7 @@ where /// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with /// method syntax. pub trait IndexedSinkExt: IndexedSink + Sized { - /// In-place counterpart of [`IndexedSourceExt::map_no_validity`]. Each lane + /// In-place counterpart of [`IndexedSourceExt::map_into`]. Each lane /// is replaced with `f(self[i])`. /// /// The closure reads `Self::Item` and returns `Self::Write`. For the common @@ -553,10 +495,10 @@ pub trait IndexedSinkExt: IndexedSink + Sized { /// write types can differ (e.g. read `f32`, write `u32`) over the same /// backing memory when sizes and alignments match. /// - /// As with [`IndexedSourceExt::map_no_validity`], use this only when the + /// As with [`IndexedSourceExt::map_into`], use this only when the /// input is known non-nullable. #[inline] - fn map_no_validity_in_place(self, mut f: F) + fn map_into_in_place(self, mut f: F) where F: FnMut(Self::Item) -> Self::Write, { @@ -589,51 +531,46 @@ pub trait IndexedSinkExt: IndexedSink + Sized { } } - /// In-place counterpart of [`IndexedSourceExt::try_map_no_validity`]. Each + /// In-place counterpart of [`IndexedSourceExt::try_map_into`]. Each /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f` /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer /// state on `Err` is unspecified. /// - /// As with [`IndexedSourceExt::try_map_no_validity`], use this only when the - /// input is known non-nullable — a `None` from `f` is treated as a failure - /// regardless of any upstream validity bitmap. - /// /// ## Error attribution /// - /// Per-lane `is_none()` flags are folded into `first_fail` via the same - /// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay - /// isn't viable here because the original input values have already been - /// overwritten by the time we'd attribute the failure. + /// Per-lane `is_none()` flags are bit-packed into a `u64` at the lane's + /// position — `fail_bits |= (opt.is_none() as u64) << bit_idx`. After the + /// 64-lane loop, `trailing_zeros()` of `fail_bits` recovers the first + /// failing lane index. `OR + shift` per lane is friendlier to the + /// autovectorizer than `min`/`csel` — see [`try_map_masked_in_place`] for + /// the same scheme over a masked variant. /// - /// [`try_map_with_mask_in_place`]: IndexedSinkExt::try_map_with_mask_in_place + /// [`try_map_masked_in_place`]: IndexedSinkExt::try_map_masked_in_place #[inline] - #[allow(clippy::cast_possible_truncation)] - fn try_map_no_validity_in_place(self, mut f: F) -> Result<(), usize> + fn try_map_in_place(self, mut f: F) -> Result<(), usize> where Self::Write: Default, F: FnMut(Self::Item) -> Option, { #[inline(always)] - #[allow(clippy::cast_possible_truncation)] - fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option + fn chunk(values: &mut S, base: usize, count: usize, f: &mut F) -> Option where S: IndexedSink, S::Write: Default, F: FnMut(S::Item) -> Option, { - let mut first_fail: u32 = u32::MAX; + let mut fail_bits: u64 = 0; for bit_idx in 0..count { let i = base + bit_idx; // SAFETY: caller guarantees base + count <= len. let v = unsafe { values.get_unchecked(i) }; let opt = f(v); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); + fail_bits |= (opt.is_none() as u64) << bit_idx; let r = opt.unwrap_or_default(); // SAFETY: caller guarantees base + count <= len. unsafe { values.set_unchecked(i, r) }; } - (first_fail != u32::MAX).then_some(first_fail) + (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize) } let mut values = self; @@ -643,85 +580,36 @@ pub trait IndexedSinkExt: IndexedSink + Sized { for chunk_idx in 0..chunks_count { if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) { - return Err(failing as usize); + return Err(failing); } } if remainder != 0 && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f) { - return Err(failing as usize); + return Err(failing); } Ok(()) } - /// In-place counterpart of [`IndexedSourceExt::map_with_mask`]. Each lane - /// is replaced with `f(self[i], mask[i])`. - /// - /// The closure reads `Self::Item` and returns `Self::Write`. For the common - /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and - /// write types can differ (e.g. read `f32`, write `u32`) over the same - /// backing memory when sizes and alignments match. - /// - /// # Panics + /// In-place counterpart of [`IndexedSourceExt::try_map_masked_into`]. Each + /// lane of `self` is replaced with `f(self[i])`, or `Self::Write::default()` + /// if `f` returned `None`. On failure returns `Err(first_failing_lane)`; + /// lanes before that point have been written, and lanes within the failing + /// chunk hold their unwrapped-or-default result. The buffer state on `Err` + /// is intentionally unspecified. /// - /// Panics if `self.len() != mask.len()`. - #[inline] - fn map_with_mask_in_place(self, mask: &BitBuffer, mut f: F) - where - F: FnMut(Self::Item, bool) -> Self::Write, - { - #[inline(always)] - fn chunk(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize) - where - S: IndexedSink, - F: FnMut(S::Item, bool) -> S::Write, - { - for bit_idx in 0..count { - let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; - // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v, bit); - unsafe { values.set_unchecked(i, r) }; - } - } - - let mut values = self; - let len = values.len(); - assert_eq!(len, mask.len(), "values and mask must have the same length"); - - let chunks = mask.chunks(); - let chunks_count = len / 64; - let remainder = len % 64; - - for (chunk_idx, src_chunk) in chunks.iter().enumerate() { - chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64); - } - if remainder != 0 { - chunk( - &mut values, - &mut f, - chunks.remainder_bits(), - chunks_count * 64, - remainder, - ); - } - } - - /// In-place counterpart of [`IndexedSourceExt::try_map_with_mask`]. Each - /// lane of `self` is replaced with `f(self[i], mask[i])`, or - /// `Self::Write::default()` if `f` returned `None`. On failure returns - /// `Err(first_failing_lane)`; lanes before that point have been written, - /// and lanes within the failing chunk hold their unwrapped-or-default - /// result. The buffer state on `Err` is intentionally unspecified. + /// **Null-lane failures are filtered automatically** — same semantics as + /// [`try_map_masked_into`]. The closure has no `valid` parameter; the kernel + /// AND-combines `is_none()` with the chunk's validity bitmap before folding + /// it into the attribution accumulator. /// /// ## Error attribution /// - /// Per-lane `is_none()` flags are folded into `first_fail` via a branchless - /// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane - /// loop, `first_fail` holds the smallest failing index in the chunk (or - /// `MAX` if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on - /// AArch64. The cold replay scheme used by [`try_map_with_mask`] isn't + /// Per-lane `(is_none && valid)` flags are folded into `first_fail` via a + /// branchless `min` of `(if is_none && valid { i as u32 } else { u32::MAX })`. + /// After the 64-lane loop, `first_fail` holds the smallest valid failing index + /// in the chunk (or `MAX` if none). Vectorizes to NEON `bsl.16b` + `umin.4s` + /// on AArch64. The cold replay scheme used by [`try_map_masked_into`] isn't /// viable here because the original input values have already been /// overwritten by the time we would attribute the failure. /// @@ -734,45 +622,47 @@ pub trait IndexedSinkExt: IndexedSink + Sized { /// in-place kernel wins back the gap by avoiding the second buffer's DRAM /// read+write traffic. /// - /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask + /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into /// /// # Panics /// /// Panics if `self.len() != mask.len()`. #[inline] #[allow(clippy::cast_possible_truncation)] - fn try_map_with_mask_in_place(self, mask: &BitBuffer, mut f: F) -> Result<(), usize> + fn try_map_masked_in_place(self, mask: &BitBuffer, mut f: F) -> Result<(), usize> where Self::Write: Default, - F: FnMut(Self::Item, bool) -> Option, + F: FnMut(Self::Item) -> Option, { + /// Bit-pack `is_none()` flags per lane, then AND with `src_chunk` post-loop to + /// drop null-lane failures — identical scheme to [`try_map_masked_into`]. The + /// per-lane attribution work is `OR + shift` (no `min`/`csel`), giving LLVM more + /// freedom to vectorize the value pipeline. #[inline(always)] - #[allow(clippy::cast_possible_truncation)] fn chunk( values: &mut S, src_chunk: u64, base: usize, count: usize, f: &mut F, - ) -> Option + ) -> Option where S: IndexedSink, S::Write: Default, - F: FnMut(S::Item, bool) -> Option, + F: FnMut(S::Item) -> Option, { - let mut first_fail: u32 = u32::MAX; + let mut fail_bits: u64 = 0; for bit_idx in 0..count { let i = base + bit_idx; - let bit = (src_chunk >> bit_idx) & 1 == 1; // SAFETY: caller guarantees `base + count <= values.len()`. let v = unsafe { values.get_unchecked(i) }; - let opt = f(v, bit); - let candidate = if opt.is_none() { i as u32 } else { u32::MAX }; - first_fail = first_fail.min(candidate); + let opt = f(v); + fail_bits |= (opt.is_none() as u64) << bit_idx; let r = opt.unwrap_or_default(); unsafe { values.set_unchecked(i, r) }; } - (first_fail != u32::MAX).then_some(first_fail) + let valid_failures = fail_bits & src_chunk; + (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) } let mut values = self; @@ -785,7 +675,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized { for (chunk_idx, src_chunk) in chunks.iter().enumerate() { if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) { - return Err(failing as usize); + return Err(failing); } } if remainder != 0 @@ -797,7 +687,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized { &mut f, ) { - return Err(failing as usize); + return Err(failing); } Ok(()) } @@ -817,146 +707,33 @@ mod tests { } #[test] - fn map_with_mask_aligned() { - let values: Vec = (0..10).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(10); - for i in 0..10 { - m.append(i % 2 == 0); - } - m.freeze() - }; - let mut out = vec![MaybeUninit::::uninit(); 10]; - values - .as_slice() - .map_with_mask(&mask, &mut out, |v, valid| if valid { v } else { -1 }); - assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]); - } - - #[test] - fn map_with_mask_partial_chunk() { - // 130 lanes — two full u64 words + a 2-bit remainder. - let values: Vec = (0..130).collect(); - let mask = BitBuffer::new_set(130); - let mut out = vec![MaybeUninit::::uninit(); 130]; - values - .as_slice() - .map_with_mask(&mask, &mut out, |v, valid| if valid { v + 1 } else { 0 }); - let got = write_t(out); - assert_eq!(got.len(), 130); - assert_eq!(got[0], 1); - assert_eq!(got[63], 64); - assert_eq!(got[64], 65); - assert_eq!(got[129], 130); - } - - #[test] - fn map_with_mask_offset_mask() { - // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5. - let big = BitBuffer::new_set(128); - let sliced = big.slice(5..70); // logical len = 65, offset = 5 - assert_eq!(sliced.len(), 65); - assert_eq!(sliced.offset(), 5); - - let values: Vec = (0..65).collect(); - let mut out = vec![MaybeUninit::::uninit(); 65]; - values.as_slice().map_with_mask( - &sliced, - &mut out, - |v, valid| { - if valid { v } else { u32::MAX } - }, - ); - let got = write_t(out); - assert_eq!(got, (0..65).collect::>()); - } - - #[test] - fn map_with_mask_offset_past_word() { - // Slicing past a full word still works. `BitBuffer::slice` normalizes the - // logical offset to `offset % 8` and bumps the underlying byte pointer, - // so `offset()` won't equal 70 here — what we exercise is that the kernel - // walks the chunked u64 view (which BitChunks handles internally). - let big = BitBuffer::new_set(256); - let sliced = big.slice(70..200); - assert_eq!(sliced.len(), 130); - - let values: Vec = (0..130).map(|i| i as i16).collect(); - let mut out = vec![MaybeUninit::::uninit(); 130]; - values - .as_slice() - .map_with_mask(&sliced, &mut out, |v, valid| if valid { v } else { -1 }); - let got = write_t(out); - assert_eq!(got, (0..130).map(|i| i as i16).collect::>()); - } - - #[test] - fn map_with_mask_empty() { - let values: Vec = vec![]; - let mask = BitBuffer::new_unset(0); - let mut out: Vec> = vec![]; - values.as_slice().map_with_mask(&mask, &mut out, |v, _| v); - } - - #[test] - fn map_with_mask_null_to_zero_branchless() { - // The trick from primitive/compute/cast.rs:147 — multiply by valid as T. - let values: Vec = (1..=100).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(100); - for i in 0..100 { - m.append(i % 3 != 0); - } - m.freeze() - }; - let mut out = vec![MaybeUninit::::uninit(); 100]; - values - .as_slice() - .map_with_mask(&mask, &mut out, |v, valid| v * (valid as i64)); - let got = write_t(out); - for (i, &x) in got.iter().enumerate() { - if i % 3 == 0 { - assert_eq!(x, 0); - } else { - assert_eq!(x, (i + 1) as i64); - } - } - } - - #[test] - fn try_map_with_mask_all_ok() { + fn try_map_masked_into_all_ok() { let values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got, (0..200u32).collect::>()); } #[test] - fn try_map_with_mask_overflow_fails() { + fn try_map_masked_into_overflow_fails() { // Put an overflowing value at lane 137 — the kernel must report Err(137). let mut values: Vec = (0..200).collect(); values[137] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(137)); } #[test] - fn try_map_with_mask_overflow_reports_first_failing_lane() { + fn try_map_masked_into_overflow_reports_first_failing_lane() { // Multiple failing lanes — must report the lowest index. let mut values: Vec = (0..200).collect(); values[50] = u64::MAX; @@ -964,17 +741,14 @@ mod tests { values[137] = u64::MAX; let mask = BitBuffer::new_set(200); let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(50)); } #[test] - fn try_map_with_mask_value_only_closure_filters_null_overflow() { + fn try_map_masked_into_value_only_closure_filters_null_overflow() { // `|v, _|` closure that ignores validity. A null lane with an overflowing // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us. let mut values: Vec = (0..200).collect(); @@ -988,11 +762,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, _valid| { - (v <= u32::MAX as u64).then_some(v as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!( res.is_ok(), "null-lane overflow should be filtered by the cold path" @@ -1000,7 +772,7 @@ mod tests { } #[test] - fn try_map_with_mask_value_only_closure_reports_first_valid_failure() { + fn try_map_masked_into_value_only_closure_reports_first_valid_failure() { // Valid lane overflow must propagate — and the reported index must be // the lowest VALID failing lane, even if earlier null lanes also "failed" // their unconditional cast. @@ -1017,16 +789,14 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, _valid| { - (v <= u32::MAX as u64).then_some(v as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(77)); } #[test] - fn try_map_with_mask_null_lane_bypasses_check() { + fn try_map_masked_into_null_lane_bypasses_check() { // Null lanes are neutralized by `valid as u64` before the range check, so an // out-of-range value at a null lane must NOT trigger failure. let mut values: Vec = (0..200).collect(); @@ -1039,12 +809,9 @@ mod tests { m.freeze() }; let mut out = vec![MaybeUninit::::uninit(); 200]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got[5], 0); // null-lane wrote default @@ -1052,7 +819,7 @@ mod tests { } #[test] - fn try_map_with_mask_branchful_matches_branchless() { + fn try_map_masked_into_branchful_matches_branchless() { let mut values: Vec = (0..130).map(|i| i as u64 * 7).collect(); values[2] = u64::MAX; values[65] = u32::MAX as u64; @@ -1068,36 +835,26 @@ mod tests { let mut branchful = vec![MaybeUninit::::uninit(); 130]; values .as_slice() - .try_map_with_mask(&mask, &mut branchless, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) + .try_map_masked_into(&mask, &mut branchless, |v| { + (v <= u32::MAX as u64).then_some(v as u32) }) .unwrap(); values .as_slice() - .try_map_with_mask(&mask, &mut branchful, |v, valid| { - if valid { - u32::try_from(v).ok() - } else { - Some(0) - } - }) + .try_map_masked_into(&mask, &mut branchful, |v| u32::try_from(v).ok()) .unwrap(); assert_eq!(write_t(branchful), write_t(branchless)); } #[test] - fn try_map_with_mask_partial_chunk() { + fn try_map_masked_into_partial_chunk() { let values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got.len(), 130); @@ -1105,7 +862,7 @@ mod tests { } #[test] - fn try_map_with_mask_sliced_mask_unaligned_offset() { + fn try_map_masked_into_sliced_mask_unaligned_offset() { // The mask's first byte is not word-aligned: slice off 13 bits, so the // underlying BitChunks iterator must shift across byte boundaries on every // 64-bit chunk it yields. @@ -1115,19 +872,16 @@ mod tests { let values: Vec = (0..130).collect(); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!(res.is_ok()); let got = write_t(out); assert_eq!(got, (0..130u32).collect::>()); } #[test] - fn try_map_with_mask_sliced_mask_with_overflow() { + fn try_map_masked_into_sliced_mask_with_overflow() { // Sliced mask + overflowing value — the cold attribution path must report // the correct lane index in the sliced (post-offset) coordinate space. let big = BitBuffer::new_set(256); @@ -1137,17 +891,14 @@ mod tests { let mut values: Vec = (0..130).collect(); values[77] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(77)); } #[test] - fn try_map_with_mask_sliced_mask_null_lanes() { + fn try_map_masked_into_sliced_mask_null_lanes() { // Mix sliced offset with a non-trivial validity pattern. Null lanes must // not contribute to fail_acc, even when their underlying value would overflow. let mut m = BitBufferMut::with_capacity(256); @@ -1165,102 +916,74 @@ mod tests { // Stuff in an overflowing value; it must be neutralized by `* valid as u64`. values[2] = u64::MAX; let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert!(res.is_ok(), "null lane should bypass the range check"); } #[test] - fn try_map_with_mask_overflow_in_remainder() { + fn try_map_masked_into_overflow_in_remainder() { // Overflow in the trailing partial chunk (not aligned to 64). let mut values: Vec = (0..130).collect(); values[129] = (u32::MAX as u64) + 1; let mask = BitBuffer::new_set(130); let mut out = vec![MaybeUninit::::uninit(); 130]; - let res = values - .as_slice() - .try_map_with_mask(&mask, &mut out, |v, valid| { - let scaled = v * valid as u64; - (scaled <= u32::MAX as u64).then_some(scaled as u32) - }); + let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| { + (v <= u32::MAX as u64).then_some(v as u32) + }); assert_eq!(res, Err(129)); } #[test] - fn map_with_mask_in_place_basic() { - let mut values: Vec = (0..130).collect(); - let mask = { - let mut m = BitBufferMut::with_capacity(130); - for i in 0..130 { - m.append(i % 2 == 0); - } - m.freeze() - }; - values - .as_mut_slice() - .map_with_mask_in_place(&mask, |v, valid| v.wrapping_mul(valid as u32)); - let expected: Vec = (0..130u32) - .map(|v| if v % 2 == 0 { v } else { 0 }) - .collect(); - assert_eq!(values, expected); - } - - #[test] - fn try_map_with_mask_in_place_all_ok() { + fn try_map_masked_in_place_all_ok() { let mut values: Vec = (0..200).collect(); let mask = BitBuffer::new_set(200); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, valid| { - let scaled = v.wrapping_mul(valid as u32); - scaled.checked_mul(2) - }); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert!(res.is_ok()); let expected: Vec = (0..200u32).map(|v| v * 2).collect(); assert_eq!(values, expected); } #[test] - fn try_map_with_mask_in_place_first_failing_chunk_wins() { + fn try_map_masked_in_place_first_failing_chunk_wins() { let mut values: Vec = (0..200).collect(); values[83] = u32::MAX; values[150] = u32::MAX; let mask = BitBuffer::new_set(200); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert_eq!(res, Err(83)); } #[test] - fn try_map_with_mask_in_place_within_chunk_reports_lowest() { + fn try_map_masked_in_place_within_chunk_reports_lowest() { let mut values: Vec = (0..200).collect(); values[80] = u32::MAX; values[100] = u32::MAX; let mask = BitBuffer::new_set(200); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert_eq!(res, Err(80)); } #[test] - fn try_map_with_mask_in_place_single_failure_lane_exact() { + fn try_map_masked_in_place_single_failure_lane_exact() { let mut values: Vec = (0..200).collect(); values[42] = u32::MAX; let mask = BitBuffer::new_set(200); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert_eq!(res, Err(42)); } #[test] - fn try_map_with_mask_in_place_null_bypass() { + fn try_map_masked_in_place_null_bypass() { let mut values: Vec = (0..200).collect(); values[5] = u32::MAX; let mask = { @@ -1272,27 +995,26 @@ mod tests { }; let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, valid| { - v.wrapping_mul(valid as u32).checked_mul(2) - }); - assert!(res.is_ok()); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); + assert!(res.is_ok(), "null-lane overflow should be filtered"); + // Null lane was overwritten with default (0). assert_eq!(values[5], 0); assert_eq!(values[6], 12); } #[test] - fn try_map_with_mask_in_place_remainder_overflow() { + fn try_map_masked_in_place_remainder_overflow() { let mut values: Vec = (0..130).collect(); values[129] = u32::MAX; let mask = BitBuffer::new_set(130); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert_eq!(res, Err(129)); } #[test] - fn try_map_with_mask_in_place_sliced_mask() { + fn try_map_masked_in_place_sliced_mask() { let big = BitBuffer::new_set(256); let mask = big.slice(13..143); assert_eq!(mask.len(), 130); @@ -1301,7 +1023,7 @@ mod tests { values[77] = u32::MAX; let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2)); + .try_map_masked_in_place(&mask, |v| v.checked_mul(2)); assert_eq!(res, Err(77)); } @@ -1312,7 +1034,7 @@ mod tests { let mut buf: Vec = (0..130).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(130); ReinterpretSink::::new(buf.as_mut_slice()) - .try_map_with_mask_in_place(&mask, |f, _valid| Some(f.to_bits().wrapping_add(1))) + .try_map_masked_in_place(&mask, |f| Some(f.to_bits().wrapping_add(1))) .unwrap(); // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by // the closure. @@ -1328,9 +1050,9 @@ mod tests { // Closure fails at a specific lane; the kernel must report that lane index. let mut buf: Vec = (0..200).map(|i| i as f32).collect(); let mask = BitBuffer::new_set(200); - let res = ReinterpretSink::::new(buf.as_mut_slice()).try_map_with_mask_in_place( + let res = ReinterpretSink::::new(buf.as_mut_slice()).try_map_masked_in_place( &mask, - |f, _valid| { + |f| { if f as u32 == 137 { None } else { @@ -1342,12 +1064,12 @@ mod tests { } #[test] - fn try_map_with_mask_in_place_partial_chunk_success() { + fn try_map_masked_in_place_partial_chunk_success() { let mut values: Vec = (0..130).collect(); let mask = BitBuffer::new_set(130); let res = values .as_mut_slice() - .try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1)); + .try_map_masked_in_place(&mask, |v| Some(v + 1)); assert!(res.is_ok()); assert_eq!(values[0], 1); assert_eq!(values[63], 64); From 608111c8f9fe3d7369de372773df6e59a6330296 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 19:26:19 +0100 Subject: [PATCH 19/21] f Signed-off-by: Joe Isaacs --- Cargo.lock | 4 - .../src/arrays/primitive/compute/cast.rs | 6 +- vortex-buffer/Cargo.toml | 11 +- vortex-buffer/benches/add_checked.rs | 175 ---------- vortex-buffer/benches/cast_to_indexed.rs | 209 ------------ vortex-buffer/benches/lane_kernels.rs | 313 ++++++++++++++++++ .../{lane_ops_indexed.rs => lane_kernels.rs} | 0 vortex-buffer/src/lib.rs | 2 +- 8 files changed, 318 insertions(+), 402 deletions(-) delete mode 100644 vortex-buffer/benches/add_checked.rs delete mode 100644 vortex-buffer/benches/cast_to_indexed.rs create mode 100644 vortex-buffer/benches/lane_kernels.rs rename vortex-buffer/src/{lane_ops_indexed.rs => lane_kernels.rs} (100%) diff --git a/Cargo.lock b/Cargo.lock index 9bb032d0d35..d29c91edf62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9355,11 +9355,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-arith", - "arrow-array", "arrow-buffer", - "arrow-cast", - "arrow-schema", "bitvec", "bytes", "codspeed-divan-compat", diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 34bc6ba3445..112173b269f 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -5,9 +5,9 @@ use num_traits::AsPrimitive; use num_traits::NumCast; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; -use vortex_buffer::lane_ops_indexed::IndexedSinkExt; -use vortex_buffer::lane_ops_indexed::IndexedSourceExt; -use vortex_buffer::lane_ops_indexed::ReinterpretSink; +use vortex_buffer::lane_kernels::IndexedSinkExt; +use vortex_buffer::lane_kernels::IndexedSourceExt; +use vortex_buffer::lane_kernels::ReinterpretSink; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index 882de199818..31b9d1c8570 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -37,11 +37,6 @@ vortex-error = { workspace = true } workspace = true [dev-dependencies] -# arrow-* are used by cast_to_indexed / add_checked benches to compare against arrow-rs. -arrow-arith = { workspace = true } -arrow-array = { workspace = true } -arrow-cast = { workspace = true } -arrow-schema = { workspace = true } divan = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } @@ -56,9 +51,5 @@ name = "vortex_bitbuffer" harness = false [[bench]] -name = "cast_to_indexed" -harness = false - -[[bench]] -name = "add_checked" +name = "lane_kernels" harness = false diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs deleted file mode 100644 index 4f71f085847..00000000000 --- a/vortex-buffer/benches/add_checked.rs +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_masked_into`] -//! with a value-only closure. Per-lane `is_none()` flags are bit-packed and -//! AND-ed with the chunk validity word so null-lane overflow is filtered -//! without the closure ever inspecting `valid`. -//! -//! Verified at startup via [`assert_overflow_parity`] (valid-lane overflow -//! propagates as `Err`) and [`assert_null_overflow_suppressed`] (null-lane -//! overflow does not). - -#![expect(clippy::unwrap_used)] - -use std::mem::MaybeUninit; - -use divan::Bencher; -use rand::SeedableRng; -use rand::prelude::*; -use vortex_buffer::BitBuffer; -use vortex_buffer::BitBufferMut; -use vortex_buffer::Buffer; -use vortex_buffer::lane_ops_indexed::IndexedSourceExt; -use vortex_buffer::lane_ops_indexed::LaneZip; - -fn main() { - assert_overflow_parity(); - assert_null_overflow_suppressed(); - divan::main(); -} - -const SIZES: &[usize] = &[65_536]; -const LHS_VALID_RATE: f64 = 0.7; -const RHS_VALID_RATE: f64 = 0.8; - -struct Fixture { - /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel - /// that ignores validity would Err on them. The implementation under test - /// must suppress that. - lhs: Buffer, - rhs: Buffer, - lhs_mask: BitBuffer, - rhs_mask: BitBuffer, -} - -fn fixture(n: usize) -> Fixture { - let mut lhs_rng = StdRng::seed_from_u64(0); - let mut rhs_rng = StdRng::seed_from_u64(1); - let mut lvr = StdRng::seed_from_u64(2); - let mut rvr = StdRng::seed_from_u64(3); - - let lhs_valid: Vec = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect(); - let rhs_valid: Vec = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect(); - - let lhs: Buffer = (0..n) - .map(|i| { - if lhs_valid[i] { - lhs_rng.random_range(0..u16::MAX as u32) - } else { - u32::MAX - } - }) - .collect(); - let rhs: Buffer = (0..n) - .map(|i| { - if rhs_valid[i] { - rhs_rng.random_range(0..u16::MAX as u32) - } else { - u32::MAX - } - }) - .collect(); - - let lhs_mask = { - let mut m = BitBufferMut::with_capacity(n); - for &v in &lhs_valid { - m.append(v); - } - m.freeze() - }; - let rhs_mask = { - let mut m = BitBufferMut::with_capacity(n); - for &v in &rhs_valid { - m.append(v); - } - m.freeze() - }; - - Fixture { - lhs, - rhs, - lhs_mask, - rhs_mask, - } -} - -fn alloc_out(n: usize) -> Vec> { - let mut out = Vec::with_capacity(n); - // SAFETY: every lane is written before any read inside the kernel. - unsafe { out.set_len(n) }; - out -} - -#[divan::bench(args = SIZES)] -fn bitpack_value_only(bencher: Bencher, n: usize) { - let f = fixture(n); - bencher - .with_inputs(|| { - ( - f.lhs.clone(), - f.rhs.clone(), - f.lhs_mask.clone(), - f.rhs_mask.clone(), - ) - }) - .bench_refs(|(lhs, rhs, lm, rm)| { - let combined = lm as &BitBuffer & rm as &BitBuffer; - let mut out = alloc_out(n); - LaneZip::new(lhs.as_slice(), rhs.as_slice()) - .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b)) - .unwrap(); - (combined, out) - }); -} - -// --------------------------------------------------------------------------- -// Parity assertions — must pass before divan runs benches. -// --------------------------------------------------------------------------- - -/// Overflow at a valid lane must propagate as `Err`. -fn assert_overflow_parity() { - let lhs: Vec = vec![1, 2, u32::MAX, 4]; - let rhs: Vec = vec![10, 20, 1, 40]; - let valid = vec![true; 4]; - - let mask = { - let mut m = BitBufferMut::with_capacity(4); - for &v in &valid { - m.append(v); - } - m.freeze() - }; - - let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); - let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( - &mask, - out.as_mut_slice(), - |(a, b)| a.checked_add(b), - ); - assert!(r.is_err(), "bitpack should Err on overflow"); -} - -/// Overflow at a null lane must NOT propagate. -fn assert_null_overflow_suppressed() { - // Lane 2 is null and holds an overflowing value; valid lanes are safe. - let lhs: Vec = vec![1, 2, u32::MAX, 4]; - let rhs: Vec = vec![10, 20, 1, 40]; - let valid = vec![true, true, false, true]; - - let mask = { - let mut m = BitBufferMut::with_capacity(4); - for &v in &valid { - m.append(v); - } - m.freeze() - }; - - let mut out = alloc_out(4); - let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( - &mask, - out.as_mut_slice(), - |(a, b)| a.checked_add(b), - ); - assert!(r.is_ok(), "bitpack: null-lane overflow leaked"); -} diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs deleted file mode 100644 index 5ab1041f5cc..00000000000 --- a/vortex-buffer/benches/cast_to_indexed.rs +++ /dev/null @@ -1,209 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Coverage benchmark for the indexed lane-op variants used by primitive casts -//! and bit-packing paths. - -#![expect(clippy::unwrap_used)] - -use std::mem::MaybeUninit; - -use arrow_array::UInt16Array; -use arrow_array::UInt64Array; -use arrow_buffer::NullBuffer; -use arrow_buffer::ScalarBuffer; -use arrow_cast::CastOptions; -use divan::Bencher; -use num_traits::AsPrimitive; -use num_traits::NumCast; -use rand::SeedableRng; -use rand::prelude::*; -use rand::rngs::StdRng; -use vortex_buffer::BitBuffer; -use vortex_buffer::BitBufferMut; -use vortex_buffer::Buffer; -use vortex_buffer::lane_ops_indexed::IndexedSinkExt; -use vortex_buffer::lane_ops_indexed::IndexedSourceExt; -use vortex_buffer::lane_ops_indexed::ReinterpretSink; - -fn main() { - divan::main(); -} - -const SIZES: &[usize] = &[65_536]; - -struct Fixture { - values_u64: Buffer, - values_u16: Buffer, - /// Positive `i32` values (always representable as `u32`). Used by the - /// in-place-vs-out-of-place cast bench. - values_i32: Buffer, - mask: BitBuffer, -} - -fn fixture(n: usize) -> Fixture { - let mut rng = StdRng::seed_from_u64(0xC457_1D3E); - - let raw_values: Vec = (0..n) - .map(|_| rng.random_range(0..(u32::MAX as u64))) - .collect(); - let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); - - #[expect(clippy::cast_possible_truncation)] - let values_u16 = raw_values - .iter() - .copied() - .map(|v| v as u16) - .collect::>(); - - // Positive i32 values (top bit cleared) — every value fits in u32. - #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] - let values_i32 = raw_values - .iter() - .copied() - .map(|v| (v as i32) & i32::MAX) - .collect::>(); - - #[expect(clippy::cast_possible_truncation)] - let values_u32 = raw_values - .iter() - .copied() - .map(|v| v as u32) - .collect::>(); - - #[expect(clippy::cast_possible_truncation)] - let values_u32_small = raw_values - .iter() - .copied() - .map(|v| (v % ((u32::MAX as u64) / 2)) as u32) - .collect::>(); - - let values_u64_invalid_overflows = raw_values - .iter() - .copied() - .zip(raw_valid.iter().copied()) - .map(|(v, valid)| if valid { v } else { u64::MAX }) - .collect::>(); - - let arrow_u64 = UInt64Array::new( - ScalarBuffer::from(raw_values.clone()), - Some(NullBuffer::from(raw_valid.clone())), - ); - #[expect(clippy::cast_possible_truncation)] - let raw_u16: Vec = raw_values.iter().map(|&v| v as u16).collect(); - let arrow_u16 = UInt16Array::new( - ScalarBuffer::from(raw_u16), - Some(NullBuffer::from(raw_valid.clone())), - ); - - Fixture { - values_u64: raw_values.into(), - values_u16, - values_i32, - mask: BitBufferMut::from_iter(raw_valid).freeze(), - } -} - -fn uninit_out(n: usize) -> Vec> { - let mut out = Vec::with_capacity(n); - // SAFETY: A `MaybeUninit` does not require initialization. - unsafe { - out.set_len(n); - } - out -} - -#[divan::bench(args = SIZES)] -fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) - .bench_values(|(values, mut out)| { - values - .as_slice() - .try_map_into(out.as_mut_slice(), ::from) - .unwrap(); - out - }); -} - -#[divan::bench(args = SIZES)] -fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) - .bench_values(|(values, mut out)| { - values.as_slice().map_into(&mut out, |v| v.as_()); - out - }); -} - -/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the same runtime -/// and showing for always true map operations `try_map_masked_into` is sufficient. -#[divan::bench(args = SIZES)] -fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .try_map_masked_into(&mask, out.as_mut_slice(), |v| ::from(v)) - .unwrap(); - out - }); -} - -#[divan::bench(args = SIZES)] -fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) - .bench_values(|(values, mut out)| { - values.as_slice().map_into(out.as_mut_slice(), |v| v.as_()); - out - }); -} - -// ----------------------------------------------------------------------------- -// In-place vs out-of-place fallible cast i32 → u32 (same byte width). -// -// `try_map_masked_into_in_place` mutates the input via `ReinterpretSink` and -// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates -// a fresh `BufferMut` and writes through it. Input values are all positive -// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any -// delta is pure allocation + memory-traffic overhead. -// ----------------------------------------------------------------------------- - -#[divan::bench(args = SIZES)] -fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::(n))) - .bench_values(|(values, mask, mut out)| { - values - .as_slice() - .try_map_masked_into(&mask, out.as_mut_slice(), |v| ::from(v)) - .unwrap(); - out - }); -} - -#[divan::bench(args = SIZES)] -fn try_map_masked_into_in_place_narrow_i32_u32(bencher: Bencher, n: usize) { - let f = fixture(n); - - bencher - .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone())) - .bench_values(|(mut values, mask)| { - ReinterpretSink::::new(values.as_mut_slice()) - .try_map_masked_in_place(&mask, |v| ::from(v)) - .unwrap(); - values - }); -} diff --git a/vortex-buffer/benches/lane_kernels.rs b/vortex-buffer/benches/lane_kernels.rs new file mode 100644 index 00000000000..60ab967e1ed --- /dev/null +++ b/vortex-buffer/benches/lane_kernels.rs @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Coverage benchmark for the lane-kernel variants used by primitive casts, +//! bit-packing paths, and `LaneZip` binary kernels. +//! +//! `add_checked` parity assertions (run at startup) verify that the bit-packed +//! fail-tracking scheme: +//! - propagates valid-lane overflow as `Err`, and +//! - suppresses null-lane overflow without the closure ever inspecting `valid`. + +#![expect(clippy::unwrap_used)] + +use std::mem::MaybeUninit; + +use divan::Bencher; +use num_traits::AsPrimitive; +use num_traits::NumCast; +use rand::SeedableRng; +use rand::prelude::*; +use rand::rngs::StdRng; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::Buffer; +use vortex_buffer::lane_kernels::IndexedSinkExt; +use vortex_buffer::lane_kernels::IndexedSourceExt; +use vortex_buffer::lane_kernels::LaneZip; +use vortex_buffer::lane_kernels::ReinterpretSink; + +fn main() { + assert_overflow_parity(); + assert_null_overflow_suppressed(); + divan::main(); +} + +const SIZES: &[usize] = &[65_536]; + +// ----------------------------------------------------------------------------- +// Cast fixture (u64/u16/i32 lanes + a single validity mask). +// ----------------------------------------------------------------------------- + +struct CastFixture { + values_u64: Buffer, + values_u16: Buffer, + /// Positive `i32` values (always representable as `u32`). Used by the + /// in-place-vs-out-of-place cast bench. + values_i32: Buffer, + mask: BitBuffer, +} + +fn cast_fixture(n: usize) -> CastFixture { + let mut rng = StdRng::seed_from_u64(0xC457_1D3E); + + let raw_values: Vec = (0..n) + .map(|_| rng.random_range(0..(u32::MAX as u64))) + .collect(); + let raw_valid: Vec = (0..n).map(|_| rng.random_bool(0.8)).collect(); + + #[expect(clippy::cast_possible_truncation)] + let values_u16 = raw_values + .iter() + .copied() + .map(|v| v as u16) + .collect::>(); + + // Positive i32 values (top bit cleared) — every value fits in u32. + #[expect(clippy::cast_possible_truncation)] + let values_i32 = raw_values + .iter() + .copied() + .map(|v| (v as i32) & i32::MAX) + .collect::>(); + + CastFixture { + values_u64: raw_values.into(), + values_u16, + values_i32, + mask: BitBufferMut::from_iter(raw_valid).freeze(), + } +} + +fn uninit_out(n: usize) -> Vec> { + let mut out = Vec::with_capacity(n); + // SAFETY: A `MaybeUninit` does not require initialization. + unsafe { + out.set_len(n); + } + out +} + +// ----------------------------------------------------------------------------- +// Cast benches (single-input, source -> output). +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values + .as_slice() + .try_map_into(out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u64.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(&mut out, |v| v.as_()); + out + }); +} + +/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the +/// same runtime — for always-true map operations `try_map_masked_into` is +/// sufficient. +#[divan::bench(args = SIZES)] +fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + values + .as_slice() + .try_map_masked_into(&mask, out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_u16.clone(), uninit_out::(n))) + .bench_values(|(values, mut out)| { + values.as_slice().map_into(out.as_mut_slice(), |v| v.as_()); + out + }); +} + +// ----------------------------------------------------------------------------- +// In-place vs out-of-place fallible cast i32 → u32 (same byte width). +// +// `try_map_masked_in_place` mutates the input via `ReinterpretSink` and +// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates +// a fresh `BufferMut` and writes through it. Input values are all positive +// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any +// delta is allocation + memory-traffic overhead. +// ----------------------------------------------------------------------------- + +#[divan::bench(args = SIZES)] +fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::(n))) + .bench_values(|(values, mask, mut out)| { + values + .as_slice() + .try_map_masked_into(&mask, out.as_mut_slice(), ::from) + .unwrap(); + out + }); +} + +#[divan::bench(args = SIZES)] +fn try_map_masked_in_place_narrow_i32_u32(bencher: Bencher, n: usize) { + let f = cast_fixture(n); + + bencher + .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone())) + .bench_values(|(mut values, mask)| { + ReinterpretSink::::new(values.as_mut_slice()) + .try_map_masked_in_place(&mask, ::from) + .unwrap(); + values + }); +} + +// ----------------------------------------------------------------------------- +// LaneZip binary kernel: checked `u32 + u32 -> u32` over two nullable columns. +// +// Per-lane `is_none()` flags are bit-packed and AND-ed with the chunk validity +// word, so null-lane overflow is filtered without the closure inspecting `valid`. +// Verified at startup via parity assertions (`assert_overflow_parity` and +// `assert_null_overflow_suppressed`). +// ----------------------------------------------------------------------------- + +const ADD_LHS_VALID_RATE: f64 = 0.7; +const ADD_RHS_VALID_RATE: f64 = 0.8; + +struct AddFixture { + /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel + /// that ignores validity would `Err` on them. The implementation under test + /// must suppress that. + lhs: Buffer, + rhs: Buffer, + lhs_mask: BitBuffer, + rhs_mask: BitBuffer, +} + +fn add_fixture(n: usize) -> AddFixture { + let mut lhs_rng = StdRng::seed_from_u64(0); + let mut rhs_rng = StdRng::seed_from_u64(1); + let mut lvr = StdRng::seed_from_u64(2); + let mut rvr = StdRng::seed_from_u64(3); + + let lhs_valid: Vec = (0..n) + .map(|_| lvr.random_bool(ADD_LHS_VALID_RATE)) + .collect(); + let rhs_valid: Vec = (0..n) + .map(|_| rvr.random_bool(ADD_RHS_VALID_RATE)) + .collect(); + + let lhs: Buffer = (0..n) + .map(|i| { + if lhs_valid[i] { + lhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + let rhs: Buffer = (0..n) + .map(|i| { + if rhs_valid[i] { + rhs_rng.random_range(0..u16::MAX as u32) + } else { + u32::MAX + } + }) + .collect(); + + let lhs_mask = BitBufferMut::from_iter(lhs_valid).freeze(); + let rhs_mask = BitBufferMut::from_iter(rhs_valid).freeze(); + + AddFixture { + lhs, + rhs, + lhs_mask, + rhs_mask, + } +} + +#[divan::bench(args = SIZES)] +fn lanezip_checked_add_u32(bencher: Bencher, n: usize) { + let f = add_fixture(n); + bencher + .with_inputs(|| { + ( + f.lhs.clone(), + f.rhs.clone(), + f.lhs_mask.clone(), + f.rhs_mask.clone(), + ) + }) + .bench_refs(|(lhs, rhs, lm, rm)| { + let combined = lm as &BitBuffer & rm as &BitBuffer; + let mut out = uninit_out::(n); + LaneZip::new(lhs.as_slice(), rhs.as_slice()) + .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b)) + .unwrap(); + (combined, out) + }); +} + +// ----------------------------------------------------------------------------- +// Parity assertions — must pass before divan runs benches. +// ----------------------------------------------------------------------------- + +/// Overflow at a valid lane must propagate as `Err`. +fn assert_overflow_parity() { + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true; 4]; + + let mask = BitBufferMut::from_iter(valid).freeze(); + let mut out: Vec> = (0..4).map(|_| MaybeUninit::uninit()).collect(); + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( + &mask, + out.as_mut_slice(), + |(a, b)| a.checked_add(b), + ); + assert!(r.is_err(), "bitpack should Err on overflow"); +} + +/// Overflow at a null lane must NOT propagate. +fn assert_null_overflow_suppressed() { + // Lane 2 is null and holds an overflowing value; valid lanes are safe. + let lhs: Vec = vec![1, 2, u32::MAX, 4]; + let rhs: Vec = vec![10, 20, 1, 40]; + let valid = vec![true, true, false, true]; + + let mask = BitBufferMut::from_iter(valid).freeze(); + let mut out = uninit_out::(4); + let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into( + &mask, + out.as_mut_slice(), + |(a, b)| a.checked_add(b), + ); + assert!(r.is_ok(), "bitpack: null-lane overflow leaked"); +} diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_kernels.rs similarity index 100% rename from vortex-buffer/src/lane_ops_indexed.rs rename to vortex-buffer/src/lane_kernels.rs diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 5fe7a4cf40d..667a1f11a9d 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -62,7 +62,7 @@ mod buffer_mut; mod bytes; mod r#const; mod debug; -pub mod lane_ops_indexed; +pub mod lane_kernels; mod macros; #[cfg(feature = "memmap2")] mod memmap2; From d0a7806df88f19093eafe4fe8e0e39c5b00dbd0a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 19:35:36 +0100 Subject: [PATCH 20/21] f Signed-off-by: Joe Isaacs --- vortex-buffer/src/lane_kernels.rs | 69 +++++++++++++++---------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/vortex-buffer/src/lane_kernels.rs b/vortex-buffer/src/lane_kernels.rs index 810d76c0900..d1e0d9b5f2e 100644 --- a/vortex-buffer/src/lane_kernels.rs +++ b/vortex-buffer/src/lane_kernels.rs @@ -16,8 +16,6 @@ //! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len` //! shorter than the underlying byte buffer, via [`BitBuffer::chunks`]. -#![allow(clippy::many_single_char_names)] - use std::marker::PhantomData; use std::mem::MaybeUninit; use std::mem::align_of; @@ -273,13 +271,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized { { let mut fail_bits: u64 = 0; for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - unsafe { out.get_unchecked_mut(i).write(r) }; + let result = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(idx).write(result) }; } let valid_failures = fail_bits & src_chunk; (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) @@ -338,10 +336,10 @@ pub trait IndexedSourceExt: IndexedSource + Sized { F: FnMut(S::Item) -> R, { for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - unsafe { out.get_unchecked_mut(i).write(f(v)) }; + let val = unsafe { values.get_unchecked(idx) }; + unsafe { out.get_unchecked_mut(idx).write(f(val)) }; } } @@ -406,13 +404,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized { { let mut fail_acc: u64 = 0; for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); fail_acc |= opt.is_none() as u64; - let r = opt.unwrap_or_default(); - unsafe { out.get_unchecked_mut(i).write(r) }; + let result = opt.unwrap_or_default(); + unsafe { out.get_unchecked_mut(idx).write(result) }; } fail_acc != 0 } @@ -459,11 +457,11 @@ where S: IndexedSource, { for bit_idx in 0..chunk_len { - let i = base + bit_idx; - // SAFETY: caller guarantees i < values.len(). - let v = unsafe { values.get_unchecked(i) }; - if lane_fails(bit_idx, v) { - return i; + let idx = base + bit_idx; + // SAFETY: caller guarantees idx < values.len(). + let val = unsafe { values.get_unchecked(idx) }; + if lane_fails(bit_idx, val) { + return idx; } } unreachable!("cold_scan called without a failing lane") @@ -477,7 +475,7 @@ where S: IndexedSource, F: FnMut(S::Item) -> Option, { - cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none()) + cold_scan(values, base, chunk_len, |_bit_idx, val| f(val).is_none()) } /// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`]. @@ -509,12 +507,12 @@ pub trait IndexedSinkExt: IndexedSink + Sized { F: FnMut(S::Item) -> S::Write, { for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let r = f(v); + let val = unsafe { values.get_unchecked(idx) }; + let result = f(val); // SAFETY: caller guarantees base + count <= len. - unsafe { values.set_unchecked(i, r) }; + unsafe { values.set_unchecked(idx, result) }; } } @@ -561,14 +559,14 @@ pub trait IndexedSinkExt: IndexedSink + Sized { { let mut fail_bits: u64 = 0; for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees base + count <= len. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); + let result = opt.unwrap_or_default(); // SAFETY: caller guarantees base + count <= len. - unsafe { values.set_unchecked(i, r) }; + unsafe { values.set_unchecked(idx, result) }; } (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize) } @@ -628,7 +626,6 @@ pub trait IndexedSinkExt: IndexedSink + Sized { /// /// Panics if `self.len() != mask.len()`. #[inline] - #[allow(clippy::cast_possible_truncation)] fn try_map_masked_in_place(self, mask: &BitBuffer, mut f: F) -> Result<(), usize> where Self::Write: Default, @@ -653,13 +650,13 @@ pub trait IndexedSinkExt: IndexedSink + Sized { { let mut fail_bits: u64 = 0; for bit_idx in 0..count { - let i = base + bit_idx; + let idx = base + bit_idx; // SAFETY: caller guarantees `base + count <= values.len()`. - let v = unsafe { values.get_unchecked(i) }; - let opt = f(v); + let val = unsafe { values.get_unchecked(idx) }; + let opt = f(val); fail_bits |= (opt.is_none() as u64) << bit_idx; - let r = opt.unwrap_or_default(); - unsafe { values.set_unchecked(i, r) }; + let result = opt.unwrap_or_default(); + unsafe { values.set_unchecked(idx, result) }; } let valid_failures = fail_bits & src_chunk; (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize) From fc9b5e857434054953cc74bcd0e1fb49005fc00f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 27 May 2026 19:50:10 +0100 Subject: [PATCH 21/21] f Signed-off-by: Joe Isaacs --- .../src/arrays/primitive/compute/cast.rs | 56 +++---------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs index 112173b269f..82bbb1c0d23 100644 --- a/vortex-array/src/arrays/primitive/compute/cast.rs +++ b/vortex-array/src/arrays/primitive/compute/cast.rs @@ -105,18 +105,7 @@ impl CastKernel for Primitive { } } -/// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with -/// `NumCast::from`. The kernel branches once on the mask shape: -/// -/// - `Mask::AllTrue` → [`try_map_into`] — no per-lane validity work. -/// - `Mask::AllFalse` → bulk zero — the closure is never invoked. -/// - `Mask::Values` → [`try_map_masked_into`] — the closure neutralizes null lanes -/// via the `* valid as F` multiply trick so out-of-range null-lane values don't -/// trigger spurious errors. -/// -/// For statically-infallible casts (e.g. widening) LLVM proves `NumCast::from` always -/// returns `Some` and strips the fail-tracking machinery, generating the same bare -/// `ushll` widen loop the old hand-written `as_()` fast path produced. +/// Cast Primitive values from `F` to `T`. fn cast_values( array: ArrayView<'_, Primitive>, new_validity: Validity, @@ -134,28 +123,18 @@ where }; // Returns `true` if every value of `from` is representable in `to` without loss. - // - // Equivalent to `from.least_supertype(to) == Some(to)`, i.e. the value domain of `from` - // is a subset of `to`'s. This is the static-only check — it does not consult any array - // statistics. Used to short-circuit checked casts when the conversion is infallible by - // type alone (widening uint→uint, signed→signed, u8→i16, i32→f64, etc.). fn casts_losslessly_to(from: PType, to: PType) -> bool { from.least_supertype(to) == Some(to) } - // Skip the fallible kernel when the conversion is infallible by type alone (widening) or - // when cached min/max prove every value fits in `T`. + // Skip the fallible kernel when type widening or (cached) min/max prove every value fits. let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE) || cached_values_fit_in(array, &target_dtype) == Some(true); let len = array.len(); - // Same-bit-width in-place fast path: when F and T have the same byte width, try to take - // unique ownership of the buffer. If successful, each kernel call site below mutates in - // place via `ReinterpretSink` and transmutes the wrapper at the end, saving the output - // allocation. Falls back to the out-of-place path (borrowed slice + fresh buffer) when - // the buffer is shared — the common case under the current borrow-based kernel API. + // If F and T have the same byte width, try to take unique ownership of the buffer. let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width(); let owned: Option> = if same_bit_width { array.into_owned().try_into_buffer_mut::().ok() @@ -165,14 +144,10 @@ where let values: &[F] = array.as_slice::(); if infallible { - // Truncating `as`-cast — safe here because static type analysis or cached stats prove - // every valid value fits. Null lanes' underlying garbage gets truncated/wrapped - // (harmless: the result validity bitmap masks them downstream). return match owned { Some(mut buf) => { ReinterpretSink::::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_()); - // SAFETY: same size + alignment for NativePType same-byte-width pairs; - // every F-slot was overwritten with a real `T` bit pattern. + // SAFETY: same size + alignment for NativePType let result: BufferMut = unsafe { buf.transmute::() }; Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array()) } @@ -193,8 +168,7 @@ where ReinterpretSink::::new(buf.as_mut_slice()) .try_map_in_place(|v: F| ::from(v)) .map_err(|_| overflow())?; - // SAFETY: same size + alignment for NativePType same-byte-width pairs; - // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. + // SAFETY: same size + alignment for NativePType let result: BufferMut = unsafe { buf.transmute::() }; result.freeze() } @@ -205,33 +179,21 @@ where ::from(v) }) .map_err(|_| overflow())?; - // SAFETY: try_map_into returned Ok, so it initialized every lane. + // SAFETY: initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() } - (Mask::AllFalse(_), Some(buf)) => { - // SAFETY: same size + alignment by NativePType same-byte-width invariant. - let mut t_buf: BufferMut = unsafe { buf.transmute::() }; - t_buf.as_mut_slice().fill(T::zero()); - t_buf.freeze() - } - (Mask::AllFalse(_), None) => BufferMut::::zeroed(len).freeze(), + (Mask::AllFalse(_), _) => BufferMut::::zeroed(len).freeze(), (Mask::Values(m), Some(mut buf)) => { ReinterpretSink::::new(buf.as_mut_slice()) .try_map_masked_in_place(m.bit_buffer(), |v: F| ::from(v)) .map_err(|_| overflow())?; - // SAFETY: same size + alignment for NativePType same-byte-width pairs; - // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`. + // SAFETY: same size + alignment for NativePType let result: BufferMut = unsafe { buf.transmute::() }; result.freeze() } (Mask::Values(m), None) => { let mut buffer = BufferMut::::with_capacity(len); - // Null-lane failures (where the underlying garbage value can't be represented in - // `T`) are filtered automatically by `try_map_masked_into`'s post-loop - // `fail_bits & src_chunk` AND. The closure is value-only — LLVM proves it's - // statically infallible for widening casts and DCEs the fail-tracking, giving the - // same codegen as the maskless kernel. values .try_map_masked_into( m.bit_buffer(), @@ -239,7 +201,7 @@ where |v| ::from(v), ) .map_err(|_| overflow())?; - // SAFETY: try_map_masked_into returned Ok, so it initialized every lane. + // SAFETY: initialized every lane. unsafe { buffer.set_len(len) }; buffer.freeze() }