From 7b5828f82478c88ae555685ffb3babe34a6932fc Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 26 May 2026 20:16:13 +0100
Subject: [PATCH 01/21] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |   4 +
 vortex-array/Cargo.toml                       |   1 +
 .../src/arrays/primitive/compute/cast.rs      |  24 +-
 vortex-buffer/Cargo.toml                      |  10 +
 vortex-buffer/benches/cast_to.rs              | 323 ++++++++
 vortex-buffer/src/lane_ops.rs                 | 713 ++++++++++++++++++
 vortex-buffer/src/lib.rs                      |   2 +
 7 files changed, 1070 insertions(+), 7 deletions(-)
 create mode 100644 vortex-buffer/benches/cast_to.rs
 create mode 100644 vortex-buffer/src/lane_ops.rs
diff --git a/Cargo.lock b/Cargo.lock
index 045c72176fd..11afc6996a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,13 +9355,17 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
+ "arrow-array",
  "arrow-buffer",
+ "arrow-cast",
+ "arrow-schema",
  "bitvec",
  "bytes",
  "codspeed-divan-compat",
  "itertools 0.14.0",
  "memmap2",
  "num-traits",
+ "rand 0.10.1",
  "rstest",
  "serde",
  "simdutf8",
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index 666a23c02c4..e5233ce7cc6 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -218,3 +218,4 @@ harness = false
 [[bench]]
 name = "to_arrow"
 harness = false
+
diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 10c0b8d6eba..bbe2f89322d 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -5,6 +5,7 @@ use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_buffer::try_map_with_mask;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
@@ -143,13 +144,22 @@ where
         )?
         .freeze(),
         Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
-        Mask::Values(m) => BufferMut::try_from_trusted_len_iter(
-            values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| {
-                let factor = if valid { F::one() } else { F::zero() };
-                <T as NumCast>::from(v * factor).ok_or_else(overflow)
-            }),
-        )?
-        .freeze(),
+        Mask::Values(m) => {
+            let mut buffer = BufferMut::<T>::with_capacity(values.len());
+            try_map_with_mask(
+                values,
+                m.bit_buffer(),
+                &mut buffer.spare_capacity_mut()[..values.len()],
+                |v, valid| {
+                    let factor = if valid { F::one() } else { F::zero() };
+                    <T as NumCast>::from(v * factor)
+                },
+            )
+            .map_err(|_| overflow())?;
+            // SAFETY: try_map_with_mask returned Ok, so it initialized every lane.
+            unsafe { buffer.set_len(values.len()) };
+            buffer.freeze()
+        }
     };
 
     Ok(PrimitiveArray::new(buffer, new_validity).into_array())
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index ae9d7e6cc05..850aec4ec19 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,8 +37,14 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
+# TEMP: arrow-{array,cast,schema} are only used by the cast_to bench for cross-impl
+# performance comparisons. Drop them when the bench is removed.
+arrow-array = { workspace = true }
+arrow-cast = { workspace = true }
+arrow-schema = { workspace = true }
 divan = { workspace = true }
 num-traits = { workspace = true }
+rand = { workspace = true }
 rstest = { workspace = true }
 
 [[bench]]
@@ -48,3 +54,7 @@ harness = false
 [[bench]]
 name = "vortex_bitbuffer"
 harness = false
+
+[[bench]]
+name = "cast_to"
+harness = false
diff --git a/vortex-buffer/benches/cast_to.rs b/vortex-buffer/benches/cast_to.rs
new file mode 100644
index 00000000000..c070f65d3a0
--- /dev/null
+++ b/vortex-buffer/benches/cast_to.rs
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Cast `u64 -> u32` over a nullable column, three ways:
+//!
+//! 1. `kernel_map_with_mask` — uses `map_with_mask`. Writes truncated values into a
+//!    pre-allocated `&mut [MaybeUninit<u32>]`. Null lanes write `0` via the branchless
+//!    `v * valid as u64` trick, mirroring `primitive/compute/cast.rs:147`.
+//! 2. `iter_zip` — `values.iter().zip(mask.iter())` collected through
+//!    `BufferMut::from_trusted_len_iter`. This is the shape the current Vortex cast uses.
+//! 3. `arrow_cast` — `arrow_cast::cast` against a `UInt64Array`, allocating a new
+//!    `UInt32Array`.
+//!
+//! Plus two fallible variants that error on overflow:
+//!
+//! 4. `kernel_try_map_with_mask` — `try_map_with_mask` with `|v, valid| (v <= MAX).then_some(...)`.
+//!    Unconditional cast + parallel range check OR-reduced into a u64 fail accumulator.
+//! 5. `iter_zip_checked` — `BufferMut::try_from_trusted_len_iter` returning Err on overflow.
+//! 6. `arrow_cast_checked` — `arrow_cast::cast` with `safe = false` (errors on overflow).
+//!
+//! Inputs are bounded to fit in `u32`, so the fallible variants always succeed and we
+//! measure the cost of the range check on the success path.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+
+use arrow_array::UInt64Array;
+use arrow_buffer::NullBuffer;
+use arrow_buffer::ScalarBuffer;
+use arrow_cast::CastOptions;
+use arrow_cast::cast_with_options;
+use arrow_schema::DataType;
+use divan::Bencher;
+use rand::SeedableRng;
+use rand::prelude::*;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_buffer::map_with_mask;
+use vortex_buffer::try_map_with_mask;
+
+fn main() {
+    divan::main();
+}
+
+const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
+const VALID_RATE: f64 = 0.7;
+const DATA_SEED: u64 = 0;
+const VALID_SEED: u64 = 1;
+
+// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte
+// boundaries on every chunk it yields.
+const SLICE_OFFSET: usize = 5;
+
+struct Fixture {
+    values: Buffer<u64>,
+    /// `offset() == 0`, underlying byte buffer starts on a byte boundary.
+    mask_aligned: BitBuffer,
+    /// Same validity bits but sliced so `offset() == SLICE_OFFSET`.
+    mask_unaligned: BitBuffer,
+    arrow_arr: UInt64Array,
+    /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset,
+    /// constructed by building an oversized array and slicing.
+    arrow_arr_unaligned: UInt64Array,
+}
+
+fn fixture(n: usize) -> Fixture {
+    let mut data_rng = StdRng::seed_from_u64(DATA_SEED);
+    let mut valid_rng = StdRng::seed_from_u64(VALID_SEED);
+    let raw_values: Vec<u64> = (0..n)
+        .map(|_| data_rng.random_range(0..u32::MAX as u64))
+        .collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
+
+    let values: Buffer<u64> = raw_values.iter().copied().collect();
+
+    let mask_aligned = {
+        let mut m = BitBufferMut::with_capacity(n);
+        for &v in &raw_valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+
+    // Build n + SLICE_OFFSET bits then slice off the leading SLICE_OFFSET, so the
+    // remaining `n` lanes carry the SAME validity pattern as the aligned mask.
+    let mask_unaligned = {
+        let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET);
+        for _ in 0..SLICE_OFFSET {
+            m.append(false); // filler — sliced away
+        }
+        for &v in &raw_valid {
+            m.append(v);
+        }
+        m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n)
+    };
+    debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET);
+    debug_assert_eq!(mask_unaligned.len(), n);
+
+    let arrow_arr = UInt64Array::new(
+        ScalarBuffer::from(raw_values.clone()),
+        Some(NullBuffer::from(raw_valid.clone())),
+    );
+
+    // Oversized array → slice off SLICE_OFFSET lanes so the resulting array's
+    // NullBuffer has `offset() == SLICE_OFFSET`. The remaining `n` lanes hold the
+    // same validity pattern as `arrow_arr`.
+    let arrow_arr_unaligned = {
+        let mut padded_values: Vec<u64> = vec![0; SLICE_OFFSET];
+        padded_values.extend_from_slice(&raw_values);
+        let mut padded_valid: Vec<bool> = vec![false; SLICE_OFFSET];
+        padded_valid.extend_from_slice(&raw_valid);
+        let oversized = UInt64Array::new(
+            ScalarBuffer::from(padded_values),
+            Some(NullBuffer::from(padded_valid)),
+        );
+        use arrow_array::Array;
+        let sliced = oversized.slice(SLICE_OFFSET, n);
+        debug_assert_eq!(
+            sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8,
+            SLICE_OFFSET
+        );
+        sliced
+    };
+
+    Fixture {
+        values,
+        mask_aligned,
+        mask_unaligned,
+        arrow_arr,
+        arrow_arr_unaligned,
+    }
+}
+
+const CAST_OPTS: CastOptions<'static> = CastOptions {
+    safe: true,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
+    safe: false,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+#[divan::bench(args = SIZES)]
+fn kernel_map_with_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            // Owned uninit-slot vector, sized once outside the timed region.
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            // SAFETY: every lane is written before any read inside the kernel.
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                (v * valid as u64) as u32
+            });
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_unaligned(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr_unaligned.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn kernel_try_map_with_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            // SAFETY: every lane is written before any read inside the kernel.
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            })
+            .unwrap();
+        });
+}
+
+/// Same kernel, but the mask has `offset() == 5` so `BitChunks::iter()` must shift
+/// across byte boundaries on every chunk. Quantifies the cost of unaligned mask access.
+#[divan::bench(args = SIZES)]
+fn kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_unaligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            })
+            .unwrap();
+        });
+}
+
+/// Aligned-mask counterpart for `map_with_mask` (infallible). Pair with the
+/// `_unaligned` variant below to isolate the mask-iteration cost from the closure.
+#[divan::bench(args = SIZES)]
+fn kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_unaligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                (v * valid as u64) as u32
+            });
+        });
+}
+
+/// As above but with the branchful idiomatic form. Tests whether autovectorization
+/// survives a per-lane `if valid { ... } else { ... }` shape.
+#[divan::bench(args = SIZES)]
+fn kernel_try_from_branchful(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                if valid {
+                    u32::try_from(v).ok()
+                } else {
+                    Some(0_u32)
+                }
+            })
+            .unwrap();
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn iter_zip_checked(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values.clone(), f.mask_aligned.clone()))
+        .bench_refs(|(values, mask)| {
+            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
+                values.iter().zip(mask.iter()).map(|(&v, valid)| {
+                    let scaled = v * valid as u64;
+                    if scaled <= u32::MAX as u64 {
+                        Ok(scaled as u32)
+                    } else {
+                        Err(())
+                    }
+                }),
+            )
+            .unwrap()
+            .freeze();
+            buf
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone()))
+        .bench_refs(|(values, mask)| {
+            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
+                values.iter().zip(mask.iter()).map(|(&v, valid)| {
+                    let scaled = v * valid as u64;
+                    if scaled <= u32::MAX as u64 {
+                        Ok(scaled as u32)
+                    } else {
+                        Err(())
+                    }
+                }),
+            )
+            .unwrap()
+            .freeze();
+            buf
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_checked(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr_unaligned.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
diff --git a/vortex-buffer/src/lane_ops.rs b/vortex-buffer/src/lane_ops.rs
new file mode 100644
index 00000000000..b145633465b
--- /dev/null
+++ b/vortex-buffer/src/lane_ops.rs
@@ -0,0 +1,713 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Elementwise kernels that combine a `[T]` slice with a `BitBuffer` validity mask.
+//!
+//! The output is always a caller-provided `&mut` slice — these kernels never allocate.
+//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
+//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
+
+use std::mem::MaybeUninit;
+
+use crate::BitBuffer;
+
+/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`.
+///
+/// All three inputs must have the same length. The output type `R` may differ from the
+/// input type `T` — this kernel is the building block for both same-type transforms
+/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out`
+/// initialized (e.g. by calling `BufferMut::set_len` after this returns).
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
+#[inline]
+pub fn map_with_mask<T, R, F>(values: &[T], mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
+where
+    T: Copy,
+    F: FnMut(T, bool) -> R,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        // Inner loop is fixed-size 64 so the compiler can autovectorize
+        // for branchless closures like `|v, valid| v * (valid as T)`.
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: chunks.iter() yields chunks_count full words, so i < chunks_count * 64 <= len.
+            let v = unsafe { *values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i = chunks_count * 64 + bit_idx < chunks_count * 64 + remainder = len.
+            let v = unsafe { *values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        }
+    }
+}
+
+/// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None` indicates a
+/// per-lane failure (e.g. range overflow on a narrowing cast).
+///
+/// The kernel does not short-circuit on the first failure inside a chunk: it processes
+/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator,
+/// then checks after each chunk. On failure, a cold scalar attribution pass replays the
+/// closure over that chunk to identify the first failing lane. The hot loop stays
+/// autovectorizable — the per-lane cost is one OR on top of the cast.
+///
+/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write
+/// `R::default()` into `out`, but the contents of `out` must not be relied upon when
+/// this function returns `Err`.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
+#[inline]
+pub fn try_map_with_mask<T, R, F>(
+    values: &[T],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<R>],
+    mut f: F,
+) -> Result<(), usize>
+where
+    T: Copy,
+    R: Copy + Default,
+    F: FnMut(T, bool) -> Option<R>,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        // Per-chunk accumulator — does not escape the SIMD inner loop.
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { *values.get_unchecked(i) };
+            let opt = f(v, bit);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure(values, src_chunk, base, 64, &mut f));
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { *values.get_unchecked(i) };
+            let opt = f(v, bit);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure(
+                values, src_chunk, base, remainder, &mut f,
+            ));
+        }
+    }
+
+    Ok(())
+}
+
+/// Cold path: identify the first lane in a chunk where `f` returned `None`.
+///
+/// Called only after the hot loop has detected that at least one lane failed.
+/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only
+/// runs once per error and the error path is supposed to be exceptional.
+#[cold]
+#[inline(never)]
+fn attribute_failure<T, R, F>(
+    values: &[T],
+    src_chunk: u64,
+    base: usize,
+    chunk_len: usize,
+    f: &mut F,
+) -> usize
+where
+    T: Copy,
+    F: FnMut(T, bool) -> Option<R>,
+{
+    for bit_idx in 0..chunk_len {
+        let i = base + bit_idx;
+        let bit = (src_chunk >> bit_idx) & 1 == 1;
+        // SAFETY: caller guarantees i < values.len().
+        let v = unsafe { *values.get_unchecked(i) };
+        if f(v, bit).is_none() {
+            return i;
+        }
+    }
+    // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed.
+    unreachable!("attribute_failure called without a failing lane")
+}
+
+/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
+///
+/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
+/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive
+/// buffer) and combine the validity bitmap in a separate pass — splitting the work
+/// this way lets the value-compare loop autovectorize cleanly.
+///
+/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
+/// beyond `len % 64` are written as `0`.
+///
+/// # Panics
+///
+/// Panics if `out.len() != values.len().div_ceil(64)`.
+#[inline]
+pub fn map_to_bits<T, F>(values: &[T], out: &mut [u64], mut f: F)
+where
+    T: Copy,
+    F: FnMut(T) -> bool,
+{
+    let len = values.len();
+    assert_eq!(
+        out.len(),
+        len.div_ceil(64),
+        "out must have len.div_ceil(64) words",
+    );
+
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        let base = chunk_idx * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..64 {
+            // SAFETY: base + bit_idx < chunks_count * 64 <= len.
+            let v = unsafe { *values.get_unchecked(base + bit_idx) };
+            packed |= (f(v) as u64) << bit_idx;
+        }
+        // SAFETY: chunk_idx < chunks_count <= out.len().
+        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
+    }
+
+    if remainder != 0 {
+        let base = chunks_count * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..remainder {
+            // SAFETY: base + bit_idx < len.
+            let v = unsafe { *values.get_unchecked(base + bit_idx) };
+            packed |= (f(v) as u64) << bit_idx;
+        }
+        // SAFETY: chunks_count < out.len() because remainder != 0.
+        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
+    }
+}
+
+/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words.
+///
+/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
+/// beyond `len % 64` are written as `0`.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`.
+#[inline]
+pub fn map_with_mask_to_bits<T, F>(values: &[T], mask: &BitBuffer, out: &mut [u64], mut f: F)
+where
+    T: Copy,
+    F: FnMut(T, bool) -> bool,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(
+        out.len(),
+        len.div_ceil(64),
+        "out must have len.div_ceil(64) words",
+    );
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { *values.get_unchecked(i) };
+            packed |= (f(v, bit) as u64) << bit_idx;
+        }
+        // SAFETY: chunk_idx < chunks_count <= out.len().
+        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { *values.get_unchecked(i) };
+            packed |= (f(v, bit) as u64) << bit_idx;
+        }
+        // SAFETY: chunks_count < out.len() because remainder != 0.
+        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::BitBufferMut;
+
+    fn write_t<T: Copy>(out: Vec<MaybeUninit<T>>) -> Vec<T> {
+        // SAFETY: tests always fully initialize the buffer.
+        unsafe { std::mem::transmute(out) }
+    }
+
+    #[test]
+    fn map_with_mask_aligned() {
+        let values: Vec<i32> = (0..10).collect();
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(10);
+            for i in 0..10 {
+                m.append(i % 2 == 0);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
+        map_with_mask(
+            &values,
+            &mask,
+            &mut out,
+            |v, valid| if valid { v } else { -1 },
+        );
+        assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
+    }
+
+    #[test]
+    fn map_with_mask_partial_chunk() {
+        // 130 lanes — two full u64 words + a 2-bit remainder.
+        let values: Vec<i32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
+        map_with_mask(
+            &values,
+            &mask,
+            &mut out,
+            |v, valid| if valid { v + 1 } else { 0 },
+        );
+        let got = write_t(out);
+        assert_eq!(got.len(), 130);
+        assert_eq!(got[0], 1);
+        assert_eq!(got[63], 64);
+        assert_eq!(got[64], 65);
+        assert_eq!(got[129], 130);
+    }
+
+    #[test]
+    fn map_with_mask_offset_mask() {
+        // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5.
+        let big = BitBuffer::new_set(128);
+        let sliced = big.slice(5..70); // logical len = 65, offset = 5
+        assert_eq!(sliced.len(), 65);
+        assert_eq!(sliced.offset(), 5);
+
+        let values: Vec<u32> = (0..65).collect();
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
+        map_with_mask(
+            &values,
+            &sliced,
+            &mut out,
+            |v, valid| if valid { v } else { u32::MAX },
+        );
+        let got = write_t(out);
+        assert_eq!(got, (0..65).collect::<Vec<u32>>());
+    }
+
+    #[test]
+    fn map_with_mask_offset_past_word() {
+        // Slicing past a full word still works. `BitBuffer::slice` normalizes the
+        // logical offset to `offset % 8` and bumps the underlying byte pointer,
+        // so `offset()` won't equal 70 here — what we exercise is that the kernel
+        // walks the chunked u64 view (which BitChunks handles internally).
+        let big = BitBuffer::new_set(256);
+        let sliced = big.slice(70..200);
+        assert_eq!(sliced.len(), 130);
+
+        let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
+        let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
+        map_with_mask(
+            &values,
+            &sliced,
+            &mut out,
+            |v, valid| if valid { v } else { -1 },
+        );
+        let got = write_t(out);
+        assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn map_with_mask_empty() {
+        let values: Vec<i32> = vec![];
+        let mask = BitBuffer::new_unset(0);
+        let mut out: Vec<MaybeUninit<i32>> = vec![];
+        map_with_mask(&values, &mask, &mut out, |v, _| v);
+    }
+
+    #[test]
+    fn map_with_mask_null_to_zero_branchless() {
+        // The trick from primitive/compute/cast.rs:147 — multiply by valid as T.
+        let values: Vec<i64> = (1..=100).collect();
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(100);
+            for i in 0..100 {
+                m.append(i % 3 != 0);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
+        map_with_mask(&values, &mask, &mut out, |v, valid| v * (valid as i64));
+        let got = write_t(out);
+        for (i, &x) in got.iter().enumerate() {
+            if i % 3 == 0 {
+                assert_eq!(x, 0);
+            } else {
+                assert_eq!(x, (i + 1) as i64);
+            }
+        }
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_aligned() {
+        let values: Vec<i32> = (0..128).collect();
+        let mask = BitBuffer::new_set(128);
+        let mut out = vec![0u64; 2];
+        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v % 2 == 0);
+        // Even numbers in [0, 128) set, odd unset.
+        for word_idx in 0..2 {
+            let word = out[word_idx];
+            for bit in 0..64 {
+                let i = word_idx * 64 + bit;
+                let expected = i % 2 == 0;
+                assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}");
+            }
+        }
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_partial_chunk() {
+        // 130 lanes — three u64 words, last word has only 2 valid bits.
+        let values: Vec<i32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        assert_eq!(out.len(), 3);
+        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v >= 64);
+        // Bits 64..128 set in word 1; bits 128..130 set in word 2.
+        assert_eq!(out[0], 0);
+        assert_eq!(out[1], u64::MAX);
+        assert_eq!(out[2], 0b11);
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_offset() {
+        let big = BitBuffer::new_set(256);
+        let sliced = big.slice(13..143); // offset=13, len=130
+        assert_eq!(sliced.len(), 130);
+        let values: Vec<u8> = (0..130).map(|i| (i % 4) as u8).collect();
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        map_with_mask_to_bits(&values, &sliced, &mut out, |v, valid| valid && v == 0);
+        for i in 0..130 {
+            let word = out[i / 64];
+            let bit = (word >> (i % 64)) & 1 == 1;
+            assert_eq!(bit, i % 4 == 0, "lane {i}");
+        }
+    }
+
+    #[test]
+    fn try_map_with_mask_all_ok() {
+        let values: Vec<u64> = (0..200).collect();
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..200u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_fails() {
+        // Put an overflowing value at lane 137 — the kernel must report Err(137).
+        let mut values: Vec<u64> = (0..200).collect();
+        values[137] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(137));
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_reports_first_failing_lane() {
+        // Multiple failing lanes — must report the lowest index.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[50] = u64::MAX;
+        values[51] = u64::MAX;
+        values[137] = u64::MAX;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(50));
+    }
+
+    #[test]
+    fn try_map_with_mask_null_lane_bypasses_check() {
+        // Null lanes are neutralized by `valid as u64` before the range check, so an
+        // out-of-range value at a null lane must NOT trigger failure.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got[5], 0); // null-lane wrote default
+        assert_eq!(got[6], 6);
+    }
+
+    #[test]
+    fn try_map_with_mask_branchful_matches_branchless() {
+        let mut values: Vec<u64> = (0..130).map(|i| i as u64 * 7).collect();
+        values[2] = u64::MAX;
+        values[65] = u32::MAX as u64;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(130);
+            for i in 0..130 {
+                m.append(!matches!(i, 2 | 17 | 99));
+            }
+            m.freeze()
+        };
+
+        let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
+        let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
+        try_map_with_mask(&values, &mask, &mut branchless, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        })
+        .unwrap();
+        try_map_with_mask(&values, &mask, &mut branchful, |v, valid| {
+            if valid {
+                u32::try_from(v).ok()
+            } else {
+                Some(0)
+            }
+        })
+        .unwrap();
+
+        assert_eq!(write_t(branchful), write_t(branchless));
+    }
+
+    #[test]
+    fn try_map_with_mask_partial_chunk() {
+        let values: Vec<u64> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got.len(), 130);
+        assert_eq!(got[129], 129);
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_unaligned_offset() {
+        // The mask's first byte is not word-aligned: slice off 13 bits, so the
+        // underlying BitChunks iterator must shift across byte boundaries on every
+        // 64-bit chunk it yields.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5
+        assert_eq!(mask.len(), 130);
+
+        let values: Vec<u64> = (0..130).collect();
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..130u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_with_overflow() {
+        // Sliced mask + overflowing value — the cold attribution path must report
+        // the correct lane index in the sliced (post-offset) coordinate space.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        let mut values: Vec<u64> = (0..130).collect();
+        values[77] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_null_lanes() {
+        // Mix sliced offset with a non-trivial validity pattern. Null lanes must
+        // not contribute to fail_acc, even when their underlying value would overflow.
+        let mut m = BitBufferMut::with_capacity(256);
+        for i in 0..256 {
+            m.append(i % 3 != 0);
+        }
+        let big = m.freeze();
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        // After the 13-lane slice, original index `13 + j` becomes lane `j`.
+        // Lane `j` is valid iff `(13 + j) % 3 != 0`.
+        let mut values: Vec<u64> = (0..130).collect();
+        // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid.
+        // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
+        values[2] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok(), "null lane should bypass the range check");
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_in_remainder() {
+        // Overflow in the trailing partial chunk (not aligned to 64).
+        let mut values: Vec<u64> = (0..130).collect();
+        values[129] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(129));
+    }
+
+    #[test]
+    fn map_to_bits_aligned() {
+        let values: Vec<i32> = (0..128).collect();
+        let mut out = vec![0u64; 2];
+        map_to_bits(&values, &mut out, |v| v % 2 == 0);
+        for word_idx in 0..2 {
+            for bit in 0..64 {
+                let i = word_idx * 64 + bit;
+                let expected = i % 2 == 0;
+                assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}");
+            }
+        }
+    }
+
+    #[test]
+    fn map_to_bits_partial_chunk() {
+        let values: Vec<i32> = (0..130).collect();
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        assert_eq!(out.len(), 3);
+        map_to_bits(&values, &mut out, |v| v >= 64);
+        assert_eq!(out[0], 0);
+        assert_eq!(out[1], u64::MAX);
+        assert_eq!(out[2], 0b11);
+    }
+
+    #[test]
+    fn map_to_bits_empty() {
+        let values: Vec<i32> = vec![];
+        let mut out: Vec<u64> = vec![];
+        map_to_bits(&values, &mut out, |v| v > 0);
+    }
+
+    #[test]
+    fn map_to_bits_matches_fused_with_all_valid_mask() {
+        // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits.
+        let values: Vec<i64> = (0..200).map(|i| i % 7).collect();
+        let mask = BitBuffer::new_set(200);
+
+        let mut a = vec![0u64; 200usize.div_ceil(64)];
+        map_with_mask_to_bits(&values, &mask, &mut a, |v, valid| valid && v == 3);
+
+        let mut b = vec![0u64; 200usize.div_ceil(64)];
+        map_to_bits(&values, &mut b, |v| v == 3);
+
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_validity_kills_lane() {
+        // Even if predicate is true, null lanes should produce false.
+        let values: Vec<i32> = vec![1; 70];
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(70);
+            for i in 0..70 {
+                m.append(i >= 32); // first 32 lanes are null
+            }
+            m.freeze()
+        };
+        let mut out = vec![0u64; 70usize.div_ceil(64)];
+        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v == 1);
+        for i in 0..70 {
+            let bit = (out[i / 64] >> (i % 64)) & 1 == 1;
+            assert_eq!(bit, i >= 32, "lane {i}");
+        }
+    }
+}
diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs
index 8319fffa387..592762d7a26 100644
--- a/vortex-buffer/src/lib.rs
+++ b/vortex-buffer/src/lib.rs
@@ -52,6 +52,7 @@ pub use buffer::*;
 pub use buffer_mut::*;
 pub use bytes::*;
 pub use r#const::*;
+pub use lane_ops::*;
 pub use string::*;
 mod alignment;
 #[cfg(feature = "arrow")]
@@ -62,6 +63,7 @@ mod buffer_mut;
 mod bytes;
 mod r#const;
 mod debug;
+mod lane_ops;
 mod macros;
 #[cfg(feature = "memmap2")]
 mod memmap2;

From 85ef2f8893f0479e971ac340366735aebcc7b709 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 10:44:19 +0100
Subject: [PATCH 02/21] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |    1 +
 vortex-array/benches/cast_primitive.rs        |   47 +
 .../src/arrays/primitive/compute/cast.rs      |   86 +-
 vortex-buffer/Cargo.toml                      |   21 +-
 vortex-buffer/benches/cast_to_indexed.rs      |  467 ++++++
 vortex-buffer/src/lane_ops_indexed.rs         | 1261 +++++++++++++++++
 vortex-buffer/src/lib.rs                      |    6 +
 7 files changed, 1837 insertions(+), 52 deletions(-)
 create mode 100644 vortex-buffer/benches/cast_to_indexed.rs
 create mode 100644 vortex-buffer/src/lane_ops_indexed.rs

diff --git a/Cargo.lock b/Cargo.lock
index 11afc6996a2..9bb032d0d35 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,6 +9355,7 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
+ "arrow-arith",
  "arrow-array",
  "arrow-buffer",
  "arrow-cast",
diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs
index 86895fb2ce7..0b67571e93d 100644
--- a/vortex-array/benches/cast_primitive.rs
+++ b/vortex-array/benches/cast_primitive.rs
@@ -20,6 +20,10 @@ fn main() {
 
 const N: usize = 100_000;
 
+// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so
+// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth.
+const SIZES: &[usize] = &[65_536];
+
 #[divan::bench]
 fn cast_u16_to_u32(bencher: Bencher) {
     let mut rng = StdRng::seed_from_u64(42);
@@ -46,3 +50,46 @@ fn cast_u16_to_u32(bencher: Bencher) {
             .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
     });
 }
+
+/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded
+/// so every value fits, isolating the kernel's per-lane checked-cast overhead.
+#[divan::bench(args = SIZES)]
+fn cast_u32_to_u8(bencher: Bencher, n: usize) {
+    let mut rng = StdRng::seed_from_u64(42);
+    #[expect(clippy::cast_possible_truncation)]
+    let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
+        if rng.random_bool(0.7) {
+            Some(rng.random_range(0..u8::MAX) as u32)
+        } else {
+            None
+        }
+    }))
+    .into_array();
+    bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
+        #[expect(clippy::unwrap_used)]
+        a.cast(DType::Primitive(PType::U8, Nullability::Nullable))
+            .unwrap()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
+    });
+}
+
+/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds
+/// but still pays the per-lane `try_from` check.
+#[divan::bench(args = SIZES)]
+fn cast_i32_to_u32(bencher: Bencher, n: usize) {
+    let mut rng = StdRng::seed_from_u64(42);
+    let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
+        if rng.random_bool(0.7) {
+            Some(rng.random_range(0..i32::MAX))
+        } else {
+            None
+        }
+    }))
+    .into_array();
+    bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
+        #[expect(clippy::unwrap_used)]
+        a.cast(DType::Primitive(PType::U32, Nullability::Nullable))
+            .unwrap()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
+    });
+}
diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index bbe2f89322d..dd5abc2f164 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
-use vortex_buffer::try_map_with_mask;
+use vortex_buffer::lane_ops_indexed::try_map_no_validity;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
@@ -103,32 +103,28 @@ impl CastKernel for Primitive {
     }
 }
 
-/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts
-/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them
-/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out.
+/// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with
+/// `NumCast::from`. The kernel branches once on the mask shape:
+///
+/// - `Mask::AllTrue`  → [`try_map_no_validity`] — no per-lane validity work.
+/// - `Mask::AllFalse` → bulk zero — the closure is never invoked.
+/// - `Mask::Values`   → [`try_map_with_mask`] — the closure neutralizes null lanes
+///   via the `* valid as F` multiply trick so out-of-range null-lane values don't
+///   trigger spurious errors.
+///
+/// For statically-infallible casts (e.g. widening) LLVM proves `NumCast::from` always
+/// returns `Some` and strips the fail-tracking machinery, generating the same bare
+/// `ushll` widen loop the old hand-written `as_()` fast path produced.
 fn cast_values<F, T>(
     array: ArrayView<'_, Primitive>,
     new_validity: Validity,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    F: NativePType + AsPrimitive<T>,
+    F: NativePType,
     T: NativePType,
 {
     let values = array.as_slice::<F>();
-
-    // Fast path: statically infallible, or cached min/max prove every valid value fits in `T`.
-    // The cached check never triggers a stats computation — if the bounds aren't already known
-    // we fall through to the per-lane loop below.
-    if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) {
-        return Ok(PrimitiveArray::new(cast::<F, T>(values), new_validity).into_array());
-    }
-
-    // TODO(joe): if the values source and target have the same bit-width we can
-    // mutate in place.
-
-    // Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for
-    // them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow.
     let mask = array.validity()?.execute_mask(array.len(), ctx)?;
     let overflow = || {
         vortex_err!(
@@ -136,13 +132,20 @@ where
             F::PTYPE, T::PTYPE,
         )
     };
+
     let buffer: Buffer<T> = match &mask {
-        Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter(
-            values
-                .iter()
-                .map(|&v| <T as NumCast>::from(v).ok_or_else(overflow)),
-        )?
-        .freeze(),
+        Mask::AllTrue(_) => {
+            let mut buffer = BufferMut::<T>::with_capacity(values.len());
+            try_map_no_validity(
+                values,
+                &mut buffer.spare_capacity_mut()[..values.len()],
+                |v| <T as NumCast>::from(v),
+            )
+            .map_err(|_| overflow())?;
+            // SAFETY: try_map_no_validity returned Ok, so it initialized every lane.
+            unsafe { buffer.set_len(values.len()) };
+            buffer.freeze()
+        }
         Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
         Mask::Values(m) => {
             let mut buffer = BufferMut::<T>::with_capacity(values.len());
@@ -150,9 +153,15 @@ where
                 values,
                 m.bit_buffer(),
                 &mut buffer.spare_capacity_mut()[..values.len()],
+                // Lazy validity: only consult `valid` on the failure branch. For
+                // widening / statically-infallible casts, `NumCast::from` is always
+                // `Some` so the `or_else` is provably dead — LLVM DCEs the validity
+                // path entirely, giving the same codegen as the maskless kernel.
+                // For narrowing, `valid` is only read at lanes that actually
+                // overflowed (a cold check on top of the cast).
                 |v, valid| {
-                    let factor = if valid { F::one() } else { F::zero() };
-                    <T as NumCast>::from(v * factor)
+                    <T as NumCast>::from(v)
+                        .or_else(|| (!valid).then(T::zero))
                 },
             )
             .map_err(|_| overflow())?;
@@ -165,12 +174,6 @@ where
     Ok(PrimitiveArray::new(buffer, new_validity).into_array())
 }
 
-/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because
-/// they are masked out by validity.
-fn cast<F: NativePType + AsPrimitive<T>, T: NativePType>(array: &[F]) -> Buffer<T> {
-    BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze()
-}
-
 fn reinterpret(
     array: ArrayView<'_, Primitive>,
     new_ptype: PType,
@@ -188,23 +191,6 @@ fn reinterpret(
     .into_array()
 }
 
-/// Returns `true` if every value of `src` is guaranteed representable in `target` without
-/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never
-/// produce an out-of-range result.
-fn values_always_fit(src: PType, target: PType) -> bool {
-    if src == target {
-        return true;
-    }
-    if src.is_int() && target.is_int() {
-        return target.byte_width() > src.byte_width()
-            && (src.is_unsigned_int() || target.is_signed_int());
-    }
-    if src.is_float() && target.is_float() {
-        return target.byte_width() > src.byte_width();
-    }
-    src.is_int() && matches!(target, PType::F32 | PType::F64)
-}
-
 /// Returns `true` if all valid values in `array` are representable as `target_ptype`.
 ///
 /// Cached min/max statistics are consulted first. If either bound is missing, the function either
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 850aec4ec19..42c882004bd 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,8 +37,9 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
-# TEMP: arrow-{array,cast,schema} are only used by the cast_to bench for cross-impl
-# performance comparisons. Drop them when the bench is removed.
+# TEMP: arrow-* are only used by the cast_to / add_checked benches for cross-impl
+# performance comparisons. Drop them when the benches are removed.
+arrow-arith = { workspace = true }
 arrow-array = { workspace = true }
 arrow-cast = { workspace = true }
 arrow-schema = { workspace = true }
@@ -58,3 +59,19 @@ harness = false
 [[bench]]
 name = "cast_to"
 harness = false
+
+[[bench]]
+name = "cast_to_indexed"
+harness = false
+
+[[bench]]
+name = "cast_iter_all"
+harness = false
+
+[[bench]]
+name = "cast_in_place"
+harness = false
+
+[[bench]]
+name = "add_checked"
+harness = false
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
new file mode 100644
index 00000000000..b2abe29b890
--- /dev/null
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Mirror of `cast_to.rs` driving the kernels through [`vortex_buffer::lane_ops_indexed`]
+//! (the `IndexedSource` trait) plus isolation benches that decompose the cost of the
+//! kernel structure vs. the cast vs. the mask access.
+//!
+//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated
+//! this design: a stateful `ExactSizeIterator` variant of these kernels was ~+100%
+//! slower because per-lane `next()` calls create a 64-deep dependency chain across
+//! iterations that blocks vectorization. The `IndexedSource` trait uses
+//! `unsafe fn get_unchecked(i)` reads — independent across iterations — and inlines
+//! to the same indexed load as the slice kernel.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+
+use arrow_array::UInt64Array;
+use arrow_buffer::NullBuffer;
+use arrow_buffer::ScalarBuffer;
+use arrow_cast::CastOptions;
+use arrow_cast::cast_with_options;
+use arrow_schema::DataType;
+use divan::Bencher;
+use rand::SeedableRng;
+use rand::prelude::*;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_buffer::lane_ops_indexed::map_with_mask as indexed_map_with_mask;
+use vortex_buffer::lane_ops_indexed::try_map_validity_filtered as indexed_try_map_validity_filtered;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask as indexed_try_map_with_mask;
+
+fn main() {
+    divan::main();
+}
+
+const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
+const VALID_RATE: f64 = 0.7;
+const DATA_SEED: u64 = 0;
+const VALID_SEED: u64 = 1;
+
+// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte
+// boundaries on every chunk it yields.
+const SLICE_OFFSET: usize = 5;
+
+struct Fixture {
+    values: Buffer<u64>,
+    /// `offset() == 0`, underlying byte buffer starts on a byte boundary.
+    mask_aligned: BitBuffer,
+    /// Same validity bits but sliced so `offset() == SLICE_OFFSET`.
+    mask_unaligned: BitBuffer,
+    arrow_arr: UInt64Array,
+    /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset,
+    /// constructed by building an oversized array and slicing.
+    arrow_arr_unaligned: UInt64Array,
+}
+
+fn fixture(n: usize) -> Fixture {
+    let mut data_rng = StdRng::seed_from_u64(DATA_SEED);
+    let mut valid_rng = StdRng::seed_from_u64(VALID_SEED);
+    let raw_values: Vec<u64> = (0..n)
+        .map(|_| data_rng.random_range(0..u32::MAX as u64))
+        .collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
+
+    let values: Buffer<u64> = raw_values.iter().copied().collect();
+
+    let mask_aligned = {
+        let mut m = BitBufferMut::with_capacity(n);
+        for &v in &raw_valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+
+    let mask_unaligned = {
+        let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET);
+        for _ in 0..SLICE_OFFSET {
+            m.append(false);
+        }
+        for &v in &raw_valid {
+            m.append(v);
+        }
+        m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n)
+    };
+    debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET);
+    debug_assert_eq!(mask_unaligned.len(), n);
+
+    let arrow_arr = UInt64Array::new(
+        ScalarBuffer::from(raw_values.clone()),
+        Some(NullBuffer::from(raw_valid.clone())),
+    );
+
+    let arrow_arr_unaligned = {
+        let mut padded_values: Vec<u64> = vec![0; SLICE_OFFSET];
+        padded_values.extend_from_slice(&raw_values);
+        let mut padded_valid: Vec<bool> = vec![false; SLICE_OFFSET];
+        padded_valid.extend_from_slice(&raw_valid);
+        let oversized = UInt64Array::new(
+            ScalarBuffer::from(padded_values),
+            Some(NullBuffer::from(padded_valid)),
+        );
+        use arrow_array::Array;
+        let sliced = oversized.slice(SLICE_OFFSET, n);
+        debug_assert_eq!(
+            sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8,
+            SLICE_OFFSET
+        );
+        sliced
+    };
+
+    Fixture {
+        values,
+        mask_aligned,
+        mask_unaligned,
+        arrow_arr,
+        arrow_arr_unaligned,
+    }
+}
+
+const CAST_OPTS: CastOptions<'static> = CastOptions {
+    safe: true,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
+    safe: false,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_unaligned(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr_unaligned.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn iter_zip_checked(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values.clone(), f.mask_aligned.clone()))
+        .bench_refs(|(values, mask)| {
+            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
+                values.iter().zip(mask.iter()).map(|(&v, valid)| {
+                    let scaled = v * valid as u64;
+                    if scaled <= u32::MAX as u64 {
+                        Ok(scaled as u32)
+                    } else {
+                        Err(())
+                    }
+                }),
+            )
+            .unwrap()
+            .freeze();
+            buf
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone()))
+        .bench_refs(|(values, mask)| {
+            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
+                values.iter().zip(mask.iter()).map(|(&v, valid)| {
+                    let scaled = v * valid as u64;
+                    if scaled <= u32::MAX as u64 {
+                        Ok(scaled as u32)
+                    } else {
+                        Err(())
+                    }
+                }),
+            )
+            .unwrap()
+            .freeze();
+            buf
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_checked(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| f.arrow_arr_unaligned.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
+
+// -----------------------------------------------------------------------------
+// Isolation benches: drop the mask, isolate the cast u64 -> u32 to see whether
+// the iterator cost is intrinsic or comes from the surrounding kernel structure.
+// -----------------------------------------------------------------------------
+
+/// Plain slice indexing, no mask. Upper bound on what the iter variants must beat.
+#[divan::bench(args = SIZES)]
+fn iso_slice_cast(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), out)
+        })
+        .bench_refs(|(values, out)| {
+            let v = values.as_slice();
+            let o = out.as_mut_slice();
+            assert_eq!(v.len(), o.len());
+            for i in 0..v.len() {
+                // SAFETY: bounds checked by the assert above.
+                unsafe { o.get_unchecked_mut(i).write(*v.get_unchecked(i) as u32) };
+            }
+        });
+}
+
+/// Per-lane iterator zip, no mask. Tests whether `slice::Iter::next` autovectorizes
+/// when nothing else is in the way.
+#[divan::bench(args = SIZES)]
+fn iso_iter_cast(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), out)
+        })
+        .bench_refs(|(values, out)| {
+            for (slot, &v) in out.iter_mut().zip(values.iter()) {
+                slot.write(v as u32);
+            }
+        });
+}
+
+/// `chunks_exact(64)` + `try_into::<&[u64; 64]>` so the outer iter advances once per
+/// 64 lanes and the inner loop indexes a fixed-size array. Tests whether moving the
+/// iterator state from per-lane to per-chunk fixes vectorization.
+#[divan::bench(args = SIZES)]
+fn iso_iter_chunks_64(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), out)
+        })
+        .bench_refs(|(values, out)| {
+            let v = values.as_slice();
+            let o = out.as_mut_slice();
+            assert_eq!(v.len(), o.len());
+            for (v_chunk, o_chunk) in v.chunks_exact(64).zip(o.chunks_exact_mut(64)) {
+                let v_arr: &[u64; 64] = v_chunk.try_into().unwrap();
+                let o_arr: &mut [MaybeUninit<u32>; 64] = o_chunk.try_into().unwrap();
+                for bit_idx in 0..64 {
+                    o_arr[bit_idx].write(v_arr[bit_idx] as u32);
+                }
+            }
+            // Ignore the tail — SIZES are all multiples of 64.
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Indexed-source variant (lane_ops_indexed). The kernel takes an `IndexedSource` whose
+// `&[T]` impl is `unsafe fn get_unchecked(i) -> T` — same indexed load as the slice
+// kernel, but the trait also supports binary inputs via `LaneZip`.
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn indexed_kernel_map_with_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                (v * valid as u64) as u32
+            });
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn indexed_kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_unaligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                (v * valid as u64) as u32
+            });
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn indexed_kernel_try_map_with_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            })
+            .unwrap();
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn indexed_kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_unaligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            })
+            .unwrap();
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn indexed_kernel_try_from_branchful(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                if valid {
+                    u32::try_from(v).ok()
+                } else {
+                    Some(0_u32)
+                }
+            })
+            .unwrap();
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Decoupled-design variant with CORRECT validity semantics: closure is `|v|`
+// (no per-lane mask threading), but the mask filters out null-lane failures at
+// the chunk boundary. A null row whose stored value would overflow does NOT
+// cause Err — this matches the existing `try_map_with_mask` semantics while
+// keeping the lighter inner loop.
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn indexed_decoupled_kernel_try_map_with_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            // SAFETY: every lane is written before any read inside the kernel.
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| {
+                (v <= u32::MAX as u64).then_some(v as u32)
+            })
+            .unwrap();
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn indexed_decoupled_kernel_try_from_branchful(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| {
+                u32::try_from(v).ok()
+            })
+            .unwrap();
+        });
+}
+
+/// Full checked-cast kernel using `chunks_exact(64)` + fixed-size array refs, with
+/// the mask. If this matches the slice kernel, the cost is in the per-lane iterator
+/// state, not the iter pattern in general.
+#[divan::bench(args = SIZES)]
+fn kernel_iter_chunks_64(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values.clone(), f.mask_aligned.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            let v = values.as_slice();
+            let o = out.as_mut_slice();
+            let len = v.len();
+            assert_eq!(len, mask.len());
+            assert_eq!(len, o.len());
+
+            let chunks = mask.chunks();
+            let chunks_count = len / 64;
+            let full = chunks_count * 64;
+            let (v_full, _v_rem) = v.split_at(full);
+            let (o_full, _o_rem) = o.split_at_mut(full);
+
+            for ((v_chunk, o_chunk), src_chunk) in v_full
+                .chunks_exact(64)
+                .zip(o_full.chunks_exact_mut(64))
+                .zip(chunks.iter())
+            {
+                let v_arr: &[u64; 64] = v_chunk.try_into().unwrap();
+                let o_arr: &mut [MaybeUninit<u32>; 64] = o_chunk.try_into().unwrap();
+                let mut fail_acc: u64 = 0;
+                for bit_idx in 0..64 {
+                    let bit = (src_chunk >> bit_idx) & 1 == 1;
+                    let scaled = v_arr[bit_idx] * bit as u64;
+                    let opt = (scaled <= u32::MAX as u64).then_some(scaled as u32);
+                    fail_acc |= opt.is_none() as u64;
+                    o_arr[bit_idx].write(opt.unwrap_or_default());
+                }
+                assert_eq!(fail_acc, 0);
+            }
+            // Ignore the tail — SIZES are all multiples of 64.
+        });
+}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
new file mode 100644
index 00000000000..c83114d8bcd
--- /dev/null
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -0,0 +1,1261 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Indexed-source variant of [`crate::lane_ops`].
+//!
+//! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is
+//! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]`
+//! this inlines to the same indexed load as the slice kernel; for `LaneZip(&[A], &[B])`
+//! it gives two independent indexed reads per lane — both shapes the auto-vectorizer
+//! handles.
+//!
+//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated
+//! this design.
+//!
+//! The output is always a caller-provided `&mut` slice — these kernels never allocate.
+//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
+//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
+
+use std::mem::MaybeUninit;
+
+use crate::BitBuffer;
+
+/// A length-known source supporting unchecked indexed reads.
+///
+/// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s.
+/// The kernels in this module require this trait instead of `Iterator` so that lane
+/// reads carry no inter-iteration data dependency — the autovectorizer treats each
+/// lane independently.
+pub trait IndexedSource {
+    /// The per-lane item type. Must be `Copy` so the kernels can pass it through
+    /// the closure by value without extra moves.
+    type Item: Copy;
+    /// Logical lane count.
+    fn len(&self) -> usize;
+    /// Returns true when there are no lanes.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+    /// Read the lane at `i` without bounds checking.
+    ///
+    /// # Safety
+    ///
+    /// `i` must be strictly less than `self.len()`.
+    unsafe fn get_unchecked(&self, i: usize) -> Self::Item;
+}
+
+impl<T: Copy> IndexedSource for &[T] {
+    type Item = T;
+    #[inline]
+    fn len(&self) -> usize {
+        <[T]>::len(self)
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> T {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked(self, i) }
+    }
+}
+
+impl<T: Copy> IndexedSource for &mut [T] {
+    type Item = T;
+    #[inline]
+    fn len(&self) -> usize {
+        <[T]>::len(self)
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> T {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked(self, i) }
+    }
+}
+
+/// An [`IndexedSource`] that also supports unchecked indexed writes — the binding
+/// for in-place kernels.
+///
+/// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a
+/// `(A, B)` pair back to two separate sources via a single index).
+pub trait IndexedSink: IndexedSource {
+    /// Write `value` into lane `i` without bounds checking.
+    ///
+    /// # Safety
+    ///
+    /// `i` must be strictly less than `self.len()`.
+    unsafe fn set_unchecked(&mut self, i: usize, value: Self::Item);
+}
+
+impl<T: Copy> IndexedSink for &mut [T] {
+    #[inline]
+    unsafe fn set_unchecked(&mut self, i: usize, value: T) {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked_mut(self, i) = value };
+    }
+}
+
+/// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane.
+///
+/// Use this to drive a binary kernel from two columns. Length equality is enforced
+/// at construction.
+pub struct LaneZip<A, B>(pub A, pub B);
+
+impl<A: IndexedSource, B: IndexedSource> LaneZip<A, B> {
+    /// Build a `LaneZip` from two equal-length sources.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the two operands have different lengths.
+    pub fn new(a: A, b: B) -> Self {
+        assert_eq!(a.len(), b.len(), "LaneZip operands must have the same length");
+        Self(a, b)
+    }
+}
+
+impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
+    type Item = (A::Item, B::Item);
+    #[inline]
+    fn len(&self) -> usize {
+        debug_assert_eq!(self.0.len(), self.1.len());
+        self.0.len()
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) {
+        // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths.
+        unsafe {
+            (
+                self.0.get_unchecked(i),
+                self.1.get_unchecked(i),
+            )
+        }
+    }
+}
+
+/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`.
+///
+/// All three inputs must have the same length. The output type `R` may differ from the
+/// input type `T` — this kernel is the building block for both same-type transforms
+/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out`
+/// initialized (e.g. by calling `BufferMut::set_len` after this returns).
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
+#[inline]
+pub fn map_with_mask<S, R, F>(
+    values: S,
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<R>],
+    mut f: F,
+) where
+    S: IndexedSource,
+    F: FnMut(S::Item, bool) -> R,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        // Inner loop is fixed-size 64 with independent per-lane reads — no iterator
+        // state, no cross-iteration dependency, so the auto-vectorizer can fuse
+        // 64 indexed loads into vector loads.
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        }
+    }
+}
+
+/// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None` indicates a
+/// per-lane failure (e.g. range overflow on a narrowing cast).
+///
+/// The kernel does not short-circuit on the first failure inside a chunk: it processes
+/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator,
+/// then checks after each chunk. On failure, a cold scalar attribution pass replays the
+/// closure over that chunk to identify the first failing lane. The hot loop stays
+/// autovectorizable — the per-lane cost is one OR on top of the cast.
+///
+/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write
+/// `R::default()` into `out`, but the contents of `out` must not be relied upon when
+/// this function returns `Err`.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
+#[inline]
+pub fn try_map_with_mask<S, R, F>(
+    values: S,
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<R>],
+    mut f: F,
+) -> Result<(), usize>
+where
+    S: IndexedSource,
+    R: Copy + Default,
+    F: FnMut(S::Item, bool) -> Option<R>,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        // Per-chunk accumulator — does not escape the SIMD inner loop.
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure(&values, src_chunk, base, 64, &mut f));
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure(
+                &values, src_chunk, base, remainder, &mut f,
+            ));
+        }
+    }
+
+    Ok(())
+}
+
+/// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every
+/// closure invocation is treated as "happened", regardless of whether the lane
+/// is null. Use this only when the input is known non-nullable.
+///
+/// For nullable inputs where the closure is infallible (no overflow / no error
+/// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible
+/// closure, prefer [`try_map_validity_filtered`] — both correctly suppress
+/// null-lane logic. This kernel exists for the narrow "no validity exists"
+/// case (non-nullable column, internal pipelines, etc.).
+///
+/// # Panics
+///
+/// Panics if `out.len() != values.len()`.
+#[inline]
+pub fn map_no_validity<S, R, F>(values: S, out: &mut [MaybeUninit<R>], mut f: F)
+where
+    S: IndexedSource,
+    F: FnMut(S::Item) -> R,
+{
+    let len = values.len();
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        let base = chunk_idx * 64;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v)) };
+        }
+    }
+
+    if remainder != 0 {
+        let base = chunks_count * 64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v)) };
+        }
+    }
+}
+
+/// Fallible map with **no validity awareness at all** — every `None` returned
+/// by the closure is treated as a failure, even at null lanes.
+///
+/// # Use this only for non-nullable inputs.
+///
+/// For nullable inputs with a fallible closure, use
+/// [`try_map_validity_filtered`] — it has the same value-only closure shape
+/// (and the same perf win) but **correctly suppresses null-lane failures**
+/// via per-chunk `fail_bits & mask_chunk`.
+///
+/// Using this kernel on a nullable input where a null lane's stored value
+/// would cause `f` to return `None` will produce a spurious `Err`. This is a
+/// correctness footgun on purpose — the name and this doc are how the API
+/// signals "you must know your input has no nulls."
+///
+/// On failure returns `Err(failing_lane_index)`.
+///
+/// # Panics
+///
+/// Panics if `out.len() != values.len()`.
+#[inline]
+pub fn try_map_no_validity<S, R, F>(
+    values: S,
+    out: &mut [MaybeUninit<R>],
+    mut f: F,
+) -> Result<(), usize>
+where
+    S: IndexedSource,
+    R: Copy + Default,
+    F: FnMut(S::Item) -> Option<R>,
+{
+    let len = values.len();
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        let base = chunk_idx * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
+        }
+    }
+
+    if remainder != 0 {
+        let base = chunks_count * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        if fail_acc != 0 {
+            return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
+        }
+    }
+
+    Ok(())
+}
+
+/// Fallible value-only map with **chunk-level validity filtering**: closure is
+/// `|v| -> Option<R>`, no validity threaded through the inner loop. After each
+/// 64-lane chunk, per-lane failure bits are ANDed against the mask chunk, so
+/// failures at null lanes do **not** propagate as `Err`.
+///
+/// This is the correct shape for "checked cast that respects validity" — a null
+/// row whose stored value would overflow does **not** cause `Err`. It also
+/// preserves the perf win of the value-only closure: the hot loop has no per-lane
+/// mask extract, no `valid`-dependent branch.
+///
+/// ## Inner-loop trick
+///
+/// Per-lane fails are packed into a `u64` via `fail_bits |= (is_none as u64) << bit_idx`.
+/// The shift amount is loop-invariant after unrolling (since `bit_idx` is the
+/// compile-time loop counter), so the autovectorizer can issue 64 sequential
+/// value reads + closure applications + packed-bit ORs as a vector pipeline.
+///
+/// ## Attribution
+///
+/// On failure, `valid_failures = fail_bits & mask_chunk` is non-zero; the lowest
+/// set bit is the first failing valid lane. `trailing_zeros()` reads it out
+/// directly — no cold replay path, no second pass.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
+#[inline]
+pub fn try_map_validity_filtered<S, R, F>(
+    values: S,
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<R>],
+    mut f: F,
+) -> Result<(), usize>
+where
+    S: IndexedSource,
+    R: Copy + Default,
+    F: FnMut(S::Item) -> Option<R>,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(out.len(), len, "out must have the same length as values");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, mask_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut fail_bits: u64 = 0;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            // Pack failure bit at the lane's position. After unrolling, `bit_idx`
+            // is a compile-time constant per-iteration, so the shift is folded.
+            fail_bits |= (opt.is_none() as u64) << bit_idx;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        // Filter failures to those at VALID lanes only. Null-lane failures vanish.
+        let valid_failures = fail_bits & mask_chunk;
+        if valid_failures != 0 {
+            return Err(base + valid_failures.trailing_zeros() as usize);
+        }
+    }
+
+    if remainder != 0 {
+        let mask_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut fail_bits: u64 = 0;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            fail_bits |= (opt.is_none() as u64) << bit_idx;
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        let valid_failures = fail_bits & mask_chunk;
+        if valid_failures != 0 {
+            return Err(base + valid_failures.trailing_zeros() as usize);
+        }
+    }
+
+    Ok(())
+}
+
+/// Cold attribution for the no-mask variant.
+#[cold]
+#[inline(never)]
+fn attribute_failure_no_mask<S, R, F>(
+    values: &S,
+    base: usize,
+    chunk_len: usize,
+    f: &mut F,
+) -> usize
+where
+    S: IndexedSource,
+    F: FnMut(S::Item) -> Option<R>,
+{
+    for bit_idx in 0..chunk_len {
+        let i = base + bit_idx;
+        // SAFETY: caller guarantees i < values.len().
+        let v = unsafe { values.get_unchecked(i) };
+        if f(v).is_none() {
+            return i;
+        }
+    }
+    unreachable!("attribute_failure_no_mask called without a failing lane")
+}
+
+/// Cold path: identify the first lane in a chunk where `f` returned `None`.
+///
+/// Called only after the hot loop has detected that at least one lane failed.
+/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only
+/// runs once per error and the error path is supposed to be exceptional.
+#[cold]
+#[inline(never)]
+fn attribute_failure<S, R, F>(
+    values: &S,
+    src_chunk: u64,
+    base: usize,
+    chunk_len: usize,
+    f: &mut F,
+) -> usize
+where
+    S: IndexedSource,
+    F: FnMut(S::Item, bool) -> Option<R>,
+{
+    for bit_idx in 0..chunk_len {
+        let i = base + bit_idx;
+        let bit = (src_chunk >> bit_idx) & 1 == 1;
+        // SAFETY: caller guarantees base + chunk_len <= values.len().
+        let v = unsafe { values.get_unchecked(i) };
+        if f(v, bit).is_none() {
+            return i;
+        }
+    }
+    // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed.
+    unreachable!("attribute_failure called without a failing lane")
+}
+
+/// In-place variant of [`map_with_mask`]. Each lane is replaced with
+/// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]).
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()`.
+#[inline]
+pub fn map_with_mask_in_place<S, F>(mut values: S, mask: &BitBuffer, mut f: F)
+where
+    S: IndexedSink,
+    F: FnMut(S::Item, bool) -> S::Item,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let r = f(v, bit);
+            // SAFETY: i < len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            let r = f(v, bit);
+            // SAFETY: i < len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+    }
+}
+
+/// In-place variant of [`try_map_with_mask`]. Each lane of `values` is replaced
+/// with `f(values[i], mask[i])`, or `S::Item::default()` if `f` returned `None`.
+/// On failure returns `Err(first_failing_lane)`; lanes before that point have been
+/// written, and lanes within the failing chunk hold their unwrapped-or-default
+/// result. The buffer state on `Err` is intentionally unspecified.
+///
+/// ## Error attribution
+///
+/// Per-lane `is_none()` flags are folded into `first_fail` via a branchless
+/// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane
+/// loop, `first_fail` holds the smallest failing index in the chunk (or `MAX`
+/// if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on AArch64. The
+/// cold replay scheme used by [`try_map_with_mask`] isn't viable here because
+/// the original input values have already been overwritten by the time we
+/// would attribute the failure.
+///
+/// ## Why in-place is slower at cache-resident sizes
+///
+/// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the
+/// out-of-place kernel despite having half the memory traffic, because input
+/// and output share memory and the compiler must be conservative reordering
+/// loads/stores across iterations. At sizes that exceed L2 the in-place kernel
+/// wins back the gap by avoiding the second buffer's DRAM read+write traffic.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()`.
+#[inline]
+pub fn try_map_with_mask_in_place<S, F>(
+    mut values: S,
+    mask: &BitBuffer,
+    mut f: F,
+) -> Result<(), usize>
+where
+    S: IndexedSink,
+    S::Item: Default,
+    F: FnMut(S::Item, bool) -> Option<S::Item>,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut first_fail: u32 = u32::MAX;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+            first_fail = first_fail.min(candidate);
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+        if first_fail != u32::MAX {
+            return Err(first_fail as usize);
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut first_fail: u32 = u32::MAX;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+            first_fail = first_fail.min(candidate);
+            let r = opt.unwrap_or_default();
+            // SAFETY: i < len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+        if first_fail != u32::MAX {
+            return Err(first_fail as usize);
+        }
+    }
+
+    Ok(())
+}
+
+/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
+///
+/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
+/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive
+/// buffer) and combine the validity bitmap in a separate pass — splitting the work
+/// this way lets the value-compare loop autovectorize cleanly.
+///
+/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
+/// beyond `len % 64` are written as `0`.
+///
+/// # Panics
+///
+/// Panics if `out.len() != values.len().div_ceil(64)`.
+#[inline]
+pub fn map_to_bits<S, F>(values: S, out: &mut [u64], mut f: F)
+where
+    S: IndexedSource,
+    F: FnMut(S::Item) -> bool,
+{
+    let len = values.len();
+    assert_eq!(
+        out.len(),
+        len.div_ceil(64),
+        "out must have len.div_ceil(64) words",
+    );
+
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        let base = chunk_idx * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..64 {
+            // SAFETY: base + bit_idx < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(base + bit_idx) };
+            packed |= (f(v) as u64) << bit_idx;
+        }
+        // SAFETY: chunk_idx < chunks_count <= out.len().
+        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
+    }
+
+    if remainder != 0 {
+        let base = chunks_count * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..remainder {
+            // SAFETY: base + bit_idx < len.
+            let v = unsafe { values.get_unchecked(base + bit_idx) };
+            packed |= (f(v) as u64) << bit_idx;
+        }
+        // SAFETY: chunks_count < out.len() because remainder != 0.
+        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
+    }
+}
+
+/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words.
+///
+/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
+/// beyond `len % 64` are written as `0`.
+///
+/// # Panics
+///
+/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`.
+#[inline]
+pub fn map_with_mask_to_bits<S, F>(values: S, mask: &BitBuffer, out: &mut [u64], mut f: F)
+where
+    S: IndexedSource,
+    F: FnMut(S::Item, bool) -> bool,
+{
+    let len = values.len();
+    assert_eq!(len, mask.len(), "values and mask must have the same length");
+    assert_eq!(
+        out.len(),
+        len.div_ceil(64),
+        "out must have len.div_ceil(64) words",
+    );
+
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < chunks_count * 64 <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            packed |= (f(v, bit) as u64) << bit_idx;
+        }
+        // SAFETY: chunk_idx < chunks_count <= out.len().
+        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut packed = 0u64;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let v = unsafe { values.get_unchecked(i) };
+            packed |= (f(v, bit) as u64) << bit_idx;
+        }
+        // SAFETY: chunks_count < out.len() because remainder != 0.
+        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::BitBufferMut;
+
+    fn write_t<T: Copy>(out: Vec<MaybeUninit<T>>) -> Vec<T> {
+        // SAFETY: tests always fully initialize the buffer.
+        unsafe { std::mem::transmute(out) }
+    }
+
+    #[test]
+    fn map_with_mask_aligned() {
+        let values: Vec<i32> = (0..10).collect();
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(10);
+            for i in 0..10 {
+                m.append(i % 2 == 0);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
+        map_with_mask(
+            values.as_slice(),
+            &mask,
+            &mut out,
+            |v, valid| if valid { v } else { -1 },
+        );
+        assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
+    }
+
+    #[test]
+    fn map_with_mask_partial_chunk() {
+        // 130 lanes — two full u64 words + a 2-bit remainder.
+        let values: Vec<i32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
+        map_with_mask(
+            values.as_slice(),
+            &mask,
+            &mut out,
+            |v, valid| if valid { v + 1 } else { 0 },
+        );
+        let got = write_t(out);
+        assert_eq!(got.len(), 130);
+        assert_eq!(got[0], 1);
+        assert_eq!(got[63], 64);
+        assert_eq!(got[64], 65);
+        assert_eq!(got[129], 130);
+    }
+
+    #[test]
+    fn map_with_mask_offset_mask() {
+        // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5.
+        let big = BitBuffer::new_set(128);
+        let sliced = big.slice(5..70); // logical len = 65, offset = 5
+        assert_eq!(sliced.len(), 65);
+        assert_eq!(sliced.offset(), 5);
+
+        let values: Vec<u32> = (0..65).collect();
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
+        map_with_mask(
+            values.as_slice(),
+            &sliced,
+            &mut out,
+            |v, valid| if valid { v } else { u32::MAX },
+        );
+        let got = write_t(out);
+        assert_eq!(got, (0..65).collect::<Vec<u32>>());
+    }
+
+    #[test]
+    fn map_with_mask_offset_past_word() {
+        // Slicing past a full word still works. `BitBuffer::slice` normalizes the
+        // logical offset to `offset % 8` and bumps the underlying byte pointer,
+        // so `offset()` won't equal 70 here — what we exercise is that the kernel
+        // walks the chunked u64 view (which BitChunks handles internally).
+        let big = BitBuffer::new_set(256);
+        let sliced = big.slice(70..200);
+        assert_eq!(sliced.len(), 130);
+
+        let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
+        let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
+        map_with_mask(
+            values.as_slice(),
+            &sliced,
+            &mut out,
+            |v, valid| if valid { v } else { -1 },
+        );
+        let got = write_t(out);
+        assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn map_with_mask_empty() {
+        let values: Vec<i32> = vec![];
+        let mask = BitBuffer::new_unset(0);
+        let mut out: Vec<MaybeUninit<i32>> = vec![];
+        map_with_mask(values.as_slice(), &mask, &mut out, |v, _| v);
+    }
+
+    #[test]
+    fn map_with_mask_null_to_zero_branchless() {
+        // The trick from primitive/compute/cast.rs:147 — multiply by valid as T.
+        let values: Vec<i64> = (1..=100).collect();
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(100);
+            for i in 0..100 {
+                m.append(i % 3 != 0);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
+        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| v * (valid as i64));
+        let got = write_t(out);
+        for (i, &x) in got.iter().enumerate() {
+            if i % 3 == 0 {
+                assert_eq!(x, 0);
+            } else {
+                assert_eq!(x, (i + 1) as i64);
+            }
+        }
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_aligned() {
+        let values: Vec<i32> = (0..128).collect();
+        let mask = BitBuffer::new_set(128);
+        let mut out = vec![0u64; 2];
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v % 2 == 0);
+        // Even numbers in [0, 128) set, odd unset.
+        for word_idx in 0..2 {
+            let word = out[word_idx];
+            for bit in 0..64 {
+                let i = word_idx * 64 + bit;
+                let expected = i % 2 == 0;
+                assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}");
+            }
+        }
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_partial_chunk() {
+        // 130 lanes — three u64 words, last word has only 2 valid bits.
+        let values: Vec<i32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        assert_eq!(out.len(), 3);
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v >= 64);
+        // Bits 64..128 set in word 1; bits 128..130 set in word 2.
+        assert_eq!(out[0], 0);
+        assert_eq!(out[1], u64::MAX);
+        assert_eq!(out[2], 0b11);
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_offset() {
+        let big = BitBuffer::new_set(256);
+        let sliced = big.slice(13..143); // offset=13, len=130
+        assert_eq!(sliced.len(), 130);
+        let values: Vec<u8> = (0..130).map(|i| (i % 4) as u8).collect();
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        map_with_mask_to_bits(values.as_slice(),&sliced, &mut out, |v, valid| valid && v == 0);
+        for i in 0..130 {
+            let word = out[i / 64];
+            let bit = (word >> (i % 64)) & 1 == 1;
+            assert_eq!(bit, i % 4 == 0, "lane {i}");
+        }
+    }
+
+    #[test]
+    fn try_map_with_mask_all_ok() {
+        let values: Vec<u64> = (0..200).collect();
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..200u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_fails() {
+        // Put an overflowing value at lane 137 — the kernel must report Err(137).
+        let mut values: Vec<u64> = (0..200).collect();
+        values[137] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(137));
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_reports_first_failing_lane() {
+        // Multiple failing lanes — must report the lowest index.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[50] = u64::MAX;
+        values[51] = u64::MAX;
+        values[137] = u64::MAX;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(50));
+    }
+
+    #[test]
+    fn try_map_validity_filtered_null_lane_overflow_does_not_err() {
+        // Null lane with a value that would overflow MUST NOT cause Err.
+        // The closure is value-only — the mask filters the null-lane failure
+        // at the chunk boundary.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX; // null lane with overflowing value
+        values[42] = u64::MAX; // null lane with overflowing value
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5 && i != 42);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_validity_filtered(
+            values.as_slice(),
+            &mask,
+            &mut out,
+            |v| (v <= u32::MAX as u64).then_some(v as u32),
+        );
+        assert!(res.is_ok(), "null-lane overflow should not propagate as Err");
+    }
+
+    #[test]
+    fn try_map_validity_filtered_valid_overflow_does_err_with_first_index() {
+        // Valid lane overflow must propagate — and the reported index must be
+        // the lowest VALID failing lane, even if earlier null lanes also "failed"
+        // their unconditional cast.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX; // null lane — filtered out
+        values[42] = u64::MAX; // null lane — filtered out
+        values[77] = u64::MAX; // VALID lane — should be reported
+        values[100] = u64::MAX; // VALID lane — higher index, ignored
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5 && i != 42);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_validity_filtered(
+            values.as_slice(),
+            &mask,
+            &mut out,
+            |v| (v <= u32::MAX as u64).then_some(v as u32),
+        );
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_with_mask_null_lane_bypasses_check() {
+        // Null lanes are neutralized by `valid as u64` before the range check, so an
+        // out-of-range value at a null lane must NOT trigger failure.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got[5], 0); // null-lane wrote default
+        assert_eq!(got[6], 6);
+    }
+
+    #[test]
+    fn try_map_with_mask_branchful_matches_branchless() {
+        let mut values: Vec<u64> = (0..130).map(|i| i as u64 * 7).collect();
+        values[2] = u64::MAX;
+        values[65] = u32::MAX as u64;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(130);
+            for i in 0..130 {
+                m.append(!matches!(i, 2 | 17 | 99));
+            }
+            m.freeze()
+        };
+
+        let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
+        let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
+        try_map_with_mask(values.as_slice(), &mask, &mut branchless, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        })
+        .unwrap();
+        try_map_with_mask(values.as_slice(), &mask, &mut branchful, |v, valid| {
+            if valid {
+                u32::try_from(v).ok()
+            } else {
+                Some(0)
+            }
+        })
+        .unwrap();
+
+        assert_eq!(write_t(branchful), write_t(branchless));
+    }
+
+    #[test]
+    fn try_map_with_mask_partial_chunk() {
+        let values: Vec<u64> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got.len(), 130);
+        assert_eq!(got[129], 129);
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_unaligned_offset() {
+        // The mask's first byte is not word-aligned: slice off 13 bits, so the
+        // underlying BitChunks iterator must shift across byte boundaries on every
+        // 64-bit chunk it yields.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5
+        assert_eq!(mask.len(), 130);
+
+        let values: Vec<u64> = (0..130).collect();
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..130u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_with_overflow() {
+        // Sliced mask + overflowing value — the cold attribution path must report
+        // the correct lane index in the sliced (post-offset) coordinate space.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        let mut values: Vec<u64> = (0..130).collect();
+        values[77] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_with_mask_sliced_mask_null_lanes() {
+        // Mix sliced offset with a non-trivial validity pattern. Null lanes must
+        // not contribute to fail_acc, even when their underlying value would overflow.
+        let mut m = BitBufferMut::with_capacity(256);
+        for i in 0..256 {
+            m.append(i % 3 != 0);
+        }
+        let big = m.freeze();
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        // After the 13-lane slice, original index `13 + j` becomes lane `j`.
+        // Lane `j` is valid iff `(13 + j) % 3 != 0`.
+        let mut values: Vec<u64> = (0..130).collect();
+        // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid.
+        // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
+        values[2] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert!(res.is_ok(), "null lane should bypass the range check");
+    }
+
+    #[test]
+    fn try_map_with_mask_overflow_in_remainder() {
+        // Overflow in the trailing partial chunk (not aligned to 64).
+        let mut values: Vec<u64> = (0..130).collect();
+        values[129] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            let scaled = v * valid as u64;
+            (scaled <= u32::MAX as u64).then_some(scaled as u32)
+        });
+        assert_eq!(res, Err(129));
+    }
+
+    #[test]
+    fn map_to_bits_aligned() {
+        let values: Vec<i32> = (0..128).collect();
+        let mut out = vec![0u64; 2];
+        map_to_bits(values.as_slice(), &mut out, |v| v % 2 == 0);
+        for word_idx in 0..2 {
+            for bit in 0..64 {
+                let i = word_idx * 64 + bit;
+                let expected = i % 2 == 0;
+                assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}");
+            }
+        }
+    }
+
+    #[test]
+    fn map_to_bits_partial_chunk() {
+        let values: Vec<i32> = (0..130).collect();
+        let mut out = vec![0u64; 130usize.div_ceil(64)];
+        assert_eq!(out.len(), 3);
+        map_to_bits(values.as_slice(), &mut out, |v| v >= 64);
+        assert_eq!(out[0], 0);
+        assert_eq!(out[1], u64::MAX);
+        assert_eq!(out[2], 0b11);
+    }
+
+    #[test]
+    fn map_to_bits_empty() {
+        let values: Vec<i32> = vec![];
+        let mut out: Vec<u64> = vec![];
+        map_to_bits(values.as_slice(), &mut out, |v| v > 0);
+    }
+
+    #[test]
+    fn map_to_bits_matches_fused_with_all_valid_mask() {
+        // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits.
+        let values: Vec<i64> = (0..200).map(|i| i % 7).collect();
+        let mask = BitBuffer::new_set(200);
+
+        let mut a = vec![0u64; 200usize.div_ceil(64)];
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut a, |v, valid| valid && v == 3);
+
+        let mut b = vec![0u64; 200usize.div_ceil(64)];
+        map_to_bits(values.as_slice(), &mut b, |v| v == 3);
+
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn map_with_mask_to_bits_validity_kills_lane() {
+        // Even if predicate is true, null lanes should produce false.
+        let values: Vec<i32> = vec![1; 70];
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(70);
+            for i in 0..70 {
+                m.append(i >= 32); // first 32 lanes are null
+            }
+            m.freeze()
+        };
+        let mut out = vec![0u64; 70usize.div_ceil(64)];
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v == 1);
+        for i in 0..70 {
+            let bit = (out[i / 64] >> (i % 64)) & 1 == 1;
+            assert_eq!(bit, i >= 32, "lane {i}");
+        }
+    }
+}
diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs
index 592762d7a26..a4519ac62ec 100644
--- a/vortex-buffer/src/lib.rs
+++ b/vortex-buffer/src/lib.rs
@@ -64,6 +64,12 @@ mod bytes;
 mod r#const;
 mod debug;
 mod lane_ops;
+/// Indexed-source variant of [`lane_ops`]: takes an `IndexedSource` trait whose
+/// implementations expose `unsafe fn get_unchecked(i) -> Item`. `&[T]` impls inline
+/// to the same indexed load as the slice kernel, but the trait also admits binary
+/// inputs via `LaneZip`. See `HISTORY.md` for the iterator-API investigation that
+/// led to this design.
+pub mod lane_ops_indexed;
 mod macros;
 #[cfg(feature = "memmap2")]
 mod memmap2;

From 5cf469ab06f192eca051bca820fe0247775cd9fe Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 10:58:56 +0100
Subject: [PATCH 03/21] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                               |   4 -
 vortex-buffer/Cargo.toml                 |  22 -
 vortex-buffer/benches/cast_to.rs         | 323 ----------
 vortex-buffer/benches/cast_to_indexed.rs | 427 +-------------
 vortex-buffer/src/lane_ops.rs            | 713 -----------------------
 vortex-buffer/src/lane_ops_indexed.rs    | 229 ++++++--
 vortex-buffer/src/lib.rs                 |   7 -
 7 files changed, 188 insertions(+), 1537 deletions(-)
 delete mode 100644 vortex-buffer/benches/cast_to.rs
 delete mode 100644 vortex-buffer/src/lane_ops.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9bb032d0d35..d29c91edf62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,11 +9355,7 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
- "arrow-arith",
- "arrow-array",
  "arrow-buffer",
- "arrow-cast",
- "arrow-schema",
  "bitvec",
  "bytes",
  "codspeed-divan-compat",
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 42c882004bd..6490516f846 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,12 +37,6 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
-# TEMP: arrow-* are only used by the cast_to / add_checked benches for cross-impl
-# performance comparisons. Drop them when the benches are removed.
-arrow-arith = { workspace = true }
-arrow-array = { workspace = true }
-arrow-cast = { workspace = true }
-arrow-schema = { workspace = true }
 divan = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true }
@@ -56,22 +50,6 @@ harness = false
 name = "vortex_bitbuffer"
 harness = false
 
-[[bench]]
-name = "cast_to"
-harness = false
-
 [[bench]]
 name = "cast_to_indexed"
 harness = false
-
-[[bench]]
-name = "cast_iter_all"
-harness = false
-
-[[bench]]
-name = "cast_in_place"
-harness = false
-
-[[bench]]
-name = "add_checked"
-harness = false
diff --git a/vortex-buffer/benches/cast_to.rs b/vortex-buffer/benches/cast_to.rs
deleted file mode 100644
index c070f65d3a0..00000000000
--- a/vortex-buffer/benches/cast_to.rs
+++ /dev/null
@@ -1,323 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Cast `u64 -> u32` over a nullable column, three ways:
-//!
-//! 1. `kernel_map_with_mask` — uses `map_with_mask`. Writes truncated values into a
-//!    pre-allocated `&mut [MaybeUninit<u32>]`. Null lanes write `0` via the branchless
-//!    `v * valid as u64` trick, mirroring `primitive/compute/cast.rs:147`.
-//! 2. `iter_zip` — `values.iter().zip(mask.iter())` collected through
-//!    `BufferMut::from_trusted_len_iter`. This is the shape the current Vortex cast uses.
-//! 3. `arrow_cast` — `arrow_cast::cast` against a `UInt64Array`, allocating a new
-//!    `UInt32Array`.
-//!
-//! Plus two fallible variants that error on overflow:
-//!
-//! 4. `kernel_try_map_with_mask` — `try_map_with_mask` with `|v, valid| (v <= MAX).then_some(...)`.
-//!    Unconditional cast + parallel range check OR-reduced into a u64 fail accumulator.
-//! 5. `iter_zip_checked` — `BufferMut::try_from_trusted_len_iter` returning Err on overflow.
-//! 6. `arrow_cast_checked` — `arrow_cast::cast` with `safe = false` (errors on overflow).
-//!
-//! Inputs are bounded to fit in `u32`, so the fallible variants always succeed and we
-//! measure the cost of the range check on the success path.
-
-#![expect(clippy::unwrap_used)]
-
-use std::mem::MaybeUninit;
-
-use arrow_array::UInt64Array;
-use arrow_buffer::NullBuffer;
-use arrow_buffer::ScalarBuffer;
-use arrow_cast::CastOptions;
-use arrow_cast::cast_with_options;
-use arrow_schema::DataType;
-use divan::Bencher;
-use rand::SeedableRng;
-use rand::prelude::*;
-use vortex_buffer::BitBuffer;
-use vortex_buffer::BitBufferMut;
-use vortex_buffer::Buffer;
-use vortex_buffer::BufferMut;
-use vortex_buffer::map_with_mask;
-use vortex_buffer::try_map_with_mask;
-
-fn main() {
-    divan::main();
-}
-
-const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
-const VALID_RATE: f64 = 0.7;
-const DATA_SEED: u64 = 0;
-const VALID_SEED: u64 = 1;
-
-// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte
-// boundaries on every chunk it yields.
-const SLICE_OFFSET: usize = 5;
-
-struct Fixture {
-    values: Buffer<u64>,
-    /// `offset() == 0`, underlying byte buffer starts on a byte boundary.
-    mask_aligned: BitBuffer,
-    /// Same validity bits but sliced so `offset() == SLICE_OFFSET`.
-    mask_unaligned: BitBuffer,
-    arrow_arr: UInt64Array,
-    /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset,
-    /// constructed by building an oversized array and slicing.
-    arrow_arr_unaligned: UInt64Array,
-}
-
-fn fixture(n: usize) -> Fixture {
-    let mut data_rng = StdRng::seed_from_u64(DATA_SEED);
-    let mut valid_rng = StdRng::seed_from_u64(VALID_SEED);
-    let raw_values: Vec<u64> = (0..n)
-        .map(|_| data_rng.random_range(0..u32::MAX as u64))
-        .collect();
-    let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
-
-    let values: Buffer<u64> = raw_values.iter().copied().collect();
-
-    let mask_aligned = {
-        let mut m = BitBufferMut::with_capacity(n);
-        for &v in &raw_valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-
-    // Build n + SLICE_OFFSET bits then slice off the leading SLICE_OFFSET, so the
-    // remaining `n` lanes carry the SAME validity pattern as the aligned mask.
-    let mask_unaligned = {
-        let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET);
-        for _ in 0..SLICE_OFFSET {
-            m.append(false); // filler — sliced away
-        }
-        for &v in &raw_valid {
-            m.append(v);
-        }
-        m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n)
-    };
-    debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET);
-    debug_assert_eq!(mask_unaligned.len(), n);
-
-    let arrow_arr = UInt64Array::new(
-        ScalarBuffer::from(raw_values.clone()),
-        Some(NullBuffer::from(raw_valid.clone())),
-    );
-
-    // Oversized array → slice off SLICE_OFFSET lanes so the resulting array's
-    // NullBuffer has `offset() == SLICE_OFFSET`. The remaining `n` lanes hold the
-    // same validity pattern as `arrow_arr`.
-    let arrow_arr_unaligned = {
-        let mut padded_values: Vec<u64> = vec![0; SLICE_OFFSET];
-        padded_values.extend_from_slice(&raw_values);
-        let mut padded_valid: Vec<bool> = vec![false; SLICE_OFFSET];
-        padded_valid.extend_from_slice(&raw_valid);
-        let oversized = UInt64Array::new(
-            ScalarBuffer::from(padded_values),
-            Some(NullBuffer::from(padded_valid)),
-        );
-        use arrow_array::Array;
-        let sliced = oversized.slice(SLICE_OFFSET, n);
-        debug_assert_eq!(
-            sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8,
-            SLICE_OFFSET
-        );
-        sliced
-    };
-
-    Fixture {
-        values,
-        mask_aligned,
-        mask_unaligned,
-        arrow_arr,
-        arrow_arr_unaligned,
-    }
-}
-
-const CAST_OPTS: CastOptions<'static> = CastOptions {
-    safe: true,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
-const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
-    safe: false,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
-#[divan::bench(args = SIZES)]
-fn kernel_map_with_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            // Owned uninit-slot vector, sized once outside the timed region.
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            // SAFETY: every lane is written before any read inside the kernel.
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                (v * valid as u64) as u32
-            });
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_unaligned(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr_unaligned.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn kernel_try_map_with_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            // SAFETY: every lane is written before any read inside the kernel.
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            })
-            .unwrap();
-        });
-}
-
-/// Same kernel, but the mask has `offset() == 5` so `BitChunks::iter()` must shift
-/// across byte boundaries on every chunk. Quantifies the cost of unaligned mask access.
-#[divan::bench(args = SIZES)]
-fn kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_unaligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            })
-            .unwrap();
-        });
-}
-
-/// Aligned-mask counterpart for `map_with_mask` (infallible). Pair with the
-/// `_unaligned` variant below to isolate the mask-iteration cost from the closure.
-#[divan::bench(args = SIZES)]
-fn kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_unaligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                (v * valid as u64) as u32
-            });
-        });
-}
-
-/// As above but with the branchful idiomatic form. Tests whether autovectorization
-/// survives a per-lane `if valid { ... } else { ... }` shape.
-#[divan::bench(args = SIZES)]
-fn kernel_try_from_branchful(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                if valid {
-                    u32::try_from(v).ok()
-                } else {
-                    Some(0_u32)
-                }
-            })
-            .unwrap();
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn iter_zip_checked(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values.clone(), f.mask_aligned.clone()))
-        .bench_refs(|(values, mask)| {
-            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
-                values.iter().zip(mask.iter()).map(|(&v, valid)| {
-                    let scaled = v * valid as u64;
-                    if scaled <= u32::MAX as u64 {
-                        Ok(scaled as u32)
-                    } else {
-                        Err(())
-                    }
-                }),
-            )
-            .unwrap()
-            .freeze();
-            buf
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone()))
-        .bench_refs(|(values, mask)| {
-            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
-                values.iter().zip(mask.iter()).map(|(&v, valid)| {
-                    let scaled = v * valid as u64;
-                    if scaled <= u32::MAX as u64 {
-                        Ok(scaled as u32)
-                    } else {
-                        Err(())
-                    }
-                }),
-            )
-            .unwrap()
-            .freeze();
-            buf
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_checked(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr_unaligned.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index b2abe29b890..d3baec7885c 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -1,37 +1,28 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Mirror of `cast_to.rs` driving the kernels through [`vortex_buffer::lane_ops_indexed`]
-//! (the `IndexedSource` trait) plus isolation benches that decompose the cost of the
-//! kernel structure vs. the cast vs. the mask access.
+//! Focused bench for the **best fallible cast kernel** — what `cast.rs` actually uses
+//! in `vortex-array/src/arrays/primitive/compute/cast.rs`. Single bench, no cross-impl
+//! baselines: just a regression guard for the production cast hot path.
 //!
-//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated
-//! this design: a stateful `ExactSizeIterator` variant of these kernels was ~+100%
-//! slower because per-lane `next()` calls create a 64-deep dependency chain across
-//! iterations that blocks vectorization. The `IndexedSource` trait uses
-//! `unsafe fn get_unchecked(i)` reads — independent across iterations — and inlines
-//! to the same indexed load as the slice kernel.
+//! The kernel: [`vortex_buffer::lane_ops_indexed::try_map_with_mask`] called with a
+//! lazy-validity `or_else` closure — for statically-infallible casts (widening) LLVM
+//! proves `NumCast::from` is always `Some`, the `or_else` branch is dead, and the
+//! validity path is DCE'd. For fallible casts (narrowing), validity is only consulted
+//! on the cold failure branch.
 
 #![expect(clippy::unwrap_used)]
 
 use std::mem::MaybeUninit;
 
-use arrow_array::UInt64Array;
-use arrow_buffer::NullBuffer;
-use arrow_buffer::ScalarBuffer;
-use arrow_cast::CastOptions;
-use arrow_cast::cast_with_options;
-use arrow_schema::DataType;
 use divan::Bencher;
+use num_traits::NumCast;
 use rand::SeedableRng;
 use rand::prelude::*;
 use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
-use vortex_buffer::BufferMut;
-use vortex_buffer::lane_ops_indexed::map_with_mask as indexed_map_with_mask;
-use vortex_buffer::lane_ops_indexed::try_map_validity_filtered as indexed_try_map_validity_filtered;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask as indexed_try_map_with_mask;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 
 fn main() {
     divan::main();
@@ -42,20 +33,9 @@ const VALID_RATE: f64 = 0.7;
 const DATA_SEED: u64 = 0;
 const VALID_SEED: u64 = 1;
 
-// Non-byte-aligned bit offset → forces BitChunks::iter() to shift across byte
-// boundaries on every chunk it yields.
-const SLICE_OFFSET: usize = 5;
-
 struct Fixture {
     values: Buffer<u64>,
-    /// `offset() == 0`, underlying byte buffer starts on a byte boundary.
-    mask_aligned: BitBuffer,
-    /// Same validity bits but sliced so `offset() == SLICE_OFFSET`.
-    mask_unaligned: BitBuffer,
-    arrow_arr: UInt64Array,
-    /// Same as `arrow_arr` but its NullBuffer has a non-byte-aligned bit offset,
-    /// constructed by building an oversized array and slicing.
-    arrow_arr_unaligned: UInt64Array,
+    mask: BitBuffer,
 }
 
 fn fixture(n: usize) -> Fixture {
@@ -67,8 +47,7 @@ fn fixture(n: usize) -> Fixture {
     let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
 
     let values: Buffer<u64> = raw_values.iter().copied().collect();
-
-    let mask_aligned = {
+    let mask = {
         let mut m = BitBufferMut::with_capacity(n);
         for &v in &raw_valid {
             m.append(v);
@@ -76,392 +55,26 @@ fn fixture(n: usize) -> Fixture {
         m.freeze()
     };
 
-    let mask_unaligned = {
-        let mut m = BitBufferMut::with_capacity(n + SLICE_OFFSET);
-        for _ in 0..SLICE_OFFSET {
-            m.append(false);
-        }
-        for &v in &raw_valid {
-            m.append(v);
-        }
-        m.freeze().slice(SLICE_OFFSET..SLICE_OFFSET + n)
-    };
-    debug_assert_eq!(mask_unaligned.offset(), SLICE_OFFSET);
-    debug_assert_eq!(mask_unaligned.len(), n);
-
-    let arrow_arr = UInt64Array::new(
-        ScalarBuffer::from(raw_values.clone()),
-        Some(NullBuffer::from(raw_valid.clone())),
-    );
-
-    let arrow_arr_unaligned = {
-        let mut padded_values: Vec<u64> = vec![0; SLICE_OFFSET];
-        padded_values.extend_from_slice(&raw_values);
-        let mut padded_valid: Vec<bool> = vec![false; SLICE_OFFSET];
-        padded_valid.extend_from_slice(&raw_valid);
-        let oversized = UInt64Array::new(
-            ScalarBuffer::from(padded_values),
-            Some(NullBuffer::from(padded_valid)),
-        );
-        use arrow_array::Array;
-        let sliced = oversized.slice(SLICE_OFFSET, n);
-        debug_assert_eq!(
-            sliced.nulls().map(|n| n.offset()).unwrap_or(0) % 8,
-            SLICE_OFFSET
-        );
-        sliced
-    };
-
-    Fixture {
-        values,
-        mask_aligned,
-        mask_unaligned,
-        arrow_arr,
-        arrow_arr_unaligned,
-    }
-}
-
-const CAST_OPTS: CastOptions<'static> = CastOptions {
-    safe: true,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
-const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
-    safe: false,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_unaligned(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr_unaligned.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn iter_zip_checked(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values.clone(), f.mask_aligned.clone()))
-        .bench_refs(|(values, mask)| {
-            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
-                values.iter().zip(mask.iter()).map(|(&v, valid)| {
-                    let scaled = v * valid as u64;
-                    if scaled <= u32::MAX as u64 {
-                        Ok(scaled as u32)
-                    } else {
-                        Err(())
-                    }
-                }),
-            )
-            .unwrap()
-            .freeze();
-            buf
-        });
+    Fixture { values, mask }
 }
 
+/// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity
+/// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only
+/// then reads `valid`) when the cast itself returned `None`.
 #[divan::bench(args = SIZES)]
-fn iter_zip_checked_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values.clone(), f.mask_unaligned.clone()))
-        .bench_refs(|(values, mask)| {
-            let buf: Buffer<u32> = BufferMut::try_from_trusted_len_iter(
-                values.iter().zip(mask.iter()).map(|(&v, valid)| {
-                    let scaled = v * valid as u64;
-                    if scaled <= u32::MAX as u64 {
-                        Ok(scaled as u32)
-                    } else {
-                        Err(())
-                    }
-                }),
-            )
-            .unwrap()
-            .freeze();
-            buf
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_checked(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_checked_unaligned(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| f.arrow_arr_unaligned.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
-
-// -----------------------------------------------------------------------------
-// Isolation benches: drop the mask, isolate the cast u64 -> u32 to see whether
-// the iterator cost is intrinsic or comes from the surrounding kernel structure.
-// -----------------------------------------------------------------------------
-
-/// Plain slice indexing, no mask. Upper bound on what the iter variants must beat.
-#[divan::bench(args = SIZES)]
-fn iso_slice_cast(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), out)
-        })
-        .bench_refs(|(values, out)| {
-            let v = values.as_slice();
-            let o = out.as_mut_slice();
-            assert_eq!(v.len(), o.len());
-            for i in 0..v.len() {
-                // SAFETY: bounds checked by the assert above.
-                unsafe { o.get_unchecked_mut(i).write(*v.get_unchecked(i) as u32) };
-            }
-        });
-}
-
-/// Per-lane iterator zip, no mask. Tests whether `slice::Iter::next` autovectorizes
-/// when nothing else is in the way.
-#[divan::bench(args = SIZES)]
-fn iso_iter_cast(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), out)
-        })
-        .bench_refs(|(values, out)| {
-            for (slot, &v) in out.iter_mut().zip(values.iter()) {
-                slot.write(v as u32);
-            }
-        });
-}
-
-/// `chunks_exact(64)` + `try_into::<&[u64; 64]>` so the outer iter advances once per
-/// 64 lanes and the inner loop indexes a fixed-size array. Tests whether moving the
-/// iterator state from per-lane to per-chunk fixes vectorization.
-#[divan::bench(args = SIZES)]
-fn iso_iter_chunks_64(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), out)
-        })
-        .bench_refs(|(values, out)| {
-            let v = values.as_slice();
-            let o = out.as_mut_slice();
-            assert_eq!(v.len(), o.len());
-            for (v_chunk, o_chunk) in v.chunks_exact(64).zip(o.chunks_exact_mut(64)) {
-                let v_arr: &[u64; 64] = v_chunk.try_into().unwrap();
-                let o_arr: &mut [MaybeUninit<u32>; 64] = o_chunk.try_into().unwrap();
-                for bit_idx in 0..64 {
-                    o_arr[bit_idx].write(v_arr[bit_idx] as u32);
-                }
-            }
-            // Ignore the tail — SIZES are all multiples of 64.
-        });
-}
-
-// -----------------------------------------------------------------------------
-// Indexed-source variant (lane_ops_indexed). The kernel takes an `IndexedSource` whose
-// `&[T]` impl is `unsafe fn get_unchecked(i) -> T` — same indexed load as the slice
-// kernel, but the trait also supports binary inputs via `LaneZip`.
-// -----------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn indexed_kernel_map_with_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                (v * valid as u64) as u32
-            });
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn indexed_kernel_map_with_mask_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_unaligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                (v * valid as u64) as u32
-            });
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn indexed_kernel_try_map_with_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            })
-            .unwrap();
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn indexed_kernel_try_map_with_mask_unaligned(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_unaligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            })
-            .unwrap();
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn indexed_kernel_try_from_branchful(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                if valid {
-                    u32::try_from(v).ok()
-                } else {
-                    Some(0_u32)
-                }
-            })
-            .unwrap();
-        });
-}
-
-// -----------------------------------------------------------------------------
-// Decoupled-design variant with CORRECT validity semantics: closure is `|v|`
-// (no per-lane mask threading), but the mask filters out null-lane failures at
-// the chunk boundary. A null row whose stored value would overflow does NOT
-// cause Err — this matches the existing `try_map_with_mask` semantics while
-// keeping the lighter inner loop.
-// -----------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn indexed_decoupled_kernel_try_map_with_mask(bencher: Bencher, n: usize) {
+fn cast_lazy_validity(bencher: Bencher, n: usize) {
     let f = fixture(n);
     bencher
         .with_inputs(|| {
             let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
             // SAFETY: every lane is written before any read inside the kernel.
             unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
+            (f.values.clone(), f.mask.clone(), out)
         })
         .bench_refs(|(values, mask, out)| {
-            indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| {
-                (v <= u32::MAX as u64).then_some(v as u32)
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
             })
             .unwrap();
         });
 }
-
-#[divan::bench(args = SIZES)]
-fn indexed_decoupled_kernel_try_from_branchful(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            indexed_try_map_validity_filtered(values.as_slice(), mask, out.as_mut_slice(), |v| {
-                u32::try_from(v).ok()
-            })
-            .unwrap();
-        });
-}
-
-/// Full checked-cast kernel using `chunks_exact(64)` + fixed-size array refs, with
-/// the mask. If this matches the slice kernel, the cost is in the per-lane iterator
-/// state, not the iter pattern in general.
-#[divan::bench(args = SIZES)]
-fn kernel_iter_chunks_64(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask_aligned.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            let v = values.as_slice();
-            let o = out.as_mut_slice();
-            let len = v.len();
-            assert_eq!(len, mask.len());
-            assert_eq!(len, o.len());
-
-            let chunks = mask.chunks();
-            let chunks_count = len / 64;
-            let full = chunks_count * 64;
-            let (v_full, _v_rem) = v.split_at(full);
-            let (o_full, _o_rem) = o.split_at_mut(full);
-
-            for ((v_chunk, o_chunk), src_chunk) in v_full
-                .chunks_exact(64)
-                .zip(o_full.chunks_exact_mut(64))
-                .zip(chunks.iter())
-            {
-                let v_arr: &[u64; 64] = v_chunk.try_into().unwrap();
-                let o_arr: &mut [MaybeUninit<u32>; 64] = o_chunk.try_into().unwrap();
-                let mut fail_acc: u64 = 0;
-                for bit_idx in 0..64 {
-                    let bit = (src_chunk >> bit_idx) & 1 == 1;
-                    let scaled = v_arr[bit_idx] * bit as u64;
-                    let opt = (scaled <= u32::MAX as u64).then_some(scaled as u32);
-                    fail_acc |= opt.is_none() as u64;
-                    o_arr[bit_idx].write(opt.unwrap_or_default());
-                }
-                assert_eq!(fail_acc, 0);
-            }
-            // Ignore the tail — SIZES are all multiples of 64.
-        });
-}
diff --git a/vortex-buffer/src/lane_ops.rs b/vortex-buffer/src/lane_ops.rs
deleted file mode 100644
index b145633465b..00000000000
--- a/vortex-buffer/src/lane_ops.rs
+++ /dev/null
@@ -1,713 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Elementwise kernels that combine a `[T]` slice with a `BitBuffer` validity mask.
-//!
-//! The output is always a caller-provided `&mut` slice — these kernels never allocate.
-//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
-//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
-
-use std::mem::MaybeUninit;
-
-use crate::BitBuffer;
-
-/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`.
-///
-/// All three inputs must have the same length. The output type `R` may differ from the
-/// input type `T` — this kernel is the building block for both same-type transforms
-/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out`
-/// initialized (e.g. by calling `BufferMut::set_len` after this returns).
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
-#[inline]
-pub fn map_with_mask<T, R, F>(values: &[T], mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
-where
-    T: Copy,
-    F: FnMut(T, bool) -> R,
-{
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(out.len(), len, "out must have the same length as values");
-
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        // Inner loop is fixed-size 64 so the compiler can autovectorize
-        // for branchless closures like `|v, valid| v * (valid as T)`.
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: chunks.iter() yields chunks_count full words, so i < chunks_count * 64 <= len.
-            let v = unsafe { *values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        }
-    }
-
-    if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i = chunks_count * 64 + bit_idx < chunks_count * 64 + remainder = len.
-            let v = unsafe { *values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        }
-    }
-}
-
-/// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None` indicates a
-/// per-lane failure (e.g. range overflow on a narrowing cast).
-///
-/// The kernel does not short-circuit on the first failure inside a chunk: it processes
-/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator,
-/// then checks after each chunk. On failure, a cold scalar attribution pass replays the
-/// closure over that chunk to identify the first failing lane. The hot loop stays
-/// autovectorizable — the per-lane cost is one OR on top of the cast.
-///
-/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write
-/// `R::default()` into `out`, but the contents of `out` must not be relied upon when
-/// this function returns `Err`.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
-#[inline]
-pub fn try_map_with_mask<T, R, F>(
-    values: &[T],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<R>],
-    mut f: F,
-) -> Result<(), usize>
-where
-    T: Copy,
-    R: Copy + Default,
-    F: FnMut(T, bool) -> Option<R>,
-{
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(out.len(), len, "out must have the same length as values");
-
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        // Per-chunk accumulator — does not escape the SIMD inner loop.
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { *values.get_unchecked(i) };
-            let opt = f(v, bit);
-            fail_acc |= opt.is_none() as u64;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        }
-        if fail_acc != 0 {
-            return Err(attribute_failure(values, src_chunk, base, 64, &mut f));
-        }
-    }
-
-    if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < len.
-            let v = unsafe { *values.get_unchecked(i) };
-            let opt = f(v, bit);
-            fail_acc |= opt.is_none() as u64;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        }
-        if fail_acc != 0 {
-            return Err(attribute_failure(
-                values, src_chunk, base, remainder, &mut f,
-            ));
-        }
-    }
-
-    Ok(())
-}
-
-/// Cold path: identify the first lane in a chunk where `f` returned `None`.
-///
-/// Called only after the hot loop has detected that at least one lane failed.
-/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only
-/// runs once per error and the error path is supposed to be exceptional.
-#[cold]
-#[inline(never)]
-fn attribute_failure<T, R, F>(
-    values: &[T],
-    src_chunk: u64,
-    base: usize,
-    chunk_len: usize,
-    f: &mut F,
-) -> usize
-where
-    T: Copy,
-    F: FnMut(T, bool) -> Option<R>,
-{
-    for bit_idx in 0..chunk_len {
-        let i = base + bit_idx;
-        let bit = (src_chunk >> bit_idx) & 1 == 1;
-        // SAFETY: caller guarantees i < values.len().
-        let v = unsafe { *values.get_unchecked(i) };
-        if f(v, bit).is_none() {
-            return i;
-        }
-    }
-    // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed.
-    unreachable!("attribute_failure called without a failing lane")
-}
-
-/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
-///
-/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
-/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive
-/// buffer) and combine the validity bitmap in a separate pass — splitting the work
-/// this way lets the value-compare loop autovectorize cleanly.
-///
-/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
-/// beyond `len % 64` are written as `0`.
-///
-/// # Panics
-///
-/// Panics if `out.len() != values.len().div_ceil(64)`.
-#[inline]
-pub fn map_to_bits<T, F>(values: &[T], out: &mut [u64], mut f: F)
-where
-    T: Copy,
-    F: FnMut(T) -> bool,
-{
-    let len = values.len();
-    assert_eq!(
-        out.len(),
-        len.div_ceil(64),
-        "out must have len.div_ceil(64) words",
-    );
-
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for chunk_idx in 0..chunks_count {
-        let base = chunk_idx * 64;
-        let mut packed = 0u64;
-        for bit_idx in 0..64 {
-            // SAFETY: base + bit_idx < chunks_count * 64 <= len.
-            let v = unsafe { *values.get_unchecked(base + bit_idx) };
-            packed |= (f(v) as u64) << bit_idx;
-        }
-        // SAFETY: chunk_idx < chunks_count <= out.len().
-        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
-    }
-
-    if remainder != 0 {
-        let base = chunks_count * 64;
-        let mut packed = 0u64;
-        for bit_idx in 0..remainder {
-            // SAFETY: base + bit_idx < len.
-            let v = unsafe { *values.get_unchecked(base + bit_idx) };
-            packed |= (f(v) as u64) << bit_idx;
-        }
-        // SAFETY: chunks_count < out.len() because remainder != 0.
-        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
-    }
-}
-
-/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words.
-///
-/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
-/// beyond `len % 64` are written as `0`.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`.
-#[inline]
-pub fn map_with_mask_to_bits<T, F>(values: &[T], mask: &BitBuffer, out: &mut [u64], mut f: F)
-where
-    T: Copy,
-    F: FnMut(T, bool) -> bool,
-{
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(
-        out.len(),
-        len.div_ceil(64),
-        "out must have len.div_ceil(64) words",
-    );
-
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        let mut packed = 0u64;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { *values.get_unchecked(i) };
-            packed |= (f(v, bit) as u64) << bit_idx;
-        }
-        // SAFETY: chunk_idx < chunks_count <= out.len().
-        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
-    }
-
-    if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut packed = 0u64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < len.
-            let v = unsafe { *values.get_unchecked(i) };
-            packed |= (f(v, bit) as u64) << bit_idx;
-        }
-        // SAFETY: chunks_count < out.len() because remainder != 0.
-        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::BitBufferMut;
-
-    fn write_t<T: Copy>(out: Vec<MaybeUninit<T>>) -> Vec<T> {
-        // SAFETY: tests always fully initialize the buffer.
-        unsafe { std::mem::transmute(out) }
-    }
-
-    #[test]
-    fn map_with_mask_aligned() {
-        let values: Vec<i32> = (0..10).collect();
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(10);
-            for i in 0..10 {
-                m.append(i % 2 == 0);
-            }
-            m.freeze()
-        };
-        let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
-        map_with_mask(
-            &values,
-            &mask,
-            &mut out,
-            |v, valid| if valid { v } else { -1 },
-        );
-        assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
-    }
-
-    #[test]
-    fn map_with_mask_partial_chunk() {
-        // 130 lanes — two full u64 words + a 2-bit remainder.
-        let values: Vec<i32> = (0..130).collect();
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
-        map_with_mask(
-            &values,
-            &mask,
-            &mut out,
-            |v, valid| if valid { v + 1 } else { 0 },
-        );
-        let got = write_t(out);
-        assert_eq!(got.len(), 130);
-        assert_eq!(got[0], 1);
-        assert_eq!(got[63], 64);
-        assert_eq!(got[64], 65);
-        assert_eq!(got[129], 130);
-    }
-
-    #[test]
-    fn map_with_mask_offset_mask() {
-        // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5.
-        let big = BitBuffer::new_set(128);
-        let sliced = big.slice(5..70); // logical len = 65, offset = 5
-        assert_eq!(sliced.len(), 65);
-        assert_eq!(sliced.offset(), 5);
-
-        let values: Vec<u32> = (0..65).collect();
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
-        map_with_mask(
-            &values,
-            &sliced,
-            &mut out,
-            |v, valid| if valid { v } else { u32::MAX },
-        );
-        let got = write_t(out);
-        assert_eq!(got, (0..65).collect::<Vec<u32>>());
-    }
-
-    #[test]
-    fn map_with_mask_offset_past_word() {
-        // Slicing past a full word still works. `BitBuffer::slice` normalizes the
-        // logical offset to `offset % 8` and bumps the underlying byte pointer,
-        // so `offset()` won't equal 70 here — what we exercise is that the kernel
-        // walks the chunked u64 view (which BitChunks handles internally).
-        let big = BitBuffer::new_set(256);
-        let sliced = big.slice(70..200);
-        assert_eq!(sliced.len(), 130);
-
-        let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
-        let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
-        map_with_mask(
-            &values,
-            &sliced,
-            &mut out,
-            |v, valid| if valid { v } else { -1 },
-        );
-        let got = write_t(out);
-        assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
-    }
-
-    #[test]
-    fn map_with_mask_empty() {
-        let values: Vec<i32> = vec![];
-        let mask = BitBuffer::new_unset(0);
-        let mut out: Vec<MaybeUninit<i32>> = vec![];
-        map_with_mask(&values, &mask, &mut out, |v, _| v);
-    }
-
-    #[test]
-    fn map_with_mask_null_to_zero_branchless() {
-        // The trick from primitive/compute/cast.rs:147 — multiply by valid as T.
-        let values: Vec<i64> = (1..=100).collect();
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(100);
-            for i in 0..100 {
-                m.append(i % 3 != 0);
-            }
-            m.freeze()
-        };
-        let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
-        map_with_mask(&values, &mask, &mut out, |v, valid| v * (valid as i64));
-        let got = write_t(out);
-        for (i, &x) in got.iter().enumerate() {
-            if i % 3 == 0 {
-                assert_eq!(x, 0);
-            } else {
-                assert_eq!(x, (i + 1) as i64);
-            }
-        }
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_aligned() {
-        let values: Vec<i32> = (0..128).collect();
-        let mask = BitBuffer::new_set(128);
-        let mut out = vec![0u64; 2];
-        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v % 2 == 0);
-        // Even numbers in [0, 128) set, odd unset.
-        for word_idx in 0..2 {
-            let word = out[word_idx];
-            for bit in 0..64 {
-                let i = word_idx * 64 + bit;
-                let expected = i % 2 == 0;
-                assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}");
-            }
-        }
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_partial_chunk() {
-        // 130 lanes — three u64 words, last word has only 2 valid bits.
-        let values: Vec<i32> = (0..130).collect();
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        assert_eq!(out.len(), 3);
-        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v >= 64);
-        // Bits 64..128 set in word 1; bits 128..130 set in word 2.
-        assert_eq!(out[0], 0);
-        assert_eq!(out[1], u64::MAX);
-        assert_eq!(out[2], 0b11);
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_offset() {
-        let big = BitBuffer::new_set(256);
-        let sliced = big.slice(13..143); // offset=13, len=130
-        assert_eq!(sliced.len(), 130);
-        let values: Vec<u8> = (0..130).map(|i| (i % 4) as u8).collect();
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        map_with_mask_to_bits(&values, &sliced, &mut out, |v, valid| valid && v == 0);
-        for i in 0..130 {
-            let word = out[i / 64];
-            let bit = (word >> (i % 64)) & 1 == 1;
-            assert_eq!(bit, i % 4 == 0, "lane {i}");
-        }
-    }
-
-    #[test]
-    fn try_map_with_mask_all_ok() {
-        let values: Vec<u64> = (0..200).collect();
-        let mask = BitBuffer::new_set(200);
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert!(res.is_ok());
-        let got = write_t(out);
-        assert_eq!(got, (0..200u32).collect::<Vec<_>>());
-    }
-
-    #[test]
-    fn try_map_with_mask_overflow_fails() {
-        // Put an overflowing value at lane 137 — the kernel must report Err(137).
-        let mut values: Vec<u64> = (0..200).collect();
-        values[137] = (u32::MAX as u64) + 1;
-        let mask = BitBuffer::new_set(200);
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert_eq!(res, Err(137));
-    }
-
-    #[test]
-    fn try_map_with_mask_overflow_reports_first_failing_lane() {
-        // Multiple failing lanes — must report the lowest index.
-        let mut values: Vec<u64> = (0..200).collect();
-        values[50] = u64::MAX;
-        values[51] = u64::MAX;
-        values[137] = u64::MAX;
-        let mask = BitBuffer::new_set(200);
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert_eq!(res, Err(50));
-    }
-
-    #[test]
-    fn try_map_with_mask_null_lane_bypasses_check() {
-        // Null lanes are neutralized by `valid as u64` before the range check, so an
-        // out-of-range value at a null lane must NOT trigger failure.
-        let mut values: Vec<u64> = (0..200).collect();
-        values[5] = u64::MAX;
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(200);
-            for i in 0..200 {
-                m.append(i != 5);
-            }
-            m.freeze()
-        };
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert!(res.is_ok());
-        let got = write_t(out);
-        assert_eq!(got[5], 0); // null-lane wrote default
-        assert_eq!(got[6], 6);
-    }
-
-    #[test]
-    fn try_map_with_mask_branchful_matches_branchless() {
-        let mut values: Vec<u64> = (0..130).map(|i| i as u64 * 7).collect();
-        values[2] = u64::MAX;
-        values[65] = u32::MAX as u64;
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(130);
-            for i in 0..130 {
-                m.append(!matches!(i, 2 | 17 | 99));
-            }
-            m.freeze()
-        };
-
-        let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
-        let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
-        try_map_with_mask(&values, &mask, &mut branchless, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        })
-        .unwrap();
-        try_map_with_mask(&values, &mask, &mut branchful, |v, valid| {
-            if valid {
-                u32::try_from(v).ok()
-            } else {
-                Some(0)
-            }
-        })
-        .unwrap();
-
-        assert_eq!(write_t(branchful), write_t(branchless));
-    }
-
-    #[test]
-    fn try_map_with_mask_partial_chunk() {
-        let values: Vec<u64> = (0..130).collect();
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert!(res.is_ok());
-        let got = write_t(out);
-        assert_eq!(got.len(), 130);
-        assert_eq!(got[129], 129);
-    }
-
-    #[test]
-    fn try_map_with_mask_sliced_mask_unaligned_offset() {
-        // The mask's first byte is not word-aligned: slice off 13 bits, so the
-        // underlying BitChunks iterator must shift across byte boundaries on every
-        // 64-bit chunk it yields.
-        let big = BitBuffer::new_set(256);
-        let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5
-        assert_eq!(mask.len(), 130);
-
-        let values: Vec<u64> = (0..130).collect();
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert!(res.is_ok());
-        let got = write_t(out);
-        assert_eq!(got, (0..130u32).collect::<Vec<_>>());
-    }
-
-    #[test]
-    fn try_map_with_mask_sliced_mask_with_overflow() {
-        // Sliced mask + overflowing value — the cold attribution path must report
-        // the correct lane index in the sliced (post-offset) coordinate space.
-        let big = BitBuffer::new_set(256);
-        let mask = big.slice(13..143);
-        assert_eq!(mask.len(), 130);
-
-        let mut values: Vec<u64> = (0..130).collect();
-        values[77] = u64::MAX;
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert_eq!(res, Err(77));
-    }
-
-    #[test]
-    fn try_map_with_mask_sliced_mask_null_lanes() {
-        // Mix sliced offset with a non-trivial validity pattern. Null lanes must
-        // not contribute to fail_acc, even when their underlying value would overflow.
-        let mut m = BitBufferMut::with_capacity(256);
-        for i in 0..256 {
-            m.append(i % 3 != 0);
-        }
-        let big = m.freeze();
-        let mask = big.slice(13..143);
-        assert_eq!(mask.len(), 130);
-
-        // After the 13-lane slice, original index `13 + j` becomes lane `j`.
-        // Lane `j` is valid iff `(13 + j) % 3 != 0`.
-        let mut values: Vec<u64> = (0..130).collect();
-        // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid.
-        // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
-        values[2] = u64::MAX;
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert!(res.is_ok(), "null lane should bypass the range check");
-    }
-
-    #[test]
-    fn try_map_with_mask_overflow_in_remainder() {
-        // Overflow in the trailing partial chunk (not aligned to 64).
-        let mut values: Vec<u64> = (0..130).collect();
-        values[129] = (u32::MAX as u64) + 1;
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(&values, &mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
-        assert_eq!(res, Err(129));
-    }
-
-    #[test]
-    fn map_to_bits_aligned() {
-        let values: Vec<i32> = (0..128).collect();
-        let mut out = vec![0u64; 2];
-        map_to_bits(&values, &mut out, |v| v % 2 == 0);
-        for word_idx in 0..2 {
-            for bit in 0..64 {
-                let i = word_idx * 64 + bit;
-                let expected = i % 2 == 0;
-                assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}");
-            }
-        }
-    }
-
-    #[test]
-    fn map_to_bits_partial_chunk() {
-        let values: Vec<i32> = (0..130).collect();
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        assert_eq!(out.len(), 3);
-        map_to_bits(&values, &mut out, |v| v >= 64);
-        assert_eq!(out[0], 0);
-        assert_eq!(out[1], u64::MAX);
-        assert_eq!(out[2], 0b11);
-    }
-
-    #[test]
-    fn map_to_bits_empty() {
-        let values: Vec<i32> = vec![];
-        let mut out: Vec<u64> = vec![];
-        map_to_bits(&values, &mut out, |v| v > 0);
-    }
-
-    #[test]
-    fn map_to_bits_matches_fused_with_all_valid_mask() {
-        // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits.
-        let values: Vec<i64> = (0..200).map(|i| i % 7).collect();
-        let mask = BitBuffer::new_set(200);
-
-        let mut a = vec![0u64; 200usize.div_ceil(64)];
-        map_with_mask_to_bits(&values, &mask, &mut a, |v, valid| valid && v == 3);
-
-        let mut b = vec![0u64; 200usize.div_ceil(64)];
-        map_to_bits(&values, &mut b, |v| v == 3);
-
-        assert_eq!(a, b);
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_validity_kills_lane() {
-        // Even if predicate is true, null lanes should produce false.
-        let values: Vec<i32> = vec![1; 70];
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(70);
-            for i in 0..70 {
-                m.append(i >= 32); // first 32 lanes are null
-            }
-            m.freeze()
-        };
-        let mut out = vec![0u64; 70usize.div_ceil(64)];
-        map_with_mask_to_bits(&values, &mask, &mut out, |v, valid| valid && v == 1);
-        for i in 0..70 {
-            let bit = (out[i / 64] >> (i % 64)) & 1 == 1;
-            assert_eq!(bit, i >= 32, "lane {i}");
-        }
-    }
-}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index c83114d8bcd..dfd2c41fd4a 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Indexed-source variant of [`crate::lane_ops`].
+//! Elementwise lane kernels over indexed sources.
 //!
 //! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is
 //! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]`
@@ -16,6 +16,8 @@
 //! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
 //! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
 
+#![allow(clippy::many_single_char_names)]
+
 use std::mem::MaybeUninit;
 
 use crate::BitBuffer;
@@ -105,7 +107,11 @@ impl<A: IndexedSource, B: IndexedSource> LaneZip<A, B> {
     ///
     /// Panics if the two operands have different lengths.
     pub fn new(a: A, b: B) -> Self {
-        assert_eq!(a.len(), b.len(), "LaneZip operands must have the same length");
+        assert_eq!(
+            a.len(),
+            b.len(),
+            "LaneZip operands must have the same length"
+        );
         Self(a, b)
     }
 }
@@ -120,12 +126,7 @@ impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
     #[inline]
     unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) {
         // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths.
-        unsafe {
-            (
-                self.0.get_unchecked(i),
-                self.1.get_unchecked(i),
-            )
-        }
+        unsafe { (self.0.get_unchecked(i), self.1.get_unchecked(i)) }
     }
 }
 
@@ -140,12 +141,8 @@ impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
 ///
 /// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
 #[inline]
-pub fn map_with_mask<S, R, F>(
-    values: S,
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<R>],
-    mut f: F,
-) where
+pub fn map_with_mask<S, R, F>(values: S, mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
+where
     S: IndexedSource,
     F: FnMut(S::Item, bool) -> R,
 {
@@ -481,12 +478,7 @@ where
 /// Cold attribution for the no-mask variant.
 #[cold]
 #[inline(never)]
-fn attribute_failure_no_mask<S, R, F>(
-    values: &S,
-    base: usize,
-    chunk_len: usize,
-    f: &mut F,
-) -> usize
+fn attribute_failure_no_mask<S, R, F>(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize
 where
     S: IndexedSource,
     F: FnMut(S::Item) -> Option<R>,
@@ -608,6 +600,7 @@ where
 ///
 /// Panics if `values.len() != mask.len()`.
 #[inline]
+#[allow(clippy::cast_possible_truncation)]
 pub fn try_map_with_mask_in_place<S, F>(
     mut values: S,
     mask: &BitBuffer,
@@ -780,6 +773,7 @@ where
 }
 
 #[cfg(test)]
+#[allow(clippy::cast_possible_truncation)]
 mod tests {
     use super::*;
     use crate::BitBufferMut;
@@ -800,12 +794,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
-        map_with_mask(
-            values.as_slice(),
-            &mask,
-            &mut out,
-            |v, valid| if valid { v } else { -1 },
-        );
+        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            if valid { v } else { -1 }
+        });
         assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
     }
 
@@ -815,12 +806,9 @@ mod tests {
         let values: Vec<i32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
-        map_with_mask(
-            values.as_slice(),
-            &mask,
-            &mut out,
-            |v, valid| if valid { v + 1 } else { 0 },
-        );
+        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            if valid { v + 1 } else { 0 }
+        });
         let got = write_t(out);
         assert_eq!(got.len(), 130);
         assert_eq!(got[0], 1);
@@ -839,12 +827,9 @@ mod tests {
 
         let values: Vec<u32> = (0..65).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
-        map_with_mask(
-            values.as_slice(),
-            &sliced,
-            &mut out,
-            |v, valid| if valid { v } else { u32::MAX },
-        );
+        map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| {
+            if valid { v } else { u32::MAX }
+        });
         let got = write_t(out);
         assert_eq!(got, (0..65).collect::<Vec<u32>>());
     }
@@ -861,12 +846,9 @@ mod tests {
 
         let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
         let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
-        map_with_mask(
-            values.as_slice(),
-            &sliced,
-            &mut out,
-            |v, valid| if valid { v } else { -1 },
-        );
+        map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| {
+            if valid { v } else { -1 }
+        });
         let got = write_t(out);
         assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
     }
@@ -891,7 +873,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
-        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| v * (valid as i64));
+        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+            v * (valid as i64)
+        });
         let got = write_t(out);
         for (i, &x) in got.iter().enumerate() {
             if i % 3 == 0 {
@@ -907,7 +891,9 @@ mod tests {
         let values: Vec<i32> = (0..128).collect();
         let mask = BitBuffer::new_set(128);
         let mut out = vec![0u64; 2];
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v % 2 == 0);
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
+            valid && v % 2 == 0
+        });
         // Even numbers in [0, 128) set, odd unset.
         for word_idx in 0..2 {
             let word = out[word_idx];
@@ -926,7 +912,9 @@ mod tests {
         let mask = BitBuffer::new_set(130);
         let mut out = vec![0u64; 130usize.div_ceil(64)];
         assert_eq!(out.len(), 3);
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v >= 64);
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
+            valid && v >= 64
+        });
         // Bits 64..128 set in word 1; bits 128..130 set in word 2.
         assert_eq!(out[0], 0);
         assert_eq!(out[1], u64::MAX);
@@ -940,7 +928,9 @@ mod tests {
         assert_eq!(sliced.len(), 130);
         let values: Vec<u8> = (0..130).map(|i| (i % 4) as u8).collect();
         let mut out = vec![0u64; 130usize.div_ceil(64)];
-        map_with_mask_to_bits(values.as_slice(),&sliced, &mut out, |v, valid| valid && v == 0);
+        map_with_mask_to_bits(values.as_slice(), &sliced, &mut out, |v, valid| {
+            valid && v == 0
+        });
         for i in 0..130 {
             let word = out[i / 64];
             let bit = (word >> (i % 64)) & 1 == 1;
@@ -1008,13 +998,13 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_validity_filtered(
-            values.as_slice(),
-            &mask,
-            &mut out,
-            |v| (v <= u32::MAX as u64).then_some(v as u32),
+        let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(
+            res.is_ok(),
+            "null-lane overflow should not propagate as Err"
         );
-        assert!(res.is_ok(), "null-lane overflow should not propagate as Err");
     }
 
     #[test]
@@ -1035,12 +1025,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_validity_filtered(
-            values.as_slice(),
-            &mask,
-            &mut out,
-            |v| (v <= u32::MAX as u64).then_some(v as u32),
-        );
+        let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(77));
     }
 
@@ -1193,6 +1180,124 @@ mod tests {
         assert_eq!(res, Err(129));
     }
 
+    #[test]
+    fn map_with_mask_in_place_basic() {
+        let mut values: Vec<u32> = (0..130).collect();
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(130);
+            for i in 0..130 {
+                m.append(i % 2 == 0);
+            }
+            m.freeze()
+        };
+        map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+            v.wrapping_mul(valid as u32)
+        });
+        let expected: Vec<u32> = (0..130u32)
+            .map(|v| if v % 2 == 0 { v } else { 0 })
+            .collect();
+        assert_eq!(values, expected);
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_all_ok() {
+        let mut values: Vec<u32> = (0..200).collect();
+        let mask = BitBuffer::new_set(200);
+        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+            let scaled = v.wrapping_mul(valid as u32);
+            scaled.checked_mul(2)
+        });
+        assert!(res.is_ok());
+        let expected: Vec<u32> = (0..200u32).map(|v| v * 2).collect();
+        assert_eq!(values, expected);
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_first_failing_chunk_wins() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[83] = u32::MAX;
+        values[150] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res =
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+        assert_eq!(res, Err(83));
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_within_chunk_reports_lowest() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[80] = u32::MAX;
+        values[100] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res =
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+        assert_eq!(res, Err(80));
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_single_failure_lane_exact() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[42] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res =
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+        assert_eq!(res, Err(42));
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_null_bypass() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[5] = u32::MAX;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5);
+            }
+            m.freeze()
+        };
+        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+            v.wrapping_mul(valid as u32).checked_mul(2)
+        });
+        assert!(res.is_ok());
+        assert_eq!(values[5], 0);
+        assert_eq!(values[6], 12);
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_remainder_overflow() {
+        let mut values: Vec<u32> = (0..130).collect();
+        values[129] = u32::MAX;
+        let mask = BitBuffer::new_set(130);
+        let res =
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+        assert_eq!(res, Err(129));
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_sliced_mask() {
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        let mut values: Vec<u32> = (0..130).collect();
+        values[77] = u32::MAX;
+        let res =
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_with_mask_in_place_partial_chunk_success() {
+        let mut values: Vec<u32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1));
+        assert!(res.is_ok());
+        assert_eq!(values[0], 1);
+        assert_eq!(values[63], 64);
+        assert_eq!(values[64], 65);
+        assert_eq!(values[129], 130);
+    }
+
     #[test]
     fn map_to_bits_aligned() {
         let values: Vec<i32> = (0..128).collect();
@@ -1252,7 +1357,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![0u64; 70usize.div_ceil(64)];
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| valid && v == 1);
+        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
+            valid && v == 1
+        });
         for i in 0..70 {
             let bit = (out[i / 64] >> (i % 64)) & 1 == 1;
             assert_eq!(bit, i >= 32, "lane {i}");
diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs
index a4519ac62ec..5fe7a4cf40d 100644
--- a/vortex-buffer/src/lib.rs
+++ b/vortex-buffer/src/lib.rs
@@ -52,7 +52,6 @@ pub use buffer::*;
 pub use buffer_mut::*;
 pub use bytes::*;
 pub use r#const::*;
-pub use lane_ops::*;
 pub use string::*;
 mod alignment;
 #[cfg(feature = "arrow")]
@@ -63,12 +62,6 @@ mod buffer_mut;
 mod bytes;
 mod r#const;
 mod debug;
-mod lane_ops;
-/// Indexed-source variant of [`lane_ops`]: takes an `IndexedSource` trait whose
-/// implementations expose `unsafe fn get_unchecked(i) -> Item`. `&[T]` impls inline
-/// to the same indexed load as the slice kernel, but the trait also admits binary
-/// inputs via `LaneZip`. See `HISTORY.md` for the iterator-API investigation that
-/// led to this design.
 pub mod lane_ops_indexed;
 mod macros;
 #[cfg(feature = "memmap2")]

From 502a2861f11b842c4b85927a307e4782adb94415 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 11:22:36 +0100
Subject: [PATCH 04/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      | 23 +++++--
 vortex-buffer/benches/cast_to_indexed.rs      | 63 +++++++++++++++++--
 2 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index dd5abc2f164..edb5ced01b9 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -4,6 +4,7 @@
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_buffer::lane_ops_indexed::map_no_validity;
 use vortex_buffer::lane_ops_indexed::try_map_no_validity;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 use vortex_error::VortexResult;
@@ -125,7 +126,6 @@ where
     T: NativePType,
 {
     let values = array.as_slice::<F>();
-    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
     let overflow = || {
         vortex_err!(
             Compute: "Cannot cast {} to {} — value exceeds target range",
@@ -133,6 +133,22 @@ where
         )
     };
 
+    // If this cast doesn't fail use the unchecked casting variant
+    let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
+    if cached_values_fit_in(array, &target_dtype) == Some(true) {
+        let mut buffer = BufferMut::<T>::with_capacity(values.len());
+        map_no_validity(
+            values,
+            &mut buffer.spare_capacity_mut()[..values.len()],
+            v.as_(), // |v| <T as NumCast>::from(v).unwrap_or_default(),
+        );
+        // SAFETY: map_no_validity initializes every lane.
+        unsafe { buffer.set_len(values.len()) };
+        return Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array());
+    }
+
+    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
+
     let buffer: Buffer<T> = match &mask {
         Mask::AllTrue(_) => {
             let mut buffer = BufferMut::<T>::with_capacity(values.len());
@@ -159,10 +175,7 @@ where
                 // path entirely, giving the same codegen as the maskless kernel.
                 // For narrowing, `valid` is only read at lanes that actually
                 // overflowed (a cold check on top of the cast).
-                |v, valid| {
-                    <T as NumCast>::from(v)
-                        .or_else(|| (!valid).then(T::zero))
-                },
+                |v, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
             )
             .map_err(|_| overflow())?;
             // SAFETY: try_map_with_mask returned Ok, so it initialized every lane.
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index d3baec7885c..1dfba41f8fd 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -34,7 +34,10 @@ const DATA_SEED: u64 = 0;
 const VALID_SEED: u64 = 1;
 
 struct Fixture {
-    values: Buffer<u64>,
+    /// u64 source for the narrowing-cast bench (`cast_lazy_validity`).
+    values_u64: Buffer<u64>,
+    /// u16 source for the widening-cast benches that compare closure forms.
+    values_u16: Buffer<u16>,
     mask: BitBuffer,
 }
 
@@ -46,7 +49,9 @@ fn fixture(n: usize) -> Fixture {
         .collect();
     let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
 
-    let values: Buffer<u64> = raw_values.iter().copied().collect();
+    let values_u64: Buffer<u64> = raw_values.iter().copied().collect();
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u16: Buffer<u16> = raw_values.iter().map(|&v| v as u16).collect();
     let mask = {
         let mut m = BitBufferMut::with_capacity(n);
         for &v in &raw_valid {
@@ -55,9 +60,12 @@ fn fixture(n: usize) -> Fixture {
         m.freeze()
     };
 
-    Fixture { values, mask }
+    Fixture {
+        values_u64,
+        values_u16,
+        mask,
+    }
 }
-
 /// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity
 /// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only
 /// then reads `valid`) when the cast itself returned `None`.
@@ -69,7 +77,33 @@ fn cast_lazy_validity(bencher: Bencher, n: usize) {
             let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
             // SAFETY: every lane is written before any read inside the kernel.
             unsafe { out.set_len(n) };
-            (f.values.clone(), f.mask.clone(), out)
+            (f.values_u64.clone(), f.mask.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+                <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
+            })
+            .unwrap();
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Widening benches (u16 → u32). Compare closure forms on a statically-infallible
+// cast to confirm the asm finding empirically: the `or_else` and `_valid`
+// (maskless) closures should produce identical timings, since LLVM aliases the
+// `or_else` function symbol directly to the maskless one (proven via
+// `cargo rustc --emit=asm` — see the `asm_u16_u32_*` helpers above).
+// -----------------------------------------------------------------------------
+
+/// Widening with the `or_else` closure — the cast.rs shape.
+#[divan::bench(args = SIZES)]
+fn widen_u16_u32_or_else(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values_u16.clone(), f.mask.clone(), out)
         })
         .bench_refs(|(values, mask, out)| {
             try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
@@ -78,3 +112,22 @@ fn cast_lazy_validity(bencher: Bencher, n: usize) {
             .unwrap();
         });
 }
+
+/// Widening with `_valid` ignored — the upper bound. Should match `or_else` per the
+/// asm aliasing finding.
+#[divan::bench(args = SIZES)]
+fn widen_u16_u32_maskless(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
+            unsafe { out.set_len(n) };
+            (f.values_u16.clone(), f.mask.clone(), out)
+        })
+        .bench_refs(|(values, mask, out)| {
+            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, _valid| {
+                <u32 as NumCast>::from(v)
+            })
+            .unwrap();
+        });
+}

From 2f6df638d80c05107d7b488849ba0a22455692c5 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 12:42:49 +0100
Subject: [PATCH 05/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |   3 +
 .../src/arrays/primitive/compute/cast.rs      |  25 +-
 vortex-buffer/Cargo.toml                      |   4 +
 vortex-buffer/benches/cast_to_indexed.rs      | 337 ++++++++++++++----
 vortex-buffer/src/lane_ops_indexed.rs         | 325 ++++++++++-------
 5 files changed, 499 insertions(+), 195 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d29c91edf62..11afc6996a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,7 +9355,10 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
+ "arrow-array",
  "arrow-buffer",
+ "arrow-cast",
+ "arrow-schema",
  "bitvec",
  "bytes",
  "codspeed-divan-compat",
diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index edb5ced01b9..8cdd27cb5c5 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
@@ -122,7 +123,7 @@ fn cast_values<F, T>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    F: NativePType,
+    F: NativePType + AsPrimitive<T>,
     T: NativePType,
 {
     let values = array.as_slice::<F>();
@@ -133,14 +134,30 @@ where
         )
     };
 
-    // If this cast doesn't fail use the unchecked casting variant
+    // Returns `true` if every value of `from` is representable in `to` without loss.
+    //
+    // Equivalent to `from.least_supertype(to) == Some(to)`, i.e. the value domain of `from`
+    // is a subset of `to`'s. This is the static-only check — it does not consult any array
+    // statistics. Used to short-circuit checked casts when the conversion is infallible by
+    // type alone (widening uint→uint, signed→signed, u8→i16, i32→f64, etc.).
+    fn casts_losslessly_to(from: PType, to: PType) -> bool {
+        from.least_supertype(to) == Some(to)
+    }
+
+    // Skip the fallible kernel when the conversion is infallible by type alone (widening) or
+    // when cached min/max prove every value fits in `T`.
     let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
-    if cached_values_fit_in(array, &target_dtype) == Some(true) {
+    if casts_losslessly_to(F::PTYPE, T::PTYPE)
+        || cached_values_fit_in(array, &target_dtype) == Some(true)
+    {
         let mut buffer = BufferMut::<T>::with_capacity(values.len());
+        // Truncating `as`-cast — safe here because stats prove every valid value fits.
+        // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result
+        // validity bitmap masks them downstream).
         map_no_validity(
             values,
             &mut buffer.spare_capacity_mut()[..values.len()],
-            v.as_(), // |v| <T as NumCast>::from(v).unwrap_or_default(),
+            |v| v.as_(),
         );
         // SAFETY: map_no_validity initializes every lane.
         unsafe { buffer.set_len(values.len()) };
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 6490516f846..385efa36dcf 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,6 +37,10 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
+# arrow-* are used by the cast_to_indexed bench to compare against arrow-rs.
+arrow-array = { workspace = true }
+arrow-cast = { workspace = true }
+arrow-schema = { workspace = true }
 divan = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true }
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index 1dfba41f8fd..8349b47eb26 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -1,133 +1,338 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Focused bench for the **best fallible cast kernel** — what `cast.rs` actually uses
-//! in `vortex-array/src/arrays/primitive/compute/cast.rs`. Single bench, no cross-impl
-//! baselines: just a regression guard for the production cast hot path.
-//!
-//! The kernel: [`vortex_buffer::lane_ops_indexed::try_map_with_mask`] called with a
-//! lazy-validity `or_else` closure — for statically-infallible casts (widening) LLVM
-//! proves `NumCast::from` is always `Some`, the `or_else` branch is dead, and the
-//! validity path is DCE'd. For fallible casts (narrowing), validity is only consulted
-//! on the cold failure branch.
+//! Coverage benchmark for the indexed lane-op variants used by primitive casts
+//! and bit-packing paths.
 
 #![expect(clippy::unwrap_used)]
 
 use std::mem::MaybeUninit;
 
+use arrow_array::UInt16Array;
+use arrow_array::UInt64Array;
+use arrow_buffer::NullBuffer;
+use arrow_buffer::ScalarBuffer;
+use arrow_cast::CastOptions;
+use arrow_cast::cast_with_options;
+use arrow_schema::DataType;
 use divan::Bencher;
 use num_traits::NumCast;
 use rand::SeedableRng;
 use rand::prelude::*;
+use rand::rngs::StdRng;
 use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
+use vortex_buffer::lane_ops_indexed::map_no_validity;
+use vortex_buffer::lane_ops_indexed::map_to_bits;
+use vortex_buffer::lane_ops_indexed::map_with_mask;
+use vortex_buffer::lane_ops_indexed::map_with_mask_in_place;
+use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits;
+use vortex_buffer::lane_ops_indexed::try_map_no_validity;
+use vortex_buffer::lane_ops_indexed::try_map_validity_filtered;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
 
 fn main() {
     divan::main();
 }
 
 const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
-const VALID_RATE: f64 = 0.7;
-const DATA_SEED: u64 = 0;
-const VALID_SEED: u64 = 1;
+const U32_THRESHOLD: u32 = u32::MAX / 2;
 
 struct Fixture {
-    /// u64 source for the narrowing-cast bench (`cast_lazy_validity`).
     values_u64: Buffer<u64>,
-    /// u16 source for the widening-cast benches that compare closure forms.
+    values_u64_invalid_overflows: Buffer<u64>,
+    values_u32: Buffer<u32>,
+    values_u32_small: Buffer<u32>,
     values_u16: Buffer<u16>,
     mask: BitBuffer,
+    /// `UInt64Array` baseline for arrow casts. Same values + validity as `values_u64` / `mask`.
+    arrow_u64: UInt64Array,
+    /// `UInt16Array` baseline. Same as `values_u16` / `mask`.
+    arrow_u16: UInt16Array,
 }
 
 fn fixture(n: usize) -> Fixture {
-    let mut data_rng = StdRng::seed_from_u64(DATA_SEED);
-    let mut valid_rng = StdRng::seed_from_u64(VALID_SEED);
+    let mut rng = StdRng::seed_from_u64(0xC457_1D3E);
+
     let raw_values: Vec<u64> = (0..n)
-        .map(|_| data_rng.random_range(0..u32::MAX as u64))
+        .map(|_| rng.random_range(0..(u32::MAX as u64)))
         .collect();
-    let raw_valid: Vec<bool> = (0..n).map(|_| valid_rng.random_bool(VALID_RATE)).collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u16 = raw_values
+        .iter()
+        .copied()
+        .map(|v| v as u16)
+        .collect::<Buffer<u16>>();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u32 = raw_values
+        .iter()
+        .copied()
+        .map(|v| v as u32)
+        .collect::<Buffer<u32>>();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u32_small = raw_values
+        .iter()
+        .copied()
+        .map(|v| (v % ((u32::MAX as u64) / 2)) as u32)
+        .collect::<Buffer<u32>>();
 
-    let values_u64: Buffer<u64> = raw_values.iter().copied().collect();
+    let values_u64_invalid_overflows = raw_values
+        .iter()
+        .copied()
+        .zip(raw_valid.iter().copied())
+        .map(|(v, valid)| if valid { v } else { u64::MAX })
+        .collect::<Buffer<u64>>();
+
+    let arrow_u64 = UInt64Array::new(
+        ScalarBuffer::from(raw_values.clone()),
+        Some(NullBuffer::from(raw_valid.clone())),
+    );
     #[expect(clippy::cast_possible_truncation)]
-    let values_u16: Buffer<u16> = raw_values.iter().map(|&v| v as u16).collect();
-    let mask = {
-        let mut m = BitBufferMut::with_capacity(n);
-        for &v in &raw_valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
+    let raw_u16: Vec<u16> = raw_values.iter().map(|&v| v as u16).collect();
+    let arrow_u16 = UInt16Array::new(
+        ScalarBuffer::from(raw_u16),
+        Some(NullBuffer::from(raw_valid.clone())),
+    );
 
     Fixture {
-        values_u64,
+        values_u64: raw_values.into(),
+        values_u64_invalid_overflows,
+        values_u32,
+        values_u32_small,
         values_u16,
-        mask,
+        mask: BitBufferMut::from_iter(raw_valid).freeze(),
+        arrow_u64,
+        arrow_u16,
     }
 }
-/// The kernel `cast.rs` uses in production: `try_map_with_mask` with a lazy-validity
-/// `or_else` closure. `NumCast::from(v)` is the cast; `or_else` only fires (and only
-/// then reads `valid`) when the cast itself returned `None`.
+
+const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
+    safe: false,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
+    let mut out = Vec::with_capacity(n);
+    // SAFETY: A `MaybeUninit<T>` does not require initialization.
+    unsafe {
+        out.set_len(n);
+    }
+    out
+}
+
 #[divan::bench(args = SIZES)]
-fn cast_lazy_validity(bencher: Bencher, n: usize) {
+fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
+
     bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            // SAFETY: every lane is written before any read inside the kernel.
-            unsafe { out.set_len(n) };
-            (f.values_u64.clone(), f.mask.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
-                <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
+        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            map_no_validity(
+                values.as_slice(),
+                out.as_mut_slice(),
+                <u32 as From<u16>>::from,
+            );
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+                <u32 as From<u16>>::from(v) * valid as u32
+            });
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            try_map_no_validity(values.as_slice(), out.as_mut_slice(), |v| {
+                <u32 as NumCast>::from(v)
             })
             .unwrap();
+            out
         });
 }
 
-// -----------------------------------------------------------------------------
-// Widening benches (u16 → u32). Compare closure forms on a statically-infallible
-// cast to confirm the asm finding empirically: the `or_else` and `_valid`
-// (maskless) closures should produce identical timings, since LLVM aliases the
-// `or_else` function symbol directly to the maskless one (proven via
-// `cargo rustc --emit=asm` — see the `asm_u16_u32_*` helpers above).
-// -----------------------------------------------------------------------------
+/// `try_map_with_mask` with a closure that **ignores `valid`**. Tests whether
+/// LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` mask extract. Uses
+/// non-overflowing `values_u64` so the closure-ignores-valid spurious-failure
+/// case never triggers (would otherwise err on null-lane overflow).
+#[divan::bench(args = SIZES)]
+fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+                <u32 as NumCast>::from(v)
+            })
+            .unwrap();
+            out
+        });
+}
 
-/// Widening with the `or_else` closure — the cast.rs shape.
 #[divan::bench(args = SIZES)]
-fn widen_u16_u32_or_else(bencher: Bencher, n: usize) {
+fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) {
     let f = fixture(n);
+
     bencher
-        .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values_u16.clone(), f.mask.clone(), out)
-        })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, valid| {
+        .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
                 <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
             })
             .unwrap();
+            out
         });
 }
 
-/// Widening with `_valid` ignored — the upper bound. Should match `or_else` per the
-/// asm aliasing finding.
 #[divan::bench(args = SIZES)]
-fn widen_u16_u32_maskless(bencher: Bencher, n: usize) {
+fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
+
     bencher
         .with_inputs(|| {
-            let mut out: Vec<MaybeUninit<u32>> = Vec::with_capacity(n);
-            unsafe { out.set_len(n) };
-            (f.values_u16.clone(), f.mask.clone(), out)
+            (
+                f.values_u64_invalid_overflows.clone(),
+                f.mask.clone(),
+                uninit_out::<u32>(n),
+            )
         })
-        .bench_refs(|(values, mask, out)| {
-            try_map_with_mask(values.as_slice(), mask, out.as_mut_slice(), |v, _valid| {
+        .bench_values(|(values, mask, mut out)| {
+            try_map_validity_filtered(values.as_slice(), &mask, out.as_mut_slice(), |v| {
                 <u32 as NumCast>::from(v)
             })
             .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+                Some(<u32 as From<u16>>::from(v)).or_else(|| (!valid).then(u32::default))
+            })
+            .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+                Some(<u32 as From<u16>>::from(v))
+            })
+            .unwrap();
+            out
         });
 }
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone()))
+        .bench_values(|(mut values, mask)| {
+            map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| v * valid as u32);
+            values
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone()))
+        .bench_values(|(mut values, mask)| {
+            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2))
+                .unwrap();
+            values
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_to_bits_u32_threshold(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u32.clone(), vec![0; n.div_ceil(64)]))
+        .bench_values(|(values, mut out)| {
+            map_to_bits(values.as_slice(), out.as_mut_slice(), |v| {
+                v >= U32_THRESHOLD
+            });
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_to_bits_u32_threshold(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+
+    bencher
+        .with_inputs(|| {
+            (
+                f.values_u32.clone(),
+                f.mask.clone(),
+                vec![0; n.div_ceil(64)],
+            )
+        })
+        .bench_values(|(values, mask, mut out)| {
+            map_with_mask_to_bits(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+                valid && v >= U32_THRESHOLD
+            });
+            out
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one
+// narrowing (u64 → u32, can fail). Each pairs with the cast variants above of
+// matching direction.
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_widen_u16_u32(bencher: Bencher, _n: usize) {
+    let f = fixture(_n);
+    bencher
+        .with_inputs(|| f.arrow_u16.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
+
+#[divan::bench(args = SIZES)]
+fn arrow_cast_narrow_u64_u32(bencher: Bencher, _n: usize) {
+    let f = fixture(_n);
+    bencher
+        .with_inputs(|| f.arrow_u64.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index dfd2c41fd4a..4f7c42e4603 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -22,6 +22,52 @@ use std::mem::MaybeUninit;
 
 use crate::BitBuffer;
 
+macro_rules! for_full_lanes {
+    ($base:expr, | $bit_idx:ident, $i:ident | $body:block) => {
+        for $bit_idx in 0..64 {
+            let $i = $base + $bit_idx;
+            $body
+        }
+    };
+}
+
+macro_rules! for_remainder_lanes {
+    ($base:expr, $remainder:expr, | $bit_idx:ident, $i:ident | $body:block) => {
+        for $bit_idx in 0..$remainder {
+            let $i = $base + $bit_idx;
+            $body
+        }
+    };
+}
+
+macro_rules! for_full_mask_lanes {
+    ($src_chunk:expr, $base:expr, | $bit_idx:ident, $i:ident, $valid:ident | $body:block) => {
+        for $bit_idx in 0..64 {
+            let $i = $base + $bit_idx;
+            let $valid = ($src_chunk >> $bit_idx) & 1 == 1;
+            $body
+        }
+    };
+}
+
+macro_rules! for_remainder_mask_lanes {
+    (
+        $src_chunk:expr,
+        $base:expr,
+        $remainder:expr, |
+        $bit_idx:ident,
+        $i:ident,
+        $valid:ident |
+        $body:block
+    ) => {
+        for $bit_idx in 0..$remainder {
+            let $i = $base + $bit_idx;
+            let $valid = ($src_chunk >> $bit_idx) & 1 == 1;
+            $body
+        }
+    };
+}
+
 /// A length-known source supporting unchecked indexed reads.
 ///
 /// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s.
@@ -159,40 +205,51 @@ where
         // Inner loop is fixed-size 64 with independent per-lane reads — no iterator
         // state, no cross-iteration dependency, so the auto-vectorizer can fuse
         // 64 indexed loads into vector loads.
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        }
+        });
     }
 
     if remainder != 0 {
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        }
+        });
     }
 }
 
 /// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None` indicates a
 /// per-lane failure (e.g. range overflow on a narrowing cast).
 ///
-/// The kernel does not short-circuit on the first failure inside a chunk: it processes
-/// whole 64-lane chunks with `is_none()` flags OR-reduced into a single accumulator,
-/// then checks after each chunk. On failure, a cold scalar attribution pass replays the
-/// closure over that chunk to identify the first failing lane. The hot loop stays
-/// autovectorizable — the per-lane cost is one OR on top of the cast.
+/// **Null-lane failures are filtered automatically.** If a null lane's stored value
+/// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as
+/// `Err` — the cold attribution pass skips lanes where the mask bit is `0`. The
+/// closure may also explicitly suppress null-lane failures by branching on `valid`
+/// itself; both behaviors compose, with the kernel's filter as a safety net.
+///
+/// ## Hot loop
+///
+/// Per-lane `is_none()` flags are OR-reduced into a single `u64` (just bit 0).
+/// When the closure ignores `valid`, LLVM DCEs the per-lane mask extract
+/// `(src_chunk >> bit_idx) & 1` entirely — the inner loop becomes pure value
+/// computation with no mask traffic. When the closure uses `valid`, the bit is
+/// passed through and the closure threads validity normally.
 ///
-/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None` write
-/// `R::default()` into `out`, but the contents of `out` must not be relied upon when
-/// this function returns `Err`.
+/// ## Cold attribution
+///
+/// On `fail_acc != 0`, [`cold_first_valid_failure`] walks the chunk filtering by
+/// mask and returns either `Some(first_valid_failure_index)` or `None` (all
+/// failures were at null lanes — the kernel continues). Not autovectorized; runs
+/// at most once per failing chunk.
+///
+/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
+/// write `R::default()` into `out`, but the contents of `out` must not be relied
+/// upon when this function returns `Err`.
 ///
 /// # Panics
 ///
@@ -219,11 +276,11 @@ where
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
-        // Per-chunk accumulator — does not escape the SIMD inner loop.
+        // Per-chunk accumulator — just bit 0. When the closure ignores `valid`,
+        // the per-lane `(src_chunk >> bit_idx) & 1` is dead code and LLVM removes
+        // it, leaving a value-only SIMD body.
         let mut fail_acc: u64 = 0;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v, bit);
@@ -231,9 +288,14 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         if fail_acc != 0 {
-            return Err(attribute_failure(&values, src_chunk, base, 64, &mut f));
+            if let Some(idx) =
+                cold_first_valid_failure(&values, src_chunk, base, 64, &mut f)
+            {
+                return Err(idx);
+            }
+            // All failures were at null lanes — continue (rescue).
         }
     }
 
@@ -241,9 +303,7 @@ where
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
         let mut fail_acc: u64 = 0;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v, bit);
@@ -251,11 +311,13 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         if fail_acc != 0 {
-            return Err(attribute_failure(
-                &values, src_chunk, base, remainder, &mut f,
-            ));
+            if let Some(idx) =
+                cold_first_valid_failure(&values, src_chunk, base, remainder, &mut f)
+            {
+                return Err(idx);
+            }
         }
     }
 
@@ -289,22 +351,20 @@ where
 
     for chunk_idx in 0..chunks_count {
         let base = chunk_idx * 64;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
+        for_full_lanes!(base, |bit_idx, i| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             unsafe { out.get_unchecked_mut(i).write(f(v)) };
-        }
+        });
     }
 
     if remainder != 0 {
         let base = chunks_count * 64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
+        for_remainder_lanes!(base, remainder, |bit_idx, i| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             unsafe { out.get_unchecked_mut(i).write(f(v)) };
-        }
+        });
     }
 }
 
@@ -348,8 +408,7 @@ where
     for chunk_idx in 0..chunks_count {
         let base = chunk_idx * 64;
         let mut fail_acc: u64 = 0;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
+        for_full_lanes!(base, |bit_idx, i| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v);
@@ -357,7 +416,7 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         if fail_acc != 0 {
             return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
         }
@@ -366,8 +425,7 @@ where
     if remainder != 0 {
         let base = chunks_count * 64;
         let mut fail_acc: u64 = 0;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
+        for_remainder_lanes!(base, remainder, |bit_idx, i| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v);
@@ -375,7 +433,7 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         if fail_acc != 0 {
             return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
         }
@@ -433,8 +491,7 @@ where
     for (chunk_idx, mask_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
         let mut fail_bits: u64 = 0;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
+        for_full_lanes!(base, |bit_idx, i| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v);
@@ -444,7 +501,7 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         // Filter failures to those at VALID lanes only. Null-lane failures vanish.
         let valid_failures = fail_bits & mask_chunk;
         if valid_failures != 0 {
@@ -456,8 +513,7 @@ where
         let mask_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
         let mut fail_bits: u64 = 0;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
+        for_remainder_lanes!(base, remainder, |bit_idx, i| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v);
@@ -465,7 +521,7 @@ where
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
-        }
+        });
         let valid_failures = fail_bits & mask_chunk;
         if valid_failures != 0 {
             return Err(base + valid_failures.trailing_zeros() as usize);
@@ -475,32 +531,47 @@ where
     Ok(())
 }
 
-/// Cold attribution for the no-mask variant.
+/// Shared cold scan: walks a chunk, returns the first lane index where
+/// `lane_fails(bit_idx, value)` returns `true`. Used by both
+/// [`attribute_failure`] and [`attribute_failure_no_mask`] via thin wrappers.
+///
+/// Caller guarantees `base + chunk_len <= values.len()`.
 #[cold]
 #[inline(never)]
-fn attribute_failure_no_mask<S, R, F>(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize
+fn cold_scan<S>(
+    values: &S,
+    base: usize,
+    chunk_len: usize,
+    mut lane_fails: impl FnMut(usize /* bit_idx */, S::Item) -> bool,
+) -> usize
 where
     S: IndexedSource,
-    F: FnMut(S::Item) -> Option<R>,
 {
     for bit_idx in 0..chunk_len {
         let i = base + bit_idx;
         // SAFETY: caller guarantees i < values.len().
         let v = unsafe { values.get_unchecked(i) };
-        if f(v).is_none() {
+        if lane_fails(bit_idx, v) {
             return i;
         }
     }
-    unreachable!("attribute_failure_no_mask called without a failing lane")
+    unreachable!("cold_scan called without a failing lane")
 }
 
-/// Cold path: identify the first lane in a chunk where `f` returned `None`.
-///
-/// Called only after the hot loop has detected that at least one lane failed.
-/// Walks the chunk scalar-style; not autovectorized, but that's fine — it only
-/// runs once per error and the error path is supposed to be exceptional.
-#[cold]
-#[inline(never)]
+/// Cold attribution for the no-mask variant. Replays `f` over the chunk to find
+/// the first lane that returns `None`.
+#[inline]
+fn attribute_failure_no_mask<S, R, F>(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize
+where
+    S: IndexedSource,
+    F: FnMut(S::Item) -> Option<R>,
+{
+    cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none())
+}
+
+/// Cold attribution for the mask variant. Replays `f` over the chunk, passing
+/// each lane's validity bit, and returns the first lane where `f` returned `None`.
+#[inline]
 fn attribute_failure<S, R, F>(
     values: &S,
     src_chunk: u64,
@@ -512,17 +583,9 @@ where
     S: IndexedSource,
     F: FnMut(S::Item, bool) -> Option<R>,
 {
-    for bit_idx in 0..chunk_len {
-        let i = base + bit_idx;
-        let bit = (src_chunk >> bit_idx) & 1 == 1;
-        // SAFETY: caller guarantees base + chunk_len <= values.len().
-        let v = unsafe { values.get_unchecked(i) };
-        if f(v, bit).is_none() {
-            return i;
-        }
-    }
-    // Unreachable: hot loop's OR-reduction said at least one lane in [base, base+chunk_len) failed.
-    unreachable!("attribute_failure called without a failing lane")
+    cold_scan(values, base, chunk_len, |bit_idx, v| {
+        f(v, (src_chunk >> bit_idx) & 1 == 1).is_none()
+    })
 }
 
 /// In-place variant of [`map_with_mask`]. Each lane is replaced with
@@ -546,29 +609,25 @@ where
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             let r = f(v, bit);
             // SAFETY: i < len.
             unsafe { values.set_unchecked(i, r) };
-        }
+        });
     }
 
     if remainder != 0 {
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             let r = f(v, bit);
             // SAFETY: i < len.
             unsafe { values.set_unchecked(i, r) };
-        }
+        });
     }
 }
 
@@ -619,49 +678,69 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        let mut first_fail: u32 = u32::MAX;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-            first_fail = first_fail.min(candidate);
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { values.set_unchecked(i, r) };
-        }
-        if first_fail != u32::MAX {
-            return Err(first_fail as usize);
+        // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body
+        // into this loop and the compiler propagates 64 into the inner `0..count` bound,
+        // unrolling exactly as `for_full_mask_lanes!` would.
+        if let Some(failing) =
+            try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f)
+        {
+            return Err(failing as usize);
         }
     }
 
     if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut first_fail: u32 = u32::MAX;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-            first_fail = first_fail.min(candidate);
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { values.set_unchecked(i, r) };
-        }
-        if first_fail != u32::MAX {
-            return Err(first_fail as usize);
+        // Runtime `count = remainder` — same shape as the prior remainder loop.
+        if let Some(failing) = try_inplace_chunk(
+            &mut values,
+            chunks.remainder_bits(),
+            chunks_count * 64,
+            remainder,
+            &mut f,
+        ) {
+            return Err(failing as usize);
         }
     }
 
     Ok(())
 }
 
+/// Per-chunk worker for [`try_map_with_mask_in_place`]. Body written once; the kernel
+/// calls this twice (with `count = 64` for full chunks, `count = remainder` for the
+/// tail). `#[inline(always)]` so the const-64 unroll for the full-chunk callers is
+/// preserved.
+///
+/// Returns `Some(first_failing_lane_index_as_u32)` if any lane in `[base, base+count)`
+/// failed (cast width-truncated since `i < 2^32` in any realistic batch), else `None`.
+#[inline(always)]
+#[allow(clippy::cast_possible_truncation)]
+fn try_inplace_chunk<S, F>(
+    values: &mut S,
+    src_chunk: u64,
+    base: usize,
+    count: usize,
+    f: &mut F,
+) -> Option<u32>
+where
+    S: IndexedSink,
+    S::Item: Default,
+    F: FnMut(S::Item, bool) -> Option<S::Item>,
+{
+    let mut first_fail: u32 = u32::MAX;
+    for bit_idx in 0..count {
+        let i = base + bit_idx;
+        let bit = (src_chunk >> bit_idx) & 1 == 1;
+        // SAFETY: caller guarantees `base + count <= values.len()`.
+        let v = unsafe { values.get_unchecked(i) };
+        let opt = f(v, bit);
+        let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+        first_fail = first_fail.min(candidate);
+        let r = opt.unwrap_or_default();
+        // SAFETY: same as above.
+        unsafe { values.set_unchecked(i, r) };
+    }
+    (first_fail != u32::MAX).then_some(first_fail)
+}
+
 /// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
 ///
 /// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
@@ -694,11 +773,11 @@ where
     for chunk_idx in 0..chunks_count {
         let base = chunk_idx * 64;
         let mut packed = 0u64;
-        for bit_idx in 0..64 {
+        for_full_lanes!(base, |bit_idx, i| {
             // SAFETY: base + bit_idx < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(base + bit_idx) };
+            let v = unsafe { values.get_unchecked(i) };
             packed |= (f(v) as u64) << bit_idx;
-        }
+        });
         // SAFETY: chunk_idx < chunks_count <= out.len().
         unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
     }
@@ -706,11 +785,11 @@ where
     if remainder != 0 {
         let base = chunks_count * 64;
         let mut packed = 0u64;
-        for bit_idx in 0..remainder {
+        for_remainder_lanes!(base, remainder, |bit_idx, i| {
             // SAFETY: base + bit_idx < len.
-            let v = unsafe { values.get_unchecked(base + bit_idx) };
+            let v = unsafe { values.get_unchecked(i) };
             packed |= (f(v) as u64) << bit_idx;
-        }
+        });
         // SAFETY: chunks_count < out.len() because remainder != 0.
         unsafe { *out.get_unchecked_mut(chunks_count) = packed };
     }
@@ -745,13 +824,11 @@ where
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
         let mut packed = 0u64;
-        for bit_idx in 0..64 {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             packed |= (f(v, bit) as u64) << bit_idx;
-        }
+        });
         // SAFETY: chunk_idx < chunks_count <= out.len().
         unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
     }
@@ -760,13 +837,11 @@ where
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
         let mut packed = 0u64;
-        for bit_idx in 0..remainder {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             packed |= (f(v, bit) as u64) << bit_idx;
-        }
+        });
         // SAFETY: chunks_count < out.len() because remainder != 0.
         unsafe { *out.get_unchecked_mut(chunks_count) = packed };
     }

From 769a2583e62ef29bfa1becd106e1b9c46c213bad Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 13:19:11 +0100
Subject: [PATCH 06/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                               |   1 +
 vortex-buffer/Cargo.toml                 |   7 +-
 vortex-buffer/benches/cast_to_indexed.rs |   9 +-
 vortex-buffer/src/lane_ops_indexed.rs    | 198 +++++------------------
 4 files changed, 56 insertions(+), 159 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 11afc6996a2..9bb032d0d35 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,6 +9355,7 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
+ "arrow-arith",
  "arrow-array",
  "arrow-buffer",
  "arrow-cast",
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 385efa36dcf..882de199818 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,7 +37,8 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
-# arrow-* are used by the cast_to_indexed bench to compare against arrow-rs.
+# arrow-* are used by cast_to_indexed / add_checked benches to compare against arrow-rs.
+arrow-arith = { workspace = true }
 arrow-array = { workspace = true }
 arrow-cast = { workspace = true }
 arrow-schema = { workspace = true }
@@ -57,3 +58,7 @@ harness = false
 [[bench]]
 name = "cast_to_indexed"
 harness = false
+
+[[bench]]
+name = "add_checked"
+harness = false
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index 8349b47eb26..848f50cd142 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -29,7 +29,6 @@ use vortex_buffer::lane_ops_indexed::map_with_mask;
 use vortex_buffer::lane_ops_indexed::map_with_mask_in_place;
 use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits;
 use vortex_buffer::lane_ops_indexed::try_map_no_validity;
-use vortex_buffer::lane_ops_indexed::try_map_validity_filtered;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
 
@@ -205,8 +204,12 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) {
         });
 }
 
+/// Migrated from the old `try_map_validity_filtered` bench: same inputs (null
+/// lanes contain overflowing values) and same correctness expectation (no Err),
+/// but now driven through the merged `try_map_with_mask` with a `|v, _|` closure.
+/// The hot loop is value-only via DCE; the cold path filters null-lane failures.
 #[divan::bench(args = SIZES)]
-fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) {
+fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
@@ -218,7 +221,7 @@ fn try_map_validity_filtered_narrow_u64_u32(bencher: Bencher, n: usize) {
             )
         })
         .bench_values(|(values, mask, mut out)| {
-            try_map_validity_filtered(values.as_slice(), &mask, out.as_mut_slice(), |v| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
                 <u32 as NumCast>::from(v)
             })
             .unwrap();
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 4f7c42e4603..47887b92810 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -228,24 +228,25 @@ where
 ///
 /// **Null-lane failures are filtered automatically.** If a null lane's stored value
 /// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as
-/// `Err` — the cold attribution pass skips lanes where the mask bit is `0`. The
-/// closure may also explicitly suppress null-lane failures by branching on `valid`
-/// itself; both behaviors compose, with the kernel's filter as a safety net.
+/// `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at the lane's
+/// position, then ANDed with the chunk's validity bitmap — null-lane bits vanish.
+/// The closure may also explicitly suppress null-lane failures by branching on
+/// `valid` itself; both behaviors compose.
 ///
 /// ## Hot loop
 ///
-/// Per-lane `is_none()` flags are OR-reduced into a single `u64` (just bit 0).
-/// When the closure ignores `valid`, LLVM DCEs the per-lane mask extract
-/// `(src_chunk >> bit_idx) & 1` entirely — the inner loop becomes pure value
-/// computation with no mask traffic. When the closure uses `valid`, the bit is
-/// passed through and the closure threads validity normally.
+/// `fail_bits |= (opt.is_none() as u64) << bit_idx`. After unrolling, `bit_idx` is a
+/// compile-time constant per-iteration, so the shift folds. The closure receives
+/// `(value, valid)`; LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` extract
+/// when the closure ignores `valid`, leaving a value-only SIMD body.
 ///
-/// ## Cold attribution
+/// ## Attribution
 ///
-/// On `fail_acc != 0`, [`cold_first_valid_failure`] walks the chunk filtering by
-/// mask and returns either `Some(first_valid_failure_index)` or `None` (all
-/// failures were at null lanes — the kernel continues). Not autovectorized; runs
-/// at most once per failing chunk.
+/// `valid_failures = fail_bits & src_chunk` — non-zero only when at least one
+/// valid lane failed. `trailing_zeros()` gives the first failing valid lane.
+/// **No cold replay**: failure detection and lane attribution happen entirely in
+/// the hot loop. Worst-case bounded per chunk regardless of how many null lanes
+/// returned `None`.
 ///
 /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
 /// write `R::default()` into `out`, but the contents of `out` must not be relied
@@ -276,48 +277,45 @@ where
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
-        // Per-chunk accumulator — just bit 0. When the closure ignores `valid`,
-        // the per-lane `(src_chunk >> bit_idx) & 1` is dead code and LLVM removes
-        // it, leaving a value-only SIMD body.
-        let mut fail_acc: u64 = 0;
+        // Bit-pack per-lane fails into a u64 at lane-position. `bit_idx` is a
+        // compile-time constant after unrolling, so the shift folds. The
+        // `src_chunk` here is the validity bitmap for this chunk; the closure
+        // still gets `bit` per lane — LLVM DCEs the per-lane mask extract if
+        // the closure ignores it.
+        let mut fail_bits: u64 = 0;
         for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
             // SAFETY: i < chunks_count * 64 <= len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v, bit);
-            fail_acc |= opt.is_none() as u64;
+            fail_bits |= (opt.is_none() as u64) << bit_idx;
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
         });
-        if fail_acc != 0 {
-            if let Some(idx) =
-                cold_first_valid_failure(&values, src_chunk, base, 64, &mut f)
-            {
-                return Err(idx);
-            }
-            // All failures were at null lanes — continue (rescue).
+        // Drop null-lane failures: only failures at lanes the mask marks as
+        // valid count. Direct attribution via trailing_zeros — no cold replay.
+        let valid_failures = fail_bits & src_chunk;
+        if valid_failures != 0 {
+            return Err(base + valid_failures.trailing_zeros() as usize);
         }
     }
 
     if remainder != 0 {
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
-        let mut fail_acc: u64 = 0;
+        let mut fail_bits: u64 = 0;
         for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
             // SAFETY: i < len.
             let v = unsafe { values.get_unchecked(i) };
             let opt = f(v, bit);
-            fail_acc |= opt.is_none() as u64;
+            fail_bits |= (opt.is_none() as u64) << bit_idx;
             let r = opt.unwrap_or_default();
             // SAFETY: i < len.
             unsafe { out.get_unchecked_mut(i).write(r) };
         });
-        if fail_acc != 0 {
-            if let Some(idx) =
-                cold_first_valid_failure(&values, src_chunk, base, remainder, &mut f)
-            {
-                return Err(idx);
-            }
+        let valid_failures = fail_bits & src_chunk;
+        if valid_failures != 0 {
+            return Err(base + valid_failures.trailing_zeros() as usize);
         }
     }
 
@@ -330,7 +328,7 @@ where
 ///
 /// For nullable inputs where the closure is infallible (no overflow / no error
 /// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible
-/// closure, prefer [`try_map_validity_filtered`] — both correctly suppress
+/// closure, prefer [`try_map_with_mask`] — both correctly suppress
 /// null-lane logic. This kernel exists for the narrow "no validity exists"
 /// case (non-nullable column, internal pipelines, etc.).
 ///
@@ -374,7 +372,7 @@ where
 /// # Use this only for non-nullable inputs.
 ///
 /// For nullable inputs with a fallible closure, use
-/// [`try_map_validity_filtered`] — it has the same value-only closure shape
+/// [`try_map_with_mask`] — it has the same value-only closure shape
 /// (and the same perf win) but **correctly suppresses null-lane failures**
 /// via per-chunk `fail_bits & mask_chunk`.
 ///
@@ -442,98 +440,9 @@ where
     Ok(())
 }
 
-/// Fallible value-only map with **chunk-level validity filtering**: closure is
-/// `|v| -> Option<R>`, no validity threaded through the inner loop. After each
-/// 64-lane chunk, per-lane failure bits are ANDed against the mask chunk, so
-/// failures at null lanes do **not** propagate as `Err`.
-///
-/// This is the correct shape for "checked cast that respects validity" — a null
-/// row whose stored value would overflow does **not** cause `Err`. It also
-/// preserves the perf win of the value-only closure: the hot loop has no per-lane
-/// mask extract, no `valid`-dependent branch.
-///
-/// ## Inner-loop trick
-///
-/// Per-lane fails are packed into a `u64` via `fail_bits |= (is_none as u64) << bit_idx`.
-/// The shift amount is loop-invariant after unrolling (since `bit_idx` is the
-/// compile-time loop counter), so the autovectorizer can issue 64 sequential
-/// value reads + closure applications + packed-bit ORs as a vector pipeline.
-///
-/// ## Attribution
-///
-/// On failure, `valid_failures = fail_bits & mask_chunk` is non-zero; the lowest
-/// set bit is the first failing valid lane. `trailing_zeros()` reads it out
-/// directly — no cold replay path, no second pass.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
-#[inline]
-pub fn try_map_validity_filtered<S, R, F>(
-    values: S,
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<R>],
-    mut f: F,
-) -> Result<(), usize>
-where
-    S: IndexedSource,
-    R: Copy + Default,
-    F: FnMut(S::Item) -> Option<R>,
-{
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(out.len(), len, "out must have the same length as values");
-
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, mask_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        let mut fail_bits: u64 = 0;
-        for_full_lanes!(base, |bit_idx, i| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            // Pack failure bit at the lane's position. After unrolling, `bit_idx`
-            // is a compile-time constant per-iteration, so the shift is folded.
-            fail_bits |= (opt.is_none() as u64) << bit_idx;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        // Filter failures to those at VALID lanes only. Null-lane failures vanish.
-        let valid_failures = fail_bits & mask_chunk;
-        if valid_failures != 0 {
-            return Err(base + valid_failures.trailing_zeros() as usize);
-        }
-    }
-
-    if remainder != 0 {
-        let mask_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut fail_bits: u64 = 0;
-        for_remainder_lanes!(base, remainder, |bit_idx, i| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            fail_bits |= (opt.is_none() as u64) << bit_idx;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        let valid_failures = fail_bits & mask_chunk;
-        if valid_failures != 0 {
-            return Err(base + valid_failures.trailing_zeros() as usize);
-        }
-    }
-
-    Ok(())
-}
-
 /// Shared cold scan: walks a chunk, returns the first lane index where
-/// `lane_fails(bit_idx, value)` returns `true`. Used by both
-/// [`attribute_failure`] and [`attribute_failure_no_mask`] via thin wrappers.
+/// `lane_fails(bit_idx, value)` returns `true`. Used by
+/// [`attribute_failure_no_mask`].
 ///
 /// Caller guarantees `base + chunk_len <= values.len()`.
 #[cold]
@@ -569,25 +478,6 @@ where
     cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none())
 }
 
-/// Cold attribution for the mask variant. Replays `f` over the chunk, passing
-/// each lane's validity bit, and returns the first lane where `f` returned `None`.
-#[inline]
-fn attribute_failure<S, R, F>(
-    values: &S,
-    src_chunk: u64,
-    base: usize,
-    chunk_len: usize,
-    f: &mut F,
-) -> usize
-where
-    S: IndexedSource,
-    F: FnMut(S::Item, bool) -> Option<R>,
-{
-    cold_scan(values, base, chunk_len, |bit_idx, v| {
-        f(v, (src_chunk >> bit_idx) & 1 == 1).is_none()
-    })
-}
-
 /// In-place variant of [`map_with_mask`]. Each lane is replaced with
 /// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]).
 ///
@@ -681,8 +571,7 @@ where
         // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body
         // into this loop and the compiler propagates 64 into the inner `0..count` bound,
         // unrolling exactly as `for_full_mask_lanes!` would.
-        if let Some(failing) =
-            try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f)
+        if let Some(failing) = try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f)
         {
             return Err(failing as usize);
         }
@@ -1058,10 +947,9 @@ mod tests {
     }
 
     #[test]
-    fn try_map_validity_filtered_null_lane_overflow_does_not_err() {
-        // Null lane with a value that would overflow MUST NOT cause Err.
-        // The closure is value-only — the mask filters the null-lane failure
-        // at the chunk boundary.
+    fn try_map_with_mask_value_only_closure_filters_null_overflow() {
+        // `|v, _|` closure that ignores validity. A null lane with an overflowing
+        // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us.
         let mut values: Vec<u64> = (0..200).collect();
         values[5] = u64::MAX; // null lane with overflowing value
         values[42] = u64::MAX; // null lane with overflowing value
@@ -1073,17 +961,17 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| {
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| {
             (v <= u32::MAX as u64).then_some(v as u32)
         });
         assert!(
             res.is_ok(),
-            "null-lane overflow should not propagate as Err"
+            "null-lane overflow should be filtered by the cold path"
         );
     }
 
     #[test]
-    fn try_map_validity_filtered_valid_overflow_does_err_with_first_index() {
+    fn try_map_with_mask_value_only_closure_reports_first_valid_failure() {
         // Valid lane overflow must propagate — and the reported index must be
         // the lowest VALID failing lane, even if earlier null lanes also "failed"
         // their unconditional cast.
@@ -1100,7 +988,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_validity_filtered(values.as_slice(), &mask, &mut out, |v| {
+        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| {
             (v <= u32::MAX as u64).then_some(v as u32)
         });
         assert_eq!(res, Err(77));

From 3a30290f33b31bf54ae2eb92b97536df5f61abd1 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 14:05:45 +0100
Subject: [PATCH 07/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/Cargo.toml                |   4 +
 vortex-buffer/benches/add_checked.rs    | 676 ++++++++++++++++++++++++
 vortex-buffer/benches/pack_vs_unpack.rs | 389 ++++++++++++++
 vortex-buffer/src/lane_ops_indexed.rs   | 464 ++++++++--------
 4 files changed, 1297 insertions(+), 236 deletions(-)
 create mode 100644 vortex-buffer/benches/add_checked.rs
 create mode 100644 vortex-buffer/benches/pack_vs_unpack.rs

diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 882de199818..048d2612364 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -62,3 +62,7 @@ harness = false
 [[bench]]
 name = "add_checked"
 harness = false
+
+[[bench]]
+name = "pack_vs_unpack"
+harness = false
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
new file mode 100644
index 00000000000..df857922d6f
--- /dev/null
+++ b/vortex-buffer/benches/add_checked.rs
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Checked `u32 + u32 -> u32` over two nullable columns — exhaustive variant
+//! comparison.
+//!
+//! Variants differ along three axes:
+//!
+//! 1. **Closure suppression strategy** — how the closure (if any) handles null lanes
+//!    - `value_only`: `|(a,b), _|` ignores validity
+//!    - `if_else`: `|(a,b), valid| if valid { ... } else { Some(default) }`
+//!    - `or_else`: `|(a,b), valid| ....or_else(|| (!valid).then(...))`
+//!    - `mul_trick`: `(a * valid as u32).checked_add(b * valid as u32)`
+//!
+//! 2. **Fail tracking scheme**
+//!    - bit-pack: `fail_bits |= (is_none << bit_idx)`; chunk-AND with mask
+//!    - boolean: `fail_acc |= is_none as u64`; cold replay attribution
+//!
+//! 3. **Validity application**
+//!    - in closure: closure consumes `valid`
+//!    - post-mask: kernel ANDs fail bitmap with `src_chunk`
+//!    - pre-mask: kernel zeros null-lane values via bit-broadcast before SIMD add
+//!    - none: ignore validity (ceiling only — not correct for real inputs)
+//!
+//! All correctness-preserving variants are verified via [`assert_overflow_parity`]
+//! and [`assert_null_overflow_suppressed`] at startup. The `pure_simd_no_validity`
+//! variant is benched as a ceiling only — it does not respect nullability.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+use std::sync::Arc;
+
+use arrow_array::Datum;
+use arrow_array::UInt32Array;
+use arrow_buffer::NullBuffer;
+use arrow_buffer::ScalarBuffer;
+use divan::Bencher;
+use rand::SeedableRng;
+use rand::prelude::*;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::lane_ops_indexed::LaneZip;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask;
+
+fn main() {
+    assert_overflow_parity();
+    assert_null_overflow_suppressed();
+    assert_pure_simd_errs_on_realistic_data();
+    divan::main();
+}
+
+const SIZES: &[usize] = &[4_096, 65_536, 1_048_576, 2_097_152, 4_194_304];
+const LHS_VALID_RATE: f64 = 0.7;
+const RHS_VALID_RATE: f64 = 0.8;
+
+struct Fixture {
+    /// **Realistic** lhs: valid lanes bounded, null lanes `u32::MAX`.
+    /// A kernel that ignores validity will see overflow at null lanes.
+    lhs: Buffer<u32>,
+    rhs: Buffer<u32>,
+    /// **Sanitized** lhs: valid lanes bounded, null lanes pre-zeroed.
+    /// Used by `pure_simd_no_validity_sanitized` only — its precondition is
+    /// "someone already zeroed the nulls."
+    lhs_sanitized: Buffer<u32>,
+    rhs_sanitized: Buffer<u32>,
+    lhs_mask: BitBuffer,
+    rhs_mask: BitBuffer,
+    lhs_arrow: Arc<UInt32Array>,
+    rhs_arrow: Arc<UInt32Array>,
+}
+
+fn fixture(n: usize) -> Fixture {
+    let mut lhs_rng = StdRng::seed_from_u64(0);
+    let mut rhs_rng = StdRng::seed_from_u64(1);
+    let mut lvr = StdRng::seed_from_u64(2);
+    let mut rvr = StdRng::seed_from_u64(3);
+
+    let lhs_valid: Vec<bool> = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect();
+    let rhs_valid: Vec<bool> = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect();
+
+    // **Realistic null storage**: null lanes contain u32::MAX. Adding two such
+    // values overflows — a kernel that ignores validity will spuriously Err.
+    // Valid lanes carry bounded values so the success path is measured at lanes
+    // where overflow shouldn't fire.
+    let raw_lhs: Vec<u32> = (0..n)
+        .map(|i| {
+            if lhs_valid[i] {
+                lhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+    let raw_rhs: Vec<u32> = (0..n)
+        .map(|i| {
+            if rhs_valid[i] {
+                rhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+
+    let lhs: Buffer<u32> = raw_lhs.iter().copied().collect();
+    let rhs: Buffer<u32> = raw_rhs.iter().copied().collect();
+
+    let lhs_sanitized: Buffer<u32> = (0..n)
+        .map(|i| if lhs_valid[i] { raw_lhs[i] } else { 0 })
+        .collect();
+    let rhs_sanitized: Buffer<u32> = (0..n)
+        .map(|i| if rhs_valid[i] { raw_rhs[i] } else { 0 })
+        .collect();
+
+    let lhs_mask = {
+        let mut m = BitBufferMut::with_capacity(n);
+        for &v in &lhs_valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+    let rhs_mask = {
+        let mut m = BitBufferMut::with_capacity(n);
+        for &v in &rhs_valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+
+    let lhs_arrow = Arc::new(UInt32Array::new(
+        ScalarBuffer::from(raw_lhs),
+        Some(NullBuffer::from(lhs_valid)),
+    ));
+    let rhs_arrow = Arc::new(UInt32Array::new(
+        ScalarBuffer::from(raw_rhs),
+        Some(NullBuffer::from(rhs_valid)),
+    ));
+
+    Fixture {
+        lhs,
+        rhs,
+        lhs_sanitized,
+        rhs_sanitized,
+        lhs_mask,
+        rhs_mask,
+        lhs_arrow,
+        rhs_arrow,
+    }
+}
+
+fn alloc_out(n: usize) -> Vec<MaybeUninit<u32>> {
+    let mut out = Vec::with_capacity(n);
+    // SAFETY: every lane is written before any read inside the kernel.
+    unsafe { out.set_len(n) };
+    out
+}
+
+// ---------------------------------------------------------------------------
+// Variant 0: arrow_arith::numeric::add — baseline
+// ---------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn arrow_add(bencher: Bencher, n: usize) {
+    let _ = n;
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.lhs_arrow.clone(), f.rhs_arrow.clone()))
+        .bench_refs(|(lhs, rhs)| {
+            arrow_arith::numeric::add(lhs.as_ref() as &dyn Datum, rhs.as_ref() as &dyn Datum)
+                .unwrap()
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 1: try_map_with_mask + closure `|(a, b), _|` (value-only)
+// Fail tracking: bit-pack via the kernel.
+// LLVM DCEs per-lane mask extract.
+// ---------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn bitpack_value_only(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &combined,
+                out.as_mut_slice(),
+                |(a, b), _valid| a.checked_add(b),
+            )
+            .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 2: try_map_with_mask + closure `|(a, b), valid|` with if-else
+// Fail tracking: bit-pack via the kernel.
+// Closure explicitly suppresses null-lane fails (redundant with bit-pack filter).
+// ---------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn bitpack_closure_suppresses_if_else(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &combined,
+                out.as_mut_slice(),
+                |(a, b), valid| {
+                    if valid { a.checked_add(b) } else { Some(0) }
+                },
+            )
+            .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 3: try_map_with_mask + closure `.or_else(|| (!valid).then(...))`
+// Fail tracking: bit-pack via the kernel.
+// Lazy suppression: closure only consults `valid` when overflow actually fires.
+// ---------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn bitpack_closure_suppresses_or_else(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &combined,
+                out.as_mut_slice(),
+                |(a, b), valid| a.checked_add(b).or_else(|| (!valid).then_some(0)),
+            )
+            .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 4: try_map_with_mask + closure with `(a * valid).checked_add(b * valid)`
+// Fail tracking: bit-pack via the kernel.
+// The multiply-by-valid trick zeroes null-lane operands so they can't overflow.
+// ---------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn bitpack_closure_mul_trick(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &combined,
+                out.as_mut_slice(),
+                |(a, b), valid| {
+                    let m = valid as u32;
+                    (a * m).checked_add(b * m)
+                },
+            )
+            .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 5: hand-rolled, boolean fail_acc, closure suppresses nulls, cold replay
+// ---------------------------------------------------------------------------
+
+/// Hand-rolled kernel: boolean `fail_acc`, cold replay attribution.
+/// Closure is expected to suppress null-lane fails by returning `Some(...)`;
+/// `fail_acc` only fires for real valid-lane overflows.
+#[inline]
+fn handrolled_boolean<F>(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+    mut f: F,
+) -> Result<(), usize>
+where
+    F: FnMut(u32, u32, bool) -> Option<u32>,
+{
+    let len = lhs.len();
+    assert_eq!(len, rhs.len());
+    assert_eq!(len, mask.len());
+    assert_eq!(len, out.len());
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..64 {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: i < len.
+            let a = unsafe { *lhs.get_unchecked(i) };
+            let b = unsafe { *rhs.get_unchecked(i) };
+            let opt = f(a, b, bit);
+            fail_acc |= opt.is_none() as u64;
+            unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) };
+        }
+        if fail_acc != 0 {
+            // Cold: find first failing lane (closure already suppressed nulls).
+            for bit_idx in 0..64 {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                let a = unsafe { *lhs.get_unchecked(i) };
+                let b = unsafe { *rhs.get_unchecked(i) };
+                if f(a, b, bit).is_none() {
+                    return Err(i);
+                }
+            }
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..remainder {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            let a = unsafe { *lhs.get_unchecked(i) };
+            let b = unsafe { *rhs.get_unchecked(i) };
+            let opt = f(a, b, bit);
+            fail_acc |= opt.is_none() as u64;
+            unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) };
+        }
+        if fail_acc != 0 {
+            for bit_idx in 0..remainder {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                let a = unsafe { *lhs.get_unchecked(i) };
+                let b = unsafe { *rhs.get_unchecked(i) };
+                if f(a, b, bit).is_none() {
+                    return Err(i);
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+#[divan::bench(args = SIZES)]
+fn boolean_closure_suppresses(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            handrolled_boolean(
+                lhs.as_slice(),
+                rhs.as_slice(),
+                &combined,
+                out.as_mut_slice(),
+                |a, b, valid| {
+                    if valid { a.checked_add(b) } else { Some(0) }
+                },
+            )
+            .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 6: hand-rolled pre-mask. Kernel zeros null-lane values via bit
+// broadcast, then unconditional add + overflow detect. Boolean fail_acc.
+// ---------------------------------------------------------------------------
+
+#[inline]
+fn handrolled_premask(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    let len = lhs.len();
+    assert_eq!(len, rhs.len());
+    assert_eq!(len, mask.len());
+    assert_eq!(len, out.len());
+    let chunks = mask.chunks();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+        let base = chunk_idx * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..64 {
+            // bit-broadcast: 0 → 0x00000000, 1 → 0xFFFFFFFF
+            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
+            let i = base + bit_idx;
+            // SAFETY: i < len.
+            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
+            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
+            let (sum, overflow) = a.overflowing_add(b);
+            fail_acc |= overflow as u64;
+            unsafe { out.get_unchecked_mut(i).write(sum) };
+        }
+        if fail_acc != 0 {
+            // Cold: walk chunk to find first valid lane that actually overflows on
+            // the unmasked inputs. Null lanes were premasked to 0+0, can't overflow.
+            for bit_idx in 0..64 {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                if !bit {
+                    continue;
+                }
+                let a = unsafe { *lhs.get_unchecked(i) };
+                let b = unsafe { *rhs.get_unchecked(i) };
+                if a.checked_add(b).is_none() {
+                    return Err(i);
+                }
+            }
+        }
+    }
+
+    if remainder != 0 {
+        let src_chunk = chunks.remainder_bits();
+        let base = chunks_count * 64;
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..remainder {
+            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
+            let i = base + bit_idx;
+            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
+            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
+            let (sum, overflow) = a.overflowing_add(b);
+            fail_acc |= overflow as u64;
+            unsafe { out.get_unchecked_mut(i).write(sum) };
+        }
+        if fail_acc != 0 {
+            for bit_idx in 0..remainder {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                if !bit {
+                    continue;
+                }
+                let a = unsafe { *lhs.get_unchecked(i) };
+                let b = unsafe { *rhs.get_unchecked(i) };
+                if a.checked_add(b).is_none() {
+                    return Err(i);
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+#[divan::bench(args = SIZES)]
+fn premask_then_simd(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = alloc_out(n);
+            handrolled_premask(lhs.as_slice(), rhs.as_slice(), &combined, out.as_mut_slice())
+                .unwrap();
+            (combined, out)
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Variant 7: pure SIMD, no mask awareness — CEILING REFERENCE ONLY.
+// Incorrect for arrays where null lanes might overflow; benchmarked just to
+// show the theoretical floor for nullable add.
+// ---------------------------------------------------------------------------
+
+#[inline]
+fn handrolled_no_validity(
+    lhs: &[u32],
+    rhs: &[u32],
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    assert_eq!(lhs.len(), rhs.len());
+    assert_eq!(lhs.len(), out.len());
+    let mut fail = false;
+    for i in 0..lhs.len() {
+        let a = unsafe { *lhs.get_unchecked(i) };
+        let b = unsafe { *rhs.get_unchecked(i) };
+        let (sum, overflow) = a.overflowing_add(b);
+        fail |= overflow;
+        unsafe { out.get_unchecked_mut(i).write(sum) };
+    }
+    if fail { Err(0) } else { Ok(()) }
+}
+
+/// Pure-SIMD ceiling on **pre-sanitized** input (null lanes pre-zeroed in the
+/// fixture, outside the timed region). Cannot run on the realistic
+/// `(lhs, rhs)` arrays because their null lanes hold `u32::MAX` and would
+/// Err — proven by [`assert_pure_simd_errs_on_realistic_data`].
+///
+/// Showing the SIMD-only arithmetic floor — what an ideal nullable-add would
+/// look like if validity could be free.
+#[divan::bench(args = SIZES)]
+fn pure_simd_no_validity_sanitized(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.lhs_sanitized.clone(), f.rhs_sanitized.clone()))
+        .bench_refs(|(lhs, rhs)| {
+            let mut out = alloc_out(n);
+            handrolled_no_validity(lhs.as_slice(), rhs.as_slice(), out.as_mut_slice()).unwrap();
+            out
+        });
+}
+
+// ---------------------------------------------------------------------------
+// Parity assertions — must pass before divan runs benches.
+// ---------------------------------------------------------------------------
+
+/// Both arrow and our kernel must Err on overflow at a valid lane.
+fn assert_overflow_parity() {
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true; 4];
+
+    let lhs_arrow = UInt32Array::new(
+        ScalarBuffer::from(lhs.clone()),
+        Some(NullBuffer::from(valid.clone())),
+    );
+    let rhs_arrow = UInt32Array::new(
+        ScalarBuffer::from(rhs.clone()),
+        Some(NullBuffer::from(valid.clone())),
+    );
+    let arrow_result =
+        arrow_arith::numeric::add(&lhs_arrow as &dyn Datum, &rhs_arrow as &dyn Datum);
+    assert!(arrow_result.is_err(), "arrow should Err on overflow");
+
+    let mask = {
+        let mut m = BitBufferMut::with_capacity(4);
+        for &v in &valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let ours = try_map_with_mask(
+        LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+        &mask,
+        out.as_mut_slice(),
+        |(a, b), _| a.checked_add(b),
+    );
+    assert!(ours.is_err(), "bitpack should Err on overflow");
+
+    let mut out2: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let boolean = handrolled_boolean(&lhs, &rhs, &mask, &mut out2, |a, b, valid| {
+        if valid { a.checked_add(b) } else { Some(0) }
+    });
+    assert!(boolean.is_err(), "boolean should Err on overflow");
+
+    let mut out3: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out3);
+    assert!(prem.is_err(), "premask should Err on overflow");
+}
+
+/// All correctness-preserving variants must NOT Err when only null lanes
+/// would overflow. (Pure-SIMD variant is excluded — it doesn't see validity.)
+fn assert_null_overflow_suppressed() {
+    // Lane 2 is null and contains overflowing values; valid lanes are safe.
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true, true, false, true];
+
+    let mask = {
+        let mut m = BitBufferMut::with_capacity(4);
+        for &v in &valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+
+    // Bit-pack with value-only closure — kernel filters null-lane fails.
+    let mut out = alloc_out(4);
+    let r = try_map_with_mask(
+        LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+        &mask,
+        out.as_mut_slice(),
+        |(a, b), _| a.checked_add(b),
+    );
+    assert!(r.is_ok(), "bitpack_value_only: null-lane overflow leaked");
+
+    // Boolean with closure that suppresses nulls.
+    let mut out = alloc_out(4);
+    let r = handrolled_boolean(&lhs, &rhs, &mask, &mut out, |a, b, valid| {
+        if valid { a.checked_add(b) } else { Some(0) }
+    });
+    assert!(r.is_ok(), "boolean_closure_suppresses: null-lane leaked");
+
+    // Pre-mask: kernel zeroes null-lane values.
+    let mut out = alloc_out(4);
+    let r = handrolled_premask(&lhs, &rhs, &mask, &mut out);
+    assert!(r.is_ok(), "premask_then_simd: null-lane overflow leaked");
+}
+
+/// Demonstrates that `pure_simd_no_validity` is **incorrect** on realistic
+/// fixture inputs — i.e., when null lanes contain values that overflow on add.
+/// This is what justifies excluding pure_simd from the realistic bench and
+/// running it only on the sanitized inputs. Without this, the "ignore the
+/// mask" approach would look too fast because the test data lets it cheat.
+fn assert_pure_simd_errs_on_realistic_data() {
+    // Lane 2 is a "null lane" in arrow-style storage: bitmap says null, but
+    // the data buffer still holds an overflowing value. The realistic
+    // `fixture` does exactly this.
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+
+    let r = handrolled_no_validity(&lhs, &rhs, &mut out);
+    assert!(
+        r.is_err(),
+        "pure_simd_no_validity should Err on realistic data (null lane has \
+         u32::MAX). If this passes, the bench fixture isn't exercising the \
+         unsafe-null-storage case and the pure_simd ceiling number is \
+         misleading — it's running on data the kernel happens to handle even \
+         without a mask."
+    );
+}
diff --git a/vortex-buffer/benches/pack_vs_unpack.rs b/vortex-buffer/benches/pack_vs_unpack.rs
new file mode 100644
index 00000000000..0ae41fb5573
--- /dev/null
+++ b/vortex-buffer/benches/pack_vs_unpack.rs
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compare two strategies for handling validity in `try_map_with_mask`:
+//!
+//! 1. **Unpack the mask** — closure consults `valid` per-lane. Null lanes are
+//!    short-circuited inside the closure (return `Some(default)` immediately),
+//!    so the checked operation never runs with garbage. The kernel still does
+//!    its `fail_bits & src_chunk` post-filter, but it's a no-op because the
+//!    closure already produced `Some` at null lanes.
+//!
+//! 2. **Pack and filter** — closure ignores `_valid`. The checked operation
+//!    runs at every lane, including null lanes (where it may produce `None`
+//!    on garbage). The kernel's post-loop `fail_bits & src_chunk` filter
+//!    drops those null-lane fails. LLVM DCEs the per-lane mask extract since
+//!    the closure doesn't consult `valid`.
+//!
+//! Two ops × two strategies = four vortex benches, plus arrow baselines.
+//!
+//! - `widen_u16_u32_*` — statically-infallible widening cast. `NumCast::from`
+//!   always returns `Some`; LLVM proves it and strips fail-tracking entirely.
+//! - `checked_add_u32_*` — genuinely fallible: `u32 + u32` can overflow.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+use std::sync::Arc;
+
+use arrow_arith::numeric::add;
+use arrow_array::Datum;
+use arrow_array::UInt16Array;
+use arrow_array::UInt32Array;
+use arrow_buffer::NullBuffer;
+use arrow_buffer::ScalarBuffer;
+use arrow_cast::CastOptions;
+use arrow_cast::cast_with_options;
+use arrow_schema::DataType;
+use divan::Bencher;
+use num_traits::NumCast;
+use rand::SeedableRng;
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::lane_ops_indexed::LaneZip;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask;
+
+fn main() {
+    divan::main();
+}
+
+const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
+
+struct Fixture {
+    values_u16: Buffer<u16>,
+    lhs_u32: Buffer<u32>,
+    rhs_u32: Buffer<u32>,
+    mask: BitBuffer,
+    arrow_u16: UInt16Array,
+    arrow_lhs: Arc<UInt32Array>,
+    arrow_rhs: Arc<UInt32Array>,
+}
+
+fn fixture(n: usize) -> Fixture {
+    let mut rng = StdRng::seed_from_u64(0xC0DE_BEEF);
+    // Bounded so `u16 + u16` (as u32) and `u32 + u32` never overflow u32.
+    // Both strategies succeed; we measure success-path perf.
+    let raw_lhs: Vec<u32> = (0..n)
+        .map(|_| rng.random_range(0..(u32::MAX / 2)))
+        .collect();
+    let raw_rhs: Vec<u32> = (0..n)
+        .map(|_| rng.random_range(0..(u32::MAX / 2)))
+        .collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u16: Buffer<u16> = raw_lhs.iter().map(|&v| v as u16).collect();
+    let lhs_u32: Buffer<u32> = raw_lhs.iter().copied().collect();
+    let rhs_u32: Buffer<u32> = raw_rhs.iter().copied().collect();
+
+    let mask = {
+        let mut m = BitBufferMut::with_capacity(n);
+        for &v in &raw_valid {
+            m.append(v);
+        }
+        m.freeze()
+    };
+
+    #[expect(clippy::cast_possible_truncation)]
+    let arrow_u16 = UInt16Array::new(
+        ScalarBuffer::from(raw_lhs.iter().map(|&v| v as u16).collect::<Vec<u16>>()),
+        Some(NullBuffer::from(raw_valid.clone())),
+    );
+    let arrow_lhs = Arc::new(UInt32Array::new(
+        ScalarBuffer::from(raw_lhs),
+        Some(NullBuffer::from(raw_valid.clone())),
+    ));
+    let arrow_rhs = Arc::new(UInt32Array::new(
+        ScalarBuffer::from(raw_rhs),
+        Some(NullBuffer::from(raw_valid)),
+    ));
+
+    Fixture {
+        values_u16,
+        lhs_u32,
+        rhs_u32,
+        mask,
+        arrow_u16,
+        arrow_lhs,
+        arrow_rhs,
+    }
+}
+
+fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
+    let mut out = Vec::with_capacity(n);
+    // SAFETY: a `MaybeUninit<T>` does not require initialization.
+    unsafe { out.set_len(n) };
+    out
+}
+
+const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
+    safe: false,
+    format_options: arrow_cast::display::FormatOptions::new(),
+};
+
+// -----------------------------------------------------------------------------
+// Widening cast u16 → u32 (statically infallible). NumCast::from never returns
+// None for widening, so the failure path is dead in both strategies.
+// -----------------------------------------------------------------------------
+
+/// Strategy 1 (unpack mask): closure consults `valid`, short-circuits at null
+/// lanes. For widening the short-circuit is dead anyway (no failure possible).
+#[divan::bench(args = SIZES)]
+fn widen_u16_u32_unpack_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+                if !valid {
+                    return Some(0u32);
+                }
+                <u32 as NumCast>::from(v)
+            })
+            .unwrap();
+            out
+        });
+}
+
+/// Strategy 2 (pack and filter): closure ignores `_valid`. LLVM DCEs the
+/// per-lane mask extract; post-loop `& src_chunk` would filter null-lane fails
+/// (none happen for widening).
+#[divan::bench(args = SIZES)]
+fn widen_u16_u32_pack_and_filter(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+                <u32 as NumCast>::from(v)
+            })
+            .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn widen_u16_u32_arrow(bencher: Bencher, _n: usize) {
+    let f = fixture(_n);
+    bencher
+        .with_inputs(|| f.arrow_u16.clone())
+        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
+}
+
+// -----------------------------------------------------------------------------
+// Checked add u32 + u32 → u32 (genuinely fallible). LaneZip(lhs, rhs) drives
+// two-input lanewise.
+// -----------------------------------------------------------------------------
+
+/// Strategy 1 (unpack mask): closure short-circuits null lanes; `checked_add`
+/// only runs at valid lanes.
+#[divan::bench(args = SIZES)]
+fn checked_add_u32_unpack_mask(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs_u32.clone(),
+                f.rhs_u32.clone(),
+                f.mask.clone(),
+                uninit_out::<u32>(n),
+            )
+        })
+        .bench_values(|(lhs, rhs, mask, mut out)| {
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &mask,
+                out.as_mut_slice(),
+                |(a, b), valid| {
+                    if !valid {
+                        return Some(0u32);
+                    }
+                    a.checked_add(b)
+                },
+            )
+            .unwrap();
+            out
+        });
+}
+
+/// Strategy 2 (pack and filter): `checked_add` runs at every lane (including
+/// null lanes with garbage values); kernel's `fail_bits & src_chunk` post-filter
+/// drops any null-lane fails.
+#[divan::bench(args = SIZES)]
+fn checked_add_u32_pack_and_filter(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs_u32.clone(),
+                f.rhs_u32.clone(),
+                f.mask.clone(),
+                uninit_out::<u32>(n),
+            )
+        })
+        .bench_values(|(lhs, rhs, mask, mut out)| {
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &mask,
+                out.as_mut_slice(),
+                |(a, b), _valid| a.checked_add(b),
+            )
+            .unwrap();
+            out
+        });
+}
+
+// Asm-extraction helpers: `#[unsafe(no_mangle)] #[inline(never)]` so a single
+// `cargo rustc --emit=asm` produces clearly-labeled symbols to diff.
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn asm_add_unpack_branchy(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    try_map_with_mask(
+        LaneZip::new(lhs, rhs),
+        mask,
+        out,
+        |(a, b), valid| {
+            if !valid {
+                return Some(0u32);
+            }
+            a.checked_add(b)
+        },
+    )
+}
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn asm_add_unpack_branchless(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    try_map_with_mask(
+        LaneZip::new(lhs, rhs),
+        mask,
+        out,
+        |(a, b), valid| {
+            // Compute first, then select. No early-return; LLVM may if-convert.
+            let r = a.checked_add(b);
+            if valid { r } else { Some(0u32) }
+        },
+    )
+}
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn asm_add_unpack_multiply(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    try_map_with_mask(
+        LaneZip::new(lhs, rhs),
+        mask,
+        out,
+        |(a, b), valid| {
+            // Neutralize null lanes via multiply (BIC); checked_add runs unconditionally.
+            let m = valid as u32;
+            (a * m).checked_add(b * m)
+        },
+    )
+}
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn asm_add_pack_filter(
+    lhs: &[u32],
+    rhs: &[u32],
+    mask: &BitBuffer,
+    out: &mut [MaybeUninit<u32>],
+) -> Result<(), usize> {
+    try_map_with_mask(
+        LaneZip::new(lhs, rhs),
+        mask,
+        out,
+        |(a, b), _valid| a.checked_add(b),
+    )
+}
+
+/// Branchless-multiply variant of unpack_mask: scale lhs/rhs by `valid as u32` so
+/// the checked op runs at every lane (with zeros at null lanes — never overflows)
+/// and the kernel's post-loop `& src_chunk` filter still applies.
+#[divan::bench(args = SIZES)]
+fn checked_add_u32_unpack_multiply(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs_u32.clone(),
+                f.rhs_u32.clone(),
+                f.mask.clone(),
+                uninit_out::<u32>(n),
+            )
+        })
+        .bench_values(|(lhs, rhs, mask, mut out)| {
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &mask,
+                out.as_mut_slice(),
+                |(a, b), valid| {
+                    let m = valid as u32;
+                    (a * m).checked_add(b * m)
+                },
+            )
+            .unwrap();
+            out
+        });
+}
+
+/// Compute-first-then-select variant of unpack_mask: removes the early `return`,
+/// keeps the `valid` consult per-lane. Tests whether LLVM if-converts when both
+/// branches are pure expressions.
+#[divan::bench(args = SIZES)]
+fn checked_add_u32_unpack_branchless(bencher: Bencher, n: usize) {
+    let f = fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs_u32.clone(),
+                f.rhs_u32.clone(),
+                f.mask.clone(),
+                uninit_out::<u32>(n),
+            )
+        })
+        .bench_values(|(lhs, rhs, mask, mut out)| {
+            try_map_with_mask(
+                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+                &mask,
+                out.as_mut_slice(),
+                |(a, b), valid| {
+                    let r = a.checked_add(b);
+                    if valid { r } else { Some(0u32) }
+                },
+            )
+            .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn checked_add_u32_arrow(bencher: Bencher, _n: usize) {
+    let f = fixture(_n);
+    bencher
+        .with_inputs(|| (f.arrow_lhs.clone(), f.arrow_rhs.clone()))
+        .bench_refs(|(lhs, rhs)| {
+            let lhs_datum: &dyn Datum = lhs.as_ref();
+            let rhs_datum: &dyn Datum = rhs.as_ref();
+            add(lhs_datum, rhs_datum).unwrap()
+        });
+}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 47887b92810..144f83a429a 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -22,52 +22,6 @@ use std::mem::MaybeUninit;
 
 use crate::BitBuffer;
 
-macro_rules! for_full_lanes {
-    ($base:expr, | $bit_idx:ident, $i:ident | $body:block) => {
-        for $bit_idx in 0..64 {
-            let $i = $base + $bit_idx;
-            $body
-        }
-    };
-}
-
-macro_rules! for_remainder_lanes {
-    ($base:expr, $remainder:expr, | $bit_idx:ident, $i:ident | $body:block) => {
-        for $bit_idx in 0..$remainder {
-            let $i = $base + $bit_idx;
-            $body
-        }
-    };
-}
-
-macro_rules! for_full_mask_lanes {
-    ($src_chunk:expr, $base:expr, | $bit_idx:ident, $i:ident, $valid:ident | $body:block) => {
-        for $bit_idx in 0..64 {
-            let $i = $base + $bit_idx;
-            let $valid = ($src_chunk >> $bit_idx) & 1 == 1;
-            $body
-        }
-    };
-}
-
-macro_rules! for_remainder_mask_lanes {
-    (
-        $src_chunk:expr,
-        $base:expr,
-        $remainder:expr, |
-        $bit_idx:ident,
-        $i:ident,
-        $valid:ident |
-        $body:block
-    ) => {
-        for $bit_idx in 0..$remainder {
-            let $i = $base + $bit_idx;
-            let $valid = ($src_chunk >> $bit_idx) & 1 == 1;
-            $body
-        }
-    };
-}
-
 /// A length-known source supporting unchecked indexed reads.
 ///
 /// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s.
@@ -192,6 +146,30 @@ where
     S: IndexedSource,
     F: FnMut(S::Item, bool) -> R,
 {
+    /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder`
+    /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the
+    /// full-chunk call site via constant propagation through inlining.
+    #[inline(always)]
+    fn chunk<S, R, F>(
+        values: &S,
+        out: &mut [MaybeUninit<R>],
+        f: &mut F,
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+    ) where
+        S: IndexedSource,
+        F: FnMut(S::Item, bool) -> R,
+    {
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        }
+    }
+
     let len = values.len();
     assert_eq!(len, mask.len(), "values and mask must have the same length");
     assert_eq!(out.len(), len, "out must have the same length as values");
@@ -201,25 +179,13 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        // Inner loop is fixed-size 64 with independent per-lane reads — no iterator
-        // state, no cross-iteration dependency, so the auto-vectorizer can fuse
-        // 64 indexed loads into vector loads.
-        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        });
+        chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64);
     }
-
     if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-        });
+        chunk(
+            &values, out, &mut f,
+            chunks.remainder_bits(), chunks_count * 64, remainder,
+        );
     }
 }
 
@@ -267,6 +233,38 @@ where
     R: Copy + Default,
     F: FnMut(S::Item, bool) -> Option<R>,
 {
+    /// Bit-packs `is_none()` into `fail_bits` at lane position; the post-loop
+    /// `& src_chunk` filter drops null-lane fails. Returns `Some(failing_idx)` if
+    /// any *valid* lane failed in `[base, base+count)`.
+    #[inline(always)]
+    fn chunk<S, R, F>(
+        values: &S,
+        out: &mut [MaybeUninit<R>],
+        f: &mut F,
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+    ) -> Option<usize>
+    where
+        S: IndexedSource,
+        R: Copy + Default,
+        F: FnMut(S::Item, bool) -> Option<R>,
+    {
+        let mut fail_bits: u64 = 0;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            fail_bits |= (opt.is_none() as u64) << bit_idx;
+            let r = opt.unwrap_or_default();
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        let valid_failures = fail_bits & src_chunk;
+        (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
+    }
+
     let len = values.len();
     assert_eq!(len, mask.len(), "values and mask must have the same length");
     assert_eq!(out.len(), len, "out must have the same length as values");
@@ -276,49 +274,18 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        // Bit-pack per-lane fails into a u64 at lane-position. `bit_idx` is a
-        // compile-time constant after unrolling, so the shift folds. The
-        // `src_chunk` here is the validity bitmap for this chunk; the closure
-        // still gets `bit` per lane — LLVM DCEs the per-lane mask extract if
-        // the closure ignores it.
-        let mut fail_bits: u64 = 0;
-        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            fail_bits |= (opt.is_none() as u64) << bit_idx;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        // Drop null-lane failures: only failures at lanes the mask marks as
-        // valid count. Direct attribution via trailing_zeros — no cold replay.
-        let valid_failures = fail_bits & src_chunk;
-        if valid_failures != 0 {
-            return Err(base + valid_failures.trailing_zeros() as usize);
+        if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) {
+            return Err(idx);
         }
     }
-
     if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut fail_bits: u64 = 0;
-        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            fail_bits |= (opt.is_none() as u64) << bit_idx;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        let valid_failures = fail_bits & src_chunk;
-        if valid_failures != 0 {
-            return Err(base + valid_failures.trailing_zeros() as usize);
+        if let Some(idx) = chunk(
+            &values, out, &mut f,
+            chunks.remainder_bits(), chunks_count * 64, remainder,
+        ) {
+            return Err(idx);
         }
     }
-
     Ok(())
 }
 
@@ -341,6 +308,25 @@ where
     S: IndexedSource,
     F: FnMut(S::Item) -> R,
 {
+    #[inline(always)]
+    fn chunk<S, R, F>(
+        values: &S,
+        out: &mut [MaybeUninit<R>],
+        f: &mut F,
+        base: usize,
+        count: usize,
+    ) where
+        S: IndexedSource,
+        F: FnMut(S::Item) -> R,
+    {
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            unsafe { out.get_unchecked_mut(i).write(f(v)) };
+        }
+    }
+
     let len = values.len();
     assert_eq!(out.len(), len, "out must have the same length as values");
 
@@ -348,21 +334,10 @@ where
     let remainder = len % 64;
 
     for chunk_idx in 0..chunks_count {
-        let base = chunk_idx * 64;
-        for_full_lanes!(base, |bit_idx, i| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v)) };
-        });
+        chunk(&values, out, &mut f, chunk_idx * 64, 64);
     }
-
     if remainder != 0 {
-        let base = chunks_count * 64;
-        for_remainder_lanes!(base, remainder, |bit_idx, i| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v)) };
-        });
+        chunk(&values, out, &mut f, chunks_count * 64, remainder);
     }
 }
 
@@ -397,6 +372,35 @@ where
     R: Copy + Default,
     F: FnMut(S::Item) -> Option<R>,
 {
+    /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced);
+    /// the cold attribution path is called at the kernel level so it can be
+    /// inlined separately for full vs remainder.
+    #[inline(always)]
+    fn chunk<S, R, F>(
+        values: &S,
+        out: &mut [MaybeUninit<R>],
+        f: &mut F,
+        base: usize,
+        count: usize,
+    ) -> bool
+    where
+        S: IndexedSource,
+        R: Copy + Default,
+        F: FnMut(S::Item) -> Option<R>,
+    {
+        let mut fail_acc: u64 = 0;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            fail_acc |= opt.is_none() as u64;
+            let r = opt.unwrap_or_default();
+            unsafe { out.get_unchecked_mut(i).write(r) };
+        }
+        fail_acc != 0
+    }
+
     let len = values.len();
     assert_eq!(out.len(), len, "out must have the same length as values");
 
@@ -405,38 +409,16 @@ where
 
     for chunk_idx in 0..chunks_count {
         let base = chunk_idx * 64;
-        let mut fail_acc: u64 = 0;
-        for_full_lanes!(base, |bit_idx, i| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            fail_acc |= opt.is_none() as u64;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        if fail_acc != 0 {
+        if chunk(&values, out, &mut f, base, 64) {
             return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
         }
     }
-
     if remainder != 0 {
         let base = chunks_count * 64;
-        let mut fail_acc: u64 = 0;
-        for_remainder_lanes!(base, remainder, |bit_idx, i| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            fail_acc |= opt.is_none() as u64;
-            let r = opt.unwrap_or_default();
-            // SAFETY: i < len.
-            unsafe { out.get_unchecked_mut(i).write(r) };
-        });
-        if fail_acc != 0 {
+        if chunk(&values, out, &mut f, base, remainder) {
             return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
         }
     }
-
     Ok(())
 }
 
@@ -490,6 +472,27 @@ where
     S: IndexedSink,
     F: FnMut(S::Item, bool) -> S::Item,
 {
+    #[inline(always)]
+    fn chunk<S, F>(
+        values: &mut S,
+        f: &mut F,
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+    ) where
+        S: IndexedSink,
+        F: FnMut(S::Item, bool) -> S::Item,
+    {
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let r = f(v, bit);
+            unsafe { values.set_unchecked(i, r) };
+        }
+    }
+
     let len = values.len();
     assert_eq!(len, mask.len(), "values and mask must have the same length");
 
@@ -498,26 +501,13 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let r = f(v, bit);
-            // SAFETY: i < len.
-            unsafe { values.set_unchecked(i, r) };
-        });
+        chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64);
     }
-
     if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            let r = f(v, bit);
-            // SAFETY: i < len.
-            unsafe { values.set_unchecked(i, r) };
-        });
+        chunk(
+            &mut values, &mut f,
+            chunks.remainder_bits(), chunks_count * 64, remainder,
+        );
     }
 }
 
@@ -560,6 +550,39 @@ where
     S::Item: Default,
     F: FnMut(S::Item, bool) -> Option<S::Item>,
 {
+    /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in
+    /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any
+    /// realistic batch), else `None`. `#[inline(always)]` so the literal `64` at the
+    /// full-chunk call site enables const-propagation through inlining.
+    #[inline(always)]
+    #[allow(clippy::cast_possible_truncation)]
+    fn chunk<S, F>(
+        values: &mut S,
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+        f: &mut F,
+    ) -> Option<u32>
+    where
+        S: IndexedSink,
+        S::Item: Default,
+        F: FnMut(S::Item, bool) -> Option<S::Item>,
+    {
+        let mut first_fail: u32 = u32::MAX;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees `base + count <= values.len()`.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v, bit);
+            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+            first_fail = first_fail.min(candidate);
+            let r = opt.unwrap_or_default();
+            unsafe { values.set_unchecked(i, r) };
+        }
+        (first_fail != u32::MAX).then_some(first_fail)
+    }
+
     let len = values.len();
     assert_eq!(len, mask.len(), "values and mask must have the same length");
 
@@ -568,68 +591,22 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        // `count = 64` is a literal; `#[inline(always)]` on the helper inlines its body
-        // into this loop and the compiler propagates 64 into the inner `0..count` bound,
-        // unrolling exactly as `for_full_mask_lanes!` would.
-        if let Some(failing) = try_inplace_chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f)
-        {
+        if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) {
             return Err(failing as usize);
         }
     }
-
     if remainder != 0 {
-        // Runtime `count = remainder` — same shape as the prior remainder loop.
-        if let Some(failing) = try_inplace_chunk(
+        if let Some(failing) = chunk(
             &mut values,
-            chunks.remainder_bits(),
-            chunks_count * 64,
-            remainder,
+            chunks.remainder_bits(), chunks_count * 64, remainder,
             &mut f,
         ) {
             return Err(failing as usize);
         }
     }
-
     Ok(())
 }
 
-/// Per-chunk worker for [`try_map_with_mask_in_place`]. Body written once; the kernel
-/// calls this twice (with `count = 64` for full chunks, `count = remainder` for the
-/// tail). `#[inline(always)]` so the const-64 unroll for the full-chunk callers is
-/// preserved.
-///
-/// Returns `Some(first_failing_lane_index_as_u32)` if any lane in `[base, base+count)`
-/// failed (cast width-truncated since `i < 2^32` in any realistic batch), else `None`.
-#[inline(always)]
-#[allow(clippy::cast_possible_truncation)]
-fn try_inplace_chunk<S, F>(
-    values: &mut S,
-    src_chunk: u64,
-    base: usize,
-    count: usize,
-    f: &mut F,
-) -> Option<u32>
-where
-    S: IndexedSink,
-    S::Item: Default,
-    F: FnMut(S::Item, bool) -> Option<S::Item>,
-{
-    let mut first_fail: u32 = u32::MAX;
-    for bit_idx in 0..count {
-        let i = base + bit_idx;
-        let bit = (src_chunk >> bit_idx) & 1 == 1;
-        // SAFETY: caller guarantees `base + count <= values.len()`.
-        let v = unsafe { values.get_unchecked(i) };
-        let opt = f(v, bit);
-        let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-        first_fail = first_fail.min(candidate);
-        let r = opt.unwrap_or_default();
-        // SAFETY: same as above.
-        unsafe { values.set_unchecked(i, r) };
-    }
-    (first_fail != u32::MAX).then_some(first_fail)
-}
-
 /// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
 ///
 /// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
@@ -649,6 +626,22 @@ where
     S: IndexedSource,
     F: FnMut(S::Item) -> bool,
 {
+    #[inline(always)]
+    fn chunk<S, F>(values: &S, f: &mut F, base: usize, count: usize) -> u64
+    where
+        S: IndexedSource,
+        F: FnMut(S::Item) -> bool,
+    {
+        let mut packed = 0u64;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            packed |= (f(v) as u64) << bit_idx;
+        }
+        packed
+    }
+
     let len = values.len();
     assert_eq!(
         out.len(),
@@ -660,25 +653,12 @@ where
     let remainder = len % 64;
 
     for chunk_idx in 0..chunks_count {
-        let base = chunk_idx * 64;
-        let mut packed = 0u64;
-        for_full_lanes!(base, |bit_idx, i| {
-            // SAFETY: base + bit_idx < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v) as u64) << bit_idx;
-        });
+        let packed = chunk(&values, &mut f, chunk_idx * 64, 64);
         // SAFETY: chunk_idx < chunks_count <= out.len().
         unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
     }
-
     if remainder != 0 {
-        let base = chunks_count * 64;
-        let mut packed = 0u64;
-        for_remainder_lanes!(base, remainder, |bit_idx, i| {
-            // SAFETY: base + bit_idx < len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v) as u64) << bit_idx;
-        });
+        let packed = chunk(&values, &mut f, chunks_count * 64, remainder);
         // SAFETY: chunks_count < out.len() because remainder != 0.
         unsafe { *out.get_unchecked_mut(chunks_count) = packed };
     }
@@ -698,6 +678,29 @@ where
     S: IndexedSource,
     F: FnMut(S::Item, bool) -> bool,
 {
+    #[inline(always)]
+    fn chunk<S, F>(
+        values: &S,
+        f: &mut F,
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+    ) -> u64
+    where
+        S: IndexedSource,
+        F: FnMut(S::Item, bool) -> bool,
+    {
+        let mut packed = 0u64;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            packed |= (f(v, bit) as u64) << bit_idx;
+        }
+        packed
+    }
+
     let len = values.len();
     assert_eq!(len, mask.len(), "values and mask must have the same length");
     assert_eq!(
@@ -711,26 +714,15 @@ where
     let remainder = len % 64;
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        let mut packed = 0u64;
-        for_full_mask_lanes!(src_chunk, base, |bit_idx, i, bit| {
-            // SAFETY: i < chunks_count * 64 <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v, bit) as u64) << bit_idx;
-        });
+        let packed = chunk(&values, &mut f, src_chunk, chunk_idx * 64, 64);
         // SAFETY: chunk_idx < chunks_count <= out.len().
         unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
     }
-
     if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut packed = 0u64;
-        for_remainder_mask_lanes!(src_chunk, base, remainder, |bit_idx, i, bit| {
-            // SAFETY: i < len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v, bit) as u64) << bit_idx;
-        });
+        let packed = chunk(
+            &values, &mut f,
+            chunks.remainder_bits(), chunks_count * 64, remainder,
+        );
         // SAFETY: chunks_count < out.len() because remainder != 0.
         unsafe { *out.get_unchecked_mut(chunks_count) = packed };
     }

From d2bca9357ab16ee786778e8f765eafc3f27d0b49 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 14:58:56 +0100
Subject: [PATCH 08/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      |  88 ++-
 vortex-buffer/src/lane_ops_indexed.rs         | 555 +++++++++---------
 2 files changed, 362 insertions(+), 281 deletions(-)

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 8cdd27cb5c5..ad0d1c8e399 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -1,13 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::mem::align_of;
+use std::mem::size_of;
+
 use num_traits::AsPrimitive;
 use num_traits::NumCast;
+use vortex_buffer::BitBuffer;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_buffer::lane_ops_indexed::ReinterpretSink;
 use vortex_buffer::lane_ops_indexed::map_no_validity;
 use vortex_buffer::lane_ops_indexed::try_map_no_validity;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
+use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
@@ -147,9 +153,21 @@ where
     // Skip the fallible kernel when the conversion is infallible by type alone (widening) or
     // when cached min/max prove every value fits in `T`.
     let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
-    if casts_losslessly_to(F::PTYPE, T::PTYPE)
-        || cached_values_fit_in(array, &target_dtype) == Some(true)
+    let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
+        || cached_values_fit_in(array, &target_dtype) == Some(true);
+
+    // Same-bit-width in-place fast path: when F and T have the same byte width and the
+    // buffer is uniquely owned, mutate in place and transmute the wrapper. Saves the
+    // output allocation. Falls through to the out-of-place path when the buffer is shared
+    // (the common case under the current borrow-based kernel API).
+    let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
+    if same_bit_width
+        && let Ok(buffer_mut) = array.into_owned().try_into_buffer_mut::<F>()
     {
+        return cast_buffer_in_place::<F, T>(buffer_mut, array, new_validity, ctx, infallible);
+    }
+
+    if infallible {
         let mut buffer = BufferMut::<T>::with_capacity(values.len());
         // Truncating `as`-cast — safe here because stats prove every valid value fits.
         // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result
@@ -204,6 +222,72 @@ where
     Ok(PrimitiveArray::new(buffer, new_validity).into_array())
 }
 
+/// In-place cast of an owned `BufferMut<F>` to `BufferMut<T>` when `F` and `T` have the
+/// same byte width. Each slot is read as `F`, converted, and written back as `T`-bits
+/// using `BufferMut`'s transmute family. Avoids allocating a second output buffer.
+///
+/// The caller has already verified `F::PTYPE.byte_width() == T::PTYPE.byte_width()`.
+fn cast_buffer_in_place<F, T>(
+    buffer: BufferMut<F>,
+    array: ArrayView<'_, Primitive>,
+    new_validity: Validity,
+    ctx: &mut ExecutionCtx,
+    infallible: bool,
+) -> VortexResult<ArrayRef>
+where
+    F: NativePType + AsPrimitive<T>,
+    T: NativePType,
+{
+    debug_assert_eq!(size_of::<F>(), size_of::<T>());
+    debug_assert_eq!(align_of::<F>(), align_of::<T>());
+
+    if infallible {
+        // `map_each_in_place` does the BufferMut<F> → BufferMut<T> transmute internally
+        // (same size + alignment for primitives of equal byte width) and walks each slot
+        // with the closure.
+        let result: BufferMut<T> = buffer.map_each_in_place(|v: F| v.as_());
+        return Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array());
+    }
+
+    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
+    let overflow = || {
+        vortex_err!(
+            Compute: "Cannot cast {} to {} — value exceeds target range",
+            F::PTYPE, T::PTYPE,
+        )
+    };
+
+    // All-null short-circuit: zero out the buffer and skip the conversion loop entirely.
+    if matches!(mask, Mask::AllFalse(_)) {
+        // SAFETY: same size + alignment by NativePType same-byte-width invariant.
+        let mut t_buf: BufferMut<T> = unsafe { buffer.transmute::<T>() };
+        t_buf.as_mut_slice().fill(T::zero());
+        return Ok(PrimitiveArray::new(t_buf.freeze(), new_validity).into_array());
+    }
+
+    let bit_buffer = match &mask {
+        Mask::AllTrue(n) => BitBuffer::new_set(*n),
+        Mask::AllFalse(_) => unreachable!("handled above"),
+        Mask::Values(m) => m.bit_buffer().clone(),
+    };
+
+    let mut buffer = buffer;
+    try_map_with_mask_in_place(
+        ReinterpretSink::<F, T>::new(buffer.as_mut_slice()),
+        &bit_buffer,
+        |f_val: F, valid| -> Option<T> {
+            <T as NumCast>::from(f_val).or_else(|| (!valid).then(T::zero))
+        },
+    )
+    .map_err(|_| overflow())?;
+
+    // SAFETY: same size + alignment for NativePType same-byte-width pairs. Every F-slot
+    // now holds a valid T-bit pattern because `ReinterpretSink::set_unchecked` wrote a
+    // real `T` at every visited lane.
+    let result: BufferMut<T> = unsafe { buffer.transmute::<T>() };
+    Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
+}
+
 fn reinterpret(
     array: ArrayView<'_, Primitive>,
     new_ptype: PType,
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 144f83a429a..f8d028eb7b7 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -18,7 +18,10 @@
 
 #![allow(clippy::many_single_char_names)]
 
+use std::marker::PhantomData;
 use std::mem::MaybeUninit;
+use std::mem::align_of;
+use std::mem::size_of;
 
 use crate::BitBuffer;
 
@@ -75,18 +78,29 @@ impl<T: Copy> IndexedSource for &mut [T] {
 /// An [`IndexedSource`] that also supports unchecked indexed writes — the binding
 /// for in-place kernels.
 ///
+/// `Write` is the type written by `set_unchecked` and may differ from
+/// `IndexedSource::Item` (the read type). For the canonical `&mut [T]` impl
+/// both are `T`. The decoupling is what makes [`ReinterpretSink`] possible —
+/// a wrapper that reads `F` and writes `T` over the same backing memory when
+/// the two have identical size and alignment.
+///
 /// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a
 /// `(A, B)` pair back to two separate sources via a single index).
 pub trait IndexedSink: IndexedSource {
+    /// The per-lane write type. Equal to `<Self as IndexedSource>::Item` for
+    /// `&mut [T]`; different for [`ReinterpretSink`].
+    type Write: Copy;
+
     /// Write `value` into lane `i` without bounds checking.
     ///
     /// # Safety
     ///
     /// `i` must be strictly less than `self.len()`.
-    unsafe fn set_unchecked(&mut self, i: usize, value: Self::Item);
+    unsafe fn set_unchecked(&mut self, i: usize, value: Self::Write);
 }
 
 impl<T: Copy> IndexedSink for &mut [T] {
+    type Write = T;
     #[inline]
     unsafe fn set_unchecked(&mut self, i: usize, value: T) {
         // SAFETY: caller guarantees i < self.len().
@@ -94,6 +108,76 @@ impl<T: Copy> IndexedSink for &mut [T] {
     }
 }
 
+/// A sink that reads `F`-values and writes `T`-values over the same backing
+/// slice of `F`, reinterpreting each `T` as `F`-bits on write.
+///
+/// Requires `size_of::<F>() == size_of::<T>()` and `align_of::<F>() == align_of::<T>()`.
+/// Both hold for any pair of `NativePType` primitives with equal byte width
+/// (e.g. `u32` ↔ `f32`, `u64` ↔ `i64`, `f64` ↔ `u64`).
+///
+/// Use this when an in-place kernel needs to convert lanes between two
+/// types of identical width without allocating a second buffer. After the
+/// kernel completes every slot holds a valid `T`-bit pattern; the caller
+/// can recover a typed view via `BufferMut::transmute::<T>()`.
+pub struct ReinterpretSink<'a, F, T> {
+    slice: &'a mut [F],
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, F, T> ReinterpretSink<'a, F, T> {
+    /// Construct a `ReinterpretSink` from `&mut [F]`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `size_of::<F>() != size_of::<T>()` or
+    /// `align_of::<F>() != align_of::<T>()`.
+    pub fn new(slice: &'a mut [F]) -> Self {
+        assert_eq!(
+            size_of::<F>(),
+            size_of::<T>(),
+            "ReinterpretSink requires F and T to have the same size",
+        );
+        assert_eq!(
+            align_of::<F>(),
+            align_of::<T>(),
+            "ReinterpretSink requires F and T to have the same alignment",
+        );
+        Self {
+            slice,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<F: Copy, T: Copy> IndexedSource for ReinterpretSink<'_, F, T> {
+    type Item = F;
+    #[inline]
+    fn len(&self) -> usize {
+        self.slice.len()
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> F {
+        // SAFETY: caller guarantees i < self.slice.len(). Pointer arithmetic
+        // avoids method-resolution ambiguity between `<[F]>::get_unchecked` and
+        // `IndexedSource::get_unchecked`.
+        unsafe { *self.slice.as_ptr().add(i) }
+    }
+}
+
+impl<F: Copy, T: Copy> IndexedSink for ReinterpretSink<'_, F, T> {
+    type Write = T;
+    #[inline]
+    unsafe fn set_unchecked(&mut self, i: usize, value: T) {
+        // SAFETY: caller guarantees i < self.slice.len(); `new` enforces
+        // size_of::<F>() == size_of::<T>() and align_of::<F>() == align_of::<T>(),
+        // so the F-slot can hold a `T` without overflow or misalignment.
+        unsafe {
+            let ptr = self.slice.as_mut_ptr().add(i) as *mut T;
+            ptr.write(value);
+        }
+    }
+}
+
 /// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane.
 ///
 /// Use this to drive a binary kernel from two columns. Length equality is enforced
@@ -183,8 +267,12 @@ where
     }
     if remainder != 0 {
         chunk(
-            &values, out, &mut f,
-            chunks.remainder_bits(), chunks_count * 64, remainder,
+            &values,
+            out,
+            &mut f,
+            chunks.remainder_bits(),
+            chunks_count * 64,
+            remainder,
         );
     }
 }
@@ -278,13 +366,17 @@ where
             return Err(idx);
         }
     }
-    if remainder != 0 {
-        if let Some(idx) = chunk(
-            &values, out, &mut f,
-            chunks.remainder_bits(), chunks_count * 64, remainder,
-        ) {
-            return Err(idx);
-        }
+    if remainder != 0
+        && let Some(idx) = chunk(
+            &values,
+            out,
+            &mut f,
+            chunks.remainder_bits(),
+            chunks_count * 64,
+            remainder,
+        )
+    {
+        return Err(idx);
     }
     Ok(())
 }
@@ -309,13 +401,8 @@ where
     F: FnMut(S::Item) -> R,
 {
     #[inline(always)]
-    fn chunk<S, R, F>(
-        values: &S,
-        out: &mut [MaybeUninit<R>],
-        f: &mut F,
-        base: usize,
-        count: usize,
-    ) where
+    fn chunk<S, R, F>(values: &S, out: &mut [MaybeUninit<R>], f: &mut F, base: usize, count: usize)
+    where
         S: IndexedSource,
         F: FnMut(S::Item) -> R,
     {
@@ -460,9 +547,120 @@ where
     cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none())
 }
 
+/// In-place variant of [`map_no_validity`]. Each lane is replaced with `f(values[i])`.
+/// The source `S` must be writable (an [`IndexedSink`]).
+///
+/// The closure reads `S::Item` and returns `S::Write`. For the common case
+/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write
+/// types can differ (e.g. read `f32`, write `u32`) over the same backing memory
+/// when sizes and alignments match.
+///
+/// As with [`map_no_validity`], use this only when the input is known
+/// non-nullable.
+#[inline]
+pub fn map_no_validity_in_place<S, F>(mut values: S, mut f: F)
+where
+    S: IndexedSink,
+    F: FnMut(S::Item) -> S::Write,
+{
+    #[inline(always)]
+    fn chunk<S, F>(values: &mut S, f: &mut F, base: usize, count: usize)
+    where
+        S: IndexedSink,
+        F: FnMut(S::Item) -> S::Write,
+    {
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let r = f(v);
+            // SAFETY: caller guarantees base + count <= len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+    }
+
+    let len = values.len();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        chunk(&mut values, &mut f, chunk_idx * 64, 64);
+    }
+    if remainder != 0 {
+        chunk(&mut values, &mut f, chunks_count * 64, remainder);
+    }
+}
+
+/// In-place variant of [`try_map_no_validity`]. Each lane is replaced with
+/// `f(values[i])`, or `S::Write::default()` when `f` returns `None`. On failure
+/// returns `Err(first_failing_lane)`; the buffer state on `Err` is unspecified.
+///
+/// As with [`try_map_no_validity`], use this only when the input is known
+/// non-nullable — a `None` from `f` is treated as a failure regardless of any
+/// upstream validity bitmap.
+///
+/// ## Error attribution
+///
+/// Per-lane `is_none()` flags are folded into `first_fail` via the same
+/// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay
+/// isn't viable here because the original input values have already been
+/// overwritten by the time we'd attribute the failure.
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+pub fn try_map_no_validity_in_place<S, F>(mut values: S, mut f: F) -> Result<(), usize>
+where
+    S: IndexedSink,
+    S::Write: Default,
+    F: FnMut(S::Item) -> Option<S::Write>,
+{
+    #[inline(always)]
+    #[allow(clippy::cast_possible_truncation)]
+    fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<u32>
+    where
+        S: IndexedSink,
+        S::Write: Default,
+        F: FnMut(S::Item) -> Option<S::Write>,
+    {
+        let mut first_fail: u32 = u32::MAX;
+        for bit_idx in 0..count {
+            let i = base + bit_idx;
+            // SAFETY: caller guarantees base + count <= len.
+            let v = unsafe { values.get_unchecked(i) };
+            let opt = f(v);
+            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+            first_fail = first_fail.min(candidate);
+            let r = opt.unwrap_or_default();
+            // SAFETY: caller guarantees base + count <= len.
+            unsafe { values.set_unchecked(i, r) };
+        }
+        (first_fail != u32::MAX).then_some(first_fail)
+    }
+
+    let len = values.len();
+    let chunks_count = len / 64;
+    let remainder = len % 64;
+
+    for chunk_idx in 0..chunks_count {
+        if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) {
+            return Err(failing as usize);
+        }
+    }
+    if remainder != 0
+        && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f)
+    {
+        return Err(failing as usize);
+    }
+    Ok(())
+}
+
 /// In-place variant of [`map_with_mask`]. Each lane is replaced with
 /// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]).
 ///
+/// The closure reads `S::Item` and returns `S::Write`. For the common case
+/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write
+/// types can differ (e.g. read `f32`, write `u32`) over the same backing
+/// memory when sizes and alignments match.
+///
 /// # Panics
 ///
 /// Panics if `values.len() != mask.len()`.
@@ -470,18 +668,13 @@ where
 pub fn map_with_mask_in_place<S, F>(mut values: S, mask: &BitBuffer, mut f: F)
 where
     S: IndexedSink,
-    F: FnMut(S::Item, bool) -> S::Item,
+    F: FnMut(S::Item, bool) -> S::Write,
 {
     #[inline(always)]
-    fn chunk<S, F>(
-        values: &mut S,
-        f: &mut F,
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-    ) where
+    fn chunk<S, F>(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize)
+    where
         S: IndexedSink,
-        F: FnMut(S::Item, bool) -> S::Item,
+        F: FnMut(S::Item, bool) -> S::Write,
     {
         for bit_idx in 0..count {
             let i = base + bit_idx;
@@ -505,8 +698,11 @@ where
     }
     if remainder != 0 {
         chunk(
-            &mut values, &mut f,
-            chunks.remainder_bits(), chunks_count * 64, remainder,
+            &mut values,
+            &mut f,
+            chunks.remainder_bits(),
+            chunks_count * 64,
+            remainder,
         );
     }
 }
@@ -547,8 +743,8 @@ pub fn try_map_with_mask_in_place<S, F>(
 ) -> Result<(), usize>
 where
     S: IndexedSink,
-    S::Item: Default,
-    F: FnMut(S::Item, bool) -> Option<S::Item>,
+    S::Write: Default,
+    F: FnMut(S::Item, bool) -> Option<S::Write>,
 {
     /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in
     /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any
@@ -565,8 +761,8 @@ where
     ) -> Option<u32>
     where
         S: IndexedSink,
-        S::Item: Default,
-        F: FnMut(S::Item, bool) -> Option<S::Item>,
+        S::Write: Default,
+        F: FnMut(S::Item, bool) -> Option<S::Write>,
     {
         let mut first_fail: u32 = u32::MAX;
         for bit_idx in 0..count {
@@ -595,137 +791,18 @@ where
             return Err(failing as usize);
         }
     }
-    if remainder != 0 {
-        if let Some(failing) = chunk(
+    if remainder != 0
+        && let Some(failing) = chunk(
             &mut values,
-            chunks.remainder_bits(), chunks_count * 64, remainder,
+            chunks.remainder_bits(),
+            chunks_count * 64,
+            remainder,
             &mut f,
-        ) {
-            return Err(failing as usize);
-        }
-    }
-    Ok(())
-}
-
-/// Apply `f(value) -> bool` lane-by-lane, packing into `out` as `u64` words.
-///
-/// This is the validity-free sibling of [`map_with_mask_to_bits`]. Use it when the
-/// predicate is a pure function of the value (e.g. compare-to-constant on a primitive
-/// buffer) and combine the validity bitmap in a separate pass — splitting the work
-/// this way lets the value-compare loop autovectorize cleanly.
-///
-/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
-/// beyond `len % 64` are written as `0`.
-///
-/// # Panics
-///
-/// Panics if `out.len() != values.len().div_ceil(64)`.
-#[inline]
-pub fn map_to_bits<S, F>(values: S, out: &mut [u64], mut f: F)
-where
-    S: IndexedSource,
-    F: FnMut(S::Item) -> bool,
-{
-    #[inline(always)]
-    fn chunk<S, F>(values: &S, f: &mut F, base: usize, count: usize) -> u64
-    where
-        S: IndexedSource,
-        F: FnMut(S::Item) -> bool,
+        )
     {
-        let mut packed = 0u64;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v) as u64) << bit_idx;
-        }
-        packed
-    }
-
-    let len = values.len();
-    assert_eq!(
-        out.len(),
-        len.div_ceil(64),
-        "out must have len.div_ceil(64) words",
-    );
-
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for chunk_idx in 0..chunks_count {
-        let packed = chunk(&values, &mut f, chunk_idx * 64, 64);
-        // SAFETY: chunk_idx < chunks_count <= out.len().
-        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
-    }
-    if remainder != 0 {
-        let packed = chunk(&values, &mut f, chunks_count * 64, remainder);
-        // SAFETY: chunks_count < out.len() because remainder != 0.
-        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
-    }
-}
-
-/// Apply `f(value, valid) -> bool` lane-by-lane, packing into `out` as `u64` words.
-///
-/// `out.len()` must equal `values.len().div_ceil(64)`. Trailing bits in the final word
-/// beyond `len % 64` are written as `0`.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len().div_ceil(64)`.
-#[inline]
-pub fn map_with_mask_to_bits<S, F>(values: S, mask: &BitBuffer, out: &mut [u64], mut f: F)
-where
-    S: IndexedSource,
-    F: FnMut(S::Item, bool) -> bool,
-{
-    #[inline(always)]
-    fn chunk<S, F>(
-        values: &S,
-        f: &mut F,
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-    ) -> u64
-    where
-        S: IndexedSource,
-        F: FnMut(S::Item, bool) -> bool,
-    {
-        let mut packed = 0u64;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            packed |= (f(v, bit) as u64) << bit_idx;
-        }
-        packed
-    }
-
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(
-        out.len(),
-        len.div_ceil(64),
-        "out must have len.div_ceil(64) words",
-    );
-
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let packed = chunk(&values, &mut f, src_chunk, chunk_idx * 64, 64);
-        // SAFETY: chunk_idx < chunks_count <= out.len().
-        unsafe { *out.get_unchecked_mut(chunk_idx) = packed };
-    }
-    if remainder != 0 {
-        let packed = chunk(
-            &values, &mut f,
-            chunks.remainder_bits(), chunks_count * 64, remainder,
-        );
-        // SAFETY: chunks_count < out.len() because remainder != 0.
-        unsafe { *out.get_unchecked_mut(chunks_count) = packed };
+        return Err(failing as usize);
     }
+    Ok(())
 }
 
 #[cfg(test)]
@@ -842,58 +919,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn map_with_mask_to_bits_aligned() {
-        let values: Vec<i32> = (0..128).collect();
-        let mask = BitBuffer::new_set(128);
-        let mut out = vec![0u64; 2];
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
-            valid && v % 2 == 0
-        });
-        // Even numbers in [0, 128) set, odd unset.
-        for word_idx in 0..2 {
-            let word = out[word_idx];
-            for bit in 0..64 {
-                let i = word_idx * 64 + bit;
-                let expected = i % 2 == 0;
-                assert_eq!((word >> bit) & 1 == 1, expected, "lane {i}");
-            }
-        }
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_partial_chunk() {
-        // 130 lanes — three u64 words, last word has only 2 valid bits.
-        let values: Vec<i32> = (0..130).collect();
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        assert_eq!(out.len(), 3);
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
-            valid && v >= 64
-        });
-        // Bits 64..128 set in word 1; bits 128..130 set in word 2.
-        assert_eq!(out[0], 0);
-        assert_eq!(out[1], u64::MAX);
-        assert_eq!(out[2], 0b11);
-    }
-
-    #[test]
-    fn map_with_mask_to_bits_offset() {
-        let big = BitBuffer::new_set(256);
-        let sliced = big.slice(13..143); // offset=13, len=130
-        assert_eq!(sliced.len(), 130);
-        let values: Vec<u8> = (0..130).map(|i| (i % 4) as u8).collect();
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        map_with_mask_to_bits(values.as_slice(), &sliced, &mut out, |v, valid| {
-            valid && v == 0
-        });
-        for i in 0..130 {
-            let word = out[i / 64];
-            let bit = (word >> (i % 64)) & 1 == 1;
-            assert_eq!(bit, i % 4 == 0, "lane {i}");
-        }
-    }
-
     #[test]
     fn try_map_with_mask_all_ok() {
         let values: Vec<u64> = (0..200).collect();
@@ -1242,82 +1267,54 @@ mod tests {
     }
 
     #[test]
-    fn try_map_with_mask_in_place_partial_chunk_success() {
-        let mut values: Vec<u32> = (0..130).collect();
+    fn reinterpret_sink_same_width_f32_u32() {
+        // Read f32, write u32-bits in place. After transmuting the slice back to u32 we
+        // should see exactly the bit patterns the closure produced.
+        let mut buf: Vec<f32> = (0..130).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(130);
-        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1));
-        assert!(res.is_ok());
-        assert_eq!(values[0], 1);
-        assert_eq!(values[63], 64);
-        assert_eq!(values[64], 65);
-        assert_eq!(values[129], 130);
-    }
-
-    #[test]
-    fn map_to_bits_aligned() {
-        let values: Vec<i32> = (0..128).collect();
-        let mut out = vec![0u64; 2];
-        map_to_bits(values.as_slice(), &mut out, |v| v % 2 == 0);
-        for word_idx in 0..2 {
-            for bit in 0..64 {
-                let i = word_idx * 64 + bit;
-                let expected = i % 2 == 0;
-                assert_eq!((out[word_idx] >> bit) & 1 == 1, expected, "lane {i}");
-            }
+        try_map_with_mask_in_place(
+            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()),
+            &mask,
+            |f, _valid| Some(f.to_bits().wrapping_add(1)),
+        )
+        .unwrap();
+        // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by
+        // the closure.
+        let as_u32: &[u32] =
+            unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const u32, buf.len()) };
+        for (i, &got) in as_u32.iter().enumerate() {
+            assert_eq!(got, (i as f32).to_bits().wrapping_add(1), "lane {i}");
         }
     }
 
     #[test]
-    fn map_to_bits_partial_chunk() {
-        let values: Vec<i32> = (0..130).collect();
-        let mut out = vec![0u64; 130usize.div_ceil(64)];
-        assert_eq!(out.len(), 3);
-        map_to_bits(values.as_slice(), &mut out, |v| v >= 64);
-        assert_eq!(out[0], 0);
-        assert_eq!(out[1], u64::MAX);
-        assert_eq!(out[2], 0b11);
-    }
-
-    #[test]
-    fn map_to_bits_empty() {
-        let values: Vec<i32> = vec![];
-        let mut out: Vec<u64> = vec![];
-        map_to_bits(values.as_slice(), &mut out, |v| v > 0);
-    }
-
-    #[test]
-    fn map_to_bits_matches_fused_with_all_valid_mask() {
-        // map_to_bits + AND with an all-true mask must equal map_with_mask_to_bits.
-        let values: Vec<i64> = (0..200).map(|i| i % 7).collect();
+    fn reinterpret_sink_failure_reports_lane() {
+        // Closure fails at a specific lane; the kernel must report that lane index.
+        let mut buf: Vec<f32> = (0..200).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(200);
-
-        let mut a = vec![0u64; 200usize.div_ceil(64)];
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut a, |v, valid| valid && v == 3);
-
-        let mut b = vec![0u64; 200usize.div_ceil(64)];
-        map_to_bits(values.as_slice(), &mut b, |v| v == 3);
-
-        assert_eq!(a, b);
+        let res = try_map_with_mask_in_place(
+            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()),
+            &mask,
+            |f, _valid| {
+                if f as u32 == 137 {
+                    None
+                } else {
+                    Some(f as u32)
+                }
+            },
+        );
+        assert_eq!(res, Err(137));
     }
 
     #[test]
-    fn map_with_mask_to_bits_validity_kills_lane() {
-        // Even if predicate is true, null lanes should produce false.
-        let values: Vec<i32> = vec![1; 70];
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(70);
-            for i in 0..70 {
-                m.append(i >= 32); // first 32 lanes are null
-            }
-            m.freeze()
-        };
-        let mut out = vec![0u64; 70usize.div_ceil(64)];
-        map_with_mask_to_bits(values.as_slice(), &mask, &mut out, |v, valid| {
-            valid && v == 1
-        });
-        for i in 0..70 {
-            let bit = (out[i / 64] >> (i % 64)) & 1 == 1;
-            assert_eq!(bit, i >= 32, "lane {i}");
-        }
+    fn try_map_with_mask_in_place_partial_chunk_success() {
+        let mut values: Vec<u32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1));
+        assert!(res.is_ok());
+        assert_eq!(values[0], 1);
+        assert_eq!(values[63], 64);
+        assert_eq!(values[64], 65);
+        assert_eq!(values[129], 130);
     }
 }

From 6fd7fc1212305994fe4879d7063c8e7f24e6b0c2 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 14:59:01 +0100
Subject: [PATCH 09/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/benches/cast_to_indexed.rs | 37 ------------------------
 1 file changed, 37 deletions(-)

diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index 848f50cd142..2751cdc8418 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -24,10 +24,8 @@ use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
 use vortex_buffer::lane_ops_indexed::map_no_validity;
-use vortex_buffer::lane_ops_indexed::map_to_bits;
 use vortex_buffer::lane_ops_indexed::map_with_mask;
 use vortex_buffer::lane_ops_indexed::map_with_mask_in_place;
-use vortex_buffer::lane_ops_indexed::map_with_mask_to_bits;
 use vortex_buffer::lane_ops_indexed::try_map_no_validity;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
@@ -37,7 +35,6 @@ fn main() {
 }
 
 const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
-const U32_THRESHOLD: u32 = u32::MAX / 2;
 
 struct Fixture {
     values_u64: Buffer<u64>,
@@ -284,40 +281,6 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) {
         });
 }
 
-#[divan::bench(args = SIZES)]
-fn map_to_bits_u32_threshold(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u32.clone(), vec![0; n.div_ceil(64)]))
-        .bench_values(|(values, mut out)| {
-            map_to_bits(values.as_slice(), out.as_mut_slice(), |v| {
-                v >= U32_THRESHOLD
-            });
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn map_with_mask_to_bits_u32_threshold(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| {
-            (
-                f.values_u32.clone(),
-                f.mask.clone(),
-                vec![0; n.div_ceil(64)],
-            )
-        })
-        .bench_values(|(values, mask, mut out)| {
-            map_with_mask_to_bits(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
-                valid && v >= U32_THRESHOLD
-            });
-            out
-        });
-}
-
 // -----------------------------------------------------------------------------
 // Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one
 // narrowing (u64 → u32, can fail). Each pairs with the cast variants above of

From 72bca8b91ec4ad931fbf5d05755d9bf35435a146 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 15:16:53 +0100
Subject: [PATCH 10/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      | 195 ++++-----
 vortex-buffer/benches/add_checked.rs          |   9 +-
 vortex-buffer/benches/pack_vs_unpack.rs       | 389 ------------------
 3 files changed, 90 insertions(+), 503 deletions(-)
 delete mode 100644 vortex-buffer/benches/pack_vs_unpack.rs

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index ad0d1c8e399..8242b5845bd 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -1,17 +1,15 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use std::mem::align_of;
-use std::mem::size_of;
-
 use num_traits::AsPrimitive;
 use num_traits::NumCast;
-use vortex_buffer::BitBuffer;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
 use vortex_buffer::lane_ops_indexed::ReinterpretSink;
 use vortex_buffer::lane_ops_indexed::map_no_validity;
+use vortex_buffer::lane_ops_indexed::map_no_validity_in_place;
 use vortex_buffer::lane_ops_indexed::try_map_no_validity;
+use vortex_buffer::lane_ops_indexed::try_map_no_validity_in_place;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
 use vortex_error::VortexResult;
@@ -132,7 +130,6 @@ where
     F: NativePType + AsPrimitive<T>,
     T: NativePType,
 {
-    let values = array.as_slice::<F>();
     let overflow = || {
         vortex_err!(
             Compute: "Cannot cast {} to {} — value exceeds target range",
@@ -156,65 +153,105 @@ where
     let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
         || cached_values_fit_in(array, &target_dtype) == Some(true);
 
-    // Same-bit-width in-place fast path: when F and T have the same byte width and the
-    // buffer is uniquely owned, mutate in place and transmute the wrapper. Saves the
-    // output allocation. Falls through to the out-of-place path when the buffer is shared
-    // (the common case under the current borrow-based kernel API).
+    let len = array.len();
+
+    // Same-bit-width in-place fast path: when F and T have the same byte width, try to take
+    // unique ownership of the buffer. If successful, each kernel call site below mutates in
+    // place via `ReinterpretSink` and transmutes the wrapper at the end, saving the output
+    // allocation. Falls back to the out-of-place path (borrowed slice + fresh buffer) when
+    // the buffer is shared — the common case under the current borrow-based kernel API.
     let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
-    if same_bit_width
-        && let Ok(buffer_mut) = array.into_owned().try_into_buffer_mut::<F>()
-    {
-        return cast_buffer_in_place::<F, T>(buffer_mut, array, new_validity, ctx, infallible);
-    }
+    let owned: Option<BufferMut<F>> = if same_bit_width {
+        array.into_owned().try_into_buffer_mut::<F>().ok()
+    } else {
+        None
+    };
+    let values: &[F] = array.as_slice::<F>();
 
     if infallible {
-        let mut buffer = BufferMut::<T>::with_capacity(values.len());
-        // Truncating `as`-cast — safe here because stats prove every valid value fits.
-        // Null lanes' underlying garbage gets truncated/wrapped (harmless: the result
-        // validity bitmap masks them downstream).
-        map_no_validity(
-            values,
-            &mut buffer.spare_capacity_mut()[..values.len()],
-            |v| v.as_(),
-        );
-        // SAFETY: map_no_validity initializes every lane.
-        unsafe { buffer.set_len(values.len()) };
-        return Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array());
+        // Truncating `as`-cast — safe here because static type analysis or cached stats prove
+        // every valid value fits. Null lanes' underlying garbage gets truncated/wrapped
+        // (harmless: the result validity bitmap masks them downstream).
+        return match owned {
+            Some(mut buf) => {
+                map_no_validity_in_place(
+                    ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
+                    |v: F| v.as_(),
+                );
+                // SAFETY: same size + alignment for NativePType same-byte-width pairs;
+                // every F-slot was overwritten with a real `T` bit pattern.
+                let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+                Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
+            }
+            None => {
+                let mut buffer = BufferMut::<T>::with_capacity(len);
+                map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| v.as_());
+                // SAFETY: map_no_validity initializes every lane.
+                unsafe { buffer.set_len(len) };
+                Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
+            }
+        };
     }
 
-    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
+    let mask = array.validity()?.execute_mask(len, ctx)?;
 
-    let buffer: Buffer<T> = match &mask {
-        Mask::AllTrue(_) => {
-            let mut buffer = BufferMut::<T>::with_capacity(values.len());
-            try_map_no_validity(
-                values,
-                &mut buffer.spare_capacity_mut()[..values.len()],
-                |v| <T as NumCast>::from(v),
+    let buffer: Buffer<T> = match (&mask, owned) {
+        (Mask::AllTrue(_), Some(mut buf)) => {
+            try_map_no_validity_in_place(
+                ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
+                |v: F| <T as NumCast>::from(v),
             )
             .map_err(|_| overflow())?;
+            // SAFETY: same size + alignment for NativePType same-byte-width pairs;
+            // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
+            let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+            result.freeze()
+        }
+        (Mask::AllTrue(_), None) => {
+            let mut buffer = BufferMut::<T>::with_capacity(len);
+            try_map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| {
+                <T as NumCast>::from(v)
+            })
+            .map_err(|_| overflow())?;
             // SAFETY: try_map_no_validity returned Ok, so it initialized every lane.
-            unsafe { buffer.set_len(values.len()) };
+            unsafe { buffer.set_len(len) };
             buffer.freeze()
         }
-        Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
-        Mask::Values(m) => {
-            let mut buffer = BufferMut::<T>::with_capacity(values.len());
+        (Mask::AllFalse(_), Some(buf)) => {
+            // SAFETY: same size + alignment by NativePType same-byte-width invariant.
+            let mut t_buf: BufferMut<T> = unsafe { buf.transmute::<T>() };
+            t_buf.as_mut_slice().fill(T::zero());
+            t_buf.freeze()
+        }
+        (Mask::AllFalse(_), None) => BufferMut::<T>::zeroed(len).freeze(),
+        (Mask::Values(m), Some(mut buf)) => {
+            try_map_with_mask_in_place(
+                ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
+                m.bit_buffer(),
+                |v: F, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
+            )
+            .map_err(|_| overflow())?;
+            // SAFETY: same size + alignment for NativePType same-byte-width pairs;
+            // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
+            let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+            result.freeze()
+        }
+        (Mask::Values(m), None) => {
+            let mut buffer = BufferMut::<T>::with_capacity(len);
             try_map_with_mask(
                 values,
                 m.bit_buffer(),
-                &mut buffer.spare_capacity_mut()[..values.len()],
-                // Lazy validity: only consult `valid` on the failure branch. For
-                // widening / statically-infallible casts, `NumCast::from` is always
-                // `Some` so the `or_else` is provably dead — LLVM DCEs the validity
-                // path entirely, giving the same codegen as the maskless kernel.
-                // For narrowing, `valid` is only read at lanes that actually
-                // overflowed (a cold check on top of the cast).
+                &mut buffer.spare_capacity_mut()[..len],
+                // Lazy validity: only consult `valid` on the failure branch. For widening /
+                // statically-infallible casts, `NumCast::from` is always `Some` so the
+                // `or_else` is provably dead — LLVM DCEs the validity path entirely, giving
+                // the same codegen as the maskless kernel. For narrowing, `valid` is only
+                // read at lanes that actually overflowed (a cold check on top of the cast).
                 |v, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
             )
             .map_err(|_| overflow())?;
             // SAFETY: try_map_with_mask returned Ok, so it initialized every lane.
-            unsafe { buffer.set_len(values.len()) };
+            unsafe { buffer.set_len(len) };
             buffer.freeze()
         }
     };
@@ -222,72 +259,6 @@ where
     Ok(PrimitiveArray::new(buffer, new_validity).into_array())
 }
 
-/// In-place cast of an owned `BufferMut<F>` to `BufferMut<T>` when `F` and `T` have the
-/// same byte width. Each slot is read as `F`, converted, and written back as `T`-bits
-/// using `BufferMut`'s transmute family. Avoids allocating a second output buffer.
-///
-/// The caller has already verified `F::PTYPE.byte_width() == T::PTYPE.byte_width()`.
-fn cast_buffer_in_place<F, T>(
-    buffer: BufferMut<F>,
-    array: ArrayView<'_, Primitive>,
-    new_validity: Validity,
-    ctx: &mut ExecutionCtx,
-    infallible: bool,
-) -> VortexResult<ArrayRef>
-where
-    F: NativePType + AsPrimitive<T>,
-    T: NativePType,
-{
-    debug_assert_eq!(size_of::<F>(), size_of::<T>());
-    debug_assert_eq!(align_of::<F>(), align_of::<T>());
-
-    if infallible {
-        // `map_each_in_place` does the BufferMut<F> → BufferMut<T> transmute internally
-        // (same size + alignment for primitives of equal byte width) and walks each slot
-        // with the closure.
-        let result: BufferMut<T> = buffer.map_each_in_place(|v: F| v.as_());
-        return Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array());
-    }
-
-    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
-    let overflow = || {
-        vortex_err!(
-            Compute: "Cannot cast {} to {} — value exceeds target range",
-            F::PTYPE, T::PTYPE,
-        )
-    };
-
-    // All-null short-circuit: zero out the buffer and skip the conversion loop entirely.
-    if matches!(mask, Mask::AllFalse(_)) {
-        // SAFETY: same size + alignment by NativePType same-byte-width invariant.
-        let mut t_buf: BufferMut<T> = unsafe { buffer.transmute::<T>() };
-        t_buf.as_mut_slice().fill(T::zero());
-        return Ok(PrimitiveArray::new(t_buf.freeze(), new_validity).into_array());
-    }
-
-    let bit_buffer = match &mask {
-        Mask::AllTrue(n) => BitBuffer::new_set(*n),
-        Mask::AllFalse(_) => unreachable!("handled above"),
-        Mask::Values(m) => m.bit_buffer().clone(),
-    };
-
-    let mut buffer = buffer;
-    try_map_with_mask_in_place(
-        ReinterpretSink::<F, T>::new(buffer.as_mut_slice()),
-        &bit_buffer,
-        |f_val: F, valid| -> Option<T> {
-            <T as NumCast>::from(f_val).or_else(|| (!valid).then(T::zero))
-        },
-    )
-    .map_err(|_| overflow())?;
-
-    // SAFETY: same size + alignment for NativePType same-byte-width pairs. Every F-slot
-    // now holds a valid T-bit pattern because `ReinterpretSink::set_unchecked` wrote a
-    // real `T` at every visited lane.
-    let result: BufferMut<T> = unsafe { buffer.transmute::<T>() };
-    Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
-}
-
 fn reinterpret(
     array: ArrayView<'_, Primitive>,
     new_ptype: PType,
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index df857922d6f..ff4f5f64e9a 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -513,8 +513,13 @@ fn premask_then_simd(bencher: Bencher, n: usize) {
         .bench_refs(|(lhs, rhs, lm, rm)| {
             let combined = lm as &BitBuffer & rm as &BitBuffer;
             let mut out = alloc_out(n);
-            handrolled_premask(lhs.as_slice(), rhs.as_slice(), &combined, out.as_mut_slice())
-                .unwrap();
+            handrolled_premask(
+                lhs.as_slice(),
+                rhs.as_slice(),
+                &combined,
+                out.as_mut_slice(),
+            )
+            .unwrap();
             (combined, out)
         });
 }
diff --git a/vortex-buffer/benches/pack_vs_unpack.rs b/vortex-buffer/benches/pack_vs_unpack.rs
deleted file mode 100644
index 0ae41fb5573..00000000000
--- a/vortex-buffer/benches/pack_vs_unpack.rs
+++ /dev/null
@@ -1,389 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Compare two strategies for handling validity in `try_map_with_mask`:
-//!
-//! 1. **Unpack the mask** — closure consults `valid` per-lane. Null lanes are
-//!    short-circuited inside the closure (return `Some(default)` immediately),
-//!    so the checked operation never runs with garbage. The kernel still does
-//!    its `fail_bits & src_chunk` post-filter, but it's a no-op because the
-//!    closure already produced `Some` at null lanes.
-//!
-//! 2. **Pack and filter** — closure ignores `_valid`. The checked operation
-//!    runs at every lane, including null lanes (where it may produce `None`
-//!    on garbage). The kernel's post-loop `fail_bits & src_chunk` filter
-//!    drops those null-lane fails. LLVM DCEs the per-lane mask extract since
-//!    the closure doesn't consult `valid`.
-//!
-//! Two ops × two strategies = four vortex benches, plus arrow baselines.
-//!
-//! - `widen_u16_u32_*` — statically-infallible widening cast. `NumCast::from`
-//!   always returns `Some`; LLVM proves it and strips fail-tracking entirely.
-//! - `checked_add_u32_*` — genuinely fallible: `u32 + u32` can overflow.
-
-#![expect(clippy::unwrap_used)]
-
-use std::mem::MaybeUninit;
-use std::sync::Arc;
-
-use arrow_arith::numeric::add;
-use arrow_array::Datum;
-use arrow_array::UInt16Array;
-use arrow_array::UInt32Array;
-use arrow_buffer::NullBuffer;
-use arrow_buffer::ScalarBuffer;
-use arrow_cast::CastOptions;
-use arrow_cast::cast_with_options;
-use arrow_schema::DataType;
-use divan::Bencher;
-use num_traits::NumCast;
-use rand::SeedableRng;
-use rand::prelude::*;
-use rand::rngs::StdRng;
-use vortex_buffer::BitBuffer;
-use vortex_buffer::BitBufferMut;
-use vortex_buffer::Buffer;
-use vortex_buffer::lane_ops_indexed::LaneZip;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask;
-
-fn main() {
-    divan::main();
-}
-
-const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
-
-struct Fixture {
-    values_u16: Buffer<u16>,
-    lhs_u32: Buffer<u32>,
-    rhs_u32: Buffer<u32>,
-    mask: BitBuffer,
-    arrow_u16: UInt16Array,
-    arrow_lhs: Arc<UInt32Array>,
-    arrow_rhs: Arc<UInt32Array>,
-}
-
-fn fixture(n: usize) -> Fixture {
-    let mut rng = StdRng::seed_from_u64(0xC0DE_BEEF);
-    // Bounded so `u16 + u16` (as u32) and `u32 + u32` never overflow u32.
-    // Both strategies succeed; we measure success-path perf.
-    let raw_lhs: Vec<u32> = (0..n)
-        .map(|_| rng.random_range(0..(u32::MAX / 2)))
-        .collect();
-    let raw_rhs: Vec<u32> = (0..n)
-        .map(|_| rng.random_range(0..(u32::MAX / 2)))
-        .collect();
-    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
-
-    #[expect(clippy::cast_possible_truncation)]
-    let values_u16: Buffer<u16> = raw_lhs.iter().map(|&v| v as u16).collect();
-    let lhs_u32: Buffer<u32> = raw_lhs.iter().copied().collect();
-    let rhs_u32: Buffer<u32> = raw_rhs.iter().copied().collect();
-
-    let mask = {
-        let mut m = BitBufferMut::with_capacity(n);
-        for &v in &raw_valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-
-    #[expect(clippy::cast_possible_truncation)]
-    let arrow_u16 = UInt16Array::new(
-        ScalarBuffer::from(raw_lhs.iter().map(|&v| v as u16).collect::<Vec<u16>>()),
-        Some(NullBuffer::from(raw_valid.clone())),
-    );
-    let arrow_lhs = Arc::new(UInt32Array::new(
-        ScalarBuffer::from(raw_lhs),
-        Some(NullBuffer::from(raw_valid.clone())),
-    ));
-    let arrow_rhs = Arc::new(UInt32Array::new(
-        ScalarBuffer::from(raw_rhs),
-        Some(NullBuffer::from(raw_valid)),
-    ));
-
-    Fixture {
-        values_u16,
-        lhs_u32,
-        rhs_u32,
-        mask,
-        arrow_u16,
-        arrow_lhs,
-        arrow_rhs,
-    }
-}
-
-fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
-    let mut out = Vec::with_capacity(n);
-    // SAFETY: a `MaybeUninit<T>` does not require initialization.
-    unsafe { out.set_len(n) };
-    out
-}
-
-const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
-    safe: false,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
-// -----------------------------------------------------------------------------
-// Widening cast u16 → u32 (statically infallible). NumCast::from never returns
-// None for widening, so the failure path is dead in both strategies.
-// -----------------------------------------------------------------------------
-
-/// Strategy 1 (unpack mask): closure consults `valid`, short-circuits at null
-/// lanes. For widening the short-circuit is dead anyway (no failure possible).
-#[divan::bench(args = SIZES)]
-fn widen_u16_u32_unpack_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
-                if !valid {
-                    return Some(0u32);
-                }
-                <u32 as NumCast>::from(v)
-            })
-            .unwrap();
-            out
-        });
-}
-
-/// Strategy 2 (pack and filter): closure ignores `_valid`. LLVM DCEs the
-/// per-lane mask extract; post-loop `& src_chunk` would filter null-lane fails
-/// (none happen for widening).
-#[divan::bench(args = SIZES)]
-fn widen_u16_u32_pack_and_filter(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
-                <u32 as NumCast>::from(v)
-            })
-            .unwrap();
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn widen_u16_u32_arrow(bencher: Bencher, _n: usize) {
-    let f = fixture(_n);
-    bencher
-        .with_inputs(|| f.arrow_u16.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
-
-// -----------------------------------------------------------------------------
-// Checked add u32 + u32 → u32 (genuinely fallible). LaneZip(lhs, rhs) drives
-// two-input lanewise.
-// -----------------------------------------------------------------------------
-
-/// Strategy 1 (unpack mask): closure short-circuits null lanes; `checked_add`
-/// only runs at valid lanes.
-#[divan::bench(args = SIZES)]
-fn checked_add_u32_unpack_mask(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs_u32.clone(),
-                f.rhs_u32.clone(),
-                f.mask.clone(),
-                uninit_out::<u32>(n),
-            )
-        })
-        .bench_values(|(lhs, rhs, mask, mut out)| {
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &mask,
-                out.as_mut_slice(),
-                |(a, b), valid| {
-                    if !valid {
-                        return Some(0u32);
-                    }
-                    a.checked_add(b)
-                },
-            )
-            .unwrap();
-            out
-        });
-}
-
-/// Strategy 2 (pack and filter): `checked_add` runs at every lane (including
-/// null lanes with garbage values); kernel's `fail_bits & src_chunk` post-filter
-/// drops any null-lane fails.
-#[divan::bench(args = SIZES)]
-fn checked_add_u32_pack_and_filter(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs_u32.clone(),
-                f.rhs_u32.clone(),
-                f.mask.clone(),
-                uninit_out::<u32>(n),
-            )
-        })
-        .bench_values(|(lhs, rhs, mask, mut out)| {
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &mask,
-                out.as_mut_slice(),
-                |(a, b), _valid| a.checked_add(b),
-            )
-            .unwrap();
-            out
-        });
-}
-
-// Asm-extraction helpers: `#[unsafe(no_mangle)] #[inline(never)]` so a single
-// `cargo rustc --emit=asm` produces clearly-labeled symbols to diff.
-
-#[unsafe(no_mangle)]
-#[inline(never)]
-pub fn asm_add_unpack_branchy(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    try_map_with_mask(
-        LaneZip::new(lhs, rhs),
-        mask,
-        out,
-        |(a, b), valid| {
-            if !valid {
-                return Some(0u32);
-            }
-            a.checked_add(b)
-        },
-    )
-}
-
-#[unsafe(no_mangle)]
-#[inline(never)]
-pub fn asm_add_unpack_branchless(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    try_map_with_mask(
-        LaneZip::new(lhs, rhs),
-        mask,
-        out,
-        |(a, b), valid| {
-            // Compute first, then select. No early-return; LLVM may if-convert.
-            let r = a.checked_add(b);
-            if valid { r } else { Some(0u32) }
-        },
-    )
-}
-
-#[unsafe(no_mangle)]
-#[inline(never)]
-pub fn asm_add_unpack_multiply(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    try_map_with_mask(
-        LaneZip::new(lhs, rhs),
-        mask,
-        out,
-        |(a, b), valid| {
-            // Neutralize null lanes via multiply (BIC); checked_add runs unconditionally.
-            let m = valid as u32;
-            (a * m).checked_add(b * m)
-        },
-    )
-}
-
-#[unsafe(no_mangle)]
-#[inline(never)]
-pub fn asm_add_pack_filter(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    try_map_with_mask(
-        LaneZip::new(lhs, rhs),
-        mask,
-        out,
-        |(a, b), _valid| a.checked_add(b),
-    )
-}
-
-/// Branchless-multiply variant of unpack_mask: scale lhs/rhs by `valid as u32` so
-/// the checked op runs at every lane (with zeros at null lanes — never overflows)
-/// and the kernel's post-loop `& src_chunk` filter still applies.
-#[divan::bench(args = SIZES)]
-fn checked_add_u32_unpack_multiply(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs_u32.clone(),
-                f.rhs_u32.clone(),
-                f.mask.clone(),
-                uninit_out::<u32>(n),
-            )
-        })
-        .bench_values(|(lhs, rhs, mask, mut out)| {
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &mask,
-                out.as_mut_slice(),
-                |(a, b), valid| {
-                    let m = valid as u32;
-                    (a * m).checked_add(b * m)
-                },
-            )
-            .unwrap();
-            out
-        });
-}
-
-/// Compute-first-then-select variant of unpack_mask: removes the early `return`,
-/// keeps the `valid` consult per-lane. Tests whether LLVM if-converts when both
-/// branches are pure expressions.
-#[divan::bench(args = SIZES)]
-fn checked_add_u32_unpack_branchless(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs_u32.clone(),
-                f.rhs_u32.clone(),
-                f.mask.clone(),
-                uninit_out::<u32>(n),
-            )
-        })
-        .bench_values(|(lhs, rhs, mask, mut out)| {
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &mask,
-                out.as_mut_slice(),
-                |(a, b), valid| {
-                    let r = a.checked_add(b);
-                    if valid { r } else { Some(0u32) }
-                },
-            )
-            .unwrap();
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn checked_add_u32_arrow(bencher: Bencher, _n: usize) {
-    let f = fixture(_n);
-    bencher
-        .with_inputs(|| (f.arrow_lhs.clone(), f.arrow_rhs.clone()))
-        .bench_refs(|(lhs, rhs)| {
-            let lhs_datum: &dyn Datum = lhs.as_ref();
-            let rhs_datum: &dyn Datum = rhs.as_ref();
-            add(lhs_datum, rhs_datum).unwrap()
-        });
-}

From fe34ccbe7ffd4021d550907bd6bf85755970567f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 15:44:41 +0100
Subject: [PATCH 11/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/Cargo.toml             |   4 -
 vortex-buffer/benches/add_checked.rs | 521 ++++-----------------------
 2 files changed, 76 insertions(+), 449 deletions(-)

diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 048d2612364..882de199818 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -62,7 +62,3 @@ harness = false
 [[bench]]
 name = "add_checked"
 harness = false
-
-[[bench]]
-name = "pack_vs_unpack"
-harness = false
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index ff4f5f64e9a..2d4db4959e7 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -1,40 +1,28 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Checked `u32 + u32 -> u32` over two nullable columns — exhaustive variant
-//! comparison.
+//! Checked `u32 + u32 -> u32` over two nullable columns.
 //!
-//! Variants differ along three axes:
+//! Two implementations:
 //!
-//! 1. **Closure suppression strategy** — how the closure (if any) handles null lanes
-//!    - `value_only`: `|(a,b), _|` ignores validity
-//!    - `if_else`: `|(a,b), valid| if valid { ... } else { Some(default) }`
-//!    - `or_else`: `|(a,b), valid| ....or_else(|| (!valid).then(...))`
-//!    - `mul_trick`: `(a * valid as u32).checked_add(b * valid as u32)`
+//! - [`bitpack_value_only`] — production path via [`try_map_with_mask`] with a
+//!   value-only closure. Per-lane `is_none()` flags are bit-packed and AND-ed
+//!   with the chunk validity word so null-lane overflow is filtered without
+//!   the closure ever inspecting `valid`.
+//! - [`premask_then_simd`] — hand-rolled ceiling. Bit-broadcasts each mask bit
+//!   to `0x00000000`/`0xFFFFFFFF`, ANDs into both operands (null lanes become
+//!   `0+0`), then unconditional `overflowing_add` with a per-chunk OR-reduced
+//!   `fail_acc` and cold scalar attribution. Same pattern that beat arrow on
+//!   the primitive cast bench (37 µs vs 55 µs).
 //!
-//! 2. **Fail tracking scheme**
-//!    - bit-pack: `fail_bits |= (is_none << bit_idx)`; chunk-AND with mask
-//!    - boolean: `fail_acc |= is_none as u64`; cold replay attribution
-//!
-//! 3. **Validity application**
-//!    - in closure: closure consumes `valid`
-//!    - post-mask: kernel ANDs fail bitmap with `src_chunk`
-//!    - pre-mask: kernel zeros null-lane values via bit-broadcast before SIMD add
-//!    - none: ignore validity (ceiling only — not correct for real inputs)
-//!
-//! All correctness-preserving variants are verified via [`assert_overflow_parity`]
-//! and [`assert_null_overflow_suppressed`] at startup. The `pure_simd_no_validity`
-//! variant is benched as a ceiling only — it does not respect nullability.
+//! Both are verified at startup via [`assert_overflow_parity`] (valid-lane
+//! overflow propagates as `Err`) and [`assert_null_overflow_suppressed`]
+//! (null-lane overflow does not).
 
 #![expect(clippy::unwrap_used)]
 
 use std::mem::MaybeUninit;
-use std::sync::Arc;
 
-use arrow_array::Datum;
-use arrow_array::UInt32Array;
-use arrow_buffer::NullBuffer;
-use arrow_buffer::ScalarBuffer;
 use divan::Bencher;
 use rand::SeedableRng;
 use rand::prelude::*;
@@ -47,7 +35,6 @@ use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 fn main() {
     assert_overflow_parity();
     assert_null_overflow_suppressed();
-    assert_pure_simd_errs_on_realistic_data();
     divan::main();
 }
 
@@ -56,19 +43,13 @@ const LHS_VALID_RATE: f64 = 0.7;
 const RHS_VALID_RATE: f64 = 0.8;
 
 struct Fixture {
-    /// **Realistic** lhs: valid lanes bounded, null lanes `u32::MAX`.
-    /// A kernel that ignores validity will see overflow at null lanes.
+    /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel
+    /// that ignores validity would Err on them. Both implementations under test
+    /// must suppress that.
     lhs: Buffer<u32>,
     rhs: Buffer<u32>,
-    /// **Sanitized** lhs: valid lanes bounded, null lanes pre-zeroed.
-    /// Used by `pure_simd_no_validity_sanitized` only — its precondition is
-    /// "someone already zeroed the nulls."
-    lhs_sanitized: Buffer<u32>,
-    rhs_sanitized: Buffer<u32>,
     lhs_mask: BitBuffer,
     rhs_mask: BitBuffer,
-    lhs_arrow: Arc<UInt32Array>,
-    rhs_arrow: Arc<UInt32Array>,
 }
 
 fn fixture(n: usize) -> Fixture {
@@ -80,11 +61,7 @@ fn fixture(n: usize) -> Fixture {
     let lhs_valid: Vec<bool> = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect();
     let rhs_valid: Vec<bool> = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect();
 
-    // **Realistic null storage**: null lanes contain u32::MAX. Adding two such
-    // values overflows — a kernel that ignores validity will spuriously Err.
-    // Valid lanes carry bounded values so the success path is measured at lanes
-    // where overflow shouldn't fire.
-    let raw_lhs: Vec<u32> = (0..n)
+    let lhs: Buffer<u32> = (0..n)
         .map(|i| {
             if lhs_valid[i] {
                 lhs_rng.random_range(0..u16::MAX as u32)
@@ -93,7 +70,7 @@ fn fixture(n: usize) -> Fixture {
             }
         })
         .collect();
-    let raw_rhs: Vec<u32> = (0..n)
+    let rhs: Buffer<u32> = (0..n)
         .map(|i| {
             if rhs_valid[i] {
                 rhs_rng.random_range(0..u16::MAX as u32)
@@ -103,16 +80,6 @@ fn fixture(n: usize) -> Fixture {
         })
         .collect();
 
-    let lhs: Buffer<u32> = raw_lhs.iter().copied().collect();
-    let rhs: Buffer<u32> = raw_rhs.iter().copied().collect();
-
-    let lhs_sanitized: Buffer<u32> = (0..n)
-        .map(|i| if lhs_valid[i] { raw_lhs[i] } else { 0 })
-        .collect();
-    let rhs_sanitized: Buffer<u32> = (0..n)
-        .map(|i| if rhs_valid[i] { raw_rhs[i] } else { 0 })
-        .collect();
-
     let lhs_mask = {
         let mut m = BitBufferMut::with_capacity(n);
         for &v in &lhs_valid {
@@ -128,24 +95,11 @@ fn fixture(n: usize) -> Fixture {
         m.freeze()
     };
 
-    let lhs_arrow = Arc::new(UInt32Array::new(
-        ScalarBuffer::from(raw_lhs),
-        Some(NullBuffer::from(lhs_valid)),
-    ));
-    let rhs_arrow = Arc::new(UInt32Array::new(
-        ScalarBuffer::from(raw_rhs),
-        Some(NullBuffer::from(rhs_valid)),
-    ));
-
     Fixture {
         lhs,
         rhs,
-        lhs_sanitized,
-        rhs_sanitized,
         lhs_mask,
         rhs_mask,
-        lhs_arrow,
-        rhs_arrow,
     }
 }
 
@@ -157,25 +111,7 @@ fn alloc_out(n: usize) -> Vec<MaybeUninit<u32>> {
 }
 
 // ---------------------------------------------------------------------------
-// Variant 0: arrow_arith::numeric::add — baseline
-// ---------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn arrow_add(bencher: Bencher, n: usize) {
-    let _ = n;
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.lhs_arrow.clone(), f.rhs_arrow.clone()))
-        .bench_refs(|(lhs, rhs)| {
-            arrow_arith::numeric::add(lhs.as_ref() as &dyn Datum, rhs.as_ref() as &dyn Datum)
-                .unwrap()
-        });
-}
-
-// ---------------------------------------------------------------------------
-// Variant 1: try_map_with_mask + closure `|(a, b), _|` (value-only)
-// Fail tracking: bit-pack via the kernel.
-// LLVM DCEs per-lane mask extract.
+// bitpack_value_only — production path via try_map_with_mask.
 // ---------------------------------------------------------------------------
 
 #[divan::bench(args = SIZES)]
@@ -205,228 +141,65 @@ fn bitpack_value_only(bencher: Bencher, n: usize) {
 }
 
 // ---------------------------------------------------------------------------
-// Variant 2: try_map_with_mask + closure `|(a, b), valid|` with if-else
-// Fail tracking: bit-pack via the kernel.
-// Closure explicitly suppresses null-lane fails (redundant with bit-pack filter).
-// ---------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn bitpack_closure_suppresses_if_else(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &combined,
-                out.as_mut_slice(),
-                |(a, b), valid| {
-                    if valid { a.checked_add(b) } else { Some(0) }
-                },
-            )
-            .unwrap();
-            (combined, out)
-        });
-}
-
-// ---------------------------------------------------------------------------
-// Variant 3: try_map_with_mask + closure `.or_else(|| (!valid).then(...))`
-// Fail tracking: bit-pack via the kernel.
-// Lazy suppression: closure only consults `valid` when overflow actually fires.
-// ---------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn bitpack_closure_suppresses_or_else(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &combined,
-                out.as_mut_slice(),
-                |(a, b), valid| a.checked_add(b).or_else(|| (!valid).then_some(0)),
-            )
-            .unwrap();
-            (combined, out)
-        });
-}
-
-// ---------------------------------------------------------------------------
-// Variant 4: try_map_with_mask + closure with `(a * valid).checked_add(b * valid)`
-// Fail tracking: bit-pack via the kernel.
-// The multiply-by-valid trick zeroes null-lane operands so they can't overflow.
-// ---------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn bitpack_closure_mul_trick(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &combined,
-                out.as_mut_slice(),
-                |(a, b), valid| {
-                    let m = valid as u32;
-                    (a * m).checked_add(b * m)
-                },
-            )
-            .unwrap();
-            (combined, out)
-        });
-}
-
-// ---------------------------------------------------------------------------
-// Variant 5: hand-rolled, boolean fail_acc, closure suppresses nulls, cold replay
+// premask_then_simd — hand-rolled ceiling.
 // ---------------------------------------------------------------------------
 
-/// Hand-rolled kernel: boolean `fail_acc`, cold replay attribution.
-/// Closure is expected to suppress null-lane fails by returning `Some(...)`;
-/// `fail_acc` only fires for real valid-lane overflows.
 #[inline]
-fn handrolled_boolean<F>(
+fn handrolled_premask(
     lhs: &[u32],
     rhs: &[u32],
     mask: &BitBuffer,
     out: &mut [MaybeUninit<u32>],
-    mut f: F,
-) -> Result<(), usize>
-where
-    F: FnMut(u32, u32, bool) -> Option<u32>,
-{
-    let len = lhs.len();
-    assert_eq!(len, rhs.len());
-    assert_eq!(len, mask.len());
-    assert_eq!(len, out.len());
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
+) -> Result<(), usize> {
+    /// Per-chunk hot loop. Bit-broadcasts each validity bit to 0x00 / 0xFF,
+    /// ANDs both operands, then `overflowing_add`. Returns true if any lane in
+    /// `[base, base+count)` overflowed. `#[inline(always)]` keeps the literal
+    /// `64` at the full-chunk call site for const propagation.
+    #[inline(always)]
+    fn chunk(
+        lhs: &[u32],
+        rhs: &[u32],
+        out: &mut [MaybeUninit<u32>],
+        src_chunk: u64,
+        base: usize,
+        count: usize,
+    ) -> bool {
         let mut fail_acc: u64 = 0;
-        for bit_idx in 0..64 {
+        for bit_idx in 0..count {
             let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: i < len.
-            let a = unsafe { *lhs.get_unchecked(i) };
-            let b = unsafe { *rhs.get_unchecked(i) };
-            let opt = f(a, b, bit);
-            fail_acc |= opt.is_none() as u64;
-            unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) };
-        }
-        if fail_acc != 0 {
-            // Cold: find first failing lane (closure already suppressed nulls).
-            for bit_idx in 0..64 {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                let a = unsafe { *lhs.get_unchecked(i) };
-                let b = unsafe { *rhs.get_unchecked(i) };
-                if f(a, b, bit).is_none() {
-                    return Err(i);
-                }
-            }
+            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
+            // SAFETY: caller guarantees base + count <= len.
+            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
+            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
+            let (sum, overflow) = a.overflowing_add(b);
+            fail_acc |= overflow as u64;
+            // SAFETY: caller guarantees base + count <= len.
+            unsafe { out.get_unchecked_mut(i).write(sum) };
         }
+        fail_acc != 0
     }
 
-    if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..remainder {
+    /// Cold attribution. Walks the chunk on raw (unmasked) operands and reports
+    /// the first valid lane that overflows. Null lanes were premasked to `0+0`
+    /// in the hot loop so they cannot contribute here.
+    #[cold]
+    #[inline(never)]
+    fn attribute(lhs: &[u32], rhs: &[u32], src_chunk: u64, base: usize, count: usize) -> usize {
+        for bit_idx in 0..count {
+            if (src_chunk >> bit_idx) & 1 == 0 {
+                continue;
+            }
             let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
+            // SAFETY: caller guarantees base + count <= len.
             let a = unsafe { *lhs.get_unchecked(i) };
             let b = unsafe { *rhs.get_unchecked(i) };
-            let opt = f(a, b, bit);
-            fail_acc |= opt.is_none() as u64;
-            unsafe { out.get_unchecked_mut(i).write(opt.unwrap_or_default()) };
-        }
-        if fail_acc != 0 {
-            for bit_idx in 0..remainder {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                let a = unsafe { *lhs.get_unchecked(i) };
-                let b = unsafe { *rhs.get_unchecked(i) };
-                if f(a, b, bit).is_none() {
-                    return Err(i);
-                }
+            if a.checked_add(b).is_none() {
+                return i;
             }
         }
+        unreachable!("attribute called without a failing valid lane")
     }
-    Ok(())
-}
-
-#[divan::bench(args = SIZES)]
-fn boolean_closure_suppresses(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            handrolled_boolean(
-                lhs.as_slice(),
-                rhs.as_slice(),
-                &combined,
-                out.as_mut_slice(),
-                |a, b, valid| {
-                    if valid { a.checked_add(b) } else { Some(0) }
-                },
-            )
-            .unwrap();
-            (combined, out)
-        });
-}
 
-// ---------------------------------------------------------------------------
-// Variant 6: hand-rolled pre-mask. Kernel zeros null-lane values via bit
-// broadcast, then unconditional add + overflow detect. Boolean fail_acc.
-// ---------------------------------------------------------------------------
-
-#[inline]
-fn handrolled_premask(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
     let len = lhs.len();
     assert_eq!(len, rhs.len());
     assert_eq!(len, mask.len());
@@ -437,62 +210,15 @@ fn handrolled_premask(
 
     for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
         let base = chunk_idx * 64;
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..64 {
-            // bit-broadcast: 0 → 0x00000000, 1 → 0xFFFFFFFF
-            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
-            let i = base + bit_idx;
-            // SAFETY: i < len.
-            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
-            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
-            let (sum, overflow) = a.overflowing_add(b);
-            fail_acc |= overflow as u64;
-            unsafe { out.get_unchecked_mut(i).write(sum) };
-        }
-        if fail_acc != 0 {
-            // Cold: walk chunk to find first valid lane that actually overflows on
-            // the unmasked inputs. Null lanes were premasked to 0+0, can't overflow.
-            for bit_idx in 0..64 {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                if !bit {
-                    continue;
-                }
-                let a = unsafe { *lhs.get_unchecked(i) };
-                let b = unsafe { *rhs.get_unchecked(i) };
-                if a.checked_add(b).is_none() {
-                    return Err(i);
-                }
-            }
+        if chunk(lhs, rhs, out, src_chunk, base, 64) {
+            return Err(attribute(lhs, rhs, src_chunk, base, 64));
         }
     }
-
     if remainder != 0 {
         let src_chunk = chunks.remainder_bits();
         let base = chunks_count * 64;
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..remainder {
-            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
-            let i = base + bit_idx;
-            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
-            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
-            let (sum, overflow) = a.overflowing_add(b);
-            fail_acc |= overflow as u64;
-            unsafe { out.get_unchecked_mut(i).write(sum) };
-        }
-        if fail_acc != 0 {
-            for bit_idx in 0..remainder {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                if !bit {
-                    continue;
-                }
-                let a = unsafe { *lhs.get_unchecked(i) };
-                let b = unsafe { *rhs.get_unchecked(i) };
-                if a.checked_add(b).is_none() {
-                    return Err(i);
-                }
-            }
+        if chunk(lhs, rhs, out, src_chunk, base, remainder) {
+            return Err(attribute(lhs, rhs, src_chunk, base, remainder));
         }
     }
     Ok(())
@@ -524,72 +250,16 @@ fn premask_then_simd(bencher: Bencher, n: usize) {
         });
 }
 
-// ---------------------------------------------------------------------------
-// Variant 7: pure SIMD, no mask awareness — CEILING REFERENCE ONLY.
-// Incorrect for arrays where null lanes might overflow; benchmarked just to
-// show the theoretical floor for nullable add.
-// ---------------------------------------------------------------------------
-
-#[inline]
-fn handrolled_no_validity(
-    lhs: &[u32],
-    rhs: &[u32],
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    assert_eq!(lhs.len(), rhs.len());
-    assert_eq!(lhs.len(), out.len());
-    let mut fail = false;
-    for i in 0..lhs.len() {
-        let a = unsafe { *lhs.get_unchecked(i) };
-        let b = unsafe { *rhs.get_unchecked(i) };
-        let (sum, overflow) = a.overflowing_add(b);
-        fail |= overflow;
-        unsafe { out.get_unchecked_mut(i).write(sum) };
-    }
-    if fail { Err(0) } else { Ok(()) }
-}
-
-/// Pure-SIMD ceiling on **pre-sanitized** input (null lanes pre-zeroed in the
-/// fixture, outside the timed region). Cannot run on the realistic
-/// `(lhs, rhs)` arrays because their null lanes hold `u32::MAX` and would
-/// Err — proven by [`assert_pure_simd_errs_on_realistic_data`].
-///
-/// Showing the SIMD-only arithmetic floor — what an ideal nullable-add would
-/// look like if validity could be free.
-#[divan::bench(args = SIZES)]
-fn pure_simd_no_validity_sanitized(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| (f.lhs_sanitized.clone(), f.rhs_sanitized.clone()))
-        .bench_refs(|(lhs, rhs)| {
-            let mut out = alloc_out(n);
-            handrolled_no_validity(lhs.as_slice(), rhs.as_slice(), out.as_mut_slice()).unwrap();
-            out
-        });
-}
-
 // ---------------------------------------------------------------------------
 // Parity assertions — must pass before divan runs benches.
 // ---------------------------------------------------------------------------
 
-/// Both arrow and our kernel must Err on overflow at a valid lane.
+/// Both implementations must Err on overflow at a valid lane.
 fn assert_overflow_parity() {
     let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
     let rhs: Vec<u32> = vec![10, 20, 1, 40];
     let valid = vec![true; 4];
 
-    let lhs_arrow = UInt32Array::new(
-        ScalarBuffer::from(lhs.clone()),
-        Some(NullBuffer::from(valid.clone())),
-    );
-    let rhs_arrow = UInt32Array::new(
-        ScalarBuffer::from(rhs.clone()),
-        Some(NullBuffer::from(valid.clone())),
-    );
-    let arrow_result =
-        arrow_arith::numeric::add(&lhs_arrow as &dyn Datum, &rhs_arrow as &dyn Datum);
-    assert!(arrow_result.is_err(), "arrow should Err on overflow");
-
     let mask = {
         let mut m = BitBufferMut::with_capacity(4);
         for &v in &valid {
@@ -597,30 +267,24 @@ fn assert_overflow_parity() {
         }
         m.freeze()
     };
+
     let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let ours = try_map_with_mask(
+    let bitpack = try_map_with_mask(
         LaneZip::new(lhs.as_slice(), rhs.as_slice()),
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
     );
-    assert!(ours.is_err(), "bitpack should Err on overflow");
+    assert!(bitpack.is_err(), "bitpack should Err on overflow");
 
-    let mut out2: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let boolean = handrolled_boolean(&lhs, &rhs, &mask, &mut out2, |a, b, valid| {
-        if valid { a.checked_add(b) } else { Some(0) }
-    });
-    assert!(boolean.is_err(), "boolean should Err on overflow");
-
-    let mut out3: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out3);
+    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out);
     assert!(prem.is_err(), "premask should Err on overflow");
 }
 
-/// All correctness-preserving variants must NOT Err when only null lanes
-/// would overflow. (Pure-SIMD variant is excluded — it doesn't see validity.)
+/// Both implementations must NOT Err when only null lanes would overflow.
 fn assert_null_overflow_suppressed() {
-    // Lane 2 is null and contains overflowing values; valid lanes are safe.
+    // Lane 2 is null and holds an overflowing value; valid lanes are safe.
     let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
     let rhs: Vec<u32> = vec![10, 20, 1, 40];
     let valid = vec![true, true, false, true];
@@ -633,49 +297,16 @@ fn assert_null_overflow_suppressed() {
         m.freeze()
     };
 
-    // Bit-pack with value-only closure — kernel filters null-lane fails.
     let mut out = alloc_out(4);
-    let r = try_map_with_mask(
+    let bitpack = try_map_with_mask(
         LaneZip::new(lhs.as_slice(), rhs.as_slice()),
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
     );
-    assert!(r.is_ok(), "bitpack_value_only: null-lane overflow leaked");
+    assert!(bitpack.is_ok(), "bitpack: null-lane overflow leaked");
 
-    // Boolean with closure that suppresses nulls.
     let mut out = alloc_out(4);
-    let r = handrolled_boolean(&lhs, &rhs, &mask, &mut out, |a, b, valid| {
-        if valid { a.checked_add(b) } else { Some(0) }
-    });
-    assert!(r.is_ok(), "boolean_closure_suppresses: null-lane leaked");
-
-    // Pre-mask: kernel zeroes null-lane values.
-    let mut out = alloc_out(4);
-    let r = handrolled_premask(&lhs, &rhs, &mask, &mut out);
-    assert!(r.is_ok(), "premask_then_simd: null-lane overflow leaked");
-}
-
-/// Demonstrates that `pure_simd_no_validity` is **incorrect** on realistic
-/// fixture inputs — i.e., when null lanes contain values that overflow on add.
-/// This is what justifies excluding pure_simd from the realistic bench and
-/// running it only on the sanitized inputs. Without this, the "ignore the
-/// mask" approach would look too fast because the test data lets it cheat.
-fn assert_pure_simd_errs_on_realistic_data() {
-    // Lane 2 is a "null lane" in arrow-style storage: bitmap says null, but
-    // the data buffer still holds an overflowing value. The realistic
-    // `fixture` does exactly this.
-    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
-    let rhs: Vec<u32> = vec![10, 20, 1, 40];
-    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-
-    let r = handrolled_no_validity(&lhs, &rhs, &mut out);
-    assert!(
-        r.is_err(),
-        "pure_simd_no_validity should Err on realistic data (null lane has \
-         u32::MAX). If this passes, the bench fixture isn't exercising the \
-         unsafe-null-storage case and the pure_simd ceiling number is \
-         misleading — it's running on data the kernel happens to handle even \
-         without a mask."
-    );
+    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out);
+    assert!(prem.is_ok(), "premask: null-lane overflow leaked");
 }

From 4299cf0e8391dbbd1461a9ad7fd11904a7ae890a Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 15:59:20 +0100
Subject: [PATCH 12/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/benches/add_checked.rs | 159 +++------------------------
 1 file changed, 14 insertions(+), 145 deletions(-)

diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index 2d4db4959e7..5814e14262e 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -1,23 +1,14 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Checked `u32 + u32 -> u32` over two nullable columns.
+//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_with_mask`]
+//! with a value-only closure. Per-lane `is_none()` flags are bit-packed and
+//! AND-ed with the chunk validity word so null-lane overflow is filtered
+//! without the closure ever inspecting `valid`.
 //!
-//! Two implementations:
-//!
-//! - [`bitpack_value_only`] — production path via [`try_map_with_mask`] with a
-//!   value-only closure. Per-lane `is_none()` flags are bit-packed and AND-ed
-//!   with the chunk validity word so null-lane overflow is filtered without
-//!   the closure ever inspecting `valid`.
-//! - [`premask_then_simd`] — hand-rolled ceiling. Bit-broadcasts each mask bit
-//!   to `0x00000000`/`0xFFFFFFFF`, ANDs into both operands (null lanes become
-//!   `0+0`), then unconditional `overflowing_add` with a per-chunk OR-reduced
-//!   `fail_acc` and cold scalar attribution. Same pattern that beat arrow on
-//!   the primitive cast bench (37 µs vs 55 µs).
-//!
-//! Both are verified at startup via [`assert_overflow_parity`] (valid-lane
-//! overflow propagates as `Err`) and [`assert_null_overflow_suppressed`]
-//! (null-lane overflow does not).
+//! Verified at startup via [`assert_overflow_parity`] (valid-lane overflow
+//! propagates as `Err`) and [`assert_null_overflow_suppressed`] (null-lane
+//! overflow does not).
 
 #![expect(clippy::unwrap_used)]
 
@@ -44,7 +35,7 @@ const RHS_VALID_RATE: f64 = 0.8;
 
 struct Fixture {
     /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel
-    /// that ignores validity would Err on them. Both implementations under test
+    /// that ignores validity would Err on them. The implementation under test
     /// must suppress that.
     lhs: Buffer<u32>,
     rhs: Buffer<u32>,
@@ -110,10 +101,6 @@ fn alloc_out(n: usize) -> Vec<MaybeUninit<u32>> {
     out
 }
 
-// ---------------------------------------------------------------------------
-// bitpack_value_only — production path via try_map_with_mask.
-// ---------------------------------------------------------------------------
-
 #[divan::bench(args = SIZES)]
 fn bitpack_value_only(bencher: Bencher, n: usize) {
     let f = fixture(n);
@@ -140,121 +127,11 @@ fn bitpack_value_only(bencher: Bencher, n: usize) {
         });
 }
 
-// ---------------------------------------------------------------------------
-// premask_then_simd — hand-rolled ceiling.
-// ---------------------------------------------------------------------------
-
-#[inline]
-fn handrolled_premask(
-    lhs: &[u32],
-    rhs: &[u32],
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<u32>],
-) -> Result<(), usize> {
-    /// Per-chunk hot loop. Bit-broadcasts each validity bit to 0x00 / 0xFF,
-    /// ANDs both operands, then `overflowing_add`. Returns true if any lane in
-    /// `[base, base+count)` overflowed. `#[inline(always)]` keeps the literal
-    /// `64` at the full-chunk call site for const propagation.
-    #[inline(always)]
-    fn chunk(
-        lhs: &[u32],
-        rhs: &[u32],
-        out: &mut [MaybeUninit<u32>],
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-    ) -> bool {
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let lane_mask = (((src_chunk >> bit_idx) & 1) as u32).wrapping_neg();
-            // SAFETY: caller guarantees base + count <= len.
-            let a = unsafe { *lhs.get_unchecked(i) } & lane_mask;
-            let b = unsafe { *rhs.get_unchecked(i) } & lane_mask;
-            let (sum, overflow) = a.overflowing_add(b);
-            fail_acc |= overflow as u64;
-            // SAFETY: caller guarantees base + count <= len.
-            unsafe { out.get_unchecked_mut(i).write(sum) };
-        }
-        fail_acc != 0
-    }
-
-    /// Cold attribution. Walks the chunk on raw (unmasked) operands and reports
-    /// the first valid lane that overflows. Null lanes were premasked to `0+0`
-    /// in the hot loop so they cannot contribute here.
-    #[cold]
-    #[inline(never)]
-    fn attribute(lhs: &[u32], rhs: &[u32], src_chunk: u64, base: usize, count: usize) -> usize {
-        for bit_idx in 0..count {
-            if (src_chunk >> bit_idx) & 1 == 0 {
-                continue;
-            }
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let a = unsafe { *lhs.get_unchecked(i) };
-            let b = unsafe { *rhs.get_unchecked(i) };
-            if a.checked_add(b).is_none() {
-                return i;
-            }
-        }
-        unreachable!("attribute called without a failing valid lane")
-    }
-
-    let len = lhs.len();
-    assert_eq!(len, rhs.len());
-    assert_eq!(len, mask.len());
-    assert_eq!(len, out.len());
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
-
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        let base = chunk_idx * 64;
-        if chunk(lhs, rhs, out, src_chunk, base, 64) {
-            return Err(attribute(lhs, rhs, src_chunk, base, 64));
-        }
-    }
-    if remainder != 0 {
-        let src_chunk = chunks.remainder_bits();
-        let base = chunks_count * 64;
-        if chunk(lhs, rhs, out, src_chunk, base, remainder) {
-            return Err(attribute(lhs, rhs, src_chunk, base, remainder));
-        }
-    }
-    Ok(())
-}
-
-#[divan::bench(args = SIZES)]
-fn premask_then_simd(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            handrolled_premask(
-                lhs.as_slice(),
-                rhs.as_slice(),
-                &combined,
-                out.as_mut_slice(),
-            )
-            .unwrap();
-            (combined, out)
-        });
-}
-
 // ---------------------------------------------------------------------------
 // Parity assertions — must pass before divan runs benches.
 // ---------------------------------------------------------------------------
 
-/// Both implementations must Err on overflow at a valid lane.
+/// Overflow at a valid lane must propagate as `Err`.
 fn assert_overflow_parity() {
     let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
     let rhs: Vec<u32> = vec![10, 20, 1, 40];
@@ -269,20 +146,16 @@ fn assert_overflow_parity() {
     };
 
     let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let bitpack = try_map_with_mask(
+    let r = try_map_with_mask(
         LaneZip::new(lhs.as_slice(), rhs.as_slice()),
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
     );
-    assert!(bitpack.is_err(), "bitpack should Err on overflow");
-
-    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out);
-    assert!(prem.is_err(), "premask should Err on overflow");
+    assert!(r.is_err(), "bitpack should Err on overflow");
 }
 
-/// Both implementations must NOT Err when only null lanes would overflow.
+/// Overflow at a null lane must NOT propagate.
 fn assert_null_overflow_suppressed() {
     // Lane 2 is null and holds an overflowing value; valid lanes are safe.
     let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
@@ -298,15 +171,11 @@ fn assert_null_overflow_suppressed() {
     };
 
     let mut out = alloc_out(4);
-    let bitpack = try_map_with_mask(
+    let r = try_map_with_mask(
         LaneZip::new(lhs.as_slice(), rhs.as_slice()),
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
     );
-    assert!(bitpack.is_ok(), "bitpack: null-lane overflow leaked");
-
-    let mut out = alloc_out(4);
-    let prem = handrolled_premask(&lhs, &rhs, &mask, &mut out);
-    assert!(prem.is_ok(), "premask: null-lane overflow leaked");
+    assert!(r.is_ok(), "bitpack: null-lane overflow leaked");
 }

From 8e5945f5818ec904914a4dd632413ebd697bd782 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 16:10:23 +0100
Subject: [PATCH 13/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/src/lane_ops_indexed.rs | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index f8d028eb7b7..f3ade8eeda6 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -287,21 +287,6 @@ where
 /// The closure may also explicitly suppress null-lane failures by branching on
 /// `valid` itself; both behaviors compose.
 ///
-/// ## Hot loop
-///
-/// `fail_bits |= (opt.is_none() as u64) << bit_idx`. After unrolling, `bit_idx` is a
-/// compile-time constant per-iteration, so the shift folds. The closure receives
-/// `(value, valid)`; LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` extract
-/// when the closure ignores `valid`, leaving a value-only SIMD body.
-///
-/// ## Attribution
-///
-/// `valid_failures = fail_bits & src_chunk` — non-zero only when at least one
-/// valid lane failed. `trailing_zeros()` gives the first failing valid lane.
-/// **No cold replay**: failure detection and lane attribution happen entirely in
-/// the hot loop. Worst-case bounded per chunk regardless of how many null lanes
-/// returned `None`.
-///
 /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
 /// write `R::default()` into `out`, but the contents of `out` must not be relied
 /// upon when this function returns `Err`.
@@ -321,9 +306,6 @@ where
     R: Copy + Default,
     F: FnMut(S::Item, bool) -> Option<R>,
 {
-    /// Bit-packs `is_none()` into `fail_bits` at lane position; the post-loop
-    /// `& src_chunk` filter drops null-lane fails. Returns `Some(failing_idx)` if
-    /// any *valid* lane failed in `[base, base+count)`.
     #[inline(always)]
     fn chunk<S, R, F>(
         values: &S,
@@ -385,12 +367,6 @@ where
 /// closure invocation is treated as "happened", regardless of whether the lane
 /// is null. Use this only when the input is known non-nullable.
 ///
-/// For nullable inputs where the closure is infallible (no overflow / no error
-/// branch), prefer [`map_with_mask`]; for nullable inputs with a fallible
-/// closure, prefer [`try_map_with_mask`] — both correctly suppress
-/// null-lane logic. This kernel exists for the narrow "no validity exists"
-/// case (non-nullable column, internal pipelines, etc.).
-///
 /// # Panics
 ///
 /// Panics if `out.len() != values.len()`.

From e9aac1d057c55ac263a686a916f22a0a00a1571d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 16:58:03 +0100
Subject: [PATCH 14/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      |   69 +-
 vortex-buffer/benches/add_checked.rs          |   20 +-
 vortex-buffer/benches/cast_to_indexed.rs      |   34 +-
 vortex-buffer/src/lane_ops_indexed.rs         | 1019 +++++++++--------
 4 files changed, 577 insertions(+), 565 deletions(-)

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 8242b5845bd..9aef97e6c9d 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -5,13 +5,9 @@ use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_buffer::lane_ops_indexed::IndexedSinkExt;
+use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
 use vortex_buffer::lane_ops_indexed::ReinterpretSink;
-use vortex_buffer::lane_ops_indexed::map_no_validity;
-use vortex_buffer::lane_ops_indexed::map_no_validity_in_place;
-use vortex_buffer::lane_ops_indexed::try_map_no_validity;
-use vortex_buffer::lane_ops_indexed::try_map_no_validity_in_place;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
@@ -174,10 +170,8 @@ where
         // (harmless: the result validity bitmap masks them downstream).
         return match owned {
             Some(mut buf) => {
-                map_no_validity_in_place(
-                    ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
-                    |v: F| v.as_(),
-                );
+                ReinterpretSink::<F, T>::new(buf.as_mut_slice())
+                    .map_no_validity_in_place(|v: F| v.as_());
                 // SAFETY: same size + alignment for NativePType same-byte-width pairs;
                 // every F-slot was overwritten with a real `T` bit pattern.
                 let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
@@ -185,7 +179,7 @@ where
             }
             None => {
                 let mut buffer = BufferMut::<T>::with_capacity(len);
-                map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| v.as_());
+                values.map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
                 // SAFETY: map_no_validity initializes every lane.
                 unsafe { buffer.set_len(len) };
                 Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
@@ -197,11 +191,9 @@ where
 
     let buffer: Buffer<T> = match (&mask, owned) {
         (Mask::AllTrue(_), Some(mut buf)) => {
-            try_map_no_validity_in_place(
-                ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
-                |v: F| <T as NumCast>::from(v),
-            )
-            .map_err(|_| overflow())?;
+            ReinterpretSink::<F, T>::new(buf.as_mut_slice())
+                .try_map_no_validity_in_place(|v: F| <T as NumCast>::from(v))
+                .map_err(|_| overflow())?;
             // SAFETY: same size + alignment for NativePType same-byte-width pairs;
             // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
             let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
@@ -209,10 +201,11 @@ where
         }
         (Mask::AllTrue(_), None) => {
             let mut buffer = BufferMut::<T>::with_capacity(len);
-            try_map_no_validity(values, &mut buffer.spare_capacity_mut()[..len], |v| {
-                <T as NumCast>::from(v)
-            })
-            .map_err(|_| overflow())?;
+            values
+                .try_map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| {
+                    <T as NumCast>::from(v)
+                })
+                .map_err(|_| overflow())?;
             // SAFETY: try_map_no_validity returned Ok, so it initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
@@ -225,12 +218,11 @@ where
         }
         (Mask::AllFalse(_), None) => BufferMut::<T>::zeroed(len).freeze(),
         (Mask::Values(m), Some(mut buf)) => {
-            try_map_with_mask_in_place(
-                ReinterpretSink::<F, T>::new(buf.as_mut_slice()),
-                m.bit_buffer(),
-                |v: F, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
-            )
-            .map_err(|_| overflow())?;
+            ReinterpretSink::<F, T>::new(buf.as_mut_slice())
+                .try_map_with_mask_in_place(m.bit_buffer(), |v: F, valid| {
+                    <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero))
+                })
+                .map_err(|_| overflow())?;
             // SAFETY: same size + alignment for NativePType same-byte-width pairs;
             // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
             let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
@@ -238,18 +230,19 @@ where
         }
         (Mask::Values(m), None) => {
             let mut buffer = BufferMut::<T>::with_capacity(len);
-            try_map_with_mask(
-                values,
-                m.bit_buffer(),
-                &mut buffer.spare_capacity_mut()[..len],
-                // Lazy validity: only consult `valid` on the failure branch. For widening /
-                // statically-infallible casts, `NumCast::from` is always `Some` so the
-                // `or_else` is provably dead — LLVM DCEs the validity path entirely, giving
-                // the same codegen as the maskless kernel. For narrowing, `valid` is only
-                // read at lanes that actually overflowed (a cold check on top of the cast).
-                |v, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
-            )
-            .map_err(|_| overflow())?;
+            values
+                .try_map_with_mask(
+                    m.bit_buffer(),
+                    &mut buffer.spare_capacity_mut()[..len],
+                    // Lazy validity: only consult `valid` on the failure branch. For widening /
+                    // statically-infallible casts, `NumCast::from` is always `Some` so the
+                    // `or_else` is provably dead — LLVM DCEs the validity path entirely,
+                    // giving the same codegen as the maskless kernel. For narrowing, `valid`
+                    // is only read at lanes that actually overflowed (a cold check on top of
+                    // the cast).
+                    |v, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
+                )
+                .map_err(|_| overflow())?;
             // SAFETY: try_map_with_mask returned Ok, so it initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index 5814e14262e..5c838479a13 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -20,8 +20,8 @@ use rand::prelude::*;
 use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
+use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
 use vortex_buffer::lane_ops_indexed::LaneZip;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask;
 
 fn main() {
     assert_overflow_parity();
@@ -116,13 +116,11 @@ fn bitpack_value_only(bencher: Bencher, n: usize) {
         .bench_refs(|(lhs, rhs, lm, rm)| {
             let combined = lm as &BitBuffer & rm as &BitBuffer;
             let mut out = alloc_out(n);
-            try_map_with_mask(
-                LaneZip::new(lhs.as_slice(), rhs.as_slice()),
-                &combined,
-                out.as_mut_slice(),
-                |(a, b), _valid| a.checked_add(b),
-            )
-            .unwrap();
+            LaneZip::new(lhs.as_slice(), rhs.as_slice())
+                .try_map_with_mask(&combined, out.as_mut_slice(), |(a, b), _valid| {
+                    a.checked_add(b)
+                })
+                .unwrap();
             (combined, out)
         });
 }
@@ -146,8 +144,7 @@ fn assert_overflow_parity() {
     };
 
     let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let r = try_map_with_mask(
-        LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask(
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
@@ -171,8 +168,7 @@ fn assert_null_overflow_suppressed() {
     };
 
     let mut out = alloc_out(4);
-    let r = try_map_with_mask(
-        LaneZip::new(lhs.as_slice(), rhs.as_slice()),
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask(
         &mask,
         out.as_mut_slice(),
         |(a, b), _| a.checked_add(b),
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index 2751cdc8418..bcc30669ccb 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -23,12 +23,8 @@ use rand::rngs::StdRng;
 use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
-use vortex_buffer::lane_ops_indexed::map_no_validity;
-use vortex_buffer::lane_ops_indexed::map_with_mask;
-use vortex_buffer::lane_ops_indexed::map_with_mask_in_place;
-use vortex_buffer::lane_ops_indexed::try_map_no_validity;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask;
-use vortex_buffer::lane_ops_indexed::try_map_with_mask_in_place;
+use vortex_buffer::lane_ops_indexed::IndexedSinkExt;
+use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
 
 fn main() {
     divan::main();
@@ -129,11 +125,9 @@ fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mut out)| {
-            map_no_validity(
-                values.as_slice(),
-                out.as_mut_slice(),
-                <u32 as From<u16>>::from,
-            );
+            values
+                .as_slice()
+                .map_no_validity(out.as_mut_slice(), <u32 as From<u16>>::from);
             out
         });
 }
@@ -145,7 +139,7 @@ fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+            values.as_slice().map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
                 <u32 as From<u16>>::from(v) * valid as u32
             });
             out
@@ -159,7 +153,7 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mut out)| {
-            try_map_no_validity(values.as_slice(), out.as_mut_slice(), |v| {
+            values.as_slice().try_map_no_validity(out.as_mut_slice(), |v| {
                 <u32 as NumCast>::from(v)
             })
             .unwrap();
@@ -178,7 +172,7 @@ fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
                 <u32 as NumCast>::from(v)
             })
             .unwrap();
@@ -193,7 +187,7 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
                 <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
             })
             .unwrap();
@@ -218,7 +212,7 @@ fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usi
             )
         })
         .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
                 <u32 as NumCast>::from(v)
             })
             .unwrap();
@@ -233,7 +227,7 @@ fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, valid| {
+            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
                 Some(<u32 as From<u16>>::from(v)).or_else(|| (!valid).then(u32::default))
             })
             .unwrap();
@@ -248,7 +242,7 @@ fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            try_map_with_mask(values.as_slice(), &mask, out.as_mut_slice(), |v, _valid| {
+            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
                 Some(<u32 as From<u16>>::from(v))
             })
             .unwrap();
@@ -263,7 +257,7 @@ fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone()))
         .bench_values(|(mut values, mask)| {
-            map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| v * valid as u32);
+            values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| v * valid as u32);
             values
         });
 }
@@ -275,7 +269,7 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone()))
         .bench_values(|(mut values, mask)| {
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2))
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2))
                 .unwrap();
             values
         });
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index f3ade8eeda6..683a03c5539 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -214,277 +214,296 @@ impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
     }
 }
 
-/// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(values[i], mask[i])`.
+/// Extension trait providing lane-kernel methods on any [`IndexedSource`].
 ///
-/// All three inputs must have the same length. The output type `R` may differ from the
-/// input type `T` — this kernel is the building block for both same-type transforms
-/// (fill_null) and cross-type ones (cast). The caller is responsible for marking `out`
-/// initialized (e.g. by calling `BufferMut::set_len` after this returns).
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
-#[inline]
-pub fn map_with_mask<S, R, F>(values: S, mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
-where
-    S: IndexedSource,
-    F: FnMut(S::Item, bool) -> R,
-{
-    /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder`
-    /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the
-    /// full-chunk call site via constant propagation through inlining.
-    #[inline(always)]
-    fn chunk<S, R, F>(
-        values: &S,
-        out: &mut [MaybeUninit<R>],
-        f: &mut F,
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-    ) where
-        S: IndexedSource,
-        F: FnMut(S::Item, bool) -> R,
+/// All methods have default implementations and are inherited via the blanket
+/// `impl<S: IndexedSource> IndexedSourceExt for S` below. Bring the trait into
+/// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call
+/// them with method syntax: `values.try_map_with_mask(&mask, &mut out, f)`.
+pub trait IndexedSourceExt: IndexedSource + Sized {
+    /// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(self[i], mask[i])`.
+    ///
+    /// All three inputs must have the same length. The output type `R` may differ from
+    /// the input type — this kernel is the building block for both same-type transforms
+    /// (fill_null) and cross-type ones (cast). The caller is responsible for marking
+    /// `out` initialized (e.g. by calling `BufferMut::set_len` after this returns).
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`.
+    #[inline]
+    fn map_with_mask<R, F>(self, mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
+    where
+        F: FnMut(Self::Item, bool) -> R,
     {
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+        /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder`
+        /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the
+        /// full-chunk call site via constant propagation through inlining.
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            src_chunk: u64,
+            base: usize,
+            count: usize,
+        ) where
+            S: IndexedSource,
+            F: FnMut(S::Item, bool) -> R,
+        {
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
+            }
         }
-    }
 
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(out.len(), len, "out must have the same length as values");
+        let values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
+        assert_eq!(out.len(), len, "out must have the same length as values");
 
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64);
-    }
-    if remainder != 0 {
-        chunk(
-            &values,
-            out,
-            &mut f,
-            chunks.remainder_bits(),
-            chunks_count * 64,
-            remainder,
-        );
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(
+                &values,
+                out,
+                &mut f,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+            );
+        }
     }
-}
 
-/// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None` indicates a
-/// per-lane failure (e.g. range overflow on a narrowing cast).
-///
-/// **Null-lane failures are filtered automatically.** If a null lane's stored value
-/// causes `f(v, false)` to return `None`, the kernel does *not* propagate that as
-/// `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at the lane's
-/// position, then ANDed with the chunk's validity bitmap — null-lane bits vanish.
-/// The closure may also explicitly suppress null-lane failures by branching on
-/// `valid` itself; both behaviors compose.
-///
-/// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
-/// write `R::default()` into `out`, but the contents of `out` must not be relied
-/// upon when this function returns `Err`.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()` or `out.len() != values.len()`.
-#[inline]
-pub fn try_map_with_mask<S, R, F>(
-    values: S,
-    mask: &BitBuffer,
-    out: &mut [MaybeUninit<R>],
-    mut f: F,
-) -> Result<(), usize>
-where
-    S: IndexedSource,
-    R: Copy + Default,
-    F: FnMut(S::Item, bool) -> Option<R>,
-{
-    #[inline(always)]
-    fn chunk<S, R, F>(
-        values: &S,
+    /// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None`
+    /// indicates a per-lane failure (e.g. range overflow on a narrowing cast).
+    ///
+    /// **Null-lane failures are filtered automatically.** If a null lane's stored
+    /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate
+    /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at
+    /// the lane's position, then ANDed with the chunk's validity bitmap — null-lane
+    /// bits vanish. The closure may also explicitly suppress null-lane failures by
+    /// branching on `valid` itself; both behaviors compose.
+    ///
+    /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
+    /// write `R::default()` into `out`, but the contents of `out` must not be relied
+    /// upon when this function returns `Err`.
+    ///
+    /// [`map_with_mask`]: IndexedSourceExt::map_with_mask
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`.
+    #[inline]
+    fn try_map_with_mask<R, F>(
+        self,
+        mask: &BitBuffer,
         out: &mut [MaybeUninit<R>],
-        f: &mut F,
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-    ) -> Option<usize>
+        mut f: F,
+    ) -> Result<(), usize>
     where
-        S: IndexedSource,
         R: Copy + Default,
-        F: FnMut(S::Item, bool) -> Option<R>,
+        F: FnMut(Self::Item, bool) -> Option<R>,
     {
-        let mut fail_bits: u64 = 0;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            fail_bits |= (opt.is_none() as u64) << bit_idx;
-            let r = opt.unwrap_or_default();
-            unsafe { out.get_unchecked_mut(i).write(r) };
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            src_chunk: u64,
+            base: usize,
+            count: usize,
+        ) -> Option<usize>
+        where
+            S: IndexedSource,
+            R: Copy + Default,
+            F: FnMut(S::Item, bool) -> Option<R>,
+        {
+            let mut fail_bits: u64 = 0;
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                let opt = f(v, bit);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
+                let r = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(i).write(r) };
+            }
+            let valid_failures = fail_bits & src_chunk;
+            (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
         }
-        let valid_failures = fail_bits & src_chunk;
-        (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
-    }
 
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
-    assert_eq!(out.len(), len, "out must have the same length as values");
+        let values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
+        assert_eq!(out.len(), len, "out must have the same length as values");
 
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) {
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) {
+                return Err(idx);
+            }
+        }
+        if remainder != 0
+            && let Some(idx) = chunk(
+                &values,
+                out,
+                &mut f,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+            )
+        {
             return Err(idx);
         }
+        Ok(())
     }
-    if remainder != 0
-        && let Some(idx) = chunk(
-            &values,
-            out,
-            &mut f,
-            chunks.remainder_bits(),
-            chunks_count * 64,
-            remainder,
-        )
-    {
-        return Err(idx);
-    }
-    Ok(())
-}
 
-/// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every
-/// closure invocation is treated as "happened", regardless of whether the lane
-/// is null. Use this only when the input is known non-nullable.
-///
-/// # Panics
-///
-/// Panics if `out.len() != values.len()`.
-#[inline]
-pub fn map_no_validity<S, R, F>(values: S, out: &mut [MaybeUninit<R>], mut f: F)
-where
-    S: IndexedSource,
-    F: FnMut(S::Item) -> R,
-{
-    #[inline(always)]
-    fn chunk<S, R, F>(values: &S, out: &mut [MaybeUninit<R>], f: &mut F, base: usize, count: usize)
+    /// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every
+    /// closure invocation is treated as "happened", regardless of whether the lane
+    /// is null. Use this only when the input is known non-nullable.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `out.len() != self.len()`.
+    #[inline]
+    fn map_no_validity<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F)
     where
-        S: IndexedSource,
-        F: FnMut(S::Item) -> R,
+        F: FnMut(Self::Item) -> R,
     {
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            unsafe { out.get_unchecked_mut(i).write(f(v)) };
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            base: usize,
+            count: usize,
+        ) where
+            S: IndexedSource,
+            F: FnMut(S::Item) -> R,
+        {
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                unsafe { out.get_unchecked_mut(i).write(f(v)) };
+            }
         }
-    }
 
-    let len = values.len();
-    assert_eq!(out.len(), len, "out must have the same length as values");
+        let values = self;
+        let len = values.len();
+        assert_eq!(out.len(), len, "out must have the same length as values");
 
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for chunk_idx in 0..chunks_count {
-        chunk(&values, out, &mut f, chunk_idx * 64, 64);
-    }
-    if remainder != 0 {
-        chunk(&values, out, &mut f, chunks_count * 64, remainder);
+        for chunk_idx in 0..chunks_count {
+            chunk(&values, out, &mut f, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(&values, out, &mut f, chunks_count * 64, remainder);
+        }
     }
-}
 
-/// Fallible map with **no validity awareness at all** — every `None` returned
-/// by the closure is treated as a failure, even at null lanes.
-///
-/// # Use this only for non-nullable inputs.
-///
-/// For nullable inputs with a fallible closure, use
-/// [`try_map_with_mask`] — it has the same value-only closure shape
-/// (and the same perf win) but **correctly suppresses null-lane failures**
-/// via per-chunk `fail_bits & mask_chunk`.
-///
-/// Using this kernel on a nullable input where a null lane's stored value
-/// would cause `f` to return `None` will produce a spurious `Err`. This is a
-/// correctness footgun on purpose — the name and this doc are how the API
-/// signals "you must know your input has no nulls."
-///
-/// On failure returns `Err(failing_lane_index)`.
-///
-/// # Panics
-///
-/// Panics if `out.len() != values.len()`.
-#[inline]
-pub fn try_map_no_validity<S, R, F>(
-    values: S,
-    out: &mut [MaybeUninit<R>],
-    mut f: F,
-) -> Result<(), usize>
-where
-    S: IndexedSource,
-    R: Copy + Default,
-    F: FnMut(S::Item) -> Option<R>,
-{
-    /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced);
-    /// the cold attribution path is called at the kernel level so it can be
-    /// inlined separately for full vs remainder.
-    #[inline(always)]
-    fn chunk<S, R, F>(
-        values: &S,
+    /// Fallible map with **no validity awareness at all** — every `None` returned
+    /// by the closure is treated as a failure, even at null lanes.
+    ///
+    /// # Use this only for non-nullable inputs.
+    ///
+    /// For nullable inputs with a fallible closure, use [`try_map_with_mask`] —
+    /// it has the same value-only closure shape (and the same perf win) but
+    /// **correctly suppresses null-lane failures** via per-chunk
+    /// `fail_bits & mask_chunk`.
+    ///
+    /// Using this kernel on a nullable input where a null lane's stored value
+    /// would cause `f` to return `None` will produce a spurious `Err`. This is a
+    /// correctness footgun on purpose — the name and this doc are how the API
+    /// signals "you must know your input has no nulls."
+    ///
+    /// On failure returns `Err(failing_lane_index)`.
+    ///
+    /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask
+    ///
+    /// # Panics
+    ///
+    /// Panics if `out.len() != self.len()`.
+    #[inline]
+    fn try_map_no_validity<R, F>(
+        self,
         out: &mut [MaybeUninit<R>],
-        f: &mut F,
-        base: usize,
-        count: usize,
-    ) -> bool
+        mut f: F,
+    ) -> Result<(), usize>
     where
-        S: IndexedSource,
         R: Copy + Default,
-        F: FnMut(S::Item) -> Option<R>,
+        F: FnMut(Self::Item) -> Option<R>,
     {
-        let mut fail_acc: u64 = 0;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            fail_acc |= opt.is_none() as u64;
-            let r = opt.unwrap_or_default();
-            unsafe { out.get_unchecked_mut(i).write(r) };
+        /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced);
+        /// the cold attribution path is called at the kernel level so it can be
+        /// inlined separately for full vs remainder.
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            base: usize,
+            count: usize,
+        ) -> bool
+        where
+            S: IndexedSource,
+            R: Copy + Default,
+            F: FnMut(S::Item) -> Option<R>,
+        {
+            let mut fail_acc: u64 = 0;
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                let opt = f(v);
+                fail_acc |= opt.is_none() as u64;
+                let r = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(i).write(r) };
+            }
+            fail_acc != 0
         }
-        fail_acc != 0
-    }
 
-    let len = values.len();
-    assert_eq!(out.len(), len, "out must have the same length as values");
+        let values = self;
+        let len = values.len();
+        assert_eq!(out.len(), len, "out must have the same length as values");
 
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for chunk_idx in 0..chunks_count {
-        let base = chunk_idx * 64;
-        if chunk(&values, out, &mut f, base, 64) {
-            return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
+        for chunk_idx in 0..chunks_count {
+            let base = chunk_idx * 64;
+            if chunk(&values, out, &mut f, base, 64) {
+                return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
+            }
         }
-    }
-    if remainder != 0 {
-        let base = chunks_count * 64;
-        if chunk(&values, out, &mut f, base, remainder) {
-            return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
+        if remainder != 0 {
+            let base = chunks_count * 64;
+            if chunk(&values, out, &mut f, base, remainder) {
+                return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
+            }
         }
+        Ok(())
     }
-    Ok(())
 }
 
+impl<S: IndexedSource> IndexedSourceExt for S {}
+
 /// Shared cold scan: walks a chunk, returns the first lane index where
 /// `lane_fails(bit_idx, value)` returns `true`. Used by
 /// [`attribute_failure_no_mask`].
@@ -523,264 +542,277 @@ where
     cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none())
 }
 
-/// In-place variant of [`map_no_validity`]. Each lane is replaced with `f(values[i])`.
-/// The source `S` must be writable (an [`IndexedSink`]).
-///
-/// The closure reads `S::Item` and returns `S::Write`. For the common case
-/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write
-/// types can differ (e.g. read `f32`, write `u32`) over the same backing memory
-/// when sizes and alignments match.
+/// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`].
 ///
-/// As with [`map_no_validity`], use this only when the input is known
-/// non-nullable.
-#[inline]
-pub fn map_no_validity_in_place<S, F>(mut values: S, mut f: F)
-where
-    S: IndexedSink,
-    F: FnMut(S::Item) -> S::Write,
-{
-    #[inline(always)]
-    fn chunk<S, F>(values: &mut S, f: &mut F, base: usize, count: usize)
+/// All methods have default implementations and are inherited via the blanket
+/// `impl<S: IndexedSink> IndexedSinkExt for S` below. Bring the trait into scope
+/// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with
+/// method syntax.
+pub trait IndexedSinkExt: IndexedSink + Sized {
+    /// In-place counterpart of [`IndexedSourceExt::map_no_validity`]. Each lane
+    /// is replaced with `f(self[i])`.
+    ///
+    /// The closure reads `Self::Item` and returns `Self::Write`. For the common
+    /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and
+    /// write types can differ (e.g. read `f32`, write `u32`) over the same
+    /// backing memory when sizes and alignments match.
+    ///
+    /// As with [`IndexedSourceExt::map_no_validity`], use this only when the
+    /// input is known non-nullable.
+    #[inline]
+    fn map_no_validity_in_place<F>(self, mut f: F)
     where
-        S: IndexedSink,
-        F: FnMut(S::Item) -> S::Write,
+        F: FnMut(Self::Item) -> Self::Write,
     {
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let r = f(v);
-            // SAFETY: caller guarantees base + count <= len.
-            unsafe { values.set_unchecked(i, r) };
+        #[inline(always)]
+        fn chunk<S, F>(values: &mut S, f: &mut F, base: usize, count: usize)
+        where
+            S: IndexedSink,
+            F: FnMut(S::Item) -> S::Write,
+        {
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                let r = f(v);
+                // SAFETY: caller guarantees base + count <= len.
+                unsafe { values.set_unchecked(i, r) };
+            }
         }
-    }
 
-    let len = values.len();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let mut values = self;
+        let len = values.len();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for chunk_idx in 0..chunks_count {
-        chunk(&mut values, &mut f, chunk_idx * 64, 64);
-    }
-    if remainder != 0 {
-        chunk(&mut values, &mut f, chunks_count * 64, remainder);
+        for chunk_idx in 0..chunks_count {
+            chunk(&mut values, &mut f, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(&mut values, &mut f, chunks_count * 64, remainder);
+        }
     }
-}
 
-/// In-place variant of [`try_map_no_validity`]. Each lane is replaced with
-/// `f(values[i])`, or `S::Write::default()` when `f` returns `None`. On failure
-/// returns `Err(first_failing_lane)`; the buffer state on `Err` is unspecified.
-///
-/// As with [`try_map_no_validity`], use this only when the input is known
-/// non-nullable — a `None` from `f` is treated as a failure regardless of any
-/// upstream validity bitmap.
-///
-/// ## Error attribution
-///
-/// Per-lane `is_none()` flags are folded into `first_fail` via the same
-/// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay
-/// isn't viable here because the original input values have already been
-/// overwritten by the time we'd attribute the failure.
-#[inline]
-#[allow(clippy::cast_possible_truncation)]
-pub fn try_map_no_validity_in_place<S, F>(mut values: S, mut f: F) -> Result<(), usize>
-where
-    S: IndexedSink,
-    S::Write: Default,
-    F: FnMut(S::Item) -> Option<S::Write>,
-{
-    #[inline(always)]
+    /// In-place counterpart of [`IndexedSourceExt::try_map_no_validity`]. Each
+    /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f`
+    /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer
+    /// state on `Err` is unspecified.
+    ///
+    /// As with [`IndexedSourceExt::try_map_no_validity`], use this only when the
+    /// input is known non-nullable — a `None` from `f` is treated as a failure
+    /// regardless of any upstream validity bitmap.
+    ///
+    /// ## Error attribution
+    ///
+    /// Per-lane `is_none()` flags are folded into `first_fail` via the same
+    /// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay
+    /// isn't viable here because the original input values have already been
+    /// overwritten by the time we'd attribute the failure.
+    ///
+    /// [`try_map_with_mask_in_place`]: IndexedSinkExt::try_map_with_mask_in_place
+    #[inline]
     #[allow(clippy::cast_possible_truncation)]
-    fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<u32>
+    fn try_map_no_validity_in_place<F>(self, mut f: F) -> Result<(), usize>
     where
-        S: IndexedSink,
-        S::Write: Default,
-        F: FnMut(S::Item) -> Option<S::Write>,
+        Self::Write: Default,
+        F: FnMut(Self::Item) -> Option<Self::Write>,
     {
-        let mut first_fail: u32 = u32::MAX;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v);
-            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-            first_fail = first_fail.min(candidate);
-            let r = opt.unwrap_or_default();
-            // SAFETY: caller guarantees base + count <= len.
-            unsafe { values.set_unchecked(i, r) };
+        #[inline(always)]
+        #[allow(clippy::cast_possible_truncation)]
+        fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<u32>
+        where
+            S: IndexedSink,
+            S::Write: Default,
+            F: FnMut(S::Item) -> Option<S::Write>,
+        {
+            let mut first_fail: u32 = u32::MAX;
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                let opt = f(v);
+                let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+                first_fail = first_fail.min(candidate);
+                let r = opt.unwrap_or_default();
+                // SAFETY: caller guarantees base + count <= len.
+                unsafe { values.set_unchecked(i, r) };
+            }
+            (first_fail != u32::MAX).then_some(first_fail)
         }
-        (first_fail != u32::MAX).then_some(first_fail)
-    }
 
-    let len = values.len();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let mut values = self;
+        let len = values.len();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for chunk_idx in 0..chunks_count {
-        if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) {
+        for chunk_idx in 0..chunks_count {
+            if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) {
+                return Err(failing as usize);
+            }
+        }
+        if remainder != 0
+            && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f)
+        {
             return Err(failing as usize);
         }
+        Ok(())
     }
-    if remainder != 0
-        && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f)
-    {
-        return Err(failing as usize);
-    }
-    Ok(())
-}
 
-/// In-place variant of [`map_with_mask`]. Each lane is replaced with
-/// `f(values[i], mask[i])`. The source `S` must be writable (an [`IndexedSink`]).
-///
-/// The closure reads `S::Item` and returns `S::Write`. For the common case
-/// `S = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and write
-/// types can differ (e.g. read `f32`, write `u32`) over the same backing
-/// memory when sizes and alignments match.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()`.
-#[inline]
-pub fn map_with_mask_in_place<S, F>(mut values: S, mask: &BitBuffer, mut f: F)
-where
-    S: IndexedSink,
-    F: FnMut(S::Item, bool) -> S::Write,
-{
-    #[inline(always)]
-    fn chunk<S, F>(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize)
+    /// In-place counterpart of [`IndexedSourceExt::map_with_mask`]. Each lane
+    /// is replaced with `f(self[i], mask[i])`.
+    ///
+    /// The closure reads `Self::Item` and returns `Self::Write`. For the common
+    /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and
+    /// write types can differ (e.g. read `f32`, write `u32`) over the same
+    /// backing memory when sizes and alignments match.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()`.
+    #[inline]
+    fn map_with_mask_in_place<F>(self, mask: &BitBuffer, mut f: F)
     where
-        S: IndexedSink,
-        F: FnMut(S::Item, bool) -> S::Write,
+        F: FnMut(Self::Item, bool) -> Self::Write,
     {
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: caller guarantees base + count <= len.
-            let v = unsafe { values.get_unchecked(i) };
-            let r = f(v, bit);
-            unsafe { values.set_unchecked(i, r) };
+        #[inline(always)]
+        fn chunk<S, F>(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize)
+        where
+            S: IndexedSink,
+            F: FnMut(S::Item, bool) -> S::Write,
+        {
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                // SAFETY: caller guarantees base + count <= len.
+                let v = unsafe { values.get_unchecked(i) };
+                let r = f(v, bit);
+                unsafe { values.set_unchecked(i, r) };
+            }
         }
-    }
 
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
+        let mut values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
 
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64);
-    }
-    if remainder != 0 {
-        chunk(
-            &mut values,
-            &mut f,
-            chunks.remainder_bits(),
-            chunks_count * 64,
-            remainder,
-        );
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(
+                &mut values,
+                &mut f,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+            );
+        }
     }
-}
 
-/// In-place variant of [`try_map_with_mask`]. Each lane of `values` is replaced
-/// with `f(values[i], mask[i])`, or `S::Item::default()` if `f` returned `None`.
-/// On failure returns `Err(first_failing_lane)`; lanes before that point have been
-/// written, and lanes within the failing chunk hold their unwrapped-or-default
-/// result. The buffer state on `Err` is intentionally unspecified.
-///
-/// ## Error attribution
-///
-/// Per-lane `is_none()` flags are folded into `first_fail` via a branchless
-/// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane
-/// loop, `first_fail` holds the smallest failing index in the chunk (or `MAX`
-/// if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on AArch64. The
-/// cold replay scheme used by [`try_map_with_mask`] isn't viable here because
-/// the original input values have already been overwritten by the time we
-/// would attribute the failure.
-///
-/// ## Why in-place is slower at cache-resident sizes
-///
-/// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the
-/// out-of-place kernel despite having half the memory traffic, because input
-/// and output share memory and the compiler must be conservative reordering
-/// loads/stores across iterations. At sizes that exceed L2 the in-place kernel
-/// wins back the gap by avoiding the second buffer's DRAM read+write traffic.
-///
-/// # Panics
-///
-/// Panics if `values.len() != mask.len()`.
-#[inline]
-#[allow(clippy::cast_possible_truncation)]
-pub fn try_map_with_mask_in_place<S, F>(
-    mut values: S,
-    mask: &BitBuffer,
-    mut f: F,
-) -> Result<(), usize>
-where
-    S: IndexedSink,
-    S::Write: Default,
-    F: FnMut(S::Item, bool) -> Option<S::Write>,
-{
-    /// Returns `Some(first_failing_lane_index_as_u32)` if any lane in
-    /// `[base, base+count)` failed (cast width-truncated since `i < 2^32` in any
-    /// realistic batch), else `None`. `#[inline(always)]` so the literal `64` at the
-    /// full-chunk call site enables const-propagation through inlining.
-    #[inline(always)]
+    /// In-place counterpart of [`IndexedSourceExt::try_map_with_mask`]. Each
+    /// lane of `self` is replaced with `f(self[i], mask[i])`, or
+    /// `Self::Write::default()` if `f` returned `None`. On failure returns
+    /// `Err(first_failing_lane)`; lanes before that point have been written,
+    /// and lanes within the failing chunk hold their unwrapped-or-default
+    /// result. The buffer state on `Err` is intentionally unspecified.
+    ///
+    /// ## Error attribution
+    ///
+    /// Per-lane `is_none()` flags are folded into `first_fail` via a branchless
+    /// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane
+    /// loop, `first_fail` holds the smallest failing index in the chunk (or
+    /// `MAX` if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on
+    /// AArch64. The cold replay scheme used by [`try_map_with_mask`] isn't
+    /// viable here because the original input values have already been
+    /// overwritten by the time we would attribute the failure.
+    ///
+    /// ## Why in-place is slower at cache-resident sizes
+    ///
+    /// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the
+    /// out-of-place kernel despite having half the memory traffic, because
+    /// input and output share memory and the compiler must be conservative
+    /// reordering loads/stores across iterations. At sizes that exceed L2 the
+    /// in-place kernel wins back the gap by avoiding the second buffer's DRAM
+    /// read+write traffic.
+    ///
+    /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()`.
+    #[inline]
     #[allow(clippy::cast_possible_truncation)]
-    fn chunk<S, F>(
-        values: &mut S,
-        src_chunk: u64,
-        base: usize,
-        count: usize,
-        f: &mut F,
-    ) -> Option<u32>
+    fn try_map_with_mask_in_place<F>(
+        self,
+        mask: &BitBuffer,
+        mut f: F,
+    ) -> Result<(), usize>
     where
-        S: IndexedSink,
-        S::Write: Default,
-        F: FnMut(S::Item, bool) -> Option<S::Write>,
+        Self::Write: Default,
+        F: FnMut(Self::Item, bool) -> Option<Self::Write>,
     {
-        let mut first_fail: u32 = u32::MAX;
-        for bit_idx in 0..count {
-            let i = base + bit_idx;
-            let bit = (src_chunk >> bit_idx) & 1 == 1;
-            // SAFETY: caller guarantees `base + count <= values.len()`.
-            let v = unsafe { values.get_unchecked(i) };
-            let opt = f(v, bit);
-            let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-            first_fail = first_fail.min(candidate);
-            let r = opt.unwrap_or_default();
-            unsafe { values.set_unchecked(i, r) };
+        #[inline(always)]
+        #[allow(clippy::cast_possible_truncation)]
+        fn chunk<S, F>(
+            values: &mut S,
+            src_chunk: u64,
+            base: usize,
+            count: usize,
+            f: &mut F,
+        ) -> Option<u32>
+        where
+            S: IndexedSink,
+            S::Write: Default,
+            F: FnMut(S::Item, bool) -> Option<S::Write>,
+        {
+            let mut first_fail: u32 = u32::MAX;
+            for bit_idx in 0..count {
+                let i = base + bit_idx;
+                let bit = (src_chunk >> bit_idx) & 1 == 1;
+                // SAFETY: caller guarantees `base + count <= values.len()`.
+                let v = unsafe { values.get_unchecked(i) };
+                let opt = f(v, bit);
+                let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
+                first_fail = first_fail.min(candidate);
+                let r = opt.unwrap_or_default();
+                unsafe { values.set_unchecked(i, r) };
+            }
+            (first_fail != u32::MAX).then_some(first_fail)
         }
-        (first_fail != u32::MAX).then_some(first_fail)
-    }
 
-    let len = values.len();
-    assert_eq!(len, mask.len(), "values and mask must have the same length");
+        let mut values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
 
-    let chunks = mask.chunks();
-    let chunks_count = len / 64;
-    let remainder = len % 64;
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
 
-    for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-        if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) {
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) {
+                return Err(failing as usize);
+            }
+        }
+        if remainder != 0
+            && let Some(failing) = chunk(
+                &mut values,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+                &mut f,
+            )
+        {
             return Err(failing as usize);
         }
+        Ok(())
     }
-    if remainder != 0
-        && let Some(failing) = chunk(
-            &mut values,
-            chunks.remainder_bits(),
-            chunks_count * 64,
-            remainder,
-            &mut f,
-        )
-    {
-        return Err(failing as usize);
-    }
-    Ok(())
 }
 
+impl<S: IndexedSink> IndexedSinkExt for S {}
+
 #[cfg(test)]
 #[allow(clippy::cast_possible_truncation)]
 mod tests {
@@ -803,7 +835,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
-        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
             if valid { v } else { -1 }
         });
         assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
@@ -815,7 +847,7 @@ mod tests {
         let values: Vec<i32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
-        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
             if valid { v + 1 } else { 0 }
         });
         let got = write_t(out);
@@ -836,7 +868,7 @@ mod tests {
 
         let values: Vec<u32> = (0..65).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
-        map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| {
+        values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| {
             if valid { v } else { u32::MAX }
         });
         let got = write_t(out);
@@ -855,7 +887,7 @@ mod tests {
 
         let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
         let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
-        map_with_mask(values.as_slice(), &sliced, &mut out, |v, valid| {
+        values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| {
             if valid { v } else { -1 }
         });
         let got = write_t(out);
@@ -867,7 +899,7 @@ mod tests {
         let values: Vec<i32> = vec![];
         let mask = BitBuffer::new_unset(0);
         let mut out: Vec<MaybeUninit<i32>> = vec![];
-        map_with_mask(values.as_slice(), &mask, &mut out, |v, _| v);
+        values.as_slice().map_with_mask(&mask, &mut out, |v, _| v);
     }
 
     #[test]
@@ -882,7 +914,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
-        map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
             v * (valid as i64)
         });
         let got = write_t(out);
@@ -900,7 +932,7 @@ mod tests {
         let values: Vec<u64> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -916,7 +948,7 @@ mod tests {
         values[137] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -932,7 +964,7 @@ mod tests {
         values[137] = u64::MAX;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -954,7 +986,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| {
             (v <= u32::MAX as u64).then_some(v as u32)
         });
         assert!(
@@ -981,7 +1013,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, _valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| {
             (v <= u32::MAX as u64).then_some(v as u32)
         });
         assert_eq!(res, Err(77));
@@ -1001,7 +1033,7 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1026,12 +1058,12 @@ mod tests {
 
         let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
         let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
-        try_map_with_mask(values.as_slice(), &mask, &mut branchless, |v, valid| {
+        values.as_slice().try_map_with_mask(&mask, &mut branchless, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         })
         .unwrap();
-        try_map_with_mask(values.as_slice(), &mask, &mut branchful, |v, valid| {
+        values.as_slice().try_map_with_mask(&mask, &mut branchful, |v, valid| {
             if valid {
                 u32::try_from(v).ok()
             } else {
@@ -1048,7 +1080,7 @@ mod tests {
         let values: Vec<u64> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1069,7 +1101,7 @@ mod tests {
 
         let values: Vec<u64> = (0..130).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1089,7 +1121,7 @@ mod tests {
         let mut values: Vec<u64> = (0..130).collect();
         values[77] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1115,7 +1147,7 @@ mod tests {
         // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
         values[2] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1129,7 +1161,7 @@ mod tests {
         values[129] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = try_map_with_mask(values.as_slice(), &mask, &mut out, |v, valid| {
+        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
             let scaled = v * valid as u64;
             (scaled <= u32::MAX as u64).then_some(scaled as u32)
         });
@@ -1146,7 +1178,7 @@ mod tests {
             }
             m.freeze()
         };
-        map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+        values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| {
             v.wrapping_mul(valid as u32)
         });
         let expected: Vec<u32> = (0..130u32)
@@ -1159,7 +1191,7 @@ mod tests {
     fn try_map_with_mask_in_place_all_ok() {
         let mut values: Vec<u32> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
-        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| {
             let scaled = v.wrapping_mul(valid as u32);
             scaled.checked_mul(2)
         });
@@ -1175,7 +1207,7 @@ mod tests {
         values[150] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res =
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(83));
     }
 
@@ -1186,7 +1218,7 @@ mod tests {
         values[100] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res =
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(80));
     }
 
@@ -1196,7 +1228,7 @@ mod tests {
         values[42] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res =
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(42));
     }
 
@@ -1211,7 +1243,7 @@ mod tests {
             }
             m.freeze()
         };
-        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, valid| {
+        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| {
             v.wrapping_mul(valid as u32).checked_mul(2)
         });
         assert!(res.is_ok());
@@ -1225,7 +1257,7 @@ mod tests {
         values[129] = u32::MAX;
         let mask = BitBuffer::new_set(130);
         let res =
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(129));
     }
 
@@ -1238,7 +1270,7 @@ mod tests {
         let mut values: Vec<u32> = (0..130).collect();
         values[77] = u32::MAX;
         let res =
-            try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| v.checked_mul(2));
+            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(77));
     }
 
@@ -1248,12 +1280,9 @@ mod tests {
         // should see exactly the bit patterns the closure produced.
         let mut buf: Vec<f32> = (0..130).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(130);
-        try_map_with_mask_in_place(
-            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()),
-            &mask,
-            |f, _valid| Some(f.to_bits().wrapping_add(1)),
-        )
-        .unwrap();
+        ReinterpretSink::<f32, u32>::new(buf.as_mut_slice())
+            .try_map_with_mask_in_place(&mask, |f, _valid| Some(f.to_bits().wrapping_add(1)))
+            .unwrap();
         // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by
         // the closure.
         let as_u32: &[u32] =
@@ -1268,17 +1297,17 @@ mod tests {
         // Closure fails at a specific lane; the kernel must report that lane index.
         let mut buf: Vec<f32> = (0..200).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(200);
-        let res = try_map_with_mask_in_place(
-            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()),
-            &mask,
-            |f, _valid| {
-                if f as u32 == 137 {
-                    None
-                } else {
-                    Some(f as u32)
-                }
-            },
-        );
+        let res =
+            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_with_mask_in_place(
+                &mask,
+                |f, _valid| {
+                    if f as u32 == 137 {
+                        None
+                    } else {
+                        Some(f as u32)
+                    }
+                },
+            );
         assert_eq!(res, Err(137));
     }
 
@@ -1286,7 +1315,7 @@ mod tests {
     fn try_map_with_mask_in_place_partial_chunk_success() {
         let mut values: Vec<u32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
-        let res = try_map_with_mask_in_place(values.as_mut_slice(), &mask, |v, _valid| Some(v + 1));
+        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1));
         assert!(res.is_ok());
         assert_eq!(values[0], 1);
         assert_eq!(values[63], 64);

From d8d5463edfa54ed37032867917ec381a347368dc Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 17:06:39 +0100
Subject: [PATCH 15/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/cast_primitive.rs   |  18 +-
 vortex-buffer/benches/cast_to_indexed.rs |  74 ++++---
 vortex-buffer/src/lane_ops_indexed.rs    | 256 +++++++++++++----------
 3 files changed, 194 insertions(+), 154 deletions(-)

diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs
index 0b67571e93d..d4279993068 100644
--- a/vortex-array/benches/cast_primitive.rs
+++ b/vortex-array/benches/cast_primitive.rs
@@ -56,13 +56,9 @@ fn cast_u16_to_u32(bencher: Bencher) {
 #[divan::bench(args = SIZES)]
 fn cast_u32_to_u8(bencher: Bencher, n: usize) {
     let mut rng = StdRng::seed_from_u64(42);
-    #[expect(clippy::cast_possible_truncation)]
     let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
-        if rng.random_bool(0.7) {
-            Some(rng.random_range(0..u8::MAX) as u32)
-        } else {
-            None
-        }
+        rng.random_bool(0.7)
+            .then(|| rng.random_range(0..u8::MAX) as u32)
     }))
     .into_array();
     bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
@@ -78,13 +74,9 @@ fn cast_u32_to_u8(bencher: Bencher, n: usize) {
 #[divan::bench(args = SIZES)]
 fn cast_i32_to_u32(bencher: Bencher, n: usize) {
     let mut rng = StdRng::seed_from_u64(42);
-    let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
-        if rng.random_bool(0.7) {
-            Some(rng.random_range(0..i32::MAX))
-        } else {
-            None
-        }
-    }))
+    let arr = PrimitiveArray::from_option_iter(
+        (0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))),
+    )
     .into_array();
     bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
         #[expect(clippy::unwrap_used)]
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index bcc30669ccb..dedddc1733a 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -139,9 +139,11 @@ fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                <u32 as From<u16>>::from(v) * valid as u32
-            });
+            values
+                .as_slice()
+                .map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
+                    <u32 as From<u16>>::from(v) * valid as u32
+                });
             out
         });
 }
@@ -153,10 +155,10 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mut out)| {
-            values.as_slice().try_map_no_validity(out.as_mut_slice(), |v| {
-                <u32 as NumCast>::from(v)
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_no_validity(out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
             out
         });
 }
@@ -172,10 +174,12 @@ fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                <u32 as NumCast>::from(v)
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
+                    <u32 as NumCast>::from(v)
+                })
+                .unwrap();
             out
         });
 }
@@ -187,10 +191,12 @@ fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
+                    <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
+                })
+                .unwrap();
             out
         });
 }
@@ -212,10 +218,12 @@ fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usi
             )
         })
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                <u32 as NumCast>::from(v)
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
+                    <u32 as NumCast>::from(v)
+                })
+                .unwrap();
             out
         });
 }
@@ -227,10 +235,12 @@ fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                Some(<u32 as From<u16>>::from(v)).or_else(|| (!valid).then(u32::default))
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
+                    Some(<u32 as From<u16>>::from(v)).or_else(|| (!valid).then(u32::default))
+                })
+                .unwrap();
             out
         });
 }
@@ -242,10 +252,12 @@ fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
-            values.as_slice().try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                Some(<u32 as From<u16>>::from(v))
-            })
-            .unwrap();
+            values
+                .as_slice()
+                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
+                    Some(<u32 as From<u16>>::from(v))
+                })
+                .unwrap();
             out
         });
 }
@@ -257,7 +269,9 @@ fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone()))
         .bench_values(|(mut values, mask)| {
-            values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| v * valid as u32);
+            values
+                .as_mut_slice()
+                .map_with_mask_in_place(&mask, |v, valid| v * valid as u32);
             values
         });
 }
@@ -269,7 +283,9 @@ fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) {
     bencher
         .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone()))
         .bench_values(|(mut values, mask)| {
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2))
+            values
+                .as_mut_slice()
+                .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2))
                 .unwrap();
             values
         });
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 683a03c5539..98d3edf5473 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -441,11 +441,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     ///
     /// Panics if `out.len() != self.len()`.
     #[inline]
-    fn try_map_no_validity<R, F>(
-        self,
-        out: &mut [MaybeUninit<R>],
-        mut f: F,
-    ) -> Result<(), usize>
+    fn try_map_no_validity<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F) -> Result<(), usize>
     where
         R: Copy + Default,
         F: FnMut(Self::Item) -> Option<R>,
@@ -745,11 +741,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
     /// Panics if `self.len() != mask.len()`.
     #[inline]
     #[allow(clippy::cast_possible_truncation)]
-    fn try_map_with_mask_in_place<F>(
-        self,
-        mask: &BitBuffer,
-        mut f: F,
-    ) -> Result<(), usize>
+    fn try_map_with_mask_in_place<F>(self, mask: &BitBuffer, mut f: F) -> Result<(), usize>
     where
         Self::Write: Default,
         F: FnMut(Self::Item, bool) -> Option<Self::Write>,
@@ -835,9 +827,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
-        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
-            if valid { v } else { -1 }
-        });
+        values
+            .as_slice()
+            .map_with_mask(&mask, &mut out, |v, valid| if valid { v } else { -1 });
         assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
     }
 
@@ -847,9 +839,9 @@ mod tests {
         let values: Vec<i32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
-        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
-            if valid { v + 1 } else { 0 }
-        });
+        values
+            .as_slice()
+            .map_with_mask(&mask, &mut out, |v, valid| if valid { v + 1 } else { 0 });
         let got = write_t(out);
         assert_eq!(got.len(), 130);
         assert_eq!(got[0], 1);
@@ -868,9 +860,13 @@ mod tests {
 
         let values: Vec<u32> = (0..65).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
-        values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| {
-            if valid { v } else { u32::MAX }
-        });
+        values.as_slice().map_with_mask(
+            &sliced,
+            &mut out,
+            |v, valid| {
+                if valid { v } else { u32::MAX }
+            },
+        );
         let got = write_t(out);
         assert_eq!(got, (0..65).collect::<Vec<u32>>());
     }
@@ -887,9 +883,9 @@ mod tests {
 
         let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
         let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
-        values.as_slice().map_with_mask(&sliced, &mut out, |v, valid| {
-            if valid { v } else { -1 }
-        });
+        values
+            .as_slice()
+            .map_with_mask(&sliced, &mut out, |v, valid| if valid { v } else { -1 });
         let got = write_t(out);
         assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
     }
@@ -914,9 +910,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
-        values.as_slice().map_with_mask(&mask, &mut out, |v, valid| {
-            v * (valid as i64)
-        });
+        values
+            .as_slice()
+            .map_with_mask(&mask, &mut out, |v, valid| v * (valid as i64));
         let got = write_t(out);
         for (i, &x) in got.iter().enumerate() {
             if i % 3 == 0 {
@@ -932,10 +928,12 @@ mod tests {
         let values: Vec<u64> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got, (0..200u32).collect::<Vec<_>>());
@@ -948,10 +946,12 @@ mod tests {
         values[137] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert_eq!(res, Err(137));
     }
 
@@ -964,10 +964,12 @@ mod tests {
         values[137] = u64::MAX;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert_eq!(res, Err(50));
     }
 
@@ -986,9 +988,11 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| {
-            (v <= u32::MAX as u64).then_some(v as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, _valid| {
+                (v <= u32::MAX as u64).then_some(v as u32)
+            });
         assert!(
             res.is_ok(),
             "null-lane overflow should be filtered by the cold path"
@@ -1013,9 +1017,11 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, _valid| {
-            (v <= u32::MAX as u64).then_some(v as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, _valid| {
+                (v <= u32::MAX as u64).then_some(v as u32)
+            });
         assert_eq!(res, Err(77));
     }
 
@@ -1033,10 +1039,12 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got[5], 0); // null-lane wrote default
@@ -1058,19 +1066,23 @@ mod tests {
 
         let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
         let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
-        values.as_slice().try_map_with_mask(&mask, &mut branchless, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        })
-        .unwrap();
-        values.as_slice().try_map_with_mask(&mask, &mut branchful, |v, valid| {
-            if valid {
-                u32::try_from(v).ok()
-            } else {
-                Some(0)
-            }
-        })
-        .unwrap();
+        values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut branchless, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            })
+            .unwrap();
+        values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut branchful, |v, valid| {
+                if valid {
+                    u32::try_from(v).ok()
+                } else {
+                    Some(0)
+                }
+            })
+            .unwrap();
 
         assert_eq!(write_t(branchful), write_t(branchless));
     }
@@ -1080,10 +1092,12 @@ mod tests {
         let values: Vec<u64> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got.len(), 130);
@@ -1101,10 +1115,12 @@ mod tests {
 
         let values: Vec<u64> = (0..130).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got, (0..130u32).collect::<Vec<_>>());
@@ -1121,10 +1137,12 @@ mod tests {
         let mut values: Vec<u64> = (0..130).collect();
         values[77] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert_eq!(res, Err(77));
     }
 
@@ -1147,10 +1165,12 @@ mod tests {
         // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
         values[2] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert!(res.is_ok(), "null lane should bypass the range check");
     }
 
@@ -1161,10 +1181,12 @@ mod tests {
         values[129] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values.as_slice().try_map_with_mask(&mask, &mut out, |v, valid| {
-            let scaled = v * valid as u64;
-            (scaled <= u32::MAX as u64).then_some(scaled as u32)
-        });
+        let res = values
+            .as_slice()
+            .try_map_with_mask(&mask, &mut out, |v, valid| {
+                let scaled = v * valid as u64;
+                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            });
         assert_eq!(res, Err(129));
     }
 
@@ -1178,9 +1200,9 @@ mod tests {
             }
             m.freeze()
         };
-        values.as_mut_slice().map_with_mask_in_place(&mask, |v, valid| {
-            v.wrapping_mul(valid as u32)
-        });
+        values
+            .as_mut_slice()
+            .map_with_mask_in_place(&mask, |v, valid| v.wrapping_mul(valid as u32));
         let expected: Vec<u32> = (0..130u32)
             .map(|v| if v % 2 == 0 { v } else { 0 })
             .collect();
@@ -1191,10 +1213,12 @@ mod tests {
     fn try_map_with_mask_in_place_all_ok() {
         let mut values: Vec<u32> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
-        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| {
-            let scaled = v.wrapping_mul(valid as u32);
-            scaled.checked_mul(2)
-        });
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, valid| {
+                let scaled = v.wrapping_mul(valid as u32);
+                scaled.checked_mul(2)
+            });
         assert!(res.is_ok());
         let expected: Vec<u32> = (0..200u32).map(|v| v * 2).collect();
         assert_eq!(values, expected);
@@ -1206,8 +1230,9 @@ mod tests {
         values[83] = u32::MAX;
         values[150] = u32::MAX;
         let mask = BitBuffer::new_set(200);
-        let res =
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(83));
     }
 
@@ -1217,8 +1242,9 @@ mod tests {
         values[80] = u32::MAX;
         values[100] = u32::MAX;
         let mask = BitBuffer::new_set(200);
-        let res =
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(80));
     }
 
@@ -1227,8 +1253,9 @@ mod tests {
         let mut values: Vec<u32> = (0..200).collect();
         values[42] = u32::MAX;
         let mask = BitBuffer::new_set(200);
-        let res =
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(42));
     }
 
@@ -1243,9 +1270,11 @@ mod tests {
             }
             m.freeze()
         };
-        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, valid| {
-            v.wrapping_mul(valid as u32).checked_mul(2)
-        });
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, valid| {
+                v.wrapping_mul(valid as u32).checked_mul(2)
+            });
         assert!(res.is_ok());
         assert_eq!(values[5], 0);
         assert_eq!(values[6], 12);
@@ -1256,8 +1285,9 @@ mod tests {
         let mut values: Vec<u32> = (0..130).collect();
         values[129] = u32::MAX;
         let mask = BitBuffer::new_set(130);
-        let res =
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(129));
     }
 
@@ -1269,8 +1299,9 @@ mod tests {
 
         let mut values: Vec<u32> = (0..130).collect();
         values[77] = u32::MAX;
-        let res =
-            values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
         assert_eq!(res, Err(77));
     }
 
@@ -1297,17 +1328,16 @@ mod tests {
         // Closure fails at a specific lane; the kernel must report that lane index.
         let mut buf: Vec<f32> = (0..200).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(200);
-        let res =
-            ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_with_mask_in_place(
-                &mask,
-                |f, _valid| {
-                    if f as u32 == 137 {
-                        None
-                    } else {
-                        Some(f as u32)
-                    }
-                },
-            );
+        let res = ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_with_mask_in_place(
+            &mask,
+            |f, _valid| {
+                if f as u32 == 137 {
+                    None
+                } else {
+                    Some(f as u32)
+                }
+            },
+        );
         assert_eq!(res, Err(137));
     }
 
@@ -1315,7 +1345,9 @@ mod tests {
     fn try_map_with_mask_in_place_partial_chunk_success() {
         let mut values: Vec<u32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
-        let res = values.as_mut_slice().try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1));
+        let res = values
+            .as_mut_slice()
+            .try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1));
         assert!(res.is_ok());
         assert_eq!(values[0], 1);
         assert_eq!(values[63], 64);

From 2556d5331a961abe734430d23efbc78cab1e131f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 17:20:47 +0100
Subject: [PATCH 16/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/Cargo.toml               | 1 -
 vortex-buffer/src/lane_ops_indexed.rs | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index e5233ce7cc6..666a23c02c4 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -218,4 +218,3 @@ harness = false
 [[bench]]
 name = "to_arrow"
 harness = false
-
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 98d3edf5473..1244ef3d0b7 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -290,7 +290,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     /// **Null-lane failures are filtered automatically.** If a null lane's stored
     /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate
     /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at
-    /// the lane's position, then ANDed with the chunk's validity bitmap — null-lane
+    /// the lane's position, then AND-combined with the chunk's validity bitmap — null-lane
     /// bits vanish. The closure may also explicitly suppress null-lane failures by
     /// branching on `valid` itself; both behaviors compose.
     ///

From aa8a6d181601704a14011204960bf1a7c8676e14 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 17:22:29 +0100
Subject: [PATCH 17/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/cast_primitive.rs   | 16 +++++-----------
 vortex-buffer/benches/add_checked.rs     |  2 +-
 vortex-buffer/benches/cast_to_indexed.rs |  2 +-
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs
index d4279993068..63eeb350e7a 100644
--- a/vortex-array/benches/cast_primitive.rs
+++ b/vortex-array/benches/cast_primitive.rs
@@ -18,22 +18,16 @@ fn main() {
     divan::main();
 }
 
-const N: usize = 100_000;
-
 // Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so
 // the kernel cost shows up clearly rather than being hidden by DRAM bandwidth.
 const SIZES: &[usize] = &[65_536];
 
-#[divan::bench]
-fn cast_u16_to_u32(bencher: Bencher) {
+#[divan::bench(args = SIZES)]
+fn cast_u16_to_u32(bencher: Bencher, n: usize) {
     let mut rng = StdRng::seed_from_u64(42);
-    #[expect(clippy::cast_possible_truncation)]
-    let arr = PrimitiveArray::from_option_iter((0..N).map(|i| {
-        if rng.random_bool(0.5) {
-            None
-        } else {
-            Some(i as u16)
-        }
+    let arr = PrimitiveArray::from_option_iter((0..n).map(|i| {
+        #[expect(clippy::cast_possible_truncation)]
+        rng.random_bool(0.5).then(|| i as u16)
     }))
     .into_array();
     // Pre-compute min/max so values_fit_in is a cache hit during the benchmark.
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index 5c838479a13..c568cffaa23 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -29,7 +29,7 @@ fn main() {
     divan::main();
 }
 
-const SIZES: &[usize] = &[4_096, 65_536, 1_048_576, 2_097_152, 4_194_304];
+const SIZES: &[usize] = &[65_536];
 const LHS_VALID_RATE: f64 = 0.7;
 const RHS_VALID_RATE: f64 = 0.8;
 
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index dedddc1733a..524985baa6b 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -30,7 +30,7 @@ fn main() {
     divan::main();
 }
 
-const SIZES: &[usize] = &[4_096, 65_536, 1_048_576];
+const SIZES: &[usize] = &[65_536];
 
 struct Fixture {
     values_u64: Buffer<u64>,

From ca2ad88e52ffb6e657ce800f98a6214ed262697f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 19:16:43 +0100
Subject: [PATCH 18/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      |  38 +-
 vortex-buffer/benches/add_checked.rs          |  14 +-
 vortex-buffer/benches/cast_to_indexed.rs      | 195 ++----
 vortex-buffer/src/lane_ops_indexed.rs         | 556 +++++-------------
 4 files changed, 207 insertions(+), 596 deletions(-)

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 9aef97e6c9d..34bc6ba3445 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -108,9 +108,9 @@ impl CastKernel for Primitive {
 /// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with
 /// `NumCast::from`. The kernel branches once on the mask shape:
 ///
-/// - `Mask::AllTrue`  → [`try_map_no_validity`] — no per-lane validity work.
+/// - `Mask::AllTrue`  → [`try_map_into`] — no per-lane validity work.
 /// - `Mask::AllFalse` → bulk zero — the closure is never invoked.
-/// - `Mask::Values`   → [`try_map_with_mask`] — the closure neutralizes null lanes
+/// - `Mask::Values`   → [`try_map_masked_into`] — the closure neutralizes null lanes
 ///   via the `* valid as F` multiply trick so out-of-range null-lane values don't
 ///   trigger spurious errors.
 ///
@@ -170,8 +170,7 @@ where
         // (harmless: the result validity bitmap masks them downstream).
         return match owned {
             Some(mut buf) => {
-                ReinterpretSink::<F, T>::new(buf.as_mut_slice())
-                    .map_no_validity_in_place(|v: F| v.as_());
+                ReinterpretSink::<F, T>::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_());
                 // SAFETY: same size + alignment for NativePType same-byte-width pairs;
                 // every F-slot was overwritten with a real `T` bit pattern.
                 let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
@@ -179,8 +178,8 @@ where
             }
             None => {
                 let mut buffer = BufferMut::<T>::with_capacity(len);
-                values.map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
-                // SAFETY: map_no_validity initializes every lane.
+                values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
+                // SAFETY: map_into initializes every lane.
                 unsafe { buffer.set_len(len) };
                 Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
             }
@@ -192,7 +191,7 @@ where
     let buffer: Buffer<T> = match (&mask, owned) {
         (Mask::AllTrue(_), Some(mut buf)) => {
             ReinterpretSink::<F, T>::new(buf.as_mut_slice())
-                .try_map_no_validity_in_place(|v: F| <T as NumCast>::from(v))
+                .try_map_in_place(|v: F| <T as NumCast>::from(v))
                 .map_err(|_| overflow())?;
             // SAFETY: same size + alignment for NativePType same-byte-width pairs;
             // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
@@ -202,11 +201,11 @@ where
         (Mask::AllTrue(_), None) => {
             let mut buffer = BufferMut::<T>::with_capacity(len);
             values
-                .try_map_no_validity(&mut buffer.spare_capacity_mut()[..len], |v| {
+                .try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| {
                     <T as NumCast>::from(v)
                 })
                 .map_err(|_| overflow())?;
-            // SAFETY: try_map_no_validity returned Ok, so it initialized every lane.
+            // SAFETY: try_map_into returned Ok, so it initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
         }
@@ -219,9 +218,7 @@ where
         (Mask::AllFalse(_), None) => BufferMut::<T>::zeroed(len).freeze(),
         (Mask::Values(m), Some(mut buf)) => {
             ReinterpretSink::<F, T>::new(buf.as_mut_slice())
-                .try_map_with_mask_in_place(m.bit_buffer(), |v: F, valid| {
-                    <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero))
-                })
+                .try_map_masked_in_place(m.bit_buffer(), |v: F| <T as NumCast>::from(v))
                 .map_err(|_| overflow())?;
             // SAFETY: same size + alignment for NativePType same-byte-width pairs;
             // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
@@ -230,20 +227,19 @@ where
         }
         (Mask::Values(m), None) => {
             let mut buffer = BufferMut::<T>::with_capacity(len);
+            // Null-lane failures (where the underlying garbage value can't be represented in
+            // `T`) are filtered automatically by `try_map_masked_into`'s post-loop
+            // `fail_bits & src_chunk` AND. The closure is value-only — LLVM proves it's
+            // statically infallible for widening casts and DCEs the fail-tracking, giving the
+            // same codegen as the maskless kernel.
             values
-                .try_map_with_mask(
+                .try_map_masked_into(
                     m.bit_buffer(),
                     &mut buffer.spare_capacity_mut()[..len],
-                    // Lazy validity: only consult `valid` on the failure branch. For widening /
-                    // statically-infallible casts, `NumCast::from` is always `Some` so the
-                    // `or_else` is provably dead — LLVM DCEs the validity path entirely,
-                    // giving the same codegen as the maskless kernel. For narrowing, `valid`
-                    // is only read at lanes that actually overflowed (a cold check on top of
-                    // the cast).
-                    |v, valid| <T as NumCast>::from(v).or_else(|| (!valid).then(T::zero)),
+                    |v| <T as NumCast>::from(v),
                 )
                 .map_err(|_| overflow())?;
-            // SAFETY: try_map_with_mask returned Ok, so it initialized every lane.
+            // SAFETY: try_map_masked_into returned Ok, so it initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
         }
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
index c568cffaa23..4f71f085847 100644
--- a/vortex-buffer/benches/add_checked.rs
+++ b/vortex-buffer/benches/add_checked.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_with_mask`]
+//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_masked_into`]
 //! with a value-only closure. Per-lane `is_none()` flags are bit-packed and
 //! AND-ed with the chunk validity word so null-lane overflow is filtered
 //! without the closure ever inspecting `valid`.
@@ -117,9 +117,7 @@ fn bitpack_value_only(bencher: Bencher, n: usize) {
             let combined = lm as &BitBuffer & rm as &BitBuffer;
             let mut out = alloc_out(n);
             LaneZip::new(lhs.as_slice(), rhs.as_slice())
-                .try_map_with_mask(&combined, out.as_mut_slice(), |(a, b), _valid| {
-                    a.checked_add(b)
-                })
+                .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b))
                 .unwrap();
             (combined, out)
         });
@@ -144,10 +142,10 @@ fn assert_overflow_parity() {
     };
 
     let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask(
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
         &mask,
         out.as_mut_slice(),
-        |(a, b), _| a.checked_add(b),
+        |(a, b)| a.checked_add(b),
     );
     assert!(r.is_err(), "bitpack should Err on overflow");
 }
@@ -168,10 +166,10 @@ fn assert_null_overflow_suppressed() {
     };
 
     let mut out = alloc_out(4);
-    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_with_mask(
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
         &mask,
         out.as_mut_slice(),
-        |(a, b), _| a.checked_add(b),
+        |(a, b)| a.checked_add(b),
     );
     assert!(r.is_ok(), "bitpack: null-lane overflow leaked");
 }
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
index 524985baa6b..5ab1041f5cc 100644
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ b/vortex-buffer/benches/cast_to_indexed.rs
@@ -13,9 +13,8 @@ use arrow_array::UInt64Array;
 use arrow_buffer::NullBuffer;
 use arrow_buffer::ScalarBuffer;
 use arrow_cast::CastOptions;
-use arrow_cast::cast_with_options;
-use arrow_schema::DataType;
 use divan::Bencher;
+use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use rand::SeedableRng;
 use rand::prelude::*;
@@ -25,6 +24,7 @@ use vortex_buffer::BitBufferMut;
 use vortex_buffer::Buffer;
 use vortex_buffer::lane_ops_indexed::IndexedSinkExt;
 use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
+use vortex_buffer::lane_ops_indexed::ReinterpretSink;
 
 fn main() {
     divan::main();
@@ -34,15 +34,11 @@ const SIZES: &[usize] = &[65_536];
 
 struct Fixture {
     values_u64: Buffer<u64>,
-    values_u64_invalid_overflows: Buffer<u64>,
-    values_u32: Buffer<u32>,
-    values_u32_small: Buffer<u32>,
     values_u16: Buffer<u16>,
+    /// Positive `i32` values (always representable as `u32`). Used by the
+    /// in-place-vs-out-of-place cast bench.
+    values_i32: Buffer<i32>,
     mask: BitBuffer,
-    /// `UInt64Array` baseline for arrow casts. Same values + validity as `values_u64` / `mask`.
-    arrow_u64: UInt64Array,
-    /// `UInt16Array` baseline. Same as `values_u16` / `mask`.
-    arrow_u16: UInt16Array,
 }
 
 fn fixture(n: usize) -> Fixture {
@@ -60,6 +56,14 @@ fn fixture(n: usize) -> Fixture {
         .map(|v| v as u16)
         .collect::<Buffer<u16>>();
 
+    // Positive i32 values (top bit cleared) — every value fits in u32.
+    #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
+    let values_i32 = raw_values
+        .iter()
+        .copied()
+        .map(|v| (v as i32) & i32::MAX)
+        .collect::<Buffer<i32>>();
+
     #[expect(clippy::cast_possible_truncation)]
     let values_u32 = raw_values
         .iter()
@@ -94,21 +98,12 @@ fn fixture(n: usize) -> Fixture {
 
     Fixture {
         values_u64: raw_values.into(),
-        values_u64_invalid_overflows,
-        values_u32,
-        values_u32_small,
         values_u16,
+        values_i32,
         mask: BitBufferMut::from_iter(raw_valid).freeze(),
-        arrow_u64,
-        arrow_u16,
     }
 }
 
-const CAST_OPTS_CHECKED: CastOptions<'static> = CastOptions {
-    safe: false,
-    format_options: arrow_cast::display::FormatOptions::new(),
-};
-
 fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
     let mut out = Vec::with_capacity(n);
     // SAFETY: A `MaybeUninit<T>` does not require initialization.
@@ -119,37 +114,7 @@ fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
 }
 
 #[divan::bench(args = SIZES)]
-fn map_no_validity_widen_u16_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mut out)| {
-            values
-                .as_slice()
-                .map_no_validity(out.as_mut_slice(), <u32 as From<u16>>::from);
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn map_with_mask_widen_u16_u32_zero_nulls(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                    <u32 as From<u16>>::from(v) * valid as u32
-                });
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) {
+fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
@@ -157,158 +122,88 @@ fn try_map_no_validity_narrow_u64_u32(bencher: Bencher, n: usize) {
         .bench_values(|(values, mut out)| {
             values
                 .as_slice()
-                .try_map_no_validity(out.as_mut_slice(), <u32 as NumCast>::from)
+                .try_map_into(out.as_mut_slice(), <u32 as NumCast>::from)
                 .unwrap();
             out
         });
 }
 
-/// `try_map_with_mask` with a closure that **ignores `valid`**. Tests whether
-/// LLVM DCEs the per-lane `(src_chunk >> bit_idx) & 1` mask extract. Uses
-/// non-overflowing `values_u64` so the closure-ignores-valid spurious-failure
-/// case never triggers (would otherwise err on null-lane overflow).
 #[divan::bench(args = SIZES)]
-fn try_map_with_mask_narrow_u64_u32_ignoring_valid(bencher: Bencher, n: usize) {
+fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
-        .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                    <u32 as NumCast>::from(v)
-                })
-                .unwrap();
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(&mut out, |v| v.as_());
             out
         });
 }
 
+/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the same runtime
+/// and showing for always true map operations `try_map_masked_into` is sufficient.
 #[divan::bench(args = SIZES)]
-fn try_map_with_mask_narrow_u64_u32_lazy_validity(bencher: Bencher, n: usize) {
+fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
-        .with_inputs(|| (f.values_u64.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
             values
                 .as_slice()
-                .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                    <u32 as NumCast>::from(v).or_else(|| (!valid).then(u32::default))
-                })
+                .try_map_masked_into(&mask, out.as_mut_slice(), |v| <u32 as NumCast>::from(v))
                 .unwrap();
             out
         });
 }
 
-/// Migrated from the old `try_map_validity_filtered` bench: same inputs (null
-/// lanes contain overflowing values) and same correctness expectation (no Err),
-/// but now driven through the merged `try_map_with_mask` with a `|v, _|` closure.
-/// The hot loop is value-only via DCE; the cold path filters null-lane failures.
 #[divan::bench(args = SIZES)]
-fn try_map_with_mask_narrow_u64_u32_value_only_filtered(bencher: Bencher, n: usize) {
+fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
-        .with_inputs(|| {
-            (
-                f.values_u64_invalid_overflows.clone(),
-                f.mask.clone(),
-                uninit_out::<u32>(n),
-            )
-        })
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                    <u32 as NumCast>::from(v)
-                })
-                .unwrap();
+        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(out.as_mut_slice(), |v| v.as_());
             out
         });
 }
 
-#[divan::bench(args = SIZES)]
-fn try_map_with_mask_widen_u16_u32_or_else(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .try_map_with_mask(&mask, out.as_mut_slice(), |v, valid| {
-                    Some(<u32 as From<u16>>::from(v)).or_else(|| (!valid).then(u32::default))
-                })
-                .unwrap();
-            out
-        });
-}
+// -----------------------------------------------------------------------------
+// In-place vs out-of-place fallible cast i32 → u32 (same byte width).
+//
+// `try_map_masked_into_in_place` mutates the input via `ReinterpretSink` and
+// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates
+// a fresh `BufferMut<u32>` and writes through it. Input values are all positive
+// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any
+// delta is pure allocation + memory-traffic overhead.
+// -----------------------------------------------------------------------------
 
 #[divan::bench(args = SIZES)]
-fn try_map_with_mask_widen_u16_u32_maskless(bencher: Bencher, n: usize) {
+fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::<u32>(n)))
         .bench_values(|(values, mask, mut out)| {
             values
                 .as_slice()
-                .try_map_with_mask(&mask, out.as_mut_slice(), |v, _valid| {
-                    Some(<u32 as From<u16>>::from(v))
-                })
+                .try_map_masked_into(&mask, out.as_mut_slice(), |v| <u32 as NumCast>::from(v))
                 .unwrap();
             out
         });
 }
 
 #[divan::bench(args = SIZES)]
-fn map_with_mask_in_place_u32_zero_nulls(bencher: Bencher, n: usize) {
+fn try_map_masked_into_in_place_narrow_i32_u32(bencher: Bencher, n: usize) {
     let f = fixture(n);
 
     bencher
-        .with_inputs(|| (f.values_u32.as_slice().to_vec(), f.mask.clone()))
+        .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone()))
         .bench_values(|(mut values, mask)| {
-            values
-                .as_mut_slice()
-                .map_with_mask_in_place(&mask, |v, valid| v * valid as u32);
-            values
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn try_map_with_mask_in_place_u32_checked_mul(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u32_small.as_slice().to_vec(), f.mask.clone()))
-        .bench_values(|(mut values, mask)| {
-            values
-                .as_mut_slice()
-                .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2))
+            ReinterpretSink::<i32, u32>::new(values.as_mut_slice())
+                .try_map_masked_in_place(&mask, |v| <u32 as NumCast>::from(v))
                 .unwrap();
             values
         });
 }
-
-// -----------------------------------------------------------------------------
-// Arrow-rs baselines. Two: one widening (u16 → u32, always succeeds) and one
-// narrowing (u64 → u32, can fail). Each pairs with the cast variants above of
-// matching direction.
-// -----------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_widen_u16_u32(bencher: Bencher, _n: usize) {
-    let f = fixture(_n);
-    bencher
-        .with_inputs(|| f.arrow_u16.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
-
-#[divan::bench(args = SIZES)]
-fn arrow_cast_narrow_u64_u32(bencher: Bencher, _n: usize) {
-    let f = fixture(_n);
-    bencher
-        .with_inputs(|| f.arrow_u64.clone())
-        .bench_refs(|arr| cast_with_options(arr, &DataType::UInt32, &CAST_OPTS_CHECKED).unwrap());
-}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_ops_indexed.rs
index 1244ef3d0b7..810d76c0900 100644
--- a/vortex-buffer/src/lane_ops_indexed.rs
+++ b/vortex-buffer/src/lane_ops_indexed.rs
@@ -219,92 +219,35 @@ impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
 /// All methods have default implementations and are inherited via the blanket
 /// `impl<S: IndexedSource> IndexedSourceExt for S` below. Bring the trait into
 /// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call
-/// them with method syntax: `values.try_map_with_mask(&mask, &mut out, f)`.
+/// them with method syntax: `values.try_map_masked_into(&mask, &mut out, f)`.
 pub trait IndexedSourceExt: IndexedSource + Sized {
-    /// Apply `f(value, valid)` lane-by-lane, writing `out[i] = f(self[i], mask[i])`.
-    ///
-    /// All three inputs must have the same length. The output type `R` may differ from
-    /// the input type — this kernel is the building block for both same-type transforms
-    /// (fill_null) and cross-type ones (cast). The caller is responsible for marking
-    /// `out` initialized (e.g. by calling `BufferMut::set_len` after this returns).
-    ///
-    /// # Panics
-    ///
-    /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`.
-    #[inline]
-    fn map_with_mask<R, F>(self, mask: &BitBuffer, out: &mut [MaybeUninit<R>], mut f: F)
-    where
-        F: FnMut(Self::Item, bool) -> R,
-    {
-        /// Per-chunk worker. Called twice (literal `64` for full chunks, `remainder`
-        /// for the tail). `#[inline(always)]` preserves the const-64 unroll at the
-        /// full-chunk call site via constant propagation through inlining.
-        #[inline(always)]
-        fn chunk<S, R, F>(
-            values: &S,
-            out: &mut [MaybeUninit<R>],
-            f: &mut F,
-            src_chunk: u64,
-            base: usize,
-            count: usize,
-        ) where
-            S: IndexedSource,
-            F: FnMut(S::Item, bool) -> R,
-        {
-            for bit_idx in 0..count {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                unsafe { out.get_unchecked_mut(i).write(f(v, bit)) };
-            }
-        }
-
-        let values = self;
-        let len = values.len();
-        assert_eq!(len, mask.len(), "values and mask must have the same length");
-        assert_eq!(out.len(), len, "out must have the same length as values");
-
-        let chunks = mask.chunks();
-        let chunks_count = len / 64;
-        let remainder = len % 64;
-
-        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-            chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64);
-        }
-        if remainder != 0 {
-            chunk(
-                &values,
-                out,
-                &mut f,
-                chunks.remainder_bits(),
-                chunks_count * 64,
-                remainder,
-            );
-        }
-    }
-
     /// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None`
     /// indicates a per-lane failure (e.g. range overflow on a narrowing cast).
     ///
-    /// **Null-lane failures are filtered automatically.** If a null lane's stored
-    /// value causes `f(v, false)` to return `None`, the kernel does *not* propagate
-    /// that as `Err`. The per-lane `is_none()` flags are bit-packed into a `u64` at
-    /// the lane's position, then AND-combined with the chunk's validity bitmap — null-lane
-    /// bits vanish. The closure may also explicitly suppress null-lane failures by
-    /// branching on `valid` itself; both behaviors compose.
+    /// **Null-lane failures are filtered automatically.** The closure is called on
+    /// every lane regardless of validity; if a null lane's stored value causes `f(v)`
+    /// to return `None`, the kernel does *not* propagate that as `Err`. The per-lane
+    /// `is_none()` flags are bit-packed into a `u64` at the lane's position, then
+    /// AND-combined with the chunk's validity bitmap — null-lane bits vanish.
+    ///
+    /// The closure shape is the same as [`try_map_into`] (`FnMut(Item) -> Option<R>`);
+    /// the mask parameter is what makes this kernel mask-aware. Callers that need to
+    /// distinguish null lanes inside the closure (e.g. to short-circuit an expensive
+    /// computation) should construct their own per-lane validity check externally; for
+    /// the common case, the kernel's automatic filter is sufficient.
     ///
     /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
     /// write `R::default()` into `out`, but the contents of `out` must not be relied
     /// upon when this function returns `Err`.
     ///
     /// [`map_with_mask`]: IndexedSourceExt::map_with_mask
+    /// [`try_map_into`]: IndexedSourceExt::try_map_into
     ///
     /// # Panics
     ///
     /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`.
     #[inline]
-    fn try_map_with_mask<R, F>(
+    fn try_map_masked_into<R, F>(
         self,
         mask: &BitBuffer,
         out: &mut [MaybeUninit<R>],
@@ -312,7 +255,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     ) -> Result<(), usize>
     where
         R: Copy + Default,
-        F: FnMut(Self::Item, bool) -> Option<R>,
+        F: FnMut(Self::Item) -> Option<R>,
     {
         #[inline(always)]
         fn chunk<S, R, F>(
@@ -326,15 +269,14 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
         where
             S: IndexedSource,
             R: Copy + Default,
-            F: FnMut(S::Item, bool) -> Option<R>,
+            F: FnMut(S::Item) -> Option<R>,
         {
             let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
                 let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
                 // SAFETY: caller guarantees base + count <= len.
                 let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v, bit);
+                let opt = f(v);
                 fail_bits |= (opt.is_none() as u64) << bit_idx;
                 let r = opt.unwrap_or_default();
                 unsafe { out.get_unchecked_mut(i).write(r) };
@@ -380,7 +322,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     ///
     /// Panics if `out.len() != self.len()`.
     #[inline]
-    fn map_no_validity<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F)
+    fn map_into<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F)
     where
         F: FnMut(Self::Item) -> R,
     {
@@ -423,7 +365,7 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     ///
     /// # Use this only for non-nullable inputs.
     ///
-    /// For nullable inputs with a fallible closure, use [`try_map_with_mask`] —
+    /// For nullable inputs with a fallible closure, use [`try_map_masked_into`] —
     /// it has the same value-only closure shape (and the same perf win) but
     /// **correctly suppresses null-lane failures** via per-chunk
     /// `fail_bits & mask_chunk`.
@@ -435,13 +377,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
     ///
     /// On failure returns `Err(failing_lane_index)`.
     ///
-    /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask
+    /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into
     ///
     /// # Panics
     ///
     /// Panics if `out.len() != self.len()`.
     #[inline]
-    fn try_map_no_validity<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F) -> Result<(), usize>
+    fn try_map_into<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F) -> Result<(), usize>
     where
         R: Copy + Default,
         F: FnMut(Self::Item) -> Option<R>,
@@ -545,7 +487,7 @@ where
 /// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with
 /// method syntax.
 pub trait IndexedSinkExt: IndexedSink + Sized {
-    /// In-place counterpart of [`IndexedSourceExt::map_no_validity`]. Each lane
+    /// In-place counterpart of [`IndexedSourceExt::map_into`]. Each lane
     /// is replaced with `f(self[i])`.
     ///
     /// The closure reads `Self::Item` and returns `Self::Write`. For the common
@@ -553,10 +495,10 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
     /// write types can differ (e.g. read `f32`, write `u32`) over the same
     /// backing memory when sizes and alignments match.
     ///
-    /// As with [`IndexedSourceExt::map_no_validity`], use this only when the
+    /// As with [`IndexedSourceExt::map_into`], use this only when the
     /// input is known non-nullable.
     #[inline]
-    fn map_no_validity_in_place<F>(self, mut f: F)
+    fn map_into_in_place<F>(self, mut f: F)
     where
         F: FnMut(Self::Item) -> Self::Write,
     {
@@ -589,51 +531,46 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
         }
     }
 
-    /// In-place counterpart of [`IndexedSourceExt::try_map_no_validity`]. Each
+    /// In-place counterpart of [`IndexedSourceExt::try_map_into`]. Each
     /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f`
     /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer
     /// state on `Err` is unspecified.
     ///
-    /// As with [`IndexedSourceExt::try_map_no_validity`], use this only when the
-    /// input is known non-nullable — a `None` from `f` is treated as a failure
-    /// regardless of any upstream validity bitmap.
-    ///
     /// ## Error attribution
     ///
-    /// Per-lane `is_none()` flags are folded into `first_fail` via the same
-    /// branchless `min` scheme as [`try_map_with_mask_in_place`]. Cold replay
-    /// isn't viable here because the original input values have already been
-    /// overwritten by the time we'd attribute the failure.
+    /// Per-lane `is_none()` flags are bit-packed into a `u64` at the lane's
+    /// position — `fail_bits |= (opt.is_none() as u64) << bit_idx`. After the
+    /// 64-lane loop, `trailing_zeros()` of `fail_bits` recovers the first
+    /// failing lane index. `OR + shift` per lane is friendlier to the
+    /// autovectorizer than `min`/`csel` — see [`try_map_masked_in_place`] for
+    /// the same scheme over a masked variant.
     ///
-    /// [`try_map_with_mask_in_place`]: IndexedSinkExt::try_map_with_mask_in_place
+    /// [`try_map_masked_in_place`]: IndexedSinkExt::try_map_masked_in_place
     #[inline]
-    #[allow(clippy::cast_possible_truncation)]
-    fn try_map_no_validity_in_place<F>(self, mut f: F) -> Result<(), usize>
+    fn try_map_in_place<F>(self, mut f: F) -> Result<(), usize>
     where
         Self::Write: Default,
         F: FnMut(Self::Item) -> Option<Self::Write>,
     {
         #[inline(always)]
-        #[allow(clippy::cast_possible_truncation)]
-        fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<u32>
+        fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<usize>
         where
             S: IndexedSink,
             S::Write: Default,
             F: FnMut(S::Item) -> Option<S::Write>,
         {
-            let mut first_fail: u32 = u32::MAX;
+            let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
                 let i = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
                 let v = unsafe { values.get_unchecked(i) };
                 let opt = f(v);
-                let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-                first_fail = first_fail.min(candidate);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
                 let r = opt.unwrap_or_default();
                 // SAFETY: caller guarantees base + count <= len.
                 unsafe { values.set_unchecked(i, r) };
             }
-            (first_fail != u32::MAX).then_some(first_fail)
+            (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize)
         }
 
         let mut values = self;
@@ -643,85 +580,36 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
 
         for chunk_idx in 0..chunks_count {
             if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) {
-                return Err(failing as usize);
+                return Err(failing);
             }
         }
         if remainder != 0
             && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f)
         {
-            return Err(failing as usize);
+            return Err(failing);
         }
         Ok(())
     }
 
-    /// In-place counterpart of [`IndexedSourceExt::map_with_mask`]. Each lane
-    /// is replaced with `f(self[i], mask[i])`.
-    ///
-    /// The closure reads `Self::Item` and returns `Self::Write`. For the common
-    /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and
-    /// write types can differ (e.g. read `f32`, write `u32`) over the same
-    /// backing memory when sizes and alignments match.
-    ///
-    /// # Panics
+    /// In-place counterpart of [`IndexedSourceExt::try_map_masked_into`]. Each
+    /// lane of `self` is replaced with `f(self[i])`, or `Self::Write::default()`
+    /// if `f` returned `None`. On failure returns `Err(first_failing_lane)`;
+    /// lanes before that point have been written, and lanes within the failing
+    /// chunk hold their unwrapped-or-default result. The buffer state on `Err`
+    /// is intentionally unspecified.
     ///
-    /// Panics if `self.len() != mask.len()`.
-    #[inline]
-    fn map_with_mask_in_place<F>(self, mask: &BitBuffer, mut f: F)
-    where
-        F: FnMut(Self::Item, bool) -> Self::Write,
-    {
-        #[inline(always)]
-        fn chunk<S, F>(values: &mut S, f: &mut F, src_chunk: u64, base: usize, count: usize)
-        where
-            S: IndexedSink,
-            F: FnMut(S::Item, bool) -> S::Write,
-        {
-            for bit_idx in 0..count {
-                let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
-                // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                let r = f(v, bit);
-                unsafe { values.set_unchecked(i, r) };
-            }
-        }
-
-        let mut values = self;
-        let len = values.len();
-        assert_eq!(len, mask.len(), "values and mask must have the same length");
-
-        let chunks = mask.chunks();
-        let chunks_count = len / 64;
-        let remainder = len % 64;
-
-        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
-            chunk(&mut values, &mut f, src_chunk, chunk_idx * 64, 64);
-        }
-        if remainder != 0 {
-            chunk(
-                &mut values,
-                &mut f,
-                chunks.remainder_bits(),
-                chunks_count * 64,
-                remainder,
-            );
-        }
-    }
-
-    /// In-place counterpart of [`IndexedSourceExt::try_map_with_mask`]. Each
-    /// lane of `self` is replaced with `f(self[i], mask[i])`, or
-    /// `Self::Write::default()` if `f` returned `None`. On failure returns
-    /// `Err(first_failing_lane)`; lanes before that point have been written,
-    /// and lanes within the failing chunk hold their unwrapped-or-default
-    /// result. The buffer state on `Err` is intentionally unspecified.
+    /// **Null-lane failures are filtered automatically** — same semantics as
+    /// [`try_map_masked_into`]. The closure has no `valid` parameter; the kernel
+    /// AND-combines `is_none()` with the chunk's validity bitmap before folding
+    /// it into the attribution accumulator.
     ///
     /// ## Error attribution
     ///
-    /// Per-lane `is_none()` flags are folded into `first_fail` via a branchless
-    /// `min` of `(if is_none { i as u32 } else { u32::MAX })`. After the 64-lane
-    /// loop, `first_fail` holds the smallest failing index in the chunk (or
-    /// `MAX` if no failure). Vectorizes to NEON `bsl.16b` + `umin.4s` on
-    /// AArch64. The cold replay scheme used by [`try_map_with_mask`] isn't
+    /// Per-lane `(is_none && valid)` flags are folded into `first_fail` via a
+    /// branchless `min` of `(if is_none && valid { i as u32 } else { u32::MAX })`.
+    /// After the 64-lane loop, `first_fail` holds the smallest valid failing index
+    /// in the chunk (or `MAX` if none). Vectorizes to NEON `bsl.16b` + `umin.4s`
+    /// on AArch64. The cold replay scheme used by [`try_map_masked_into`] isn't
     /// viable here because the original input values have already been
     /// overwritten by the time we would attribute the failure.
     ///
@@ -734,45 +622,47 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
     /// in-place kernel wins back the gap by avoiding the second buffer's DRAM
     /// read+write traffic.
     ///
-    /// [`try_map_with_mask`]: IndexedSourceExt::try_map_with_mask
+    /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into
     ///
     /// # Panics
     ///
     /// Panics if `self.len() != mask.len()`.
     #[inline]
     #[allow(clippy::cast_possible_truncation)]
-    fn try_map_with_mask_in_place<F>(self, mask: &BitBuffer, mut f: F) -> Result<(), usize>
+    fn try_map_masked_in_place<F>(self, mask: &BitBuffer, mut f: F) -> Result<(), usize>
     where
         Self::Write: Default,
-        F: FnMut(Self::Item, bool) -> Option<Self::Write>,
+        F: FnMut(Self::Item) -> Option<Self::Write>,
     {
+        /// Bit-pack `is_none()` flags per lane, then AND with `src_chunk` post-loop to
+        /// drop null-lane failures — identical scheme to [`try_map_masked_into`]. The
+        /// per-lane attribution work is `OR + shift` (no `min`/`csel`), giving LLVM more
+        /// freedom to vectorize the value pipeline.
         #[inline(always)]
-        #[allow(clippy::cast_possible_truncation)]
         fn chunk<S, F>(
             values: &mut S,
             src_chunk: u64,
             base: usize,
             count: usize,
             f: &mut F,
-        ) -> Option<u32>
+        ) -> Option<usize>
         where
             S: IndexedSink,
             S::Write: Default,
-            F: FnMut(S::Item, bool) -> Option<S::Write>,
+            F: FnMut(S::Item) -> Option<S::Write>,
         {
-            let mut first_fail: u32 = u32::MAX;
+            let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
                 let i = base + bit_idx;
-                let bit = (src_chunk >> bit_idx) & 1 == 1;
                 // SAFETY: caller guarantees `base + count <= values.len()`.
                 let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v, bit);
-                let candidate = if opt.is_none() { i as u32 } else { u32::MAX };
-                first_fail = first_fail.min(candidate);
+                let opt = f(v);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
                 let r = opt.unwrap_or_default();
                 unsafe { values.set_unchecked(i, r) };
             }
-            (first_fail != u32::MAX).then_some(first_fail)
+            let valid_failures = fail_bits & src_chunk;
+            (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
         }
 
         let mut values = self;
@@ -785,7 +675,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
 
         for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
             if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) {
-                return Err(failing as usize);
+                return Err(failing);
             }
         }
         if remainder != 0
@@ -797,7 +687,7 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
                 &mut f,
             )
         {
-            return Err(failing as usize);
+            return Err(failing);
         }
         Ok(())
     }
@@ -817,146 +707,33 @@ mod tests {
     }
 
     #[test]
-    fn map_with_mask_aligned() {
-        let values: Vec<i32> = (0..10).collect();
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(10);
-            for i in 0..10 {
-                m.append(i % 2 == 0);
-            }
-            m.freeze()
-        };
-        let mut out = vec![MaybeUninit::<i32>::uninit(); 10];
-        values
-            .as_slice()
-            .map_with_mask(&mask, &mut out, |v, valid| if valid { v } else { -1 });
-        assert_eq!(write_t(out), vec![0, -1, 2, -1, 4, -1, 6, -1, 8, -1]);
-    }
-
-    #[test]
-    fn map_with_mask_partial_chunk() {
-        // 130 lanes — two full u64 words + a 2-bit remainder.
-        let values: Vec<i32> = (0..130).collect();
-        let mask = BitBuffer::new_set(130);
-        let mut out = vec![MaybeUninit::<i32>::uninit(); 130];
-        values
-            .as_slice()
-            .map_with_mask(&mask, &mut out, |v, valid| if valid { v + 1 } else { 0 });
-        let got = write_t(out);
-        assert_eq!(got.len(), 130);
-        assert_eq!(got[0], 1);
-        assert_eq!(got[63], 64);
-        assert_eq!(got[64], 65);
-        assert_eq!(got[129], 130);
-    }
-
-    #[test]
-    fn map_with_mask_offset_mask() {
-        // Build a 128-bit all-true mask, then slice off the first 5 bits to force offset=5.
-        let big = BitBuffer::new_set(128);
-        let sliced = big.slice(5..70); // logical len = 65, offset = 5
-        assert_eq!(sliced.len(), 65);
-        assert_eq!(sliced.offset(), 5);
-
-        let values: Vec<u32> = (0..65).collect();
-        let mut out = vec![MaybeUninit::<u32>::uninit(); 65];
-        values.as_slice().map_with_mask(
-            &sliced,
-            &mut out,
-            |v, valid| {
-                if valid { v } else { u32::MAX }
-            },
-        );
-        let got = write_t(out);
-        assert_eq!(got, (0..65).collect::<Vec<u32>>());
-    }
-
-    #[test]
-    fn map_with_mask_offset_past_word() {
-        // Slicing past a full word still works. `BitBuffer::slice` normalizes the
-        // logical offset to `offset % 8` and bumps the underlying byte pointer,
-        // so `offset()` won't equal 70 here — what we exercise is that the kernel
-        // walks the chunked u64 view (which BitChunks handles internally).
-        let big = BitBuffer::new_set(256);
-        let sliced = big.slice(70..200);
-        assert_eq!(sliced.len(), 130);
-
-        let values: Vec<i16> = (0..130).map(|i| i as i16).collect();
-        let mut out = vec![MaybeUninit::<i16>::uninit(); 130];
-        values
-            .as_slice()
-            .map_with_mask(&sliced, &mut out, |v, valid| if valid { v } else { -1 });
-        let got = write_t(out);
-        assert_eq!(got, (0..130).map(|i| i as i16).collect::<Vec<_>>());
-    }
-
-    #[test]
-    fn map_with_mask_empty() {
-        let values: Vec<i32> = vec![];
-        let mask = BitBuffer::new_unset(0);
-        let mut out: Vec<MaybeUninit<i32>> = vec![];
-        values.as_slice().map_with_mask(&mask, &mut out, |v, _| v);
-    }
-
-    #[test]
-    fn map_with_mask_null_to_zero_branchless() {
-        // The trick from primitive/compute/cast.rs:147 — multiply by valid as T.
-        let values: Vec<i64> = (1..=100).collect();
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(100);
-            for i in 0..100 {
-                m.append(i % 3 != 0);
-            }
-            m.freeze()
-        };
-        let mut out = vec![MaybeUninit::<i64>::uninit(); 100];
-        values
-            .as_slice()
-            .map_with_mask(&mask, &mut out, |v, valid| v * (valid as i64));
-        let got = write_t(out);
-        for (i, &x) in got.iter().enumerate() {
-            if i % 3 == 0 {
-                assert_eq!(x, 0);
-            } else {
-                assert_eq!(x, (i + 1) as i64);
-            }
-        }
-    }
-
-    #[test]
-    fn try_map_with_mask_all_ok() {
+    fn try_map_masked_into_all_ok() {
         let values: Vec<u64> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got, (0..200u32).collect::<Vec<_>>());
     }
 
     #[test]
-    fn try_map_with_mask_overflow_fails() {
+    fn try_map_masked_into_overflow_fails() {
         // Put an overflowing value at lane 137 — the kernel must report Err(137).
         let mut values: Vec<u64> = (0..200).collect();
         values[137] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(137));
     }
 
     #[test]
-    fn try_map_with_mask_overflow_reports_first_failing_lane() {
+    fn try_map_masked_into_overflow_reports_first_failing_lane() {
         // Multiple failing lanes — must report the lowest index.
         let mut values: Vec<u64> = (0..200).collect();
         values[50] = u64::MAX;
@@ -964,17 +741,14 @@ mod tests {
         values[137] = u64::MAX;
         let mask = BitBuffer::new_set(200);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(50));
     }
 
     #[test]
-    fn try_map_with_mask_value_only_closure_filters_null_overflow() {
+    fn try_map_masked_into_value_only_closure_filters_null_overflow() {
         // `|v, _|` closure that ignores validity. A null lane with an overflowing
         // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us.
         let mut values: Vec<u64> = (0..200).collect();
@@ -988,11 +762,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, _valid| {
-                (v <= u32::MAX as u64).then_some(v as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(
             res.is_ok(),
             "null-lane overflow should be filtered by the cold path"
@@ -1000,7 +772,7 @@ mod tests {
     }
 
     #[test]
-    fn try_map_with_mask_value_only_closure_reports_first_valid_failure() {
+    fn try_map_masked_into_value_only_closure_reports_first_valid_failure() {
         // Valid lane overflow must propagate — and the reported index must be
         // the lowest VALID failing lane, even if earlier null lanes also "failed"
         // their unconditional cast.
@@ -1017,16 +789,14 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, _valid| {
-                (v <= u32::MAX as u64).then_some(v as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(77));
     }
 
     #[test]
-    fn try_map_with_mask_null_lane_bypasses_check() {
+    fn try_map_masked_into_null_lane_bypasses_check() {
         // Null lanes are neutralized by `valid as u64` before the range check, so an
         // out-of-range value at a null lane must NOT trigger failure.
         let mut values: Vec<u64> = (0..200).collect();
@@ -1039,12 +809,9 @@ mod tests {
             m.freeze()
         };
         let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got[5], 0); // null-lane wrote default
@@ -1052,7 +819,7 @@ mod tests {
     }
 
     #[test]
-    fn try_map_with_mask_branchful_matches_branchless() {
+    fn try_map_masked_into_branchful_matches_branchless() {
         let mut values: Vec<u64> = (0..130).map(|i| i as u64 * 7).collect();
         values[2] = u64::MAX;
         values[65] = u32::MAX as u64;
@@ -1068,36 +835,26 @@ mod tests {
         let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
         values
             .as_slice()
-            .try_map_with_mask(&mask, &mut branchless, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
+            .try_map_masked_into(&mask, &mut branchless, |v| {
+                (v <= u32::MAX as u64).then_some(v as u32)
             })
             .unwrap();
         values
             .as_slice()
-            .try_map_with_mask(&mask, &mut branchful, |v, valid| {
-                if valid {
-                    u32::try_from(v).ok()
-                } else {
-                    Some(0)
-                }
-            })
+            .try_map_masked_into(&mask, &mut branchful, |v| u32::try_from(v).ok())
             .unwrap();
 
         assert_eq!(write_t(branchful), write_t(branchless));
     }
 
     #[test]
-    fn try_map_with_mask_partial_chunk() {
+    fn try_map_masked_into_partial_chunk() {
         let values: Vec<u64> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got.len(), 130);
@@ -1105,7 +862,7 @@ mod tests {
     }
 
     #[test]
-    fn try_map_with_mask_sliced_mask_unaligned_offset() {
+    fn try_map_masked_into_sliced_mask_unaligned_offset() {
         // The mask's first byte is not word-aligned: slice off 13 bits, so the
         // underlying BitChunks iterator must shift across byte boundaries on every
         // 64-bit chunk it yields.
@@ -1115,19 +872,16 @@ mod tests {
 
         let values: Vec<u64> = (0..130).collect();
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(res.is_ok());
         let got = write_t(out);
         assert_eq!(got, (0..130u32).collect::<Vec<_>>());
     }
 
     #[test]
-    fn try_map_with_mask_sliced_mask_with_overflow() {
+    fn try_map_masked_into_sliced_mask_with_overflow() {
         // Sliced mask + overflowing value — the cold attribution path must report
         // the correct lane index in the sliced (post-offset) coordinate space.
         let big = BitBuffer::new_set(256);
@@ -1137,17 +891,14 @@ mod tests {
         let mut values: Vec<u64> = (0..130).collect();
         values[77] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(77));
     }
 
     #[test]
-    fn try_map_with_mask_sliced_mask_null_lanes() {
+    fn try_map_masked_into_sliced_mask_null_lanes() {
         // Mix sliced offset with a non-trivial validity pattern. Null lanes must
         // not contribute to fail_acc, even when their underlying value would overflow.
         let mut m = BitBufferMut::with_capacity(256);
@@ -1165,102 +916,74 @@ mod tests {
         // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
         values[2] = u64::MAX;
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert!(res.is_ok(), "null lane should bypass the range check");
     }
 
     #[test]
-    fn try_map_with_mask_overflow_in_remainder() {
+    fn try_map_masked_into_overflow_in_remainder() {
         // Overflow in the trailing partial chunk (not aligned to 64).
         let mut values: Vec<u64> = (0..130).collect();
         values[129] = (u32::MAX as u64) + 1;
         let mask = BitBuffer::new_set(130);
         let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
-        let res = values
-            .as_slice()
-            .try_map_with_mask(&mask, &mut out, |v, valid| {
-                let scaled = v * valid as u64;
-                (scaled <= u32::MAX as u64).then_some(scaled as u32)
-            });
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
         assert_eq!(res, Err(129));
     }
 
     #[test]
-    fn map_with_mask_in_place_basic() {
-        let mut values: Vec<u32> = (0..130).collect();
-        let mask = {
-            let mut m = BitBufferMut::with_capacity(130);
-            for i in 0..130 {
-                m.append(i % 2 == 0);
-            }
-            m.freeze()
-        };
-        values
-            .as_mut_slice()
-            .map_with_mask_in_place(&mask, |v, valid| v.wrapping_mul(valid as u32));
-        let expected: Vec<u32> = (0..130u32)
-            .map(|v| if v % 2 == 0 { v } else { 0 })
-            .collect();
-        assert_eq!(values, expected);
-    }
-
-    #[test]
-    fn try_map_with_mask_in_place_all_ok() {
+    fn try_map_masked_in_place_all_ok() {
         let mut values: Vec<u32> = (0..200).collect();
         let mask = BitBuffer::new_set(200);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, valid| {
-                let scaled = v.wrapping_mul(valid as u32);
-                scaled.checked_mul(2)
-            });
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert!(res.is_ok());
         let expected: Vec<u32> = (0..200u32).map(|v| v * 2).collect();
         assert_eq!(values, expected);
     }
 
     #[test]
-    fn try_map_with_mask_in_place_first_failing_chunk_wins() {
+    fn try_map_masked_in_place_first_failing_chunk_wins() {
         let mut values: Vec<u32> = (0..200).collect();
         values[83] = u32::MAX;
         values[150] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert_eq!(res, Err(83));
     }
 
     #[test]
-    fn try_map_with_mask_in_place_within_chunk_reports_lowest() {
+    fn try_map_masked_in_place_within_chunk_reports_lowest() {
         let mut values: Vec<u32> = (0..200).collect();
         values[80] = u32::MAX;
         values[100] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert_eq!(res, Err(80));
     }
 
     #[test]
-    fn try_map_with_mask_in_place_single_failure_lane_exact() {
+    fn try_map_masked_in_place_single_failure_lane_exact() {
         let mut values: Vec<u32> = (0..200).collect();
         values[42] = u32::MAX;
         let mask = BitBuffer::new_set(200);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert_eq!(res, Err(42));
     }
 
     #[test]
-    fn try_map_with_mask_in_place_null_bypass() {
+    fn try_map_masked_in_place_null_bypass() {
         let mut values: Vec<u32> = (0..200).collect();
         values[5] = u32::MAX;
         let mask = {
@@ -1272,27 +995,26 @@ mod tests {
         };
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, valid| {
-                v.wrapping_mul(valid as u32).checked_mul(2)
-            });
-        assert!(res.is_ok());
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert!(res.is_ok(), "null-lane overflow should be filtered");
+        // Null lane was overwritten with default (0).
         assert_eq!(values[5], 0);
         assert_eq!(values[6], 12);
     }
 
     #[test]
-    fn try_map_with_mask_in_place_remainder_overflow() {
+    fn try_map_masked_in_place_remainder_overflow() {
         let mut values: Vec<u32> = (0..130).collect();
         values[129] = u32::MAX;
         let mask = BitBuffer::new_set(130);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert_eq!(res, Err(129));
     }
 
     #[test]
-    fn try_map_with_mask_in_place_sliced_mask() {
+    fn try_map_masked_in_place_sliced_mask() {
         let big = BitBuffer::new_set(256);
         let mask = big.slice(13..143);
         assert_eq!(mask.len(), 130);
@@ -1301,7 +1023,7 @@ mod tests {
         values[77] = u32::MAX;
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| v.checked_mul(2));
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
         assert_eq!(res, Err(77));
     }
 
@@ -1312,7 +1034,7 @@ mod tests {
         let mut buf: Vec<f32> = (0..130).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(130);
         ReinterpretSink::<f32, u32>::new(buf.as_mut_slice())
-            .try_map_with_mask_in_place(&mask, |f, _valid| Some(f.to_bits().wrapping_add(1)))
+            .try_map_masked_in_place(&mask, |f| Some(f.to_bits().wrapping_add(1)))
             .unwrap();
         // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by
         // the closure.
@@ -1328,9 +1050,9 @@ mod tests {
         // Closure fails at a specific lane; the kernel must report that lane index.
         let mut buf: Vec<f32> = (0..200).map(|i| i as f32).collect();
         let mask = BitBuffer::new_set(200);
-        let res = ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_with_mask_in_place(
+        let res = ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_masked_in_place(
             &mask,
-            |f, _valid| {
+            |f| {
                 if f as u32 == 137 {
                     None
                 } else {
@@ -1342,12 +1064,12 @@ mod tests {
     }
 
     #[test]
-    fn try_map_with_mask_in_place_partial_chunk_success() {
+    fn try_map_masked_in_place_partial_chunk_success() {
         let mut values: Vec<u32> = (0..130).collect();
         let mask = BitBuffer::new_set(130);
         let res = values
             .as_mut_slice()
-            .try_map_with_mask_in_place(&mask, |v, _valid| Some(v + 1));
+            .try_map_masked_in_place(&mask, |v| Some(v + 1));
         assert!(res.is_ok());
         assert_eq!(values[0], 1);
         assert_eq!(values[63], 64);

From 608111c8f9fe3d7369de372773df6e59a6330296 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 19:26:19 +0100
Subject: [PATCH 19/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |   4 -
 .../src/arrays/primitive/compute/cast.rs      |   6 +-
 vortex-buffer/Cargo.toml                      |  11 +-
 vortex-buffer/benches/add_checked.rs          | 175 ----------
 vortex-buffer/benches/cast_to_indexed.rs      | 209 ------------
 vortex-buffer/benches/lane_kernels.rs         | 313 ++++++++++++++++++
 .../{lane_ops_indexed.rs => lane_kernels.rs}  |   0
 vortex-buffer/src/lib.rs                      |   2 +-
 8 files changed, 318 insertions(+), 402 deletions(-)
 delete mode 100644 vortex-buffer/benches/add_checked.rs
 delete mode 100644 vortex-buffer/benches/cast_to_indexed.rs
 create mode 100644 vortex-buffer/benches/lane_kernels.rs
 rename vortex-buffer/src/{lane_ops_indexed.rs => lane_kernels.rs} (100%)

diff --git a/Cargo.lock b/Cargo.lock
index 9bb032d0d35..d29c91edf62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9355,11 +9355,7 @@ dependencies = [
 name = "vortex-buffer"
 version = "0.1.0"
 dependencies = [
- "arrow-arith",
- "arrow-array",
  "arrow-buffer",
- "arrow-cast",
- "arrow-schema",
  "bitvec",
  "bytes",
  "codspeed-divan-compat",
diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 34bc6ba3445..112173b269f 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -5,9 +5,9 @@ use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
-use vortex_buffer::lane_ops_indexed::IndexedSinkExt;
-use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
-use vortex_buffer::lane_ops_indexed::ReinterpretSink;
+use vortex_buffer::lane_kernels::IndexedSinkExt;
+use vortex_buffer::lane_kernels::IndexedSourceExt;
+use vortex_buffer::lane_kernels::ReinterpretSink;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index 882de199818..31b9d1c8570 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -37,11 +37,6 @@ vortex-error = { workspace = true }
 workspace = true
 
 [dev-dependencies]
-# arrow-* are used by cast_to_indexed / add_checked benches to compare against arrow-rs.
-arrow-arith = { workspace = true }
-arrow-array = { workspace = true }
-arrow-cast = { workspace = true }
-arrow-schema = { workspace = true }
 divan = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true }
@@ -56,9 +51,5 @@ name = "vortex_bitbuffer"
 harness = false
 
 [[bench]]
-name = "cast_to_indexed"
-harness = false
-
-[[bench]]
-name = "add_checked"
+name = "lane_kernels"
 harness = false
diff --git a/vortex-buffer/benches/add_checked.rs b/vortex-buffer/benches/add_checked.rs
deleted file mode 100644
index 4f71f085847..00000000000
--- a/vortex-buffer/benches/add_checked.rs
+++ /dev/null
@@ -1,175 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Checked `u32 + u32 -> u32` over two nullable columns via [`try_map_masked_into`]
-//! with a value-only closure. Per-lane `is_none()` flags are bit-packed and
-//! AND-ed with the chunk validity word so null-lane overflow is filtered
-//! without the closure ever inspecting `valid`.
-//!
-//! Verified at startup via [`assert_overflow_parity`] (valid-lane overflow
-//! propagates as `Err`) and [`assert_null_overflow_suppressed`] (null-lane
-//! overflow does not).
-
-#![expect(clippy::unwrap_used)]
-
-use std::mem::MaybeUninit;
-
-use divan::Bencher;
-use rand::SeedableRng;
-use rand::prelude::*;
-use vortex_buffer::BitBuffer;
-use vortex_buffer::BitBufferMut;
-use vortex_buffer::Buffer;
-use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
-use vortex_buffer::lane_ops_indexed::LaneZip;
-
-fn main() {
-    assert_overflow_parity();
-    assert_null_overflow_suppressed();
-    divan::main();
-}
-
-const SIZES: &[usize] = &[65_536];
-const LHS_VALID_RATE: f64 = 0.7;
-const RHS_VALID_RATE: f64 = 0.8;
-
-struct Fixture {
-    /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel
-    /// that ignores validity would Err on them. The implementation under test
-    /// must suppress that.
-    lhs: Buffer<u32>,
-    rhs: Buffer<u32>,
-    lhs_mask: BitBuffer,
-    rhs_mask: BitBuffer,
-}
-
-fn fixture(n: usize) -> Fixture {
-    let mut lhs_rng = StdRng::seed_from_u64(0);
-    let mut rhs_rng = StdRng::seed_from_u64(1);
-    let mut lvr = StdRng::seed_from_u64(2);
-    let mut rvr = StdRng::seed_from_u64(3);
-
-    let lhs_valid: Vec<bool> = (0..n).map(|_| lvr.random_bool(LHS_VALID_RATE)).collect();
-    let rhs_valid: Vec<bool> = (0..n).map(|_| rvr.random_bool(RHS_VALID_RATE)).collect();
-
-    let lhs: Buffer<u32> = (0..n)
-        .map(|i| {
-            if lhs_valid[i] {
-                lhs_rng.random_range(0..u16::MAX as u32)
-            } else {
-                u32::MAX
-            }
-        })
-        .collect();
-    let rhs: Buffer<u32> = (0..n)
-        .map(|i| {
-            if rhs_valid[i] {
-                rhs_rng.random_range(0..u16::MAX as u32)
-            } else {
-                u32::MAX
-            }
-        })
-        .collect();
-
-    let lhs_mask = {
-        let mut m = BitBufferMut::with_capacity(n);
-        for &v in &lhs_valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-    let rhs_mask = {
-        let mut m = BitBufferMut::with_capacity(n);
-        for &v in &rhs_valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-
-    Fixture {
-        lhs,
-        rhs,
-        lhs_mask,
-        rhs_mask,
-    }
-}
-
-fn alloc_out(n: usize) -> Vec<MaybeUninit<u32>> {
-    let mut out = Vec::with_capacity(n);
-    // SAFETY: every lane is written before any read inside the kernel.
-    unsafe { out.set_len(n) };
-    out
-}
-
-#[divan::bench(args = SIZES)]
-fn bitpack_value_only(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-    bencher
-        .with_inputs(|| {
-            (
-                f.lhs.clone(),
-                f.rhs.clone(),
-                f.lhs_mask.clone(),
-                f.rhs_mask.clone(),
-            )
-        })
-        .bench_refs(|(lhs, rhs, lm, rm)| {
-            let combined = lm as &BitBuffer & rm as &BitBuffer;
-            let mut out = alloc_out(n);
-            LaneZip::new(lhs.as_slice(), rhs.as_slice())
-                .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b))
-                .unwrap();
-            (combined, out)
-        });
-}
-
-// ---------------------------------------------------------------------------
-// Parity assertions — must pass before divan runs benches.
-// ---------------------------------------------------------------------------
-
-/// Overflow at a valid lane must propagate as `Err`.
-fn assert_overflow_parity() {
-    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
-    let rhs: Vec<u32> = vec![10, 20, 1, 40];
-    let valid = vec![true; 4];
-
-    let mask = {
-        let mut m = BitBufferMut::with_capacity(4);
-        for &v in &valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-
-    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
-    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
-        &mask,
-        out.as_mut_slice(),
-        |(a, b)| a.checked_add(b),
-    );
-    assert!(r.is_err(), "bitpack should Err on overflow");
-}
-
-/// Overflow at a null lane must NOT propagate.
-fn assert_null_overflow_suppressed() {
-    // Lane 2 is null and holds an overflowing value; valid lanes are safe.
-    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
-    let rhs: Vec<u32> = vec![10, 20, 1, 40];
-    let valid = vec![true, true, false, true];
-
-    let mask = {
-        let mut m = BitBufferMut::with_capacity(4);
-        for &v in &valid {
-            m.append(v);
-        }
-        m.freeze()
-    };
-
-    let mut out = alloc_out(4);
-    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
-        &mask,
-        out.as_mut_slice(),
-        |(a, b)| a.checked_add(b),
-    );
-    assert!(r.is_ok(), "bitpack: null-lane overflow leaked");
-}
diff --git a/vortex-buffer/benches/cast_to_indexed.rs b/vortex-buffer/benches/cast_to_indexed.rs
deleted file mode 100644
index 5ab1041f5cc..00000000000
--- a/vortex-buffer/benches/cast_to_indexed.rs
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Coverage benchmark for the indexed lane-op variants used by primitive casts
-//! and bit-packing paths.
-
-#![expect(clippy::unwrap_used)]
-
-use std::mem::MaybeUninit;
-
-use arrow_array::UInt16Array;
-use arrow_array::UInt64Array;
-use arrow_buffer::NullBuffer;
-use arrow_buffer::ScalarBuffer;
-use arrow_cast::CastOptions;
-use divan::Bencher;
-use num_traits::AsPrimitive;
-use num_traits::NumCast;
-use rand::SeedableRng;
-use rand::prelude::*;
-use rand::rngs::StdRng;
-use vortex_buffer::BitBuffer;
-use vortex_buffer::BitBufferMut;
-use vortex_buffer::Buffer;
-use vortex_buffer::lane_ops_indexed::IndexedSinkExt;
-use vortex_buffer::lane_ops_indexed::IndexedSourceExt;
-use vortex_buffer::lane_ops_indexed::ReinterpretSink;
-
-fn main() {
-    divan::main();
-}
-
-const SIZES: &[usize] = &[65_536];
-
-struct Fixture {
-    values_u64: Buffer<u64>,
-    values_u16: Buffer<u16>,
-    /// Positive `i32` values (always representable as `u32`). Used by the
-    /// in-place-vs-out-of-place cast bench.
-    values_i32: Buffer<i32>,
-    mask: BitBuffer,
-}
-
-fn fixture(n: usize) -> Fixture {
-    let mut rng = StdRng::seed_from_u64(0xC457_1D3E);
-
-    let raw_values: Vec<u64> = (0..n)
-        .map(|_| rng.random_range(0..(u32::MAX as u64)))
-        .collect();
-    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
-
-    #[expect(clippy::cast_possible_truncation)]
-    let values_u16 = raw_values
-        .iter()
-        .copied()
-        .map(|v| v as u16)
-        .collect::<Buffer<u16>>();
-
-    // Positive i32 values (top bit cleared) — every value fits in u32.
-    #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    let values_i32 = raw_values
-        .iter()
-        .copied()
-        .map(|v| (v as i32) & i32::MAX)
-        .collect::<Buffer<i32>>();
-
-    #[expect(clippy::cast_possible_truncation)]
-    let values_u32 = raw_values
-        .iter()
-        .copied()
-        .map(|v| v as u32)
-        .collect::<Buffer<u32>>();
-
-    #[expect(clippy::cast_possible_truncation)]
-    let values_u32_small = raw_values
-        .iter()
-        .copied()
-        .map(|v| (v % ((u32::MAX as u64) / 2)) as u32)
-        .collect::<Buffer<u32>>();
-
-    let values_u64_invalid_overflows = raw_values
-        .iter()
-        .copied()
-        .zip(raw_valid.iter().copied())
-        .map(|(v, valid)| if valid { v } else { u64::MAX })
-        .collect::<Buffer<u64>>();
-
-    let arrow_u64 = UInt64Array::new(
-        ScalarBuffer::from(raw_values.clone()),
-        Some(NullBuffer::from(raw_valid.clone())),
-    );
-    #[expect(clippy::cast_possible_truncation)]
-    let raw_u16: Vec<u16> = raw_values.iter().map(|&v| v as u16).collect();
-    let arrow_u16 = UInt16Array::new(
-        ScalarBuffer::from(raw_u16),
-        Some(NullBuffer::from(raw_valid.clone())),
-    );
-
-    Fixture {
-        values_u64: raw_values.into(),
-        values_u16,
-        values_i32,
-        mask: BitBufferMut::from_iter(raw_valid).freeze(),
-    }
-}
-
-fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
-    let mut out = Vec::with_capacity(n);
-    // SAFETY: A `MaybeUninit<T>` does not require initialization.
-    unsafe {
-        out.set_len(n);
-    }
-    out
-}
-
-#[divan::bench(args = SIZES)]
-fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mut out)| {
-            values
-                .as_slice()
-                .try_map_into(out.as_mut_slice(), <u32 as NumCast>::from)
-                .unwrap();
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mut out)| {
-            values.as_slice().map_into(&mut out, |v| v.as_());
-            out
-        });
-}
-
-/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the same runtime
-/// and showing for always true map operations `try_map_masked_into` is sufficient.
-#[divan::bench(args = SIZES)]
-fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .try_map_masked_into(&mask, out.as_mut_slice(), |v| <u32 as NumCast>::from(v))
-                .unwrap();
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mut out)| {
-            values.as_slice().map_into(out.as_mut_slice(), |v| v.as_());
-            out
-        });
-}
-
-// -----------------------------------------------------------------------------
-// In-place vs out-of-place fallible cast i32 → u32 (same byte width).
-//
-// `try_map_masked_into_in_place` mutates the input via `ReinterpretSink` and
-// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates
-// a fresh `BufferMut<u32>` and writes through it. Input values are all positive
-// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any
-// delta is pure allocation + memory-traffic overhead.
-// -----------------------------------------------------------------------------
-
-#[divan::bench(args = SIZES)]
-fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::<u32>(n)))
-        .bench_values(|(values, mask, mut out)| {
-            values
-                .as_slice()
-                .try_map_masked_into(&mask, out.as_mut_slice(), |v| <u32 as NumCast>::from(v))
-                .unwrap();
-            out
-        });
-}
-
-#[divan::bench(args = SIZES)]
-fn try_map_masked_into_in_place_narrow_i32_u32(bencher: Bencher, n: usize) {
-    let f = fixture(n);
-
-    bencher
-        .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone()))
-        .bench_values(|(mut values, mask)| {
-            ReinterpretSink::<i32, u32>::new(values.as_mut_slice())
-                .try_map_masked_in_place(&mask, |v| <u32 as NumCast>::from(v))
-                .unwrap();
-            values
-        });
-}
diff --git a/vortex-buffer/benches/lane_kernels.rs b/vortex-buffer/benches/lane_kernels.rs
new file mode 100644
index 00000000000..60ab967e1ed
--- /dev/null
+++ b/vortex-buffer/benches/lane_kernels.rs
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Coverage benchmark for the lane-kernel variants used by primitive casts,
+//! bit-packing paths, and `LaneZip` binary kernels.
+//!
+//! `add_checked` parity assertions (run at startup) verify that the bit-packed
+//! fail-tracking scheme:
+//!   - propagates valid-lane overflow as `Err`, and
+//!   - suppresses null-lane overflow without the closure ever inspecting `valid`.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+
+use divan::Bencher;
+use num_traits::AsPrimitive;
+use num_traits::NumCast;
+use rand::SeedableRng;
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::lane_kernels::IndexedSinkExt;
+use vortex_buffer::lane_kernels::IndexedSourceExt;
+use vortex_buffer::lane_kernels::LaneZip;
+use vortex_buffer::lane_kernels::ReinterpretSink;
+
+fn main() {
+    assert_overflow_parity();
+    assert_null_overflow_suppressed();
+    divan::main();
+}
+
+const SIZES: &[usize] = &[65_536];
+
+// -----------------------------------------------------------------------------
+// Cast fixture (u64/u16/i32 lanes + a single validity mask).
+// -----------------------------------------------------------------------------
+
+struct CastFixture {
+    values_u64: Buffer<u64>,
+    values_u16: Buffer<u16>,
+    /// Positive `i32` values (always representable as `u32`). Used by the
+    /// in-place-vs-out-of-place cast bench.
+    values_i32: Buffer<i32>,
+    mask: BitBuffer,
+}
+
+fn cast_fixture(n: usize) -> CastFixture {
+    let mut rng = StdRng::seed_from_u64(0xC457_1D3E);
+
+    let raw_values: Vec<u64> = (0..n)
+        .map(|_| rng.random_range(0..(u32::MAX as u64)))
+        .collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u16 = raw_values
+        .iter()
+        .copied()
+        .map(|v| v as u16)
+        .collect::<Buffer<u16>>();
+
+    // Positive i32 values (top bit cleared) — every value fits in u32.
+    #[expect(clippy::cast_possible_truncation)]
+    let values_i32 = raw_values
+        .iter()
+        .copied()
+        .map(|v| (v as i32) & i32::MAX)
+        .collect::<Buffer<i32>>();
+
+    CastFixture {
+        values_u64: raw_values.into(),
+        values_u16,
+        values_i32,
+        mask: BitBufferMut::from_iter(raw_valid).freeze(),
+    }
+}
+
+fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
+    let mut out = Vec::with_capacity(n);
+    // SAFETY: A `MaybeUninit<T>` does not require initialization.
+    unsafe {
+        out.set_len(n);
+    }
+    out
+}
+
+// -----------------------------------------------------------------------------
+// Cast benches (single-input, source -> output).
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values
+                .as_slice()
+                .try_map_into(out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(&mut out, |v| v.as_());
+            out
+        });
+}
+
+/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the
+/// same runtime — for always-true map operations `try_map_masked_into` is
+/// sufficient.
+#[divan::bench(args = SIZES)]
+fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            values
+                .as_slice()
+                .try_map_masked_into(&mask, out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(out.as_mut_slice(), |v| v.as_());
+            out
+        });
+}
+
+// -----------------------------------------------------------------------------
+// In-place vs out-of-place fallible cast i32 → u32 (same byte width).
+//
+// `try_map_masked_in_place` mutates the input via `ReinterpretSink` and
+// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates
+// a fresh `BufferMut<u32>` and writes through it. Input values are all positive
+// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any
+// delta is allocation + memory-traffic overhead.
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            values
+                .as_slice()
+                .try_map_masked_into(&mask, out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_masked_in_place_narrow_i32_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone()))
+        .bench_values(|(mut values, mask)| {
+            ReinterpretSink::<i32, u32>::new(values.as_mut_slice())
+                .try_map_masked_in_place(&mask, <u32 as NumCast>::from)
+                .unwrap();
+            values
+        });
+}
+
+// -----------------------------------------------------------------------------
+// LaneZip binary kernel: checked `u32 + u32 -> u32` over two nullable columns.
+//
+// Per-lane `is_none()` flags are bit-packed and AND-ed with the chunk validity
+// word, so null-lane overflow is filtered without the closure inspecting `valid`.
+// Verified at startup via parity assertions (`assert_overflow_parity` and
+// `assert_null_overflow_suppressed`).
+// -----------------------------------------------------------------------------
+
+const ADD_LHS_VALID_RATE: f64 = 0.7;
+const ADD_RHS_VALID_RATE: f64 = 0.8;
+
+struct AddFixture {
+    /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel
+    /// that ignores validity would `Err` on them. The implementation under test
+    /// must suppress that.
+    lhs: Buffer<u32>,
+    rhs: Buffer<u32>,
+    lhs_mask: BitBuffer,
+    rhs_mask: BitBuffer,
+}
+
+fn add_fixture(n: usize) -> AddFixture {
+    let mut lhs_rng = StdRng::seed_from_u64(0);
+    let mut rhs_rng = StdRng::seed_from_u64(1);
+    let mut lvr = StdRng::seed_from_u64(2);
+    let mut rvr = StdRng::seed_from_u64(3);
+
+    let lhs_valid: Vec<bool> = (0..n)
+        .map(|_| lvr.random_bool(ADD_LHS_VALID_RATE))
+        .collect();
+    let rhs_valid: Vec<bool> = (0..n)
+        .map(|_| rvr.random_bool(ADD_RHS_VALID_RATE))
+        .collect();
+
+    let lhs: Buffer<u32> = (0..n)
+        .map(|i| {
+            if lhs_valid[i] {
+                lhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+    let rhs: Buffer<u32> = (0..n)
+        .map(|i| {
+            if rhs_valid[i] {
+                rhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+
+    let lhs_mask = BitBufferMut::from_iter(lhs_valid).freeze();
+    let rhs_mask = BitBufferMut::from_iter(rhs_valid).freeze();
+
+    AddFixture {
+        lhs,
+        rhs,
+        lhs_mask,
+        rhs_mask,
+    }
+}
+
+#[divan::bench(args = SIZES)]
+fn lanezip_checked_add_u32(bencher: Bencher, n: usize) {
+    let f = add_fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = uninit_out::<u32>(n);
+            LaneZip::new(lhs.as_slice(), rhs.as_slice())
+                .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b))
+                .unwrap();
+            (combined, out)
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Parity assertions — must pass before divan runs benches.
+// -----------------------------------------------------------------------------
+
+/// Overflow at a valid lane must propagate as `Err`.
+fn assert_overflow_parity() {
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true; 4];
+
+    let mask = BitBufferMut::from_iter(valid).freeze();
+    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
+        &mask,
+        out.as_mut_slice(),
+        |(a, b)| a.checked_add(b),
+    );
+    assert!(r.is_err(), "bitpack should Err on overflow");
+}
+
+/// Overflow at a null lane must NOT propagate.
+fn assert_null_overflow_suppressed() {
+    // Lane 2 is null and holds an overflowing value; valid lanes are safe.
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true, true, false, true];
+
+    let mask = BitBufferMut::from_iter(valid).freeze();
+    let mut out = uninit_out::<u32>(4);
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
+        &mask,
+        out.as_mut_slice(),
+        |(a, b)| a.checked_add(b),
+    );
+    assert!(r.is_ok(), "bitpack: null-lane overflow leaked");
+}
diff --git a/vortex-buffer/src/lane_ops_indexed.rs b/vortex-buffer/src/lane_kernels.rs
similarity index 100%
rename from vortex-buffer/src/lane_ops_indexed.rs
rename to vortex-buffer/src/lane_kernels.rs
diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs
index 5fe7a4cf40d..667a1f11a9d 100644
--- a/vortex-buffer/src/lib.rs
+++ b/vortex-buffer/src/lib.rs
@@ -62,7 +62,7 @@ mod buffer_mut;
 mod bytes;
 mod r#const;
 mod debug;
-pub mod lane_ops_indexed;
+pub mod lane_kernels;
 mod macros;
 #[cfg(feature = "memmap2")]
 mod memmap2;

From d0a7806df88f19093eafe4fe8e0e39c5b00dbd0a Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 19:35:36 +0100
Subject: [PATCH 20/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-buffer/src/lane_kernels.rs | 69 +++++++++++++++----------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/vortex-buffer/src/lane_kernels.rs b/vortex-buffer/src/lane_kernels.rs
index 810d76c0900..d1e0d9b5f2e 100644
--- a/vortex-buffer/src/lane_kernels.rs
+++ b/vortex-buffer/src/lane_kernels.rs
@@ -16,8 +16,6 @@
 //! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
 //! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
 
-#![allow(clippy::many_single_char_names)]
-
 use std::marker::PhantomData;
 use std::mem::MaybeUninit;
 use std::mem::align_of;
@@ -273,13 +271,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
         {
             let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v);
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
                 fail_bits |= (opt.is_none() as u64) << bit_idx;
-                let r = opt.unwrap_or_default();
-                unsafe { out.get_unchecked_mut(i).write(r) };
+                let result = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(idx).write(result) };
             }
             let valid_failures = fail_bits & src_chunk;
             (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
@@ -338,10 +336,10 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
             F: FnMut(S::Item) -> R,
         {
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                unsafe { out.get_unchecked_mut(i).write(f(v)) };
+                let val = unsafe { values.get_unchecked(idx) };
+                unsafe { out.get_unchecked_mut(idx).write(f(val)) };
             }
         }
 
@@ -406,13 +404,13 @@ pub trait IndexedSourceExt: IndexedSource + Sized {
         {
             let mut fail_acc: u64 = 0;
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v);
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
                 fail_acc |= opt.is_none() as u64;
-                let r = opt.unwrap_or_default();
-                unsafe { out.get_unchecked_mut(i).write(r) };
+                let result = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(idx).write(result) };
             }
             fail_acc != 0
         }
@@ -459,11 +457,11 @@ where
     S: IndexedSource,
 {
     for bit_idx in 0..chunk_len {
-        let i = base + bit_idx;
-        // SAFETY: caller guarantees i < values.len().
-        let v = unsafe { values.get_unchecked(i) };
-        if lane_fails(bit_idx, v) {
-            return i;
+        let idx = base + bit_idx;
+        // SAFETY: caller guarantees idx < values.len().
+        let val = unsafe { values.get_unchecked(idx) };
+        if lane_fails(bit_idx, val) {
+            return idx;
         }
     }
     unreachable!("cold_scan called without a failing lane")
@@ -477,7 +475,7 @@ where
     S: IndexedSource,
     F: FnMut(S::Item) -> Option<R>,
 {
-    cold_scan(values, base, chunk_len, |_bit_idx, v| f(v).is_none())
+    cold_scan(values, base, chunk_len, |_bit_idx, val| f(val).is_none())
 }
 
 /// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`].
@@ -509,12 +507,12 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
             F: FnMut(S::Item) -> S::Write,
         {
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                let r = f(v);
+                let val = unsafe { values.get_unchecked(idx) };
+                let result = f(val);
                 // SAFETY: caller guarantees base + count <= len.
-                unsafe { values.set_unchecked(i, r) };
+                unsafe { values.set_unchecked(idx, result) };
             }
         }
 
@@ -561,14 +559,14 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
         {
             let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees base + count <= len.
-                let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v);
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
                 fail_bits |= (opt.is_none() as u64) << bit_idx;
-                let r = opt.unwrap_or_default();
+                let result = opt.unwrap_or_default();
                 // SAFETY: caller guarantees base + count <= len.
-                unsafe { values.set_unchecked(i, r) };
+                unsafe { values.set_unchecked(idx, result) };
             }
             (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize)
         }
@@ -628,7 +626,6 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
     ///
     /// Panics if `self.len() != mask.len()`.
     #[inline]
-    #[allow(clippy::cast_possible_truncation)]
     fn try_map_masked_in_place<F>(self, mask: &BitBuffer, mut f: F) -> Result<(), usize>
     where
         Self::Write: Default,
@@ -653,13 +650,13 @@ pub trait IndexedSinkExt: IndexedSink + Sized {
         {
             let mut fail_bits: u64 = 0;
             for bit_idx in 0..count {
-                let i = base + bit_idx;
+                let idx = base + bit_idx;
                 // SAFETY: caller guarantees `base + count <= values.len()`.
-                let v = unsafe { values.get_unchecked(i) };
-                let opt = f(v);
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
                 fail_bits |= (opt.is_none() as u64) << bit_idx;
-                let r = opt.unwrap_or_default();
-                unsafe { values.set_unchecked(i, r) };
+                let result = opt.unwrap_or_default();
+                unsafe { values.set_unchecked(idx, result) };
             }
             let valid_failures = fail_bits & src_chunk;
             (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)

From fc9b5e857434054953cc74bcd0e1fb49005fc00f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 27 May 2026 19:50:10 +0100
Subject: [PATCH 21/21] f

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/arrays/primitive/compute/cast.rs      | 56 +++----------------
 1 file changed, 9 insertions(+), 47 deletions(-)

diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 112173b269f..82bbb1c0d23 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -105,18 +105,7 @@ impl CastKernel for Primitive {
     }
 }
 
-/// Cast values from `F` to `T`. Always routes through the fallible lane-op kernels with
-/// `NumCast::from`. The kernel branches once on the mask shape:
-///
-/// - `Mask::AllTrue`  → [`try_map_into`] — no per-lane validity work.
-/// - `Mask::AllFalse` → bulk zero — the closure is never invoked.
-/// - `Mask::Values`   → [`try_map_masked_into`] — the closure neutralizes null lanes
-///   via the `* valid as F` multiply trick so out-of-range null-lane values don't
-///   trigger spurious errors.
-///
-/// For statically-infallible casts (e.g. widening) LLVM proves `NumCast::from` always
-/// returns `Some` and strips the fail-tracking machinery, generating the same bare
-/// `ushll` widen loop the old hand-written `as_()` fast path produced.
+/// Cast Primitive values from `F` to `T`.
 fn cast_values<F, T>(
     array: ArrayView<'_, Primitive>,
     new_validity: Validity,
@@ -134,28 +123,18 @@ where
     };
 
     // Returns `true` if every value of `from` is representable in `to` without loss.
-    //
-    // Equivalent to `from.least_supertype(to) == Some(to)`, i.e. the value domain of `from`
-    // is a subset of `to`'s. This is the static-only check — it does not consult any array
-    // statistics. Used to short-circuit checked casts when the conversion is infallible by
-    // type alone (widening uint→uint, signed→signed, u8→i16, i32→f64, etc.).
     fn casts_losslessly_to(from: PType, to: PType) -> bool {
         from.least_supertype(to) == Some(to)
     }
 
-    // Skip the fallible kernel when the conversion is infallible by type alone (widening) or
-    // when cached min/max prove every value fits in `T`.
+    // Skip the fallible kernel when type widening or (cached) min/max prove every value fits.
     let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
     let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
         || cached_values_fit_in(array, &target_dtype) == Some(true);
 
     let len = array.len();
 
-    // Same-bit-width in-place fast path: when F and T have the same byte width, try to take
-    // unique ownership of the buffer. If successful, each kernel call site below mutates in
-    // place via `ReinterpretSink` and transmutes the wrapper at the end, saving the output
-    // allocation. Falls back to the out-of-place path (borrowed slice + fresh buffer) when
-    // the buffer is shared — the common case under the current borrow-based kernel API.
+    // If F and T have the same byte width, try to take unique ownership of the buffer.
     let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
     let owned: Option<BufferMut<F>> = if same_bit_width {
         array.into_owned().try_into_buffer_mut::<F>().ok()
@@ -165,14 +144,10 @@ where
     let values: &[F] = array.as_slice::<F>();
 
     if infallible {
-        // Truncating `as`-cast — safe here because static type analysis or cached stats prove
-        // every valid value fits. Null lanes' underlying garbage gets truncated/wrapped
-        // (harmless: the result validity bitmap masks them downstream).
         return match owned {
             Some(mut buf) => {
                 ReinterpretSink::<F, T>::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_());
-                // SAFETY: same size + alignment for NativePType same-byte-width pairs;
-                // every F-slot was overwritten with a real `T` bit pattern.
+                // SAFETY: same size + alignment for NativePType
                 let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
                 Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
             }
@@ -193,8 +168,7 @@ where
             ReinterpretSink::<F, T>::new(buf.as_mut_slice())
                 .try_map_in_place(|v: F| <T as NumCast>::from(v))
                 .map_err(|_| overflow())?;
-            // SAFETY: same size + alignment for NativePType same-byte-width pairs;
-            // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
+            // SAFETY: same size + alignment for NativePType
             let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
             result.freeze()
         }
@@ -205,33 +179,21 @@ where
                     <T as NumCast>::from(v)
                 })
                 .map_err(|_| overflow())?;
-            // SAFETY: try_map_into returned Ok, so it initialized every lane.
+            // SAFETY: initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
         }
-        (Mask::AllFalse(_), Some(buf)) => {
-            // SAFETY: same size + alignment by NativePType same-byte-width invariant.
-            let mut t_buf: BufferMut<T> = unsafe { buf.transmute::<T>() };
-            t_buf.as_mut_slice().fill(T::zero());
-            t_buf.freeze()
-        }
-        (Mask::AllFalse(_), None) => BufferMut::<T>::zeroed(len).freeze(),
+        (Mask::AllFalse(_), _) => BufferMut::<T>::zeroed(len).freeze(),
         (Mask::Values(m), Some(mut buf)) => {
             ReinterpretSink::<F, T>::new(buf.as_mut_slice())
                 .try_map_masked_in_place(m.bit_buffer(), |v: F| <T as NumCast>::from(v))
                 .map_err(|_| overflow())?;
-            // SAFETY: same size + alignment for NativePType same-byte-width pairs;
-            // every F-slot now holds a `T` bit pattern written by `ReinterpretSink`.
+            // SAFETY: same size + alignment for NativePType
             let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
             result.freeze()
         }
         (Mask::Values(m), None) => {
             let mut buffer = BufferMut::<T>::with_capacity(len);
-            // Null-lane failures (where the underlying garbage value can't be represented in
-            // `T`) are filtered automatically by `try_map_masked_into`'s post-loop
-            // `fail_bits & src_chunk` AND. The closure is value-only — LLVM proves it's
-            // statically infallible for widening casts and DCEs the fail-tracking, giving the
-            // same codegen as the maskless kernel.
             values
                 .try_map_masked_into(
                     m.bit_buffer(),
@@ -239,7 +201,7 @@ where
                     |v| <T as NumCast>::from(v),
                 )
                 .map_err(|_| overflow())?;
-            // SAFETY: try_map_masked_into returned Ok, so it initialized every lane.
+            // SAFETY: initialized every lane.
             unsafe { buffer.set_len(len) };
             buffer.freeze()
         }