diff --git a/Cargo.lock b/Cargo.lock
index 045c72176fd..d29c91edf62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9362,6 +9362,7 @@ dependencies = [
  "itertools 0.14.0",
  "memmap2",
  "num-traits",
+ "rand 0.10.1",
  "rstest",
  "serde",
  "simdutf8",
diff --git a/vortex-array/benches/cast_primitive.rs b/vortex-array/benches/cast_primitive.rs
index 86895fb2ce7..63eeb350e7a 100644
--- a/vortex-array/benches/cast_primitive.rs
+++ b/vortex-array/benches/cast_primitive.rs
@@ -18,18 +18,16 @@ fn main() {
     divan::main();
 }
 
-const N: usize = 100_000;
+// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so
+// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth.
+const SIZES: &[usize] = &[65_536];
 
-#[divan::bench]
-fn cast_u16_to_u32(bencher: Bencher) {
+#[divan::bench(args = SIZES)]
+fn cast_u16_to_u32(bencher: Bencher, n: usize) {
     let mut rng = StdRng::seed_from_u64(42);
-    #[expect(clippy::cast_possible_truncation)]
-    let arr = PrimitiveArray::from_option_iter((0..N).map(|i| {
-        if rng.random_bool(0.5) {
-            None
-        } else {
-            Some(i as u16)
-        }
+    let arr = PrimitiveArray::from_option_iter((0..n).map(|i| {
+        #[expect(clippy::cast_possible_truncation)]
+        rng.random_bool(0.5).then(|| i as u16)
     }))
     .into_array();
     // Pre-compute min/max so values_fit_in is a cache hit during the benchmark.
@@ -46,3 +44,38 @@ fn cast_u16_to_u32(bencher: Bencher) {
             .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
     });
 }
+
+/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded
+/// so every value fits, isolating the kernel's per-lane checked-cast overhead.
+#[divan::bench(args = SIZES)]
+fn cast_u32_to_u8(bencher: Bencher, n: usize) {
+    let mut rng = StdRng::seed_from_u64(42);
+    let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
+        rng.random_bool(0.7)
+            .then(|| rng.random_range(0..u8::MAX) as u32)
+    }))
+    .into_array();
+    bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
+        #[expect(clippy::unwrap_used)]
+        a.cast(DType::Primitive(PType::U8, Nullability::Nullable))
+            .unwrap()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
+    });
+}
+
+/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds
+/// but still pays the per-lane `try_from` check.
+#[divan::bench(args = SIZES)]
+fn cast_i32_to_u32(bencher: Bencher, n: usize) {
+    let mut rng = StdRng::seed_from_u64(42);
+    let arr = PrimitiveArray::from_option_iter(
+        (0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))),
+    )
+    .into_array();
+    bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
+        #[expect(clippy::unwrap_used)]
+        a.cast(DType::Primitive(PType::U32, Nullability::Nullable))
+            .unwrap()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
+    });
+}
diff --git a/vortex-array/src/arrays/primitive/compute/cast.rs b/vortex-array/src/arrays/primitive/compute/cast.rs
index 10c0b8d6eba..82bbb1c0d23 100644
--- a/vortex-array/src/arrays/primitive/compute/cast.rs
+++ b/vortex-array/src/arrays/primitive/compute/cast.rs
@@ -5,6 +5,9 @@ use num_traits::AsPrimitive;
 use num_traits::NumCast;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_buffer::lane_kernels::IndexedSinkExt;
+use vortex_buffer::lane_kernels::IndexedSourceExt;
+use vortex_buffer::lane_kernels::ReinterpretSink;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
@@ -102,9 +105,7 @@ impl CastKernel for Primitive {
     }
 }
 
-/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts
-/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them
-/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out.
+/// Cast Primitive values from `F` to `T`.
 fn cast_values<F, T>(
     array: ArrayView<'_, Primitive>,
     new_validity: Validity,
@@ -114,53 +115,101 @@ where
     F: NativePType + AsPrimitive<T>,
     T: NativePType,
 {
-    let values = array.as_slice::<F>();
-
-    // Fast path: statically infallible, or cached min/max prove every valid value fits in `T`.
-    // The cached check never triggers a stats computation — if the bounds aren't already known
-    // we fall through to the per-lane loop below.
-    if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) {
-        return Ok(PrimitiveArray::new(cast::<F, T>(values), new_validity).into_array());
-    }
-
-    // TODO(joe): if the values source and target have the same bit-width we can
-    // mutate in place.
-
-    // Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for
-    // them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow.
-    let mask = array.validity()?.execute_mask(array.len(), ctx)?;
     let overflow = || {
         vortex_err!(
             Compute: "Cannot cast {} to {} — value exceeds target range",
             F::PTYPE, T::PTYPE,
         )
     };
-    let buffer: Buffer<T> = match &mask {
-        Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter(
+
+    // Returns `true` if every value of `from` is representable in `to` without loss.
+    fn casts_losslessly_to(from: PType, to: PType) -> bool {
+        from.least_supertype(to) == Some(to)
+    }
+
+    // Skip the fallible kernel when type widening or (cached) min/max prove every value fits.
+    let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
+    let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
+        || cached_values_fit_in(array, &target_dtype) == Some(true);
+
+    let len = array.len();
+
+    // If F and T have the same byte width, try to take unique ownership of the buffer.
+    let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
+    let owned: Option<BufferMut<F>> = if same_bit_width {
+        array.into_owned().try_into_buffer_mut::<F>().ok()
+    } else {
+        None
+    };
+    let values: &[F] = array.as_slice::<F>();
+
+    if infallible {
+        return match owned {
+            Some(mut buf) => {
+                ReinterpretSink::<F, T>::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_());
+                // SAFETY: same size + alignment for NativePType
+                let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+                Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
+            }
+            None => {
+                let mut buffer = BufferMut::<T>::with_capacity(len);
+                values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
+                // SAFETY: map_into initializes every lane.
+                unsafe { buffer.set_len(len) };
+                Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
+            }
+        };
+    }
+
+    let mask = array.validity()?.execute_mask(len, ctx)?;
+
+    let buffer: Buffer<T> = match (&mask, owned) {
+        (Mask::AllTrue(_), Some(mut buf)) => {
+            ReinterpretSink::<F, T>::new(buf.as_mut_slice())
+                .try_map_in_place(|v: F| <T as NumCast>::from(v))
+                .map_err(|_| overflow())?;
+            // SAFETY: same size + alignment for NativePType
+            let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+            result.freeze()
+        }
+        (Mask::AllTrue(_), None) => {
+            let mut buffer = BufferMut::<T>::with_capacity(len);
             values
-                .iter()
-                .map(|&v| <T as NumCast>::from(v).ok_or_else(overflow)),
-        )?
-        .freeze(),
-        Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
-        Mask::Values(m) => BufferMut::try_from_trusted_len_iter(
-            values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| {
-                let factor = if valid { F::one() } else { F::zero() };
-                <T as NumCast>::from(v * factor).ok_or_else(overflow)
-            }),
-        )?
-        .freeze(),
+                .try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| {
+                    <T as NumCast>::from(v)
+                })
+                .map_err(|_| overflow())?;
+            // SAFETY: initialized every lane.
+            unsafe { buffer.set_len(len) };
+            buffer.freeze()
+        }
+        (Mask::AllFalse(_), _) => BufferMut::<T>::zeroed(len).freeze(),
+        (Mask::Values(m), Some(mut buf)) => {
+            ReinterpretSink::<F, T>::new(buf.as_mut_slice())
+                .try_map_masked_in_place(m.bit_buffer(), |v: F| <T as NumCast>::from(v))
+                .map_err(|_| overflow())?;
+            // SAFETY: same size + alignment for NativePType
+            let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
+            result.freeze()
+        }
+        (Mask::Values(m), None) => {
+            let mut buffer = BufferMut::<T>::with_capacity(len);
+            values
+                .try_map_masked_into(
+                    m.bit_buffer(),
+                    &mut buffer.spare_capacity_mut()[..len],
+                    |v| <T as NumCast>::from(v),
+                )
+                .map_err(|_| overflow())?;
+            // SAFETY: initialized every lane.
+            unsafe { buffer.set_len(len) };
+            buffer.freeze()
+        }
     };
 
     Ok(PrimitiveArray::new(buffer, new_validity).into_array())
 }
 
-/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because
-/// they are masked out by validity.
-fn cast<F: NativePType + AsPrimitive<T>, T: NativePType>(array: &[F]) -> Buffer<T> {
-    BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze()
-}
-
 fn reinterpret(
     array: ArrayView<'_, Primitive>,
     new_ptype: PType,
@@ -178,23 +227,6 @@ fn reinterpret(
     .into_array()
 }
 
-/// Returns `true` if every value of `src` is guaranteed representable in `target` without
-/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never
-/// produce an out-of-range result.
-fn values_always_fit(src: PType, target: PType) -> bool {
-    if src == target {
-        return true;
-    }
-    if src.is_int() && target.is_int() {
-        return target.byte_width() > src.byte_width()
-            && (src.is_unsigned_int() || target.is_signed_int());
-    }
-    if src.is_float() && target.is_float() {
-        return target.byte_width() > src.byte_width();
-    }
-    src.is_int() && matches!(target, PType::F32 | PType::F64)
-}
-
 /// Returns `true` if all valid values in `array` are representable as `target_ptype`.
 ///
 /// Cached min/max statistics are consulted first. If either bound is missing, the function either
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index ae9d7e6cc05..31b9d1c8570 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -39,6 +39,7 @@ workspace = true
 [dev-dependencies]
 divan = { workspace = true }
 num-traits = { workspace = true }
+rand = { workspace = true }
 rstest = { workspace = true }
 
 [[bench]]
@@ -48,3 +49,7 @@ harness = false
 [[bench]]
 name = "vortex_bitbuffer"
 harness = false
+
+[[bench]]
+name = "lane_kernels"
+harness = false
diff --git a/vortex-buffer/benches/lane_kernels.rs b/vortex-buffer/benches/lane_kernels.rs
new file mode 100644
index 00000000000..60ab967e1ed
--- /dev/null
+++ b/vortex-buffer/benches/lane_kernels.rs
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Coverage benchmark for the lane-kernel variants used by primitive casts,
+//! bit-packing paths, and `LaneZip` binary kernels.
+//!
+//! `add_checked` parity assertions (run at startup) verify that the bit-packed
+//! fail-tracking scheme:
+//!   - propagates valid-lane overflow as `Err`, and
+//!   - suppresses null-lane overflow without the closure ever inspecting `valid`.
+
+#![expect(clippy::unwrap_used)]
+
+use std::mem::MaybeUninit;
+
+use divan::Bencher;
+use num_traits::AsPrimitive;
+use num_traits::NumCast;
+use rand::SeedableRng;
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::Buffer;
+use vortex_buffer::lane_kernels::IndexedSinkExt;
+use vortex_buffer::lane_kernels::IndexedSourceExt;
+use vortex_buffer::lane_kernels::LaneZip;
+use vortex_buffer::lane_kernels::ReinterpretSink;
+
+fn main() {
+    assert_overflow_parity();
+    assert_null_overflow_suppressed();
+    divan::main();
+}
+
+const SIZES: &[usize] = &[65_536];
+
+// -----------------------------------------------------------------------------
+// Cast fixture (u64/u16/i32 lanes + a single validity mask).
+// -----------------------------------------------------------------------------
+
+struct CastFixture {
+    values_u64: Buffer<u64>,
+    values_u16: Buffer<u16>,
+    /// Positive `i32` values (always representable as `u32`). Used by the
+    /// in-place-vs-out-of-place cast bench.
+    values_i32: Buffer<i32>,
+    mask: BitBuffer,
+}
+
+fn cast_fixture(n: usize) -> CastFixture {
+    let mut rng = StdRng::seed_from_u64(0xC457_1D3E);
+
+    let raw_values: Vec<u64> = (0..n)
+        .map(|_| rng.random_range(0..(u32::MAX as u64)))
+        .collect();
+    let raw_valid: Vec<bool> = (0..n).map(|_| rng.random_bool(0.8)).collect();
+
+    #[expect(clippy::cast_possible_truncation)]
+    let values_u16 = raw_values
+        .iter()
+        .copied()
+        .map(|v| v as u16)
+        .collect::<Buffer<u16>>();
+
+    // Positive i32 values (top bit cleared) — every value fits in u32.
+    #[expect(clippy::cast_possible_truncation)]
+    let values_i32 = raw_values
+        .iter()
+        .copied()
+        .map(|v| (v as i32) & i32::MAX)
+        .collect::<Buffer<i32>>();
+
+    CastFixture {
+        values_u64: raw_values.into(),
+        values_u16,
+        values_i32,
+        mask: BitBufferMut::from_iter(raw_valid).freeze(),
+    }
+}
+
+fn uninit_out<T>(n: usize) -> Vec<MaybeUninit<T>> {
+    let mut out = Vec::with_capacity(n);
+    // SAFETY: A `MaybeUninit<T>` does not require initialization.
+    unsafe {
+        out.set_len(n);
+    }
+    out
+}
+
+// -----------------------------------------------------------------------------
+// Cast benches (single-input, source -> output).
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn try_map_into_narrow_u64_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values
+                .as_slice()
+                .try_map_into(out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_narrow_u64_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u64.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(&mut out, |v| v.as_());
+            out
+        });
+}
+
+/// `try_map_masked_into_widen_u16_u32` and `map_with_mask_widen_u16_u32` have the
+/// same runtime — for always-true map operations `try_map_masked_into` is
+/// sufficient.
+#[divan::bench(args = SIZES)]
+fn try_map_masked_into_widen_u16_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            values
+                .as_slice()
+                .try_map_masked_into(&mask, out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn map_with_mask_widen_u16_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_u16.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mut out)| {
+            values.as_slice().map_into(out.as_mut_slice(), |v| v.as_());
+            out
+        });
+}
+
+// -----------------------------------------------------------------------------
+// In-place vs out-of-place fallible cast i32 → u32 (same byte width).
+//
+// `try_map_masked_in_place` mutates the input via `ReinterpretSink` and
+// transmutes the wrapper — no output allocation. `try_map_masked_into` allocates
+// a fresh `BufferMut<u32>` and writes through it. Input values are all positive
+// `i32` so every lane succeeds; the two kernels do the same arithmetic, so any
+// delta is allocation + memory-traffic overhead.
+// -----------------------------------------------------------------------------
+
+#[divan::bench(args = SIZES)]
+fn try_map_masked_into_narrow_i32_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_i32.clone(), f.mask.clone(), uninit_out::<u32>(n)))
+        .bench_values(|(values, mask, mut out)| {
+            values
+                .as_slice()
+                .try_map_masked_into(&mask, out.as_mut_slice(), <u32 as NumCast>::from)
+                .unwrap();
+            out
+        });
+}
+
+#[divan::bench(args = SIZES)]
+fn try_map_masked_in_place_narrow_i32_u32(bencher: Bencher, n: usize) {
+    let f = cast_fixture(n);
+
+    bencher
+        .with_inputs(|| (f.values_i32.as_slice().to_vec(), f.mask.clone()))
+        .bench_values(|(mut values, mask)| {
+            ReinterpretSink::<i32, u32>::new(values.as_mut_slice())
+                .try_map_masked_in_place(&mask, <u32 as NumCast>::from)
+                .unwrap();
+            values
+        });
+}
+
+// -----------------------------------------------------------------------------
+// LaneZip binary kernel: checked `u32 + u32 -> u32` over two nullable columns.
+//
+// Per-lane `is_none()` flags are bit-packed and AND-ed with the chunk validity
+// word, so null-lane overflow is filtered without the closure inspecting `valid`.
+// Verified at startup via parity assertions (`assert_overflow_parity` and
+// `assert_null_overflow_suppressed`).
+// -----------------------------------------------------------------------------
+
+const ADD_LHS_VALID_RATE: f64 = 0.7;
+const ADD_RHS_VALID_RATE: f64 = 0.8;
+
+struct AddFixture {
+    /// Valid lanes carry bounded values; null lanes hold `u32::MAX` so a kernel
+    /// that ignores validity would `Err` on them. The implementation under test
+    /// must suppress that.
+    lhs: Buffer<u32>,
+    rhs: Buffer<u32>,
+    lhs_mask: BitBuffer,
+    rhs_mask: BitBuffer,
+}
+
+fn add_fixture(n: usize) -> AddFixture {
+    let mut lhs_rng = StdRng::seed_from_u64(0);
+    let mut rhs_rng = StdRng::seed_from_u64(1);
+    let mut lvr = StdRng::seed_from_u64(2);
+    let mut rvr = StdRng::seed_from_u64(3);
+
+    let lhs_valid: Vec<bool> = (0..n)
+        .map(|_| lvr.random_bool(ADD_LHS_VALID_RATE))
+        .collect();
+    let rhs_valid: Vec<bool> = (0..n)
+        .map(|_| rvr.random_bool(ADD_RHS_VALID_RATE))
+        .collect();
+
+    let lhs: Buffer<u32> = (0..n)
+        .map(|i| {
+            if lhs_valid[i] {
+                lhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+    let rhs: Buffer<u32> = (0..n)
+        .map(|i| {
+            if rhs_valid[i] {
+                rhs_rng.random_range(0..u16::MAX as u32)
+            } else {
+                u32::MAX
+            }
+        })
+        .collect();
+
+    let lhs_mask = BitBufferMut::from_iter(lhs_valid).freeze();
+    let rhs_mask = BitBufferMut::from_iter(rhs_valid).freeze();
+
+    AddFixture {
+        lhs,
+        rhs,
+        lhs_mask,
+        rhs_mask,
+    }
+}
+
+#[divan::bench(args = SIZES)]
+fn lanezip_checked_add_u32(bencher: Bencher, n: usize) {
+    let f = add_fixture(n);
+    bencher
+        .with_inputs(|| {
+            (
+                f.lhs.clone(),
+                f.rhs.clone(),
+                f.lhs_mask.clone(),
+                f.rhs_mask.clone(),
+            )
+        })
+        .bench_refs(|(lhs, rhs, lm, rm)| {
+            let combined = lm as &BitBuffer & rm as &BitBuffer;
+            let mut out = uninit_out::<u32>(n);
+            LaneZip::new(lhs.as_slice(), rhs.as_slice())
+                .try_map_masked_into(&combined, out.as_mut_slice(), |(a, b)| a.checked_add(b))
+                .unwrap();
+            (combined, out)
+        });
+}
+
+// -----------------------------------------------------------------------------
+// Parity assertions — must pass before divan runs benches.
+// -----------------------------------------------------------------------------
+
+/// Overflow at a valid lane must propagate as `Err`.
+fn assert_overflow_parity() {
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true; 4];
+
+    let mask = BitBufferMut::from_iter(valid).freeze();
+    let mut out: Vec<MaybeUninit<u32>> = (0..4).map(|_| MaybeUninit::uninit()).collect();
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
+        &mask,
+        out.as_mut_slice(),
+        |(a, b)| a.checked_add(b),
+    );
+    assert!(r.is_err(), "bitpack should Err on overflow");
+}
+
+/// Overflow at a null lane must NOT propagate.
+fn assert_null_overflow_suppressed() {
+    // Lane 2 is null and holds an overflowing value; valid lanes are safe.
+    let lhs: Vec<u32> = vec![1, 2, u32::MAX, 4];
+    let rhs: Vec<u32> = vec![10, 20, 1, 40];
+    let valid = vec![true, true, false, true];
+
+    let mask = BitBufferMut::from_iter(valid).freeze();
+    let mut out = uninit_out::<u32>(4);
+    let r = LaneZip::new(lhs.as_slice(), rhs.as_slice()).try_map_masked_into(
+        &mask,
+        out.as_mut_slice(),
+        |(a, b)| a.checked_add(b),
+    );
+    assert!(r.is_ok(), "bitpack: null-lane overflow leaked");
+}
diff --git a/vortex-buffer/src/lane_kernels.rs b/vortex-buffer/src/lane_kernels.rs
new file mode 100644
index 00000000000..d1e0d9b5f2e
--- /dev/null
+++ b/vortex-buffer/src/lane_kernels.rs
@@ -0,0 +1,1076 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Elementwise lane kernels over indexed sources.
+//!
+//! Replaces `&[T]` with an [`IndexedSource`] trait: each lane read is
+//! `unsafe fn get_unchecked(i) -> Item`, independent across iterations. For `&[T]`
+//! this inlines to the same indexed load as the slice kernel; for `LaneZip(&[A], &[B])`
+//! it gives two independent indexed reads per lane — both shapes the auto-vectorizer
+//! handles.
+//!
+//! See `vortex-buffer/HISTORY.md` for the iterator-API investigation that motivated
+//! this design.
+//!
+//! The output is always a caller-provided `&mut` slice — these kernels never allocate.
+//! Both kernels handle a mask with a non-byte-aligned offset and with a logical `len`
+//! shorter than the underlying byte buffer, via [`BitBuffer::chunks`].
+
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::mem::align_of;
+use std::mem::size_of;
+
+use crate::BitBuffer;
+
+/// A length-known source supporting unchecked indexed reads.
+///
+/// Implemented for `&[T]` (with `T: Copy`) and for [`LaneZip`] over two `IndexedSource`s.
+/// The kernels in this module require this trait instead of `Iterator` so that lane
+/// reads carry no inter-iteration data dependency — the autovectorizer treats each
+/// lane independently.
+pub trait IndexedSource {
+    /// The per-lane item type. Must be `Copy` so the kernels can pass it through
+    /// the closure by value without extra moves.
+    type Item: Copy;
+    /// Logical lane count.
+    fn len(&self) -> usize;
+    /// Returns true when there are no lanes.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+    /// Read the lane at `i` without bounds checking.
+    ///
+    /// # Safety
+    ///
+    /// `i` must be strictly less than `self.len()`.
+    unsafe fn get_unchecked(&self, i: usize) -> Self::Item;
+}
+
+impl<T: Copy> IndexedSource for &[T] {
+    type Item = T;
+    #[inline]
+    fn len(&self) -> usize {
+        <[T]>::len(self)
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> T {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked(self, i) }
+    }
+}
+
+impl<T: Copy> IndexedSource for &mut [T] {
+    type Item = T;
+    #[inline]
+    fn len(&self) -> usize {
+        <[T]>::len(self)
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> T {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked(self, i) }
+    }
+}
+
+/// An [`IndexedSource`] that also supports unchecked indexed writes — the binding
+/// for in-place kernels.
+///
+/// `Write` is the type written by `set_unchecked` and may differ from
+/// `IndexedSource::Item` (the read type). For the canonical `&mut [T]` impl
+/// both are `T`. The decoupling is what makes [`ReinterpretSink`] possible —
+/// a wrapper that reads `F` and writes `T` over the same backing memory when
+/// the two have identical size and alignment.
+///
+/// Implemented for `&mut [T]`; not implemented for [`LaneZip`] (you can't write a
+/// `(A, B)` pair back to two separate sources via a single index).
+pub trait IndexedSink: IndexedSource {
+    /// The per-lane write type. Equal to `<Self as IndexedSource>::Item` for
+    /// `&mut [T]`; different for [`ReinterpretSink`].
+    type Write: Copy;
+
+    /// Write `value` into lane `i` without bounds checking.
+    ///
+    /// # Safety
+    ///
+    /// `i` must be strictly less than `self.len()`.
+    unsafe fn set_unchecked(&mut self, i: usize, value: Self::Write);
+}
+
+impl<T: Copy> IndexedSink for &mut [T] {
+    type Write = T;
+    #[inline]
+    unsafe fn set_unchecked(&mut self, i: usize, value: T) {
+        // SAFETY: caller guarantees i < self.len().
+        unsafe { *<[T]>::get_unchecked_mut(self, i) = value };
+    }
+}
+
+/// A sink that reads `F`-values and writes `T`-values over the same backing
+/// slice of `F`, reinterpreting each `T` as `F`-bits on write.
+///
+/// Requires `size_of::<F>() == size_of::<T>()` and `align_of::<F>() == align_of::<T>()`.
+/// Both hold for any pair of `NativePType` primitives with equal byte width
+/// (e.g. `u32` ↔ `f32`, `u64` ↔ `i64`, `f64` ↔ `u64`).
+///
+/// Use this when an in-place kernel needs to convert lanes between two
+/// types of identical width without allocating a second buffer. After the
+/// kernel completes every slot holds a valid `T`-bit pattern; the caller
+/// can recover a typed view via `BufferMut::transmute::<T>()`.
+pub struct ReinterpretSink<'a, F, T> {
+    slice: &'a mut [F],
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, F, T> ReinterpretSink<'a, F, T> {
+    /// Construct a `ReinterpretSink` from `&mut [F]`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `size_of::<F>() != size_of::<T>()` or
+    /// `align_of::<F>() != align_of::<T>()`.
+    pub fn new(slice: &'a mut [F]) -> Self {
+        assert_eq!(
+            size_of::<F>(),
+            size_of::<T>(),
+            "ReinterpretSink requires F and T to have the same size",
+        );
+        assert_eq!(
+            align_of::<F>(),
+            align_of::<T>(),
+            "ReinterpretSink requires F and T to have the same alignment",
+        );
+        Self {
+            slice,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<F: Copy, T: Copy> IndexedSource for ReinterpretSink<'_, F, T> {
+    type Item = F;
+    #[inline]
+    fn len(&self) -> usize {
+        self.slice.len()
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> F {
+        // SAFETY: caller guarantees i < self.slice.len(). Pointer arithmetic
+        // avoids method-resolution ambiguity between `<[F]>::get_unchecked` and
+        // `IndexedSource::get_unchecked`.
+        unsafe { *self.slice.as_ptr().add(i) }
+    }
+}
+
+impl<F: Copy, T: Copy> IndexedSink for ReinterpretSink<'_, F, T> {
+    type Write = T;
+    #[inline]
+    unsafe fn set_unchecked(&mut self, i: usize, value: T) {
+        // SAFETY: caller guarantees i < self.slice.len(); `new` enforces
+        // size_of::<F>() == size_of::<T>() and align_of::<F>() == align_of::<T>(),
+        // so the F-slot can hold a `T` without overflow or misalignment.
+        unsafe {
+            let ptr = self.slice.as_mut_ptr().add(i) as *mut T;
+            ptr.write(value);
+        }
+    }
+}
+
+/// Pair of two [`IndexedSource`]s of equal length. Yields `(A::Item, B::Item)` per lane.
+///
+/// Use this to drive a binary kernel from two columns. Length equality is enforced
+/// at construction.
+pub struct LaneZip<A, B>(pub A, pub B);
+
+impl<A: IndexedSource, B: IndexedSource> LaneZip<A, B> {
+    /// Build a `LaneZip` from two equal-length sources.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the two operands have different lengths.
+    pub fn new(a: A, b: B) -> Self {
+        assert_eq!(
+            a.len(),
+            b.len(),
+            "LaneZip operands must have the same length"
+        );
+        Self(a, b)
+    }
+}
+
+impl<A: IndexedSource, B: IndexedSource> IndexedSource for LaneZip<A, B> {
+    type Item = (A::Item, B::Item);
+    #[inline]
+    fn len(&self) -> usize {
+        debug_assert_eq!(self.0.len(), self.1.len());
+        self.0.len()
+    }
+    #[inline]
+    unsafe fn get_unchecked(&self, i: usize) -> (A::Item, B::Item) {
+        // SAFETY: caller guarantees i < self.len(); `new` enforces matching lengths.
+        unsafe { (self.0.get_unchecked(i), self.1.get_unchecked(i)) }
+    }
+}
+
+/// Extension trait providing lane-kernel methods on any [`IndexedSource`].
+///
+/// All methods have default implementations and are inherited via the blanket
+/// `impl<S: IndexedSource> IndexedSourceExt for S` below. Bring the trait into
+/// scope (`use vortex_buffer::lane_ops_indexed::IndexedSourceExt;`) to call
+/// them with method syntax: `values.try_map_masked_into(&mask, &mut out, f)`.
+pub trait IndexedSourceExt: IndexedSource + Sized {
+    /// Fallible variant of [`map_with_mask`]. `f` returns `Option<R>`; `None`
+    /// indicates a per-lane failure (e.g. range overflow on a narrowing cast).
+    ///
+    /// **Null-lane failures are filtered automatically.** The closure is called on
+    /// every lane regardless of validity; if a null lane's stored value causes `f(v)`
+    /// to return `None`, the kernel does *not* propagate that as `Err`. The per-lane
+    /// `is_none()` flags are bit-packed into a `u64` at the lane's position, then
+    /// AND-combined with the chunk's validity bitmap — null-lane bits vanish.
+    ///
+    /// The closure shape is the same as [`try_map_into`] (`FnMut(Item) -> Option<R>`);
+    /// the mask parameter is what makes this kernel mask-aware. Callers that need to
+    /// distinguish null lanes inside the closure (e.g. to short-circuit an expensive
+    /// computation) should construct their own per-lane validity check externally; for
+    /// the common case, the kernel's automatic filter is sufficient.
+    ///
+    /// On failure returns `Err(failing_lane_index)`. Lanes whose `f` returned `None`
+    /// write `R::default()` into `out`, but the contents of `out` must not be relied
+    /// upon when this function returns `Err`.
+    ///
+    /// [`map_with_mask`]: IndexedSourceExt::map_with_mask
+    /// [`try_map_into`]: IndexedSourceExt::try_map_into
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()` or `out.len() != self.len()`.
+    #[inline]
+    fn try_map_masked_into<R, F>(
+        self,
+        mask: &BitBuffer,
+        out: &mut [MaybeUninit<R>],
+        mut f: F,
+    ) -> Result<(), usize>
+    where
+        R: Copy + Default,
+        F: FnMut(Self::Item) -> Option<R>,
+    {
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            src_chunk: u64,
+            base: usize,
+            count: usize,
+        ) -> Option<usize>
+        where
+            S: IndexedSource,
+            R: Copy + Default,
+            F: FnMut(S::Item) -> Option<R>,
+        {
+            let mut fail_bits: u64 = 0;
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
+                let result = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(idx).write(result) };
+            }
+            let valid_failures = fail_bits & src_chunk;
+            (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
+        }
+
+        let values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
+        assert_eq!(out.len(), len, "out must have the same length as values");
+
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            if let Some(idx) = chunk(&values, out, &mut f, src_chunk, chunk_idx * 64, 64) {
+                return Err(idx);
+            }
+        }
+        if remainder != 0
+            && let Some(idx) = chunk(
+                &values,
+                out,
+                &mut f,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+            )
+        {
+            return Err(idx);
+        }
+        Ok(())
+    }
+
+    /// Apply `f(value)` lane-by-lane with **no validity awareness at all** — every
+    /// closure invocation is treated as "happened", regardless of whether the lane
+    /// is null. Use this only when the input is known non-nullable.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `out.len() != self.len()`.
+    #[inline]
+    fn map_into<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F)
+    where
+        F: FnMut(Self::Item) -> R,
+    {
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            base: usize,
+            count: usize,
+        ) where
+            S: IndexedSource,
+            F: FnMut(S::Item) -> R,
+        {
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let val = unsafe { values.get_unchecked(idx) };
+                unsafe { out.get_unchecked_mut(idx).write(f(val)) };
+            }
+        }
+
+        let values = self;
+        let len = values.len();
+        assert_eq!(out.len(), len, "out must have the same length as values");
+
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for chunk_idx in 0..chunks_count {
+            chunk(&values, out, &mut f, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(&values, out, &mut f, chunks_count * 64, remainder);
+        }
+    }
+
+    /// Fallible map with **no validity awareness at all** — every `None` returned
+    /// by the closure is treated as a failure, even at null lanes.
+    ///
+    /// # Use this only for non-nullable inputs.
+    ///
+    /// For nullable inputs with a fallible closure, use [`try_map_masked_into`] —
+    /// it has the same value-only closure shape (and the same perf win) but
+    /// **correctly suppresses null-lane failures** via per-chunk
+    /// `fail_bits & mask_chunk`.
+    ///
+    /// Using this kernel on a nullable input where a null lane's stored value
+    /// would cause `f` to return `None` will produce a spurious `Err`. This is a
+    /// correctness footgun on purpose — the name and this doc are how the API
+    /// signals "you must know your input has no nulls."
+    ///
+    /// On failure returns `Err(failing_lane_index)`.
+    ///
+    /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into
+    ///
+    /// # Panics
+    ///
+    /// Panics if `out.len() != self.len()`.
+    #[inline]
+    fn try_map_into<R, F>(self, out: &mut [MaybeUninit<R>], mut f: F) -> Result<(), usize>
+    where
+        R: Copy + Default,
+        F: FnMut(Self::Item) -> Option<R>,
+    {
+        /// Returns `true` if any lane in `[base, base+count)` failed (OR-reduced);
+        /// the cold attribution path is called at the kernel level so it can be
+        /// inlined separately for full vs remainder.
+        #[inline(always)]
+        fn chunk<S, R, F>(
+            values: &S,
+            out: &mut [MaybeUninit<R>],
+            f: &mut F,
+            base: usize,
+            count: usize,
+        ) -> bool
+        where
+            S: IndexedSource,
+            R: Copy + Default,
+            F: FnMut(S::Item) -> Option<R>,
+        {
+            let mut fail_acc: u64 = 0;
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
+                fail_acc |= opt.is_none() as u64;
+                let result = opt.unwrap_or_default();
+                unsafe { out.get_unchecked_mut(idx).write(result) };
+            }
+            fail_acc != 0
+        }
+
+        let values = self;
+        let len = values.len();
+        assert_eq!(out.len(), len, "out must have the same length as values");
+
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for chunk_idx in 0..chunks_count {
+            let base = chunk_idx * 64;
+            if chunk(&values, out, &mut f, base, 64) {
+                return Err(attribute_failure_no_mask(&values, base, 64, &mut f));
+            }
+        }
+        if remainder != 0 {
+            let base = chunks_count * 64;
+            if chunk(&values, out, &mut f, base, remainder) {
+                return Err(attribute_failure_no_mask(&values, base, remainder, &mut f));
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<S: IndexedSource> IndexedSourceExt for S {}
+
+/// Shared cold scan: walks a chunk, returns the first lane index where
+/// `lane_fails(bit_idx, value)` returns `true`. Used by
+/// [`attribute_failure_no_mask`].
+///
+/// Caller guarantees `base + chunk_len <= values.len()`.
+#[cold]
+#[inline(never)]
+fn cold_scan<S>(
+    values: &S,
+    base: usize,
+    chunk_len: usize,
+    mut lane_fails: impl FnMut(usize /* bit_idx */, S::Item) -> bool,
+) -> usize
+where
+    S: IndexedSource,
+{
+    for bit_idx in 0..chunk_len {
+        let idx = base + bit_idx;
+        // SAFETY: caller guarantees idx < values.len().
+        let val = unsafe { values.get_unchecked(idx) };
+        if lane_fails(bit_idx, val) {
+            return idx;
+        }
+    }
+    unreachable!("cold_scan called without a failing lane")
+}
+
+/// Cold attribution for the no-mask variant. Replays `f` over the chunk to find
+/// the first lane that returns `None`.
+#[inline]
+fn attribute_failure_no_mask<S, R, F>(values: &S, base: usize, chunk_len: usize, f: &mut F) -> usize
+where
+    S: IndexedSource,
+    F: FnMut(S::Item) -> Option<R>,
+{
+    cold_scan(values, base, chunk_len, |_bit_idx, val| f(val).is_none())
+}
+
+/// Extension trait providing in-place lane-kernel methods on any [`IndexedSink`].
+///
+/// All methods have default implementations and are inherited via the blanket
+/// `impl<S: IndexedSink> IndexedSinkExt for S` below. Bring the trait into scope
+/// (`use vortex_buffer::lane_ops_indexed::IndexedSinkExt;`) to call them with
+/// method syntax.
+pub trait IndexedSinkExt: IndexedSink + Sized {
+    /// In-place counterpart of [`IndexedSourceExt::map_into`]. Each lane
+    /// is replaced with `f(self[i])`.
+    ///
+    /// The closure reads `Self::Item` and returns `Self::Write`. For the common
+    /// case `Self = &mut [T]` both are `T`; for [`ReinterpretSink`] the read and
+    /// write types can differ (e.g. read `f32`, write `u32`) over the same
+    /// backing memory when sizes and alignments match.
+    ///
+    /// As with [`IndexedSourceExt::map_into`], use this only when the
+    /// input is known non-nullable.
+    #[inline]
+    fn map_into_in_place<F>(self, mut f: F)
+    where
+        F: FnMut(Self::Item) -> Self::Write,
+    {
+        #[inline(always)]
+        fn chunk<S, F>(values: &mut S, f: &mut F, base: usize, count: usize)
+        where
+            S: IndexedSink,
+            F: FnMut(S::Item) -> S::Write,
+        {
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let val = unsafe { values.get_unchecked(idx) };
+                let result = f(val);
+                // SAFETY: caller guarantees base + count <= len.
+                unsafe { values.set_unchecked(idx, result) };
+            }
+        }
+
+        let mut values = self;
+        let len = values.len();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for chunk_idx in 0..chunks_count {
+            chunk(&mut values, &mut f, chunk_idx * 64, 64);
+        }
+        if remainder != 0 {
+            chunk(&mut values, &mut f, chunks_count * 64, remainder);
+        }
+    }
+
+    /// In-place counterpart of [`IndexedSourceExt::try_map_into`]. Each
+    /// lane is replaced with `f(self[i])`, or `Self::Write::default()` when `f`
+    /// returns `None`. On failure returns `Err(first_failing_lane)`; the buffer
+    /// state on `Err` is unspecified.
+    ///
+    /// ## Error attribution
+    ///
+    /// Per-lane `is_none()` flags are bit-packed into a `u64` at the lane's
+    /// position — `fail_bits |= (opt.is_none() as u64) << bit_idx`. After the
+    /// 64-lane loop, `trailing_zeros()` of `fail_bits` recovers the first
+    /// failing lane index. `OR + shift` per lane is friendlier to the
+    /// autovectorizer than `min`/`csel` — see [`try_map_masked_in_place`] for
+    /// the same scheme over a masked variant.
+    ///
+    /// [`try_map_masked_in_place`]: IndexedSinkExt::try_map_masked_in_place
+    #[inline]
+    fn try_map_in_place<F>(self, mut f: F) -> Result<(), usize>
+    where
+        Self::Write: Default,
+        F: FnMut(Self::Item) -> Option<Self::Write>,
+    {
+        #[inline(always)]
+        fn chunk<S, F>(values: &mut S, base: usize, count: usize, f: &mut F) -> Option<usize>
+        where
+            S: IndexedSink,
+            S::Write: Default,
+            F: FnMut(S::Item) -> Option<S::Write>,
+        {
+            let mut fail_bits: u64 = 0;
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees base + count <= len.
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
+                let result = opt.unwrap_or_default();
+                // SAFETY: caller guarantees base + count <= len.
+                unsafe { values.set_unchecked(idx, result) };
+            }
+            (fail_bits != 0).then_some(base + fail_bits.trailing_zeros() as usize)
+        }
+
+        let mut values = self;
+        let len = values.len();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for chunk_idx in 0..chunks_count {
+            if let Some(failing) = chunk(&mut values, chunk_idx * 64, 64, &mut f) {
+                return Err(failing);
+            }
+        }
+        if remainder != 0
+            && let Some(failing) = chunk(&mut values, chunks_count * 64, remainder, &mut f)
+        {
+            return Err(failing);
+        }
+        Ok(())
+    }
+
+    /// In-place counterpart of [`IndexedSourceExt::try_map_masked_into`]. Each
+    /// lane of `self` is replaced with `f(self[i])`, or `Self::Write::default()`
+    /// if `f` returned `None`. On failure returns `Err(first_failing_lane)`;
+    /// lanes before that point have been written, and lanes within the failing
+    /// chunk hold their unwrapped-or-default result. The buffer state on `Err`
+    /// is intentionally unspecified.
+    ///
+    /// **Null-lane failures are filtered automatically** — same semantics as
+    /// [`try_map_masked_into`]. The closure has no `valid` parameter; the kernel
+    /// AND-combines `is_none()` with the chunk's validity bitmap before folding
+    /// it into the attribution accumulator.
+    ///
+    /// ## Error attribution
+    ///
+    /// Per-lane `(is_none && valid)` flags are folded into `first_fail` via a
+    /// branchless `min` of `(if is_none && valid { i as u32 } else { u32::MAX })`.
+    /// After the 64-lane loop, `first_fail` holds the smallest valid failing index
+    /// in the chunk (or `MAX` if none). Vectorizes to NEON `bsl.16b` + `umin.4s`
+    /// on AArch64. The cold replay scheme used by [`try_map_masked_into`] isn't
+    /// viable here because the original input values have already been
+    /// overwritten by the time we would attribute the failure.
+    ///
+    /// ## Why in-place is slower at cache-resident sizes
+    ///
+    /// At sizes that fit in L1/L2 the in-place kernel is ~1.5× slower than the
+    /// out-of-place kernel despite having half the memory traffic, because
+    /// input and output share memory and the compiler must be conservative
+    /// reordering loads/stores across iterations. At sizes that exceed L2 the
+    /// in-place kernel wins back the gap by avoiding the second buffer's DRAM
+    /// read+write traffic.
+    ///
+    /// [`try_map_masked_into`]: IndexedSourceExt::try_map_masked_into
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.len() != mask.len()`.
+    #[inline]
+    fn try_map_masked_in_place<F>(self, mask: &BitBuffer, mut f: F) -> Result<(), usize>
+    where
+        Self::Write: Default,
+        F: FnMut(Self::Item) -> Option<Self::Write>,
+    {
+        /// Bit-pack `is_none()` flags per lane, then AND with `src_chunk` post-loop to
+        /// drop null-lane failures — identical scheme to [`try_map_masked_into`]. The
+        /// per-lane attribution work is `OR + shift` (no `min`/`csel`), giving LLVM more
+        /// freedom to vectorize the value pipeline.
+        #[inline(always)]
+        fn chunk<S, F>(
+            values: &mut S,
+            src_chunk: u64,
+            base: usize,
+            count: usize,
+            f: &mut F,
+        ) -> Option<usize>
+        where
+            S: IndexedSink,
+            S::Write: Default,
+            F: FnMut(S::Item) -> Option<S::Write>,
+        {
+            let mut fail_bits: u64 = 0;
+            for bit_idx in 0..count {
+                let idx = base + bit_idx;
+                // SAFETY: caller guarantees `base + count <= values.len()`.
+                let val = unsafe { values.get_unchecked(idx) };
+                let opt = f(val);
+                fail_bits |= (opt.is_none() as u64) << bit_idx;
+                let result = opt.unwrap_or_default();
+                unsafe { values.set_unchecked(idx, result) };
+            }
+            let valid_failures = fail_bits & src_chunk;
+            (valid_failures != 0).then_some(base + valid_failures.trailing_zeros() as usize)
+        }
+
+        let mut values = self;
+        let len = values.len();
+        assert_eq!(len, mask.len(), "values and mask must have the same length");
+
+        let chunks = mask.chunks();
+        let chunks_count = len / 64;
+        let remainder = len % 64;
+
+        for (chunk_idx, src_chunk) in chunks.iter().enumerate() {
+            if let Some(failing) = chunk(&mut values, src_chunk, chunk_idx * 64, 64, &mut f) {
+                return Err(failing);
+            }
+        }
+        if remainder != 0
+            && let Some(failing) = chunk(
+                &mut values,
+                chunks.remainder_bits(),
+                chunks_count * 64,
+                remainder,
+                &mut f,
+            )
+        {
+            return Err(failing);
+        }
+        Ok(())
+    }
+}
+
+impl<S: IndexedSink> IndexedSinkExt for S {}
+
+#[cfg(test)]
+#[allow(clippy::cast_possible_truncation)]
+mod tests {
+    use super::*;
+    use crate::BitBufferMut;
+
+    fn write_t<T: Copy>(out: Vec<MaybeUninit<T>>) -> Vec<T> {
+        // SAFETY: tests always fully initialize the buffer.
+        unsafe { std::mem::transmute(out) }
+    }
+
+    #[test]
+    fn try_map_masked_into_all_ok() {
+        let values: Vec<u64> = (0..200).collect();
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..200u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_masked_into_overflow_fails() {
+        // Put an overflowing value at lane 137 — the kernel must report Err(137).
+        let mut values: Vec<u64> = (0..200).collect();
+        values[137] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert_eq!(res, Err(137));
+    }
+
+    #[test]
+    fn try_map_masked_into_overflow_reports_first_failing_lane() {
+        // Multiple failing lanes — must report the lowest index.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[50] = u64::MAX;
+        values[51] = u64::MAX;
+        values[137] = u64::MAX;
+        let mask = BitBuffer::new_set(200);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert_eq!(res, Err(50));
+    }
+
+    #[test]
+    fn try_map_masked_into_value_only_closure_filters_null_overflow() {
+        // `|v, _|` closure that ignores validity. A null lane with an overflowing
+        // value MUST NOT cause Err — the kernel's cold-path mask filter rescues us.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX; // null lane with overflowing value
+        values[42] = u64::MAX; // null lane with overflowing value
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5 && i != 42);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(
+            res.is_ok(),
+            "null-lane overflow should be filtered by the cold path"
+        );
+    }
+
+    #[test]
+    fn try_map_masked_into_value_only_closure_reports_first_valid_failure() {
+        // Valid lane overflow must propagate — and the reported index must be
+        // the lowest VALID failing lane, even if earlier null lanes also "failed"
+        // their unconditional cast.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX; // null lane — filtered out
+        values[42] = u64::MAX; // null lane — filtered out
+        values[77] = u64::MAX; // VALID lane — should be reported
+        values[100] = u64::MAX; // VALID lane — higher index, ignored
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5 && i != 42);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_masked_into_null_lane_bypasses_check() {
+        // Null lanes are neutralized by `valid as u64` before the range check, so an
+        // out-of-range value at a null lane must NOT trigger failure.
+        let mut values: Vec<u64> = (0..200).collect();
+        values[5] = u64::MAX;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5);
+            }
+            m.freeze()
+        };
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 200];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got[5], 0); // null-lane wrote default
+        assert_eq!(got[6], 6);
+    }
+
+    #[test]
+    fn try_map_masked_into_branchful_matches_branchless() {
+        let mut values: Vec<u64> = (0..130).map(|i| i as u64 * 7).collect();
+        values[2] = u64::MAX;
+        values[65] = u32::MAX as u64;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(130);
+            for i in 0..130 {
+                m.append(!matches!(i, 2 | 17 | 99));
+            }
+            m.freeze()
+        };
+
+        let mut branchless = vec![MaybeUninit::<u32>::uninit(); 130];
+        let mut branchful = vec![MaybeUninit::<u32>::uninit(); 130];
+        values
+            .as_slice()
+            .try_map_masked_into(&mask, &mut branchless, |v| {
+                (v <= u32::MAX as u64).then_some(v as u32)
+            })
+            .unwrap();
+        values
+            .as_slice()
+            .try_map_masked_into(&mask, &mut branchful, |v| u32::try_from(v).ok())
+            .unwrap();
+
+        assert_eq!(write_t(branchful), write_t(branchless));
+    }
+
+    #[test]
+    fn try_map_masked_into_partial_chunk() {
+        let values: Vec<u64> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got.len(), 130);
+        assert_eq!(got[129], 129);
+    }
+
+    #[test]
+    fn try_map_masked_into_sliced_mask_unaligned_offset() {
+        // The mask's first byte is not word-aligned: slice off 13 bits, so the
+        // underlying BitChunks iterator must shift across byte boundaries on every
+        // 64-bit chunk it yields.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143); // logical len = 130, bit offset = 13 % 8 = 5
+        assert_eq!(mask.len(), 130);
+
+        let values: Vec<u64> = (0..130).collect();
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(res.is_ok());
+        let got = write_t(out);
+        assert_eq!(got, (0..130u32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn try_map_masked_into_sliced_mask_with_overflow() {
+        // Sliced mask + overflowing value — the cold attribution path must report
+        // the correct lane index in the sliced (post-offset) coordinate space.
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        let mut values: Vec<u64> = (0..130).collect();
+        values[77] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn try_map_masked_into_sliced_mask_null_lanes() {
+        // Mix sliced offset with a non-trivial validity pattern. Null lanes must
+        // not contribute to fail_acc, even when their underlying value would overflow.
+        let mut m = BitBufferMut::with_capacity(256);
+        for i in 0..256 {
+            m.append(i % 3 != 0);
+        }
+        let big = m.freeze();
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        // After the 13-lane slice, original index `13 + j` becomes lane `j`.
+        // Lane `j` is valid iff `(13 + j) % 3 != 0`.
+        let mut values: Vec<u64> = (0..130).collect();
+        // Pick a lane that is INVALID in the sliced coords: 13+2 = 15, 15 % 3 == 0 → invalid.
+        // Stuff in an overflowing value; it must be neutralized by `* valid as u64`.
+        values[2] = u64::MAX;
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert!(res.is_ok(), "null lane should bypass the range check");
+    }
+
+    #[test]
+    fn try_map_masked_into_overflow_in_remainder() {
+        // Overflow in the trailing partial chunk (not aligned to 64).
+        let mut values: Vec<u64> = (0..130).collect();
+        values[129] = (u32::MAX as u64) + 1;
+        let mask = BitBuffer::new_set(130);
+        let mut out = vec![MaybeUninit::<u32>::uninit(); 130];
+        let res = values.as_slice().try_map_masked_into(&mask, &mut out, |v| {
+            (v <= u32::MAX as u64).then_some(v as u32)
+        });
+        assert_eq!(res, Err(129));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_all_ok() {
+        let mut values: Vec<u32> = (0..200).collect();
+        let mask = BitBuffer::new_set(200);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert!(res.is_ok());
+        let expected: Vec<u32> = (0..200u32).map(|v| v * 2).collect();
+        assert_eq!(values, expected);
+    }
+
+    #[test]
+    fn try_map_masked_in_place_first_failing_chunk_wins() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[83] = u32::MAX;
+        values[150] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert_eq!(res, Err(83));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_within_chunk_reports_lowest() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[80] = u32::MAX;
+        values[100] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert_eq!(res, Err(80));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_single_failure_lane_exact() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[42] = u32::MAX;
+        let mask = BitBuffer::new_set(200);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert_eq!(res, Err(42));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_null_bypass() {
+        let mut values: Vec<u32> = (0..200).collect();
+        values[5] = u32::MAX;
+        let mask = {
+            let mut m = BitBufferMut::with_capacity(200);
+            for i in 0..200 {
+                m.append(i != 5);
+            }
+            m.freeze()
+        };
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert!(res.is_ok(), "null-lane overflow should be filtered");
+        // Null lane was overwritten with default (0).
+        assert_eq!(values[5], 0);
+        assert_eq!(values[6], 12);
+    }
+
+    #[test]
+    fn try_map_masked_in_place_remainder_overflow() {
+        let mut values: Vec<u32> = (0..130).collect();
+        values[129] = u32::MAX;
+        let mask = BitBuffer::new_set(130);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert_eq!(res, Err(129));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_sliced_mask() {
+        let big = BitBuffer::new_set(256);
+        let mask = big.slice(13..143);
+        assert_eq!(mask.len(), 130);
+
+        let mut values: Vec<u32> = (0..130).collect();
+        values[77] = u32::MAX;
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| v.checked_mul(2));
+        assert_eq!(res, Err(77));
+    }
+
+    #[test]
+    fn reinterpret_sink_same_width_f32_u32() {
+        // Read f32, write u32-bits in place. After transmuting the slice back to u32 we
+        // should see exactly the bit patterns the closure produced.
+        let mut buf: Vec<f32> = (0..130).map(|i| i as f32).collect();
+        let mask = BitBuffer::new_set(130);
+        ReinterpretSink::<f32, u32>::new(buf.as_mut_slice())
+            .try_map_masked_in_place(&mask, |f| Some(f.to_bits().wrapping_add(1)))
+            .unwrap();
+        // SAFETY: same size + alignment for f32 and u32; every slot now holds a u32 written by
+        // the closure.
+        let as_u32: &[u32] =
+            unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const u32, buf.len()) };
+        for (i, &got) in as_u32.iter().enumerate() {
+            assert_eq!(got, (i as f32).to_bits().wrapping_add(1), "lane {i}");
+        }
+    }
+
+    #[test]
+    fn reinterpret_sink_failure_reports_lane() {
+        // Closure fails at a specific lane; the kernel must report that lane index.
+        let mut buf: Vec<f32> = (0..200).map(|i| i as f32).collect();
+        let mask = BitBuffer::new_set(200);
+        let res = ReinterpretSink::<f32, u32>::new(buf.as_mut_slice()).try_map_masked_in_place(
+            &mask,
+            |f| {
+                if f as u32 == 137 {
+                    None
+                } else {
+                    Some(f as u32)
+                }
+            },
+        );
+        assert_eq!(res, Err(137));
+    }
+
+    #[test]
+    fn try_map_masked_in_place_partial_chunk_success() {
+        let mut values: Vec<u32> = (0..130).collect();
+        let mask = BitBuffer::new_set(130);
+        let res = values
+            .as_mut_slice()
+            .try_map_masked_in_place(&mask, |v| Some(v + 1));
+        assert!(res.is_ok());
+        assert_eq!(values[0], 1);
+        assert_eq!(values[63], 64);
+        assert_eq!(values[64], 65);
+        assert_eq!(values[129], 130);
+    }
+}
diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs
index 8319fffa387..667a1f11a9d 100644
--- a/vortex-buffer/src/lib.rs
+++ b/vortex-buffer/src/lib.rs
@@ -62,6 +62,7 @@ mod buffer_mut;
 mod bytes;
 mod r#const;
 mod debug;
+pub mod lane_kernels;
 mod macros;
 #[cfg(feature = "memmap2")]
 mod memmap2;