From 329d81c97e41016e1b8cb9882b6bd228bb13873e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 14 May 2026 14:06:54 +0000
Subject: [PATCH 1/2] Eq/NotEq compare-constant fast path for bit-packed arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bit-packed lane holds values in `[0, 2^bit_width - 1]`. When the RHS constant
sits outside that range, no packed lane can equal it, so:

  Eq    -> false everywhere
  NotEq -> true  everywhere

modulo patches (which carry the real value) and validity. Detecting the range
is an `O(1)` `i128` check on the constant alone — strictly cheaper than
encoding `c` into the bit-packed representation.

Register a `CompareKernel` for `BitPacked` that short-circuits this case. With
no patches and no nulls it returns a `ConstantArray<bool>` (also `O(1)`);
otherwise it allocates a `BitBuffer`, fills it with the constant result, and
overlays the per-position outcome at each patch index. Ordering operators
(`Lt`/`Lte`/`Gt`/`Gte`) and in-range constants fall through to the canonical
decompress + Arrow compare path; tests exercise both fall-throughs.

Signed-off-by: Claude <noreply@anthropic.com>
---
 encodings/fastlanes/public-api.lock           |   4 +
 .../src/bitpacking/compute/compare.rs         | 323 ++++++++++++++++++
 .../fastlanes/src/bitpacking/compute/mod.rs   |   1 +
 .../src/bitpacking/vtable/kernels.rs          |   2 +
 4 files changed, 330 insertions(+)
 create mode 100644 encodings/fastlanes/src/bitpacking/compute/compare.rs
diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock
index 4f0ce3df18c..527dda21442 100644
--- a/encodings/fastlanes/public-api.lock
+++ b/encodings/fastlanes/public-api.lock
@@ -190,6 +190,10 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPacked
 
 pub fn vortex_fastlanes::BitPacked::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
 
+impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_fastlanes::BitPacked
+
+pub fn vortex_fastlanes::BitPacked::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
 impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_fastlanes::BitPacked
 
 pub fn vortex_fastlanes::BitPacked::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
new file mode 100644
index 00000000000..900b7bd9f60
--- /dev/null
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Fast-path `Eq` / `NotEq` comparison against a constant.
+//!
+//! When the constant cannot fit in the packable range `[0, 2^bit_width - 1]`, no value
+//! stored in the packed buffer can equal it, so:
+//!
+//! * `Eq`    → every position is `false` (modulo patches/validity).
+//! * `NotEq` → every position is `true`  (modulo patches/validity).
+//!
+//! Detecting this is an `O(1)` range check on the constant — strictly cheaper than
+//! encoding `c` into the bit-packed representation. The check is layout-agnostic and
+//! does not touch the packed buffer.
+//!
+//! In-range constants and ordering operators (`Lt`/`Lte`/`Gt`/`Gte`) currently fall
+//! through to the canonical decompress + Arrow compare path.
+
+use num_traits::ToPrimitive;
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::primitive::NativeValue;
+use vortex_array::dtype::IntegerPType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::binary::CompareKernel;
+use vortex_array::scalar_fn::fns::operators::CompareOperator;
+use vortex_buffer::BitBufferMut;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+
+impl CompareKernel for BitPacked {
+    fn compare(
+        lhs: ArrayView<'_, Self>,
+        rhs: &ArrayRef,
+        operator: CompareOperator,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        // Only `Eq` / `NotEq` are accelerated here. Ordering operators (`Lt`, `Lte`, `Gt`,
+        // `Gte`) need either a SWAR less-than over the packed bytes or unpack-then-compare;
+        // both are out of scope for this commit and fall through to the canonical path.
+        if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) {
+            return Ok(None);
+        }
+
+        let Some(constant) = rhs.as_constant() else {
+            return Ok(None);
+        };
+        let Some(constant) = constant.as_primitive_opt() else {
+            return Ok(None);
+        };
+
+        match_each_integer_ptype!(constant.ptype(), |T| {
+            compare_eq_constant::<T>(
+                lhs,
+                constant
+                    .typed_value::<T>()
+                    .vortex_expect("null scalar handled in adaptor"),
+                rhs.dtype().nullability(),
+                operator,
+                ctx,
+            )
+        })
+    }
+}
+
+/// Returns `true` if `constant` cannot fit in the packable range `[0, 2^bit_width - 1]`.
+///
+/// `O(1)` check on the constant; never inspects the packed buffer.
+#[inline]
+fn constant_out_of_packable_range<T>(constant: T, bit_width: u8) -> bool
+where
+    T: NativePType + ToPrimitive,
+{
+    let Some(c) = constant.to_i128() else {
+        return false;
+    };
+    let max = (1i128 << bit_width) - 1;
+    c < 0 || c > max
+}
+
+fn compare_eq_constant<T>(
+    lhs: ArrayView<'_, BitPacked>,
+    constant: T,
+    rhs_nullability: Nullability,
+    operator: CompareOperator,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<ArrayRef>>
+where
+    T: NativePType + ToPrimitive,
+{
+    if !constant_out_of_packable_range(constant, lhs.bit_width()) {
+        // Constant fits in the packable range, so at least some packed lanes could match
+        // it. The fast path doesn't apply.
+        return Ok(None);
+    }
+
+    // Every packed lane disagrees with `constant`. `Eq` is `false` everywhere, `NotEq` is
+    // `true` everywhere — modulo patches (which carry the real value) and validity.
+    let packed_lane_result = matches!(operator, CompareOperator::NotEq);
+    let len = lhs.len();
+    let validity = lhs.validity()?;
+    let patches = lhs.patches();
+    let result_nullability = lhs.dtype().nullability() | rhs_nullability;
+
+    // Hot path: no patches, no nulls — every position has the same boolean result, so we
+    // return a `ConstantArray<bool>` in `O(1)`.
+    if patches.is_none() && validity.no_nulls() {
+        return Ok(Some(
+            ConstantArray::new(Scalar::bool(packed_lane_result, result_nullability), len)
+                .into_array(),
+        ));
+    }
+
+    let mut bits = BitBufferMut::full(packed_lane_result, len);
+
+    if let Some(patches) = patches {
+        let indices = patches.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        let values = patches.values().clone().execute::<PrimitiveArray>(ctx)?;
+        let patches_offset = patches.offset();
+
+        match_each_unsigned_integer_ptype!(indices.ptype(), |I| {
+            apply_eq_patches::<T, I>(
+                &mut bits,
+                indices.as_slice::<I>(),
+                values.as_slice::<T>(),
+                patches_offset,
+                operator,
+                constant,
+            );
+        });
+    }
+
+    let validity = validity.union_nullability(rhs_nullability);
+    Ok(Some(BoolArray::new(bits.freeze(), validity).into_array()))
+}
+
+fn apply_eq_patches<T, I>(
+    bits: &mut BitBufferMut,
+    indices: &[I],
+    values: &[T],
+    indices_offset: usize,
+    operator: CompareOperator,
+    constant: T,
+) where
+    T: NativePType,
+    I: IntegerPType,
+{
+    // Only Eq/NotEq reach this point (see `CompareKernel::compare`).
+    let cmp: fn(T, T) -> bool = match operator {
+        CompareOperator::Eq => |l, r| NativeValue(l) == NativeValue(r),
+        CompareOperator::NotEq => |l, r| NativeValue(l) != NativeValue(r),
+        _ => unreachable!("only Eq/NotEq reach the bitpacked compare-constant fast path"),
+    };
+
+    let len = bits.len();
+    for (&raw_idx, &value) in indices.iter().zip(values.iter()) {
+        let i: usize = raw_idx.as_();
+        if i < indices_offset {
+            continue;
+        }
+        let pos = i - indices_offset;
+        if pos >= len {
+            break;
+        }
+        if cmp(value, constant) {
+            bits.set(pos);
+        } else {
+            bits.unset(pos);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::LazyLock;
+
+    use rstest::rstest;
+    use vortex_array::IntoArray;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::BoolArray;
+    use vortex_array::arrays::ConstantArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::assert_arrays_eq;
+    use vortex_array::builtins::ArrayBuiltins;
+    use vortex_array::dtype::DType;
+    use vortex_array::dtype::Nullability;
+    use vortex_array::scalar::Scalar;
+    use vortex_array::scalar_fn::fns::operators::Operator;
+    use vortex_array::session::ArraySession;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
+
+    use crate::BitPackedArrayExt;
+    use crate::BitPackedData;
+    use crate::bitpacking::compute::compare::constant_out_of_packable_range;
+
+    static SESSION: LazyLock<VortexSession> =
+        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+    #[test]
+    fn range_check_is_o1() {
+        // 8-bit packable range is [0, 255].
+        assert!(constant_out_of_packable_range::<i32>(256, 8));
+        assert!(constant_out_of_packable_range::<i32>(-1, 8));
+        assert!(!constant_out_of_packable_range::<i32>(255, 8));
+        assert!(!constant_out_of_packable_range::<i32>(0, 8));
+    }
+
+    #[rstest]
+    #[case(Operator::Eq, false)]
+    #[case(Operator::NotEq, true)]
+    fn eq_above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        // 999 is above the 8-bit packable range; no packed lane matches.
+        let packed = BitPackedData::encode(
+            &PrimitiveArray::from_iter([1u32, 2, 3, 250, 100]).into_array(),
+            8,
+            &mut ctx,
+        )?;
+        let result = packed
+            .into_array()
+            .binary(ConstantArray::new(999u32, 5).into_array(), op)?
+            .execute::<BoolArray>(&mut ctx)?;
+        assert_arrays_eq!(result, BoolArray::from_iter([expected; 5]));
+        Ok(())
+    }
+
+    #[rstest]
+    #[case(Operator::Eq)]
+    #[case(Operator::NotEq)]
+    fn eq_above_range_with_patches(#[case] op: Operator) -> VortexResult<()> {
+        // bit_width=4 packable range is [0, 15]; out-of-range values become patches.
+        let mut ctx = SESSION.create_execution_ctx();
+        let values = buffer![1u32, 5, 1000, 7, 1000, 14];
+        let constant = 1000u32;
+
+        let packed = BitPackedData::encode(&values.clone().into_array(), 4, &mut ctx)?;
+        assert!(packed.patches().is_some());
+
+        let result = packed
+            .into_array()
+            .binary(ConstantArray::new(constant, values.len()).into_array(), op)?
+            .execute::<BoolArray>(&mut ctx)?;
+
+        let expected: Vec<bool> = values
+            .iter()
+            .map(|v| match op {
+                Operator::Eq => *v == constant,
+                Operator::NotEq => *v != constant,
+                _ => unreachable!(),
+            })
+            .collect();
+        assert_arrays_eq!(result, BoolArray::from_iter(expected));
+        Ok(())
+    }
+
+    #[test]
+    fn ordering_falls_through() -> VortexResult<()> {
+        // Ordering ops aren't accelerated yet; they go through the canonical path and
+        // must still return a correct answer.
+        let mut ctx = SESSION.create_execution_ctx();
+        let values = [1u32, 2, 3, 250, 100];
+        let packed =
+            BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?;
+        let result = packed
+            .into_array()
+            .binary(ConstantArray::new(999u32, 5).into_array(), Operator::Lt)?
+            .execute::<BoolArray>(&mut ctx)?;
+        assert_arrays_eq!(
+            result,
+            BoolArray::from_iter(values.iter().map(|v| *v < 999))
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn eq_in_range_falls_through() -> VortexResult<()> {
+        // In-range constants must defer to the canonical path.
+        let mut ctx = SESSION.create_execution_ctx();
+        let values = [1u32, 2, 3, 250, 100];
+        let packed =
+            BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?;
+        let result = packed
+            .into_array()
+            .binary(ConstantArray::new(100u32, 5).into_array(), Operator::Eq)?
+            .execute::<BoolArray>(&mut ctx)?;
+        assert_arrays_eq!(
+            result,
+            BoolArray::from_iter(values.iter().map(|v| *v == 100))
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn eq_nullable_constant() -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        let packed = BitPackedData::encode(
+            &PrimitiveArray::from_iter([1u32, 2, 3]).into_array(),
+            4,
+            &mut ctx,
+        )?;
+        let rhs = ConstantArray::new(Scalar::primitive(999u32, Nullability::Nullable), 3);
+        let result = packed
+            .into_array()
+            .binary(rhs.into_array(), Operator::Eq)?
+            .execute::<BoolArray>(&mut ctx)?;
+        assert_eq!(result.dtype(), &DType::Bool(Nullability::Nullable));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs
index 2501d952356..169546d311c 100644
--- a/encodings/fastlanes/src/bitpacking/compute/mod.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 mod cast;
+mod compare;
 mod filter;
 pub(crate) mod is_constant;
 mod slice;
diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
index cb020dc2ce9..29ec9ea8982 100644
--- a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
+++ b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
@@ -5,12 +5,14 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor;
 use vortex_array::arrays::filter::FilterExecuteAdaptor;
 use vortex_array::arrays::slice::SliceExecuteAdaptor;
 use vortex_array::kernel::ParentKernelSet;
+use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor;
 use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor;
 
 use crate::BitPacked;
 
 pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPacked> = ParentKernelSet::new(&[
     ParentKernelSet::lift(&CastExecuteAdaptor(BitPacked)),
+    ParentKernelSet::lift(&CompareExecuteAdaptor(BitPacked)),
     ParentKernelSet::lift(&FilterExecuteAdaptor(BitPacked)),
     ParentKernelSet::lift(&SliceExecuteAdaptor(BitPacked)),
     ParentKernelSet::lift(&TakeExecuteAdaptor(BitPacked)),

From a20f09db29de291de88e9cd1e66a93291783a9e9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 14 May 2026 14:10:37 +0000
Subject: [PATCH 2/2] Extend bit-packed compare-constant to ordering, add
 bitpack_constant kernel and benches, plan in-range ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ordering operators (Lt/Lte/Gt/Gte) now use the same out-of-range short-circuit
as Eq/NotEq: when `c` lies outside `[0, 2^bit_width - 1]`, every packed lane has
the same `Ordering` relative to `c`, so each of the six operators collapses to
a constant boolean (modulo patches and validity).

Add a constant-only pack kernel `bitpack_constant` that builds the FastLanes
bit pattern for a `[constant; len]` input without calling `BitPacking::pack`.
For constant input every lane produces the same `bit_width` output words; we
compute those words analytically — each output word's `j`-th bit is bit
`(k * T_bits + j) mod bit_width` of `c` — then `memset` each word `LANES` times
into a stack chunk template and `memcpy` the template into every full chunk.
The standard packer is only invoked for the partial tail (zero-padded past
`len`). `bitpack_encode_constant` wraps the buffer up as a `BitPackedArray`.
A bitwise equivalence rstest covers byte-identity with `BitPacking::pack`
across lengths, widths, and constants.

Bench `bitpack_constant` (analytical vs full `bitpack_encode`) on a small,
fast grid: at 64 K u32 elements the analytical kernel is roughly 23-62x faster
than the full encoder, since it skips the histogram, min-scan, patches gather,
and per-chunk SIMD pack call.

Bench `bitpack_compare` (out-of-range fast path vs explicit
"decompress + Arrow compare" baseline): 1.4-1.5 µs constant-array setup vs
8-125 µs for the baseline across `bit_width ∈ {4, 16}`, `len ∈ {1024, 65536}`
and Eq/Lt.

Add a `value_fits_bit_width` helper on `BitPackedData` exposing the same O(1)
range check used internally.

Plan how to accelerate **in-range** ordering comparisons in
`encodings/fastlanes/docs/inrange_compare_plan.md`: compare the packed array
against the packed constant via SWAR less-than per supported bit width, derive
the four ordering operators from one `Lt` primitive, and benchmark against
the canonical SIMD baseline before landing.

Signed-off-by: Claude <noreply@anthropic.com>
---
 encodings/fastlanes/Cargo.toml                |   8 +
 .../fastlanes/benches/bitpack_compare.rs      | 108 +++++++++++
 .../fastlanes/benches/bitpack_constant.rs     |  53 ++++++
 .../fastlanes/docs/inrange_compare_plan.md    | 149 +++++++++++++++
 encodings/fastlanes/public-api.lock           |   6 +
 .../src/bitpacking/array/bitpack_compress.rs  | 169 +++++++++++++++++
 .../fastlanes/src/bitpacking/array/mod.rs     |  23 +++
 .../src/bitpacking/compute/compare.rs         | 175 +++++++++++-------
 8 files changed, 620 insertions(+), 71 deletions(-)
 create mode 100644 encodings/fastlanes/benches/bitpack_compare.rs
 create mode 100644 encodings/fastlanes/benches/bitpack_constant.rs
 create mode 100644 encodings/fastlanes/docs/inrange_compare_plan.md

diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
index a14e19389bc..5eaf9fa4bf5 100644
--- a/encodings/fastlanes/Cargo.toml
+++ b/encodings/fastlanes/Cargo.toml
@@ -59,3 +59,11 @@ required-features = ["_test-harness"]
 name = "bit_transpose"
 harness = false
 required-features = ["_test-harness"]
+
+[[bench]]
+name = "bitpack_constant"
+harness = false
+
+[[bench]]
+name = "bitpack_compare"
+harness = false
diff --git a/encodings/fastlanes/benches/bitpack_compare.rs b/encodings/fastlanes/benches/bitpack_compare.rs
new file mode 100644
index 00000000000..939c401ee38
--- /dev/null
+++ b/encodings/fastlanes/benches/bitpack_compare.rs
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compare an already-packed `BitPackedArray` against a constant value. Compares the
+//! out-of-range fast path (constant outside `[0, 2^bit_width - 1]`) against an explicit
+//! "decompress, then compare" baseline.
+//!
+//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare`.
+
+#![expect(clippy::unwrap_used)]
+#![expect(clippy::cast_possible_truncation)]
+
+use divan::Bencher;
+use divan::counter::ItemsCount;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::BitPackedData;
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[1024, 64 * 1024];
+const BIT_WIDTHS: &[u8] = &[4, 16];
+
+/// Build a packed array of varied in-range values, plus an out-of-range constant RHS for
+/// the fast-path benches.
+fn build_inputs<const BW: u8>(len: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let buf: BufferMut<u32> = (0..len).map(|i| (i as u32) % (1 << BW)).collect();
+    let array = BitPackedData::encode(
+        &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(),
+        BW,
+        &mut ctx,
+    )
+    .unwrap()
+    .into_array();
+    // 1 << BW is just past the packable range, so the out-of-range fast path fires.
+    let constant = 1u32 << BW;
+    let rhs = ConstantArray::new(constant, len).into_array();
+    (array, rhs, ctx)
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn fast_eq_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        array
+            .clone()
+            .binary(rhs.clone(), Operator::Eq)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn baseline_eq<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        // What the fallback would do: materialize the unpacked primitive, then run Arrow
+        // compare on it.
+        let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        primitive
+            .into_array()
+            .binary(rhs.clone(), Operator::Eq)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn fast_lt_out_of_range<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        array
+            .clone()
+            .binary(rhs.clone(), Operator::Lt)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn baseline_lt<const BW: u8>(bencher: Bencher, len: usize) {
+    let (array, rhs, mut ctx) = build_inputs::<BW>(len);
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        let primitive = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        primitive
+            .into_array()
+            .binary(rhs.clone(), Operator::Lt)
+            .unwrap()
+            .execute::<BoolArray>(&mut ctx)
+            .unwrap()
+    });
+}
diff --git a/encodings/fastlanes/benches/bitpack_constant.rs b/encodings/fastlanes/benches/bitpack_constant.rs
new file mode 100644
index 00000000000..a5e06e53972
--- /dev/null
+++ b/encodings/fastlanes/benches/bitpack_constant.rs
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compare the fast constant bit-packing path against the standard `bitpack_encode`
+//! pipeline on a uniform-constant input.
+//!
+//! Sized to finish quickly. Run with `cargo bench -p vortex-fastlanes --bench bitpack_constant`.
+
+#![expect(clippy::unwrap_used)]
+
+use divan::Bencher;
+use divan::black_box;
+use divan::counter::ItemsCount;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::bitpack_compress::bitpack_encode;
+use vortex_fastlanes::bitpack_compress::bitpack_encode_constant;
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[1024, 64 * 1024];
+const BIT_WIDTHS: &[u8] = &[4, 16];
+
+const CONSTANT: u32 = 7;
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn full_encode<const BW: u8>(bencher: Bencher, len: usize) {
+    let buf: BufferMut<u32> = (0..len).map(|_| CONSTANT).collect();
+    let arr = PrimitiveArray::new(buf.freeze(), Validity::NonNullable);
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+
+    bencher
+        .counter(ItemsCount::new(len))
+        .bench_local(|| bitpack_encode(black_box(&arr), black_box(BW), None, &mut ctx).unwrap());
+}
+
+#[divan::bench(args = LENS, consts = BIT_WIDTHS)]
+fn fast_encode<const BW: u8>(bencher: Bencher, len: usize) {
+    bencher.counter(ItemsCount::new(len)).bench_local(|| {
+        bitpack_encode_constant::<u32>(
+            black_box(CONSTANT),
+            black_box(BW),
+            black_box(len),
+            Validity::NonNullable,
+        )
+        .unwrap()
+    });
+}
diff --git a/encodings/fastlanes/docs/inrange_compare_plan.md b/encodings/fastlanes/docs/inrange_compare_plan.md
new file mode 100644
index 00000000000..4399f83dc14
--- /dev/null
+++ b/encodings/fastlanes/docs/inrange_compare_plan.md
@@ -0,0 +1,149 @@
+# Plan: in-range constant compare for bit-packed arrays
+
+## Status today
+
+`encodings/fastlanes/src/bitpacking/compute/compare.rs` accelerates
+`BitPackedArray op ConstantArray` only when `c` is **outside** the packable range
+`[0, 2^bit_width - 1]`. That case reduces every packed lane to the same boolean
+under `op`, so the result is a `ConstantArray<bool>` (no work on the buffer) or a
+`BitBuffer` filled with that constant plus a per-position overlay at any patched
+indices.
+
+**In-range** constants (those that could equal a packed lane) fall through to the
+canonical "decompress to `PrimitiveArray`, then run Arrow's vectorized compare"
+path. For `Eq`/`NotEq`/`Lt`/`Lte`/`Gt`/`Gte` this is correct but does two SIMD
+passes' worth of work (unpack + compare) and writes the unpacked primitive to
+memory along the way.
+
+## Why the obvious approach (FastLanes `unpack_cmp`) doesn't win
+
+`fastlanes::BitPackingCompare::unchecked_unpack_cmp` fuses unpack + compare and
+emits `[bool; 1024]` without materializing the primitive array. It is
+`#[inline(never)]` and applies the comparator closure to every element
+individually. We tried wiring it in: at 65 K u32 elements (bit_width 4) the
+fused path measured ~170 µs against ~91 µs for the canonical "unpack then
+Arrow compare" path. Both Arrow's primitive compare and FastLanes' `unpack` are
+heavily SIMD-vectorized; the per-element closure call defeats vectorization in
+`unpack_cmp`. Reverted in commit `cc586c6`.
+
+## Proposal: bit-parallel compare on the packed buffer
+
+Pack the constant into a 1024-element template once (we already have a
+constant-only pack kernel in `bitpack_compress::bitpack_constant`, which
+synthesizes the FastLanes bit pattern analytically — no `BitPacking::pack`
+call). Then for each 1024-chunk of the input, do the comparison directly on the
+packed bytes via SIMD/SWAR. No materialization, less memory traffic
+(`~3W·128` bytes per chunk vs `12·1024` bytes for unpack + compare on `u32`),
+and the loop is fully vectorizable.
+
+### Equality (`Eq` / `NotEq`)
+
+The clean case.
+
+```
+diff = packed_chunk  ^  c_packed_chunk        // SIMD XOR per word
+eq_per_element = "every W-bit slot of diff is zero"
+```
+
+Per the FastLanes layout, lane `l`'s `W` output words contain bits
+`[k·T, (k+1)·T)` of the per-lane stream `c, c, c, …` for `k ∈ 0..W`. After
+XOR with the same-layout `c_packed`, element `r`'s `W` bits land at known
+positions inside the lane's `W` output words.
+
+* **`W` divides `T` (W ∈ {1, 2, 4, 8, 16, 32} for u32):** each element's `W`
+  bits are contained in a single output word. The classic SWAR "any byte is
+  zero" trick works for `W = 8`:
+  ```
+  let v = diff_word;
+  let zero_byte = !v & (v.wrapping_sub(0x01010101)) & 0x80808080;
+  // bit 7 of each byte set iff that byte was 0
+  ```
+  Analogous masks `0x55555555` (`W=2`), `0x11111111` (`W=4`),
+  `0x00010001` (`W=16`) cover the other power-of-2 widths.
+* **`W` does not divide `T` (e.g. 3, 5, 7, 9, 11, 13, 15):** elements straddle
+  word boundaries. The "OR-reduce W shifted copies" idea still applies but the
+  mask depends on the rotation; easiest implementation is per-width SWAR
+  unrolled at compile time via `match_each_bit_width!`.
+
+Pack the resulting per-element bits into the output `BitBuffer`. We already do
+this for the out-of-range short-circuit's patches overlay; the same code
+applies.
+
+### Ordering (`Lt` / `Lte` / `Gt` / `Gte`)
+
+The harder case. Two routes; pick one per width.
+
+#### Route A — SWAR less-than (preferred for `W ∈ {8, 16, 32}`)
+
+For `W = 8` and `u32` storage, each output word holds 4 packed elements as
+bytes. The classic SWAR unsigned less-than is:
+
+```
+let A = packed_word;
+let B = c_packed_word;                        // a constant per chunk
+let mask = 0x80808080;
+let lt = ((A | mask) - (B & !mask)) ^ ((A ^ B) | mask);
+let lt_bits = lt & mask;                      // high bit per byte = 1 iff A_byte < B_byte
+```
+
+Extract bit 7 of each byte (e.g. `_pext_u32(lt_bits, 0x80808080)` on BMI2, or a
+shift-and-mask sequence) and pack into the result `BitBuffer`. `W = 16` uses
+`0x80008000`; `W = 32` is the trivial single-element-per-word case.
+
+Derive the other three operators from `Lt`:
+* `Gt(a, c) = Lt(c, a)` → swap operands.
+* `Lte(a, c) = !Gt(a, c)` → SWAR less-than with swapped operands, then invert.
+* `Gte(a, c) = !Lt(a, c)` → invert.
+
+For `W = 4` the same SWAR pattern works on nibbles with mask `0x88888888`.
+
+#### Route B — bit-sliced compare (covers all `W`)
+
+Generic alternative: for each output word, treat the contained `W`-bit slots
+as a vertical stack of `T_bits / W` slot values, and run the standard
+bit-sliced comparator on the lane's `W` output words at once. This is
+layout-aware (uses the FastLanes lane order) but doesn't need per-width
+SWAR masks. Slower than Route A on widths Route A supports, but simpler to
+write and works uniformly.
+
+### Patches and validity
+
+Same overlay pattern as the out-of-range path: compute the per-position
+ordering bit from the packed buffer, then for each `(idx, value)` patch set the
+bit at `idx - patches.offset()` to `op(value, c)`. Apply the validity mask
+at the end via `BoolArray::new(bits, validity)`.
+
+### Sliced arrays
+
+`lhs.offset() != 0` means the first chunk's packed bytes do not align with
+element 0; defer to the canonical path until we have proper offset handling
+inside the SWAR loop (drop the first `offset` bits before writing).
+
+## Order of work
+
+1. **`Eq` in-range via XOR + SWAR zero-detect.** Add the per-width SWAR masks
+   for `W ∈ {1, 2, 4, 8, 16, 32}` first; widths in between can fall through
+   to canonical until step 3. NotEq is the same kernel inverted.
+2. **`Gt` / `Gte` in-range via SWAR less-than.** Land `W ∈ {8, 16, 32}`,
+   derive the four ordering operators from a single `Lt(a, b)` primitive.
+3. **Non-power-of-2 widths.** Pick Route B (bit-sliced compare) or
+   per-width SWAR; benchmark.
+4. **Sliced offsets and patches.** Handle `offset != 0` inside the SWAR loop
+   so we don't fall back on sliced inputs.
+
+Each step is independently shippable; the kernel already returns `Ok(None)`
+for any case it doesn't accelerate, so the canonical path remains the
+correctness fallback throughout.
+
+## Benchmarks to land alongside
+
+Add cases to `benches/bitpack_compare.rs` for an **in-range** constant
+(currently only the out-of-range fast path is benched there). Compare:
+
+* the SWAR fast path
+* the canonical "execute to `PrimitiveArray`, then Arrow compare" baseline
+
+across `bit_width ∈ {4, 8, 16}` and `len ∈ {1 024, 65 536}` for both `Eq`
+and `Gt`. We need to beat the baseline at 64 K to be worth landing —
+otherwise the canonical path's SIMD throughput is already the right answer
+and we should drop this idea.
diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock
index 527dda21442..d63eb857ed8 100644
--- a/encodings/fastlanes/public-api.lock
+++ b/encodings/fastlanes/public-api.lock
@@ -18,8 +18,12 @@ pub mod vortex_fastlanes::bitpack_compress
 
 pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(vortex_array::array::view::ArrayView<'_, vortex_array::arrays::primitive::vtable::Primitive>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<alloc::vec::Vec<usize>>
 
+pub fn vortex_fastlanes::bitpack_compress::bitpack_constant<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking>(T, u8, usize) -> vortex_buffer::buffer::Buffer<T>
+
 pub fn vortex_fastlanes::bitpack_compress::bitpack_encode(&vortex_array::arrays::primitive::vtable::PrimitiveArray, u8, core::option::Option<&[usize]>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>
 
+pub fn vortex_fastlanes::bitpack_compress::bitpack_encode_constant<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking + num_traits::cast::ToPrimitive>(T, u8, usize, vortex_array::validity::Validity) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>
+
 pub unsafe fn vortex_fastlanes::bitpack_compress::bitpack_encode_unchecked(vortex_array::arrays::primitive::vtable::PrimitiveArray, u8) -> vortex_error::VortexResult<vortex_fastlanes::BitPackedArray>
 
 pub fn vortex_fastlanes::bitpack_compress::bitpack_primitive<T: vortex_array::dtype::ptype::NativePType + fastlanes::bitpacking::BitPacking>(&[T], u8) -> vortex_buffer::buffer::Buffer<T>
@@ -224,6 +228,8 @@ pub fn vortex_fastlanes::BitPackedData::try_new(vortex_array::buffer::BufferHand
 
 pub fn vortex_fastlanes::BitPackedData::unpacked_chunks<T: vortex_fastlanes::unpack_iter::BitPacked>(&self, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<vortex_fastlanes::unpack_iter::BitUnpackedChunks<T>>
 
+pub fn vortex_fastlanes::BitPackedData::value_fits_bit_width<T: vortex_array::dtype::ptype::NativePType + num_traits::cast::ToPrimitive>(&self, T) -> core::option::Option<bool>
+
 impl core::clone::Clone for vortex_fastlanes::BitPackedData
 
 pub fn vortex_fastlanes::BitPackedData::clone(&self) -> vortex_fastlanes::BitPackedData
diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs
index 2e517819059..acac34f4fae 100644
--- a/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs
+++ b/encodings/fastlanes/src/bitpacking/array/bitpack_compress.rs
@@ -194,6 +194,143 @@ pub fn bitpack_primitive<T: NativePType + BitPacking>(array: &[T], bit_width: u8
     output.freeze()
 }
 
+/// Build the bit-packed buffer for a `[constant; len]` input without calling the
+/// SIMD packer.
+///
+/// The FastLanes packing kernel runs `LANES` independent lane packers in parallel, each
+/// consuming `T = 8 * size_of::<T>()` input values and producing `bit_width` output words.
+/// When every input value equals `constant`, all `LANES` lane packers produce the same
+/// `bit_width` words. We compute those words analytically — looping over `T` bits per
+/// output word with a single `OR`/shift — then replicate the lane pattern across the
+/// chunk and the chunk pattern across the buffer with `memset`/`memcpy`. No call to
+/// `BitPacking::pack` is involved for any full chunk.
+///
+/// The trailing partial chunk (when `len % 1024 != 0`) is zero-padded past `len`, so it
+/// has a different pattern than the full template. It is built by re-using the analytical
+/// kernel only when `len % 1024` is itself a multiple of `T` (so the padded boundary
+/// aligns with a lane row); otherwise we fall back to a single `unchecked_pack` call for
+/// that final chunk only.
+///
+/// # Preconditions
+///
+/// * `constant` must fit in `bit_width`, i.e., `(constant as u64) < (1 << bit_width)`.
+/// * `0 < bit_width <= size_of::<T>() * 8`.
+pub fn bitpack_constant<T: NativePType + BitPacking>(
+    constant: T,
+    bit_width: u8,
+    len: usize,
+) -> Buffer<T> {
+    if bit_width == 0 || len == 0 {
+        return Buffer::<T>::empty();
+    }
+    let w = bit_width as usize;
+    let t_bits = 8 * size_of::<T>();
+    let lanes = 1024 / t_bits;
+    let packed_len = 128 * w / size_of::<T>();
+    debug_assert_eq!(packed_len, w * lanes);
+
+    let num_chunks = len.div_ceil(1024);
+    let num_full_chunks = len / 1024;
+
+    let mut output = BufferMut::<T>::with_capacity(num_chunks * packed_len);
+
+    if num_full_chunks > 0 {
+        // One full chunk's bit pattern: `w` distinct output words, each replicated `lanes`
+        // times. Build the template on the stack with `lane_word`-sized `memset`s, then
+        // `memcpy` it into the output for every full chunk.
+        let lane_words = constant_lane_words::<T>(constant, w);
+        let mut chunk: [T; 1024] = [T::zero(); 1024];
+        for (k, &word) in lane_words.iter().enumerate() {
+            chunk[k * lanes..(k + 1) * lanes].fill(word);
+        }
+        let template = &chunk[..packed_len];
+        for _ in 0..num_full_chunks {
+            output.extend_from_slice(template);
+        }
+    }
+
+    if num_chunks > num_full_chunks {
+        // Tail chunk gets zero-padded past `len % 1024`, so it differs from the full
+        // template. Use the standard packer for this single chunk.
+        let last_chunk_size = len % 1024;
+        let mut last_chunk: [T; 1024] = [T::zero(); 1024];
+        last_chunk[..last_chunk_size].fill(constant);
+        let tail_start = output.len();
+        unsafe {
+            output.set_len(tail_start + packed_len);
+            BitPacking::unchecked_pack(w, &last_chunk, &mut output[tail_start..][..packed_len]);
+        }
+    }
+
+    output.freeze()
+}
+
+/// Compute the `bit_width` output words that every FastLanes lane produces when packing
+/// `T = 8 * size_of::<T>()` copies of `constant`.
+///
+/// For constant input, each lane packs a periodic bit-stream of period `bit_width` made
+/// of the low `bit_width` bits of `constant`. Output word `k` contains bits
+/// `[k * T, (k + 1) * T)` of that stream, so its `j`-th bit equals bit
+/// `(k * T + j) mod bit_width` of `constant`.
+fn constant_lane_words<T: NativePType + BitPacking>(constant: T, bit_width: usize) -> Vec<T> {
+    let t_bits = 8 * size_of::<T>();
+    let mask = if bit_width == t_bits {
+        !T::zero()
+    } else {
+        (T::one() << bit_width) - T::one()
+    };
+    let s = constant & mask;
+    (0..bit_width)
+        .map(|k| {
+            let mut word = T::zero();
+            for j in 0..t_bits {
+                let bit_in_s = (k * t_bits + j) % bit_width;
+                let bit = (s >> bit_in_s) & T::one();
+                word = word | (bit << j);
+            }
+            word
+        })
+        .collect()
+}
+
+/// Encode a length-`len` array of `constant` values as a [`BitPackedArray`] without
+/// running the standard encode pipeline.
+///
+/// Returns an error if `constant` does not fit in `bit_width`, or if `bit_width` is too
+/// large for `T`.
+pub fn bitpack_encode_constant<T: NativePType + BitPacking + num_traits::ToPrimitive>(
+    constant: T,
+    bit_width: u8,
+    len: usize,
+    validity: Validity,
+) -> VortexResult<BitPackedArray> {
+    if bit_width as usize >= T::PTYPE.bit_width() {
+        vortex_bail!(
+            InvalidArgument: "Cannot pack - specified bit width {bit_width} >= {}",
+            T::PTYPE.bit_width()
+        );
+    }
+    let c = constant
+        .to_i128()
+        .ok_or_else(|| vortex_error::vortex_err!("cannot cast constant to i128"))?;
+    if c < 0 || c > (1i128 << bit_width) - 1 {
+        vortex_bail!(
+            InvalidArgument: "constant {c} does not fit in bit_width {bit_width}"
+        );
+    }
+
+    let packed = bitpack_constant(constant, bit_width, len).into_byte_buffer();
+    BitPacked::try_new(
+        BufferHandle::new_host(packed),
+        T::PTYPE,
+        validity,
+        None,
+        bit_width,
+        len,
+        0,
+    )
+}
+
 pub fn gather_patches(
     parray: &PrimitiveArray,
     bit_width: u8,
@@ -650,4 +787,36 @@ mod test {
         assert_arrays_eq!(chunk_offsets, PrimitiveArray::from_iter([0u64]));
         Ok(())
     }
+
+    #[rstest::rstest]
+    #[case::aligned_1024(1024u32, 7, 5)]
+    #[case::aligned_multi(8192u32, 7, 5)]
+    #[case::partial_tail(2050u32, 7, 5)]
+    #[case::small(13u32, 5, 17)]
+    #[case::large_bitwidth(1_000_000u32, 18, 200_000)]
+    fn bitpack_constant_matches_full_encode(
+        #[case] len: u32,
+        #[case] bit_width: u8,
+        #[case] constant: u32,
+    ) -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        let input = PrimitiveArray::from_iter(std::iter::repeat_n(constant, len as usize));
+
+        let slow = bitpack_encode(&input, bit_width, None, &mut ctx)?;
+        let fast = bitpack_encode_constant::<u32>(
+            constant,
+            bit_width,
+            len as usize,
+            Validity::NonNullable,
+        )?;
+
+        let slow_packed = slow.packed().clone().unwrap_host();
+        let fast_packed = fast.packed().clone().unwrap_host();
+        assert_eq!(slow_packed.as_slice(), fast_packed.as_slice());
+
+        // Unpack fast result and verify roundtrip.
+        let unpacked = fast.into_array().execute::<PrimitiveArray>(&mut ctx)?;
+        assert_arrays_eq!(unpacked, input);
+        Ok(())
+    }
 }
diff --git a/encodings/fastlanes/src/bitpacking/array/mod.rs b/encodings/fastlanes/src/bitpacking/array/mod.rs
index e5c64252fbc..b109d53d64c 100644
--- a/encodings/fastlanes/src/bitpacking/array/mod.rs
+++ b/encodings/fastlanes/src/bitpacking/array/mod.rs
@@ -273,6 +273,29 @@ impl BitPackedData {
     pub fn max_packed_value(&self) -> usize {
         (1 << self.bit_width()) - 1
     }
+
+    /// Test whether `value` can be represented as a packed lane in this array, i.e. whether
+    /// it falls in the range `[0, 2^bit_width - 1]`.
+    ///
+    /// This is an `O(1)` check that never inspects the packed buffer and is strictly cheaper
+    /// than encoding `value` into the bit-packed representation. It is the building block for
+    /// fast comparison kernels that can short-circuit when the constant cannot match any
+    /// packed lane.
+    ///
+    /// Returns `None` if `value` cannot be losslessly converted to `i128` (which never
+    /// happens for the integer types supported by bit-packing).
+    #[inline]
+    pub fn value_fits_bit_width<T: NativePType + num_traits::ToPrimitive>(
+        &self,
+        value: T,
+    ) -> Option<bool> {
+        let v = value.to_i128()?;
+        if v < 0 {
+            return Some(false);
+        }
+        let max = (1i128 << self.bit_width()) - 1;
+        Some(v <= max)
+    }
 }
 
 pub trait BitPackedArrayExt: BitPackedArraySlotsExt {
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 900b7bd9f60..ef29f0b4608 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -1,20 +1,28 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Fast-path `Eq` / `NotEq` comparison against a constant.
+//! Fast-path comparison against a constant for bit-packed arrays.
 //!
-//! When the constant cannot fit in the packable range `[0, 2^bit_width - 1]`, no value
-//! stored in the packed buffer can equal it, so:
+//! A bit-packed lane holds values in `[0, 2^bit_width - 1]`. When the RHS constant sits
+//! outside that range, every packed lane has the same `Ordering` relative to `c`:
 //!
-//! * `Eq`    → every position is `false` (modulo patches/validity).
-//! * `NotEq` → every position is `true`  (modulo patches/validity).
+//! * `c > 2^bit_width - 1` (above range) → every packed lane is `< c`
+//! * `c < 0` (below range) → every packed lane is `> c` (packed values are non-negative)
 //!
-//! Detecting this is an `O(1)` range check on the constant — strictly cheaper than
-//! encoding `c` into the bit-packed representation. The check is layout-agnostic and
-//! does not touch the packed buffer.
+//! That collapses each of the six comparison operators to a constant boolean (modulo
+//! patches and validity), so the result is either a `ConstantArray<bool>` (`O(1)`) or a
+//! `BitBuffer` filled with that constant and overlaid with per-position results at any
+//! patched indices.
 //!
-//! In-range constants and ordering operators (`Lt`/`Lte`/`Gt`/`Gte`) currently fall
-//! through to the canonical decompress + Arrow compare path.
+//! Detecting whether the constant falls in the packable range is an `O(1)` `i128` check
+//! on the constant alone — strictly cheaper than encoding `c` into the bit-packed
+//! representation, and layout-agnostic.
+//!
+//! **In-range constants** (those that could match a packed lane) fall through to the
+//! canonical decompress + Arrow compare path. See `docs/inrange_compare_plan.md` for the
+//! plan to accelerate that case for ordering operators.
+
+use std::cmp::Ordering;
 
 use num_traits::ToPrimitive;
 use vortex_array::ArrayRef;
@@ -47,13 +55,6 @@ impl CompareKernel for BitPacked {
         operator: CompareOperator,
         ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
-        // Only `Eq` / `NotEq` are accelerated here. Ordering operators (`Lt`, `Lte`, `Gt`,
-        // `Gte`) need either a SWAR less-than over the packed bytes or unpack-then-compare;
-        // both are out of scope for this commit and fall through to the canonical path.
-        if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) {
-            return Ok(None);
-        }
-
         let Some(constant) = rhs.as_constant() else {
             return Ok(None);
         };
@@ -62,7 +63,7 @@ impl CompareKernel for BitPacked {
         };
 
         match_each_integer_ptype!(constant.ptype(), |T| {
-            compare_eq_constant::<T>(
+            compare_constant::<T>(
                 lhs,
                 constant
                     .typed_value::<T>()
@@ -75,22 +76,45 @@ impl CompareKernel for BitPacked {
     }
 }
 
-/// Returns `true` if `constant` cannot fit in the packable range `[0, 2^bit_width - 1]`.
+/// Ordering of every packed lane vs `constant` when `constant` is outside the packable
+/// range. Returns `None` when `constant` itself fits in the range (no fast path applies).
 ///
 /// `O(1)` check on the constant; never inspects the packed buffer.
 #[inline]
-fn constant_out_of_packable_range<T>(constant: T, bit_width: u8) -> bool
+fn constant_relation_to_packed<T>(constant: T, bit_width: u8) -> Option<Ordering>
 where
     T: NativePType + ToPrimitive,
 {
-    let Some(c) = constant.to_i128() else {
-        return false;
-    };
+    let c = constant.to_i128()?;
+    if c < 0 {
+        return Some(Ordering::Greater);
+    }
     let max = (1i128 << bit_width) - 1;
-    c < 0 || c > max
+    if c > max {
+        return Some(Ordering::Less);
+    }
+    None
 }
 
-fn compare_eq_constant<T>(
+/// Reduce `lane op constant` to a constant boolean when every packed lane has the same
+/// ordering relation to `constant`.
+#[inline]
+fn reduce_constant(relation: Ordering, operator: CompareOperator) -> bool {
+    match (operator, relation) {
+        (CompareOperator::Eq, _) => false,
+        (CompareOperator::NotEq, _) => true,
+        (CompareOperator::Lt, Ordering::Less) => true,
+        (CompareOperator::Lt, _) => false,
+        (CompareOperator::Lte, Ordering::Less | Ordering::Equal) => true,
+        (CompareOperator::Lte, _) => false,
+        (CompareOperator::Gt, Ordering::Greater) => true,
+        (CompareOperator::Gt, _) => false,
+        (CompareOperator::Gte, Ordering::Greater | Ordering::Equal) => true,
+        (CompareOperator::Gte, _) => false,
+    }
+}
+
+fn compare_constant<T>(
     lhs: ArrayView<'_, BitPacked>,
     constant: T,
     rhs_nullability: Nullability,
@@ -100,22 +124,19 @@ fn compare_eq_constant<T>(
 where
     T: NativePType + ToPrimitive,
 {
-    if !constant_out_of_packable_range(constant, lhs.bit_width()) {
-        // Constant fits in the packable range, so at least some packed lanes could match
-        // it. The fast path doesn't apply.
+    let Some(relation) = constant_relation_to_packed(constant, lhs.bit_width()) else {
+        // In-range constants currently fall through to the canonical path. See
+        // `docs/inrange_compare_plan.md` for the plan to accelerate Lt/Lte/Gt/Gte here.
         return Ok(None);
-    }
+    };
 
-    // Every packed lane disagrees with `constant`. `Eq` is `false` everywhere, `NotEq` is
-    // `true` everywhere — modulo patches (which carry the real value) and validity.
-    let packed_lane_result = matches!(operator, CompareOperator::NotEq);
+    let packed_lane_result = reduce_constant(relation, operator);
     let len = lhs.len();
     let validity = lhs.validity()?;
     let patches = lhs.patches();
     let result_nullability = lhs.dtype().nullability() | rhs_nullability;
 
-    // Hot path: no patches, no nulls — every position has the same boolean result, so we
-    // return a `ConstantArray<bool>` in `O(1)`.
+    // Hot path: no patches, no nulls — every position has the same boolean result.
     if patches.is_none() && validity.no_nulls() {
         return Ok(Some(
             ConstantArray::new(Scalar::bool(packed_lane_result, result_nullability), len)
@@ -131,7 +152,7 @@ where
         let patches_offset = patches.offset();
 
         match_each_unsigned_integer_ptype!(indices.ptype(), |I| {
-            apply_eq_patches::<T, I>(
+            apply_patches::<T, I>(
                 &mut bits,
                 indices.as_slice::<I>(),
                 values.as_slice::<T>(),
@@ -146,7 +167,7 @@ where
     Ok(Some(BoolArray::new(bits.freeze(), validity).into_array()))
 }
 
-fn apply_eq_patches<T, I>(
+fn apply_patches<T, I>(
     bits: &mut BitBufferMut,
     indices: &[I],
     values: &[T],
@@ -157,11 +178,13 @@ fn apply_eq_patches<T, I>(
     T: NativePType,
     I: IntegerPType,
 {
-    // Only Eq/NotEq reach this point (see `CompareKernel::compare`).
     let cmp: fn(T, T) -> bool = match operator {
         CompareOperator::Eq => |l, r| NativeValue(l) == NativeValue(r),
         CompareOperator::NotEq => |l, r| NativeValue(l) != NativeValue(r),
-        _ => unreachable!("only Eq/NotEq reach the bitpacked compare-constant fast path"),
+        CompareOperator::Lt => |l, r| NativeValue(l) < NativeValue(r),
+        CompareOperator::Lte => |l, r| NativeValue(l) <= NativeValue(r),
+        CompareOperator::Gt => |l, r| NativeValue(l) > NativeValue(r),
+        CompareOperator::Gte => |l, r| NativeValue(l) >= NativeValue(r),
     };
 
     let len = bits.len();
@@ -184,6 +207,7 @@ fn apply_eq_patches<T, I>(
 
 #[cfg(test)]
 mod tests {
+    use std::cmp::Ordering;
     use std::sync::LazyLock;
 
     use rstest::rstest;
@@ -205,26 +229,36 @@ mod tests {
 
     use crate::BitPackedArrayExt;
     use crate::BitPackedData;
-    use crate::bitpacking::compute::compare::constant_out_of_packable_range;
+    use crate::bitpacking::compute::compare::constant_relation_to_packed;
 
     static SESSION: LazyLock<VortexSession> =
         LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
 
     #[test]
     fn range_check_is_o1() {
-        // 8-bit packable range is [0, 255].
-        assert!(constant_out_of_packable_range::<i32>(256, 8));
-        assert!(constant_out_of_packable_range::<i32>(-1, 8));
-        assert!(!constant_out_of_packable_range::<i32>(255, 8));
-        assert!(!constant_out_of_packable_range::<i32>(0, 8));
+        // For an 8-bit packable range of [0, 255]:
+        assert_eq!(
+            constant_relation_to_packed::<i32>(256, 8),
+            Some(Ordering::Less)
+        );
+        assert_eq!(
+            constant_relation_to_packed::<i32>(-1, 8),
+            Some(Ordering::Greater)
+        );
+        assert_eq!(constant_relation_to_packed::<i32>(255, 8), None);
+        assert_eq!(constant_relation_to_packed::<i32>(0, 8), None);
     }
 
     #[rstest]
     #[case(Operator::Eq, false)]
     #[case(Operator::NotEq, true)]
-    fn eq_above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> {
+    #[case(Operator::Lt, true)]
+    #[case(Operator::Lte, true)]
+    #[case(Operator::Gt, false)]
+    #[case(Operator::Gte, false)]
+    fn above_range_no_patches(#[case] op: Operator, #[case] expected: bool) -> VortexResult<()> {
         let mut ctx = SESSION.create_execution_ctx();
-        // 999 is above the 8-bit packable range; no packed lane matches.
+        // 999 is above the 8-bit packable range; every packed lane is < 999.
         let packed = BitPackedData::encode(
             &PrimitiveArray::from_iter([1u32, 2, 3, 250, 100]).into_array(),
             8,
@@ -240,8 +274,8 @@ mod tests {
 
     #[rstest]
     #[case(Operator::Eq)]
-    #[case(Operator::NotEq)]
-    fn eq_above_range_with_patches(#[case] op: Operator) -> VortexResult<()> {
+    #[case(Operator::Lt)]
+    fn above_range_with_patches(#[case] op: Operator) -> VortexResult<()> {
         // bit_width=4 packable range is [0, 15]; out-of-range values become patches.
         let mut ctx = SESSION.create_execution_ctx();
         let values = buffer![1u32, 5, 1000, 7, 1000, 14];
@@ -255,40 +289,39 @@ mod tests {
             .binary(ConstantArray::new(constant, values.len()).into_array(), op)?
             .execute::<BoolArray>(&mut ctx)?;
 
-        let expected: Vec<bool> = values
-            .iter()
-            .map(|v| match op {
-                Operator::Eq => *v == constant,
-                Operator::NotEq => *v != constant,
-                _ => unreachable!(),
-            })
-            .collect();
-        assert_arrays_eq!(result, BoolArray::from_iter(expected));
+        let cmp: fn(u32, u32) -> bool = match op {
+            Operator::Eq => |l, r| l == r,
+            Operator::Lt => |l, r| l < r,
+            _ => unreachable!(),
+        };
+        assert_arrays_eq!(
+            result,
+            BoolArray::from_iter(values.iter().map(|v| cmp(*v, constant)))
+        );
         Ok(())
     }
 
     #[test]
-    fn ordering_falls_through() -> VortexResult<()> {
-        // Ordering ops aren't accelerated yet; they go through the canonical path and
-        // must still return a correct answer.
+    fn below_range_signed() -> VortexResult<()> {
+        // Packed signed values are non-negative, so -5 is always less than every lane.
         let mut ctx = SESSION.create_execution_ctx();
-        let values = [1u32, 2, 3, 250, 100];
-        let packed =
-            BitPackedData::encode(&PrimitiveArray::from_iter(values).into_array(), 8, &mut ctx)?;
+        let packed = BitPackedData::encode(
+            &PrimitiveArray::from_iter([0i32, 7, 15, 3, 12]).into_array(),
+            4,
+            &mut ctx,
+        )?;
+        let len = packed.len();
         let result = packed
             .into_array()
-            .binary(ConstantArray::new(999u32, 5).into_array(), Operator::Lt)?
+            .binary(ConstantArray::new(-5i32, len).into_array(), Operator::Gt)?
             .execute::<BoolArray>(&mut ctx)?;
-        assert_arrays_eq!(
-            result,
-            BoolArray::from_iter(values.iter().map(|v| *v < 999))
-        );
+        assert_arrays_eq!(result, BoolArray::from_iter([true; 5]));
         Ok(())
     }
 
     #[test]
-    fn eq_in_range_falls_through() -> VortexResult<()> {
-        // In-range constants must defer to the canonical path.
+    fn in_range_falls_through() -> VortexResult<()> {
+        // 100 is in the 8-bit packable range; fall through to the canonical path.
         let mut ctx = SESSION.create_execution_ctx();
         let values = [1u32, 2, 3, 250, 100];
         let packed =
@@ -305,7 +338,7 @@ mod tests {
     }
 
     #[test]
-    fn eq_nullable_constant() -> VortexResult<()> {
+    fn nullable_constant() -> VortexResult<()> {
         let mut ctx = SESSION.create_execution_ctx();
         let packed = BitPackedData::encode(
             &PrimitiveArray::from_iter([1u32, 2, 3]).into_array(),