From faccc95a0231001c4877320073e0ebe8ea2902f6 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Wed, 20 May 2026 15:52:43 -0400 Subject: [PATCH 01/13] remove rebuild from take Signed-off-by: Matthew Katz --- .../src/arrays/listview/compute/take.rs | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/vortex-array/src/arrays/listview/compute/take.rs b/vortex-array/src/arrays/listview/compute/take.rs index 04e404a846e..a7715d927bf 100644 --- a/vortex-array/src/arrays/listview/compute/take.rs +++ b/vortex-array/src/arrays/listview/compute/take.rs @@ -4,7 +4,6 @@ use num_traits::Zero; use vortex_error::VortexResult; -use super::REBUILD_DENSITY_THRESHOLD; use crate::ArrayRef; use crate::ExecutionCtx; use crate::IntoArray; @@ -14,7 +13,6 @@ use crate::arrays::ListViewArray; use crate::arrays::dict::TakeExecute; use crate::arrays::dict::TakeReduce; use crate::arrays::listview::ListViewArrayExt; -use crate::arrays::listview::ListViewRebuildMode; use crate::builtins::ArrayBuiltins; use crate::dtype::Nullability; use crate::match_each_integer_ptype; @@ -23,14 +21,6 @@ use crate::scalar::Scalar; /// Metadata-only take for [`ListViewArray`]. impl TakeReduce for ListView { fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult> { - // Approximate element density by the fraction of list rows retained. Assumes roughly - // uniform list sizes; good enough to decide whether dragging along the full `elements` - // buffer is worth avoiding a rebuild. - let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; - if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { - return Ok(None); - } - Ok(Some(apply_take(array, indices)?.into_array())) } } @@ -45,21 +35,7 @@ impl TakeExecute for ListView { indices: &ArrayRef, _ctx: &mut ExecutionCtx, ) -> VortexResult> { - let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; - let taken = apply_take(array, indices)?; - - if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { - // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` - // compute functions have run, at the "top" of the operator tree. However, we cannot do - // this right now, so we will just rebuild every time (similar to `ListArray`). - Ok(Some( - taken - .rebuild(ListViewRebuildMode::MakeZeroCopyToList)? - .into_array(), - )) - } else { - Ok(Some(taken.into_array())) - } + Ok(Some(apply_take(array, indices)?.into_array())) } } From c8b117fafff5ad5a6335b9e277a8b96e6603c7d0 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Wed, 20 May 2026 16:04:14 -0400 Subject: [PATCH 02/13] remove from filter Signed-off-by: Matthew Katz --- vortex-array/src/arrays/filter/execute/listview.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/vortex-array/src/arrays/filter/execute/listview.rs b/vortex-array/src/arrays/filter/execute/listview.rs index a9849383ec4..6f19cfc6bc5 100644 --- a/vortex-array/src/arrays/filter/execute/listview.rs +++ b/vortex-array/src/arrays/filter/execute/listview.rs @@ -55,18 +55,7 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc) // - Offsets and sizes are derived from existing valid child arrays. // - Offsets and sizes have the same length (both filtered by `selection_mask`). // - Validity matches the filtered array's nullability. - let new_array = unsafe { - ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) - }; - - let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32; - if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD { - new_array - .rebuild(ListViewRebuildMode::MakeZeroCopyToList) - .vortex_expect("ListViewArray rebuild to zero-copy List should always succeed") - } else { - new_array - } + unsafe { ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) } } #[cfg(test)] From 54df45ae2d92a9dc836aaaed4d3b129b3f683ff2 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Wed, 20 May 2026 17:22:25 -0400 Subject: [PATCH 03/13] add density calculation and estimation Signed-off-by: Matthew Katz --- vortex-array/src/arrays/listview/array.rs | 92 +++++++++++++++++++++ vortex-array/src/arrays/listview/rebuild.rs | 11 +++ 2 files changed, 103 insertions(+) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 5c9350ed072..d44c390dc85 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -7,11 +7,13 @@ use std::sync::Arc; use num_traits::AsPrimitive; use smallvec::smallvec; +use vortex_buffer::BitBufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; +use vortex_mask::Mask; use crate::ArrayRef; use crate::ArraySlots; @@ -19,6 +21,7 @@ use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; +use crate::aggregate_fn::fns::sum::sum; use crate::array::Array; use crate::array::ArrayParts; use crate::array::TypedArrayRef; @@ -30,6 +33,9 @@ use crate::arrays::PrimitiveArray; use crate::arrays::bool; use crate::dtype::DType; use crate::dtype::IntegerPType; +use crate::expr::stats::Precision; +use crate::expr::stats::Stat; +use crate::expr::stats::StatsProvider; use crate::match_each_integer_ptype; use crate::validity::Validity; @@ -396,6 +402,92 @@ pub trait ListViewArrayExt: TypedArrayRef { let sizes_primitive = self.sizes().to_primitive(); validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok() } + + /// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that + /// position in `elements` is referenced by at least one view. + /// + /// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`, + /// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly. + /// + /// Returns `None` when `elements` is empty. + fn compute_referenced_elements_mask(&self) -> Option { + let len = self.elements().len(); + if len == 0 { + return None; + } + + let offsets_dtype = self.offsets().dtype(); + let sizes_dtype = self.sizes().dtype(); + + #[expect(deprecated)] + let offsets_primitive = self.offsets().to_primitive(); + #[expect(deprecated)] + let sizes_primitive = self.sizes().to_primitive(); + + let mut buf = BitBufferMut::new_unset(len); + let offset_len = self.as_ref().len(); + + match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| { + match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| { + let offsets_slice = offsets_primitive.as_slice::(); + let sizes_slice = sizes_primitive.as_slice::(); + + (0..offset_len).for_each(|i| { + let start = offsets_slice[i] as usize; + let size = sizes_slice[i] as usize; + buf.fill_range(start, start + size, true); + }); + }) + }); + + Some(Mask::from_buffer(buf.freeze())) + } + + /// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly. + /// + /// Returns `None` when `elements` is empty. + fn compute_density(&self) -> Option { + self.compute_referenced_elements_mask() + .map(|mask| match mask { + Mask::AllTrue(_) => 1.0, + Mask::AllFalse(_) => 0.0, + Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32, + }) + } + + /// Upper-bound estimate of [`compute_density`](Self::compute_density) via + /// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`. + /// + /// Exact for non-overlapping views, but overcounts when multiple views share the same elements. + /// + /// Returns `Ok(None)` when `elements` is empty + fn estimate_density(&self) -> VortexResult> { + let n_elts = self.elements().len(); + if n_elts == 0 { + return Ok(None); + } + + let sizes = self.sizes(); + if sizes.is_empty() { + return Ok(Some(0.0)); + } + + // Try to fetch the cached sum stat, otherwise fall back to calculating it on the spot + let sizes_sum = if let Some(Precision::Exact(scalar)) = sizes.statistics().get(Stat::Sum) + && let Some(sum) = scalar.as_primitive().as_::() + { + sum + } else { + sum(sizes, &mut LEGACY_SESSION.create_execution_ctx())? + .as_primitive() + .as_::() + .unwrap() + }; + + let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0); + + Ok(Some(estimate)) + } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 416d73bb92a..68774b3a56a 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -15,6 +15,7 @@ use crate::aggregate_fn::fns::min_max::min_max; use crate::arrays::ConstantArray; use crate::arrays::ListViewArray; use crate::arrays::listview::ListViewArrayExt; +use crate::arrays::listview::compute::REBUILD_DENSITY_THRESHOLD; use crate::builders::builder_with_capacity; use crate::builtins::ArrayBuiltins; use crate::dtype::DType; @@ -376,6 +377,16 @@ impl ListViewArray { self.rebuild_zero_copy_to_list() } } + + fn should_rebuild(&self, exact: bool) -> bool { + let density = if exact { + self.compute_density() + } else { + self.estimate_density().ok().flatten() + }; + + density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD + } } #[cfg(test)] From 3196d0f260452a8d6883558c118135251a263c70 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Thu, 21 May 2026 09:53:39 -0400 Subject: [PATCH 04/13] add tests and remove from filter Signed-off-by: Matthew Katz --- .../src/arrays/filter/execute/listview.rs | 2 - .../src/arrays/listview/tests/density.rs | 142 ++++++++++++++++++ vortex-array/src/arrays/listview/tests/mod.rs | 1 + 3 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 vortex-array/src/arrays/listview/tests/density.rs diff --git a/vortex-array/src/arrays/filter/execute/listview.rs b/vortex-array/src/arrays/filter/execute/listview.rs index 6f19cfc6bc5..b3f787133ab 100644 --- a/vortex-array/src/arrays/filter/execute/listview.rs +++ b/vortex-array/src/arrays/filter/execute/listview.rs @@ -9,9 +9,7 @@ use vortex_mask::MaskValues; use crate::arrays::ListViewArray; use crate::arrays::filter::execute::filter_validity; -use crate::arrays::listview; use crate::arrays::listview::ListViewArrayExt; -use crate::arrays::listview::ListViewRebuildMode; /// [`ListViewArray`] filter implementation. /// diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs new file mode 100644 index 00000000000..9708eac81d8 --- /dev/null +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Tests for `compute_referenced_elements_mask`, `compute_density`, and +//! `estimate_density` on `ListViewArray`. + +use vortex_buffer::buffer; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use super::common::create_basic_listview; +use super::common::create_empty_lists_listview; +use super::common::create_large_listview; +use super::common::create_overlapping_listview; +use crate::IntoArray; +use crate::arrays::ListViewArray; +use crate::arrays::PrimitiveArray; +use crate::arrays::listview::ListViewArrayExt; +use crate::expr::stats::Precision; +use crate::expr::stats::Stat; +use crate::scalar::ScalarValue; +use crate::validity::Validity; + +const EPS: f32 = 1e-6; + +#[test] +fn full_density_no_overlap() -> VortexResult<()> { + let lv = create_basic_listview(); + let exact = lv.compute_density().expect("non-empty elements"); + let est = lv.estimate_density()?.expect("non-empty elements"); + + assert!((exact - 1.0).abs() < EPS, "exact density {exact}"); + assert!((est - 1.0).abs() < EPS, "estimate density {est}"); + Ok(()) +} + +#[test] +fn sparse_no_overlap_matches_exact() -> VortexResult<()> { + let lv = create_large_listview(); + let exact = lv.compute_density().expect("non-empty"); + let est = lv.estimate_density()?.expect("non-empty"); + + assert!((exact - 0.5).abs() < EPS, "exact density {exact}"); + assert!((est - 0.5).abs() < EPS, "estimate density {est}"); + Ok(()) +} + +#[test] +fn all_empty_lists_is_zero_density() -> VortexResult<()> { + let lv = create_empty_lists_listview(); + let exact = lv.compute_density().expect("elements has length 1"); + let est = lv.estimate_density()?.expect("elements has length 1"); + + assert_eq!(exact, 0.0); + assert_eq!(est, 0.0); + Ok(()) +} + +#[test] +fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> { + let lv = create_overlapping_listview(); + let exact = lv.compute_density().expect("non-empty"); + let est = lv.estimate_density()?.expect("non-empty"); + + assert!((exact - 1.0).abs() < EPS, "exact density {exact}"); + assert!((est - 1.0).abs() < EPS, "estimate density {est}"); + Ok(()) +} + +#[test] +fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> { + // Two rows both pointing at elements[0..5) over a 20-element buffer. + // Unique referenced = 5 → exact = 0.25 + // sum(sizes) = 10 → estimate = 0.50 + let elements = PrimitiveArray::from_iter(0i32..20).into_array(); + let offsets = buffer![0u32, 0].into_array(); + let sizes = buffer![5u32, 5].into_array(); + let lv = ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)?; + + let exact = lv.compute_density().expect("non-empty"); + let est = lv.estimate_density()?.expect("non-empty"); + + assert!((exact - 0.25).abs() < EPS, "exact density {exact}"); + assert!((est - 0.50).abs() < EPS, "estimate density {est}"); + assert!(est > exact, "estimate must overcount overlapping views"); + Ok(()) +} + +#[test] +fn empty_elements_returns_none() -> VortexResult<()> { + let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array(); + let offsets = buffer![0u32; 0].into_array(); + let sizes = buffer![0u32; 0].into_array(); + let lv = ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)?; + + assert!(lv.compute_density().is_none()); + assert!(lv.estimate_density()?.is_none()); + Ok(()) +} + +#[test] +fn estimate_uses_cached_sum_stat() -> VortexResult<()> { + let lv = create_basic_listview(); + // Pre-populate Stat::Sum with a deliberately-wrong 5 so we can prove + // estimate_density reads from the cache instead of computing fresh. + lv.sizes() + .statistics() + .set(Stat::Sum, Precision::Exact(ScalarValue::from(5u64))); + + let est = lv.estimate_density()?.expect("non-empty"); + assert!( + (est - 0.5).abs() < EPS, + "estimate {est} should reflect cached Sum=5, not computed Sum=10", + ); + Ok(()) +} + +#[test] +fn referenced_mask_set_bits_match_views() -> VortexResult<()> { + // create_large_listview: 10 lists of size 50 at offsets [0,100,200,...,900]. + // Bits [i*100 .. i*100+50) set, the rest unset. + let lv = create_large_listview(); + let mask = lv + .compute_referenced_elements_mask() + .expect("non-empty elements"); + let bits = match mask { + Mask::Values(v) => v, + other => panic!("expected Values mask for partial coverage, got {other:?}"), + }; + + assert_eq!(bits.true_count(), 500); + // Spot-check the boundaries of the first and last views. + let bb = bits.bit_buffer(); + assert!(bb.value(0)); + assert!(bb.value(49)); + assert!(!bb.value(50)); + assert!(!bb.value(99)); + assert!(bb.value(100)); + assert!(bb.value(949)); + assert!(!bb.value(950)); + Ok(()) +} diff --git a/vortex-array/src/arrays/listview/tests/mod.rs b/vortex-array/src/arrays/listview/tests/mod.rs index dea78fd8a97..5e0c357282e 100644 --- a/vortex-array/src/arrays/listview/tests/mod.rs +++ b/vortex-array/src/arrays/listview/tests/mod.rs @@ -4,6 +4,7 @@ pub(super) mod common; mod basic; +mod density; mod filter; mod nested; mod nullability; From a2aa05f920786507703ab9cb33590dd085087967 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 10:42:24 -0400 Subject: [PATCH 05/13] fix tests Signed-off-by: Matthew Katz --- .../src/arrays/listview/tests/common.rs | 20 ++++++ .../src/arrays/listview/tests/density.rs | 63 +++++++------------ 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/vortex-array/src/arrays/listview/tests/common.rs b/vortex-array/src/arrays/listview/tests/common.rs index d8f25f522cf..dc8bc94fb18 100644 --- a/vortex-array/src/arrays/listview/tests/common.rs +++ b/vortex-array/src/arrays/listview/tests/common.rs @@ -22,6 +22,15 @@ pub fn create_basic_listview() -> ListViewArray { } } +/// Creates a sparse ListView with two overlap regions +/// `[[0,1,2], [1,2], [18, 19], [19]]` over 20 elements. +pub fn create_sparse_overlapping_listview() -> ListViewArray { + let elements = buffer![0i32..20].into_array(); + let offsets = buffer![0u32, 1, 18, 19].into_array(); + let sizes = buffer![3u32, 2, 2, 1].into_array(); + ListViewArray::new(elements, offsets, sizes, Validity::NonNullable) +} + /// Creates a nullable ListView: [[10,20], null, [50]] pub fn create_nullable_listview() -> ListViewArray { let elements = buffer![10i32, 20, 30, 40, 50].into_array(); @@ -45,6 +54,17 @@ pub fn create_empty_lists_listview() -> ListViewArray { } } +/// Creates a ListView with empty lists and elements: [[]] +pub fn create_empty_elements_listview() -> ListViewArray { + let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array(); + let offsets = buffer![0u32; 0].into_array(); + let sizes = buffer![0u32; 0].into_array(); + unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) + .with_zero_copy_to_list(true) + } +} + /// Creates a ListView with overlapping lists and out-of-order offsets /// Lists: [[5,6,7], [2,3], [8,9], [0,1], [1,2,3,4]] pub fn create_overlapping_listview() -> ListViewArray { diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index 9708eac81d8..ea1aa8d7a26 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -4,7 +4,6 @@ //! Tests for `compute_referenced_elements_mask`, `compute_density`, and //! `estimate_density` on `ListViewArray`. -use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_mask::Mask; @@ -12,14 +11,12 @@ use super::common::create_basic_listview; use super::common::create_empty_lists_listview; use super::common::create_large_listview; use super::common::create_overlapping_listview; -use crate::IntoArray; -use crate::arrays::ListViewArray; -use crate::arrays::PrimitiveArray; +use super::common::create_sparse_overlapping_listview; use crate::arrays::listview::ListViewArrayExt; +use crate::arrays::listview::tests::common::create_empty_elements_listview; use crate::expr::stats::Precision; use crate::expr::stats::Stat; use crate::scalar::ScalarValue; -use crate::validity::Validity; const EPS: f32 = 1e-6; @@ -29,8 +26,8 @@ fn full_density_no_overlap() -> VortexResult<()> { let exact = lv.compute_density().expect("non-empty elements"); let est = lv.estimate_density()?.expect("non-empty elements"); - assert!((exact - 1.0).abs() < EPS, "exact density {exact}"); - assert!((est - 1.0).abs() < EPS, "estimate density {est}"); + assert!((exact - 1.0).abs() < EPS); + assert!((est - 1.0).abs() < EPS); Ok(()) } @@ -40,8 +37,8 @@ fn sparse_no_overlap_matches_exact() -> VortexResult<()> { let exact = lv.compute_density().expect("non-empty"); let est = lv.estimate_density()?.expect("non-empty"); - assert!((exact - 0.5).abs() < EPS, "exact density {exact}"); - assert!((est - 0.5).abs() < EPS, "estimate density {est}"); + assert!((exact - 0.5).abs() < EPS); + assert!((est - 0.5).abs() < EPS); Ok(()) } @@ -62,36 +59,26 @@ fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> { let exact = lv.compute_density().expect("non-empty"); let est = lv.estimate_density()?.expect("non-empty"); - assert!((exact - 1.0).abs() < EPS, "exact density {exact}"); - assert!((est - 1.0).abs() < EPS, "estimate density {est}"); + assert!((exact - 1.0).abs() < EPS); + assert!((est - 1.0).abs() < EPS); Ok(()) } #[test] fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> { - // Two rows both pointing at elements[0..5) over a 20-element buffer. - // Unique referenced = 5 → exact = 0.25 - // sum(sizes) = 10 → estimate = 0.50 - let elements = PrimitiveArray::from_iter(0i32..20).into_array(); - let offsets = buffer![0u32, 0].into_array(); - let sizes = buffer![5u32, 5].into_array(); - let lv = ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)?; + let lv = create_sparse_overlapping_listview(); let exact = lv.compute_density().expect("non-empty"); let est = lv.estimate_density()?.expect("non-empty"); - assert!((exact - 0.25).abs() < EPS, "exact density {exact}"); - assert!((est - 0.50).abs() < EPS, "estimate density {est}"); - assert!(est > exact, "estimate must overcount overlapping views"); + assert!((exact - 0.25).abs() < EPS); + assert!((est - 0.40).abs() < EPS); Ok(()) } #[test] fn empty_elements_returns_none() -> VortexResult<()> { - let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array(); - let offsets = buffer![0u32; 0].into_array(); - let sizes = buffer![0u32; 0].into_array(); - let lv = ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)?; + let lv = create_empty_elements_listview(); assert!(lv.compute_density().is_none()); assert!(lv.estimate_density()?.is_none()); @@ -108,35 +95,27 @@ fn estimate_uses_cached_sum_stat() -> VortexResult<()> { .set(Stat::Sum, Precision::Exact(ScalarValue::from(5u64))); let est = lv.estimate_density()?.expect("non-empty"); - assert!( - (est - 0.5).abs() < EPS, - "estimate {est} should reflect cached Sum=5, not computed Sum=10", - ); + assert!((est - 0.5).abs() < EPS); Ok(()) } #[test] fn referenced_mask_set_bits_match_views() -> VortexResult<()> { - // create_large_listview: 10 lists of size 50 at offsets [0,100,200,...,900]. - // Bits [i*100 .. i*100+50) set, the rest unset. - let lv = create_large_listview(); + let lv = create_sparse_overlapping_listview(); let mask = lv .compute_referenced_elements_mask() .expect("non-empty elements"); let bits = match mask { Mask::Values(v) => v, - other => panic!("expected Values mask for partial coverage, got {other:?}"), + _ => panic!("expected Values mask"), }; - assert_eq!(bits.true_count(), 500); - // Spot-check the boundaries of the first and last views. + assert_eq!(bits.true_count(), 5); let bb = bits.bit_buffer(); - assert!(bb.value(0)); - assert!(bb.value(49)); - assert!(!bb.value(50)); - assert!(!bb.value(99)); - assert!(bb.value(100)); - assert!(bb.value(949)); - assert!(!bb.value(950)); + for i in 0..3 { + assert!(bb.value(i)); + } + assert!(bb.value(18)); + assert!(bb.value(19)); Ok(()) } From 9164418cc80f2e555276a0f6096fb86e2fbcbaa3 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 10:56:21 -0400 Subject: [PATCH 06/13] fix lint Signed-off-by: Matthew Katz --- vortex-array/src/arrays/listview/array.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index d44c390dc85..c80e0fa1796 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -433,7 +433,9 @@ pub trait ListViewArrayExt: TypedArrayRef { let sizes_slice = sizes_primitive.as_slice::(); (0..offset_len).for_each(|i| { + #[allow(clippy::cast_possible_truncation)] let start = offsets_slice[i] as usize; + #[allow(clippy::cast_possible_truncation)] let size = sizes_slice[i] as usize; buf.fill_range(start, start + size, true); }); @@ -481,7 +483,7 @@ pub trait ListViewArrayExt: TypedArrayRef { sum(sizes, &mut LEGACY_SESSION.create_execution_ctx())? .as_primitive() .as_::() - .unwrap() + .expect("sum should cast to u64") }; let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0); From f4d3e7923431c0ba87f809d9bcba1e72aa34688e Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 11:52:57 -0400 Subject: [PATCH 07/13] move const and fix lint Signed-off-by: Matthew Katz --- vortex-array/src/arrays/listview/array.rs | 2 +- vortex-array/src/arrays/listview/compute/mod.rs | 12 ------------ vortex-array/src/arrays/listview/rebuild.rs | 13 ++++++++++++- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index c80e0fa1796..31f61d9bfa2 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -483,7 +483,7 @@ pub trait ListViewArrayExt: TypedArrayRef { sum(sizes, &mut LEGACY_SESSION.create_execution_ctx())? .as_primitive() .as_::() - .expect("sum should cast to u64") + .ok_or_else(|| vortex_err!("could not cast sum of sizes to u64"))? }; let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0); diff --git a/vortex-array/src/arrays/listview/compute/mod.rs b/vortex-array/src/arrays/listview/compute/mod.rs index 3ea82cafb33..9a43503c4b5 100644 --- a/vortex-array/src/arrays/listview/compute/mod.rs +++ b/vortex-array/src/arrays/listview/compute/mod.rs @@ -6,15 +6,3 @@ mod mask; pub(crate) mod rules; mod slice; mod take; - -/// The threshold below which we rebuild the elements of a listview. -/// -/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive. -/// However, we also don't want to drag around a large amount of garbage data when the selection -/// is sparse. Below this fraction of list rows retained, the rebuild is worth it. -/// Rebuilding is needed when exporting the ListView's elements. -/// -// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` -// compute functions have run, at the "top" of the operator tree. However, we cannot do this -// right now, so we will just rebuild every time (similar to [`ListArray`]). -pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1; diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 68774b3a56a..3e54b7b57ab 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -15,7 +15,6 @@ use crate::aggregate_fn::fns::min_max::min_max; use crate::arrays::ConstantArray; use crate::arrays::ListViewArray; use crate::arrays::listview::ListViewArrayExt; -use crate::arrays::listview::compute::REBUILD_DENSITY_THRESHOLD; use crate::builders::builder_with_capacity; use crate::builtins::ArrayBuiltins; use crate::dtype::DType; @@ -26,6 +25,18 @@ use crate::match_each_integer_ptype; use crate::scalar::Scalar; use crate::scalar_fn::fns::operators::Operator; +/// The threshold below which we rebuild the elements of a listview. +/// +/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive. +/// However, we also don't want to drag around a large amount of garbage data when the selection +/// is sparse. Below this fraction of list rows retained, the rebuild is worth it. +/// Rebuilding is needed when exporting the ListView's elements. +/// +// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` +// compute functions have run, at the "top" of the operator tree. However, we cannot do this +// right now, so we will just rebuild every time (similar to [`ListArray`]). +const REBUILD_DENSITY_THRESHOLD: f32 = 0.1; + /// Modes for rebuilding a [`ListViewArray`]. pub enum ListViewRebuildMode { /// Removes all unused data and flattens out all list data, such that the array is zero-copyable From 9b91eab6b70e84dcb39c26858f2985d472461284 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 11:58:02 -0400 Subject: [PATCH 08/13] update public api Signed-off-by: Matthew Katz --- vortex-array/public-api.lock | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index c5184c008cd..068c98b73d6 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -4446,8 +4446,14 @@ pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::v pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self) -> core::option::Option + +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self) -> core::option::Option + pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef +pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self) -> vortex_error::VortexResult> + pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult pub fn vortex_array::arrays::listview::ListViewArrayExt::listview_validity(&self) -> vortex_array::validity::Validity @@ -4466,8 +4472,14 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_ impl> vortex_array::arrays::listview::ListViewArrayExt for T +pub fn T::compute_density(&self) -> core::option::Option + +pub fn T::compute_referenced_elements_mask(&self) -> core::option::Option + pub fn T::elements(&self) -> &vortex_array::ArrayRef +pub fn T::estimate_density(&self) -> vortex_error::VortexResult> + pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult pub fn T::listview_validity(&self) -> vortex_array::validity::Validity From 52c39168125e03c366485243764803b1098519e3 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 12:22:35 -0400 Subject: [PATCH 09/13] add rebuild to duckdb exporter Signed-off-by: Matthew Katz --- vortex-array/public-api.lock | 2 ++ vortex-array/src/arrays/listview/rebuild.rs | 2 +- vortex-duckdb/src/exporter/list_view.rs | 10 +++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 068c98b73d6..543211d6bf2 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -23966,6 +23966,8 @@ impl vortex_array::Array pub fn vortex_array::Array::rebuild(&self, vortex_array::arrays::listview::ListViewRebuildMode) -> vortex_error::VortexResult +pub fn vortex_array::Array::should_rebuild(&self, bool) -> bool + impl vortex_array::Array pub fn vortex_array::Array::try_new(vortex_array::ArrayRef, vortex_array::validity::Validity) -> vortex_error::VortexResult diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 3e54b7b57ab..a0b67c3186f 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -389,7 +389,7 @@ impl ListViewArray { } } - fn should_rebuild(&self, exact: bool) -> bool { + pub fn should_rebuild(&self, exact: bool) -> bool { let density = if exact { self.compute_density() } else { diff --git a/vortex-duckdb/src/exporter/list_view.rs b/vortex-duckdb/src/exporter/list_view.rs index 0a3c718b465..9ec07f33a28 100644 --- a/vortex-duckdb/src/exporter/list_view.rs +++ b/vortex-duckdb/src/exporter/list_view.rs @@ -11,6 +11,7 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::listview::ListViewDataParts; +use vortex::array::arrays::listview::ListViewRebuildMode; use vortex::array::match_each_integer_ptype; use vortex::array::validity::Validity; use vortex::dtype::IntegerPType; @@ -50,13 +51,20 @@ pub(crate) fn new_exporter( ctx: &mut ExecutionCtx, ) -> VortexResult> { let len = array.len(); + + let compact_array = if array.should_rebuild(false) { + array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? + } else { + array + }; + let ListViewDataParts { elements_dtype, elements, offsets, sizes, validity, - } = array.into_data_parts(); + } = compact_array.into_data_parts(); // Cache an `elements` vector up front so that future exports can reference it. let num_elements = elements.len(); From 94cf0b562beb11354b41da98b5aa5263e2618e73 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 15:04:05 -0400 Subject: [PATCH 10/13] address comments Signed-off-by: Matthew Katz --- vortex-array/src/arrays/listview/array.rs | 72 +++++++++---------- .../src/arrays/listview/compute/take.rs | 3 - vortex-array/src/arrays/listview/rebuild.rs | 22 +++--- .../src/arrays/listview/tests/density.rs | 49 +++++++++---- vortex-array/src/stats/array.rs | 2 + vortex-duckdb/src/exporter/list_view.rs | 2 +- 6 files changed, 80 insertions(+), 70 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 31f61d9bfa2..5d27b29fd0c 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -17,11 +17,11 @@ use vortex_mask::Mask; use crate::ArrayRef; use crate::ArraySlots; +use crate::ExecutionCtx; use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; -use crate::aggregate_fn::fns::sum::sum; use crate::array::Array; use crate::array::ArrayParts; use crate::array::TypedArrayRef; @@ -33,9 +33,7 @@ use crate::arrays::PrimitiveArray; use crate::arrays::bool; use crate::dtype::DType; use crate::dtype::IntegerPType; -use crate::expr::stats::Precision; use crate::expr::stats::Stat; -use crate::expr::stats::StatsProvider; use crate::match_each_integer_ptype; use crate::validity::Validity; @@ -409,52 +407,51 @@ pub trait ListViewArrayExt: TypedArrayRef { /// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`, /// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly. /// - /// Returns `None` when `elements` is empty. - fn compute_referenced_elements_mask(&self) -> Option { + /// Returns `Ok(None)` when `elements` is empty. + fn compute_referenced_elements_mask( + &self, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { let len = self.elements().len(); if len == 0 { - return None; + return Ok(None); } - let offsets_dtype = self.offsets().dtype(); - let sizes_dtype = self.sizes().dtype(); - - #[expect(deprecated)] - let offsets_primitive = self.offsets().to_primitive(); - #[expect(deprecated)] - let sizes_primitive = self.sizes().to_primitive(); + let offsets_primitive = self.offsets().clone().execute::(ctx)?; + let sizes_primitive = self.sizes().clone().execute::(ctx)?; let mut buf = BitBufferMut::new_unset(len); let offset_len = self.as_ref().len(); - match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| { - match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| { + match_each_integer_ptype!(offsets_primitive.ptype(), |O| { + match_each_integer_ptype!(sizes_primitive.ptype(), |S| { let offsets_slice = offsets_primitive.as_slice::(); let sizes_slice = sizes_primitive.as_slice::(); - (0..offset_len).for_each(|i| { - #[allow(clippy::cast_possible_truncation)] - let start = offsets_slice[i] as usize; - #[allow(clippy::cast_possible_truncation)] - let size = sizes_slice[i] as usize; + for i in 0..offset_len { + let start = + usize::try_from(offsets_slice[i]).vortex_expect("offset must fit in usize"); + let size = + usize::try_from(sizes_slice[i]).vortex_expect("size must fit in usize"); buf.fill_range(start, start + size, true); - }); + } }) }); - Some(Mask::from_buffer(buf.freeze())) + Ok(Some(Mask::from_buffer(buf.freeze()))) } /// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly. /// - /// Returns `None` when `elements` is empty. - fn compute_density(&self) -> Option { - self.compute_referenced_elements_mask() + /// Returns `Ok(None)` when `elements` is empty. + fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult> { + Ok(self + .compute_referenced_elements_mask(ctx)? .map(|mask| match mask { Mask::AllTrue(_) => 1.0, Mask::AllFalse(_) => 0.0, Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32, - }) + })) } /// Upper-bound estimate of [`compute_density`](Self::compute_density) via @@ -462,8 +459,8 @@ pub trait ListViewArrayExt: TypedArrayRef { /// /// Exact for non-overlapping views, but overcounts when multiple views share the same elements. /// - /// Returns `Ok(None)` when `elements` is empty - fn estimate_density(&self) -> VortexResult> { + /// Returns `Ok(None)` when `elements` is empty. + fn estimate_density(&self, ctx: &mut ExecutionCtx) -> VortexResult> { let n_elts = self.elements().len(); if n_elts == 0 { return Ok(None); @@ -474,17 +471,14 @@ pub trait ListViewArrayExt: TypedArrayRef { return Ok(Some(0.0)); } - // Try to fetch the cached sum stat, otherwise fall back to calculating it on the spot - let sizes_sum = if let Some(Precision::Exact(scalar)) = sizes.statistics().get(Stat::Sum) - && let Some(sum) = scalar.as_primitive().as_::() - { - sum - } else { - sum(sizes, &mut LEGACY_SESSION.create_execution_ctx())? - .as_primitive() - .as_::() - .ok_or_else(|| vortex_err!("could not cast sum of sizes to u64"))? - }; + // compute_stat short-circuits on a cached exact Sum and otherwise computes-and-caches. + let sizes_sum = sizes + .statistics() + .compute_stat(Stat::Sum, ctx)? + .ok_or_else(|| vortex_err!("Sum stat unavailable for sizes"))? + .as_primitive() + .as_::() + .ok_or_else(|| vortex_err!("could not cast sum of sizes to u64"))?; let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0); diff --git a/vortex-array/src/arrays/listview/compute/take.rs b/vortex-array/src/arrays/listview/compute/take.rs index a7715d927bf..2b6c016d2c3 100644 --- a/vortex-array/src/arrays/listview/compute/take.rs +++ b/vortex-array/src/arrays/listview/compute/take.rs @@ -26,9 +26,6 @@ impl TakeReduce for ListView { } /// Execution-path take for [`ListViewArray`]. -/// -/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the -/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`. impl TakeExecute for ListView { fn take( array: ArrayView<'_, ListView>, diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index a0b67c3186f..acd26e77881 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -6,6 +6,7 @@ use vortex_buffer::BufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; +use crate::ExecutionCtx; use crate::IntoArray; use crate::LEGACY_SESSION; #[expect(deprecated)] @@ -25,16 +26,11 @@ use crate::match_each_integer_ptype; use crate::scalar::Scalar; use crate::scalar_fn::fns::operators::Operator; -/// The threshold below which we rebuild the elements of a listview. +/// Density threshold to decide whether to rebuild a sparse `ListViewArray`. /// -/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive. -/// However, we also don't want to drag around a large amount of garbage data when the selection -/// is sparse. Below this fraction of list rows retained, the rebuild is worth it. -/// Rebuilding is needed when exporting the ListView's elements. -/// -// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` -// compute functions have run, at the "top" of the operator tree. However, we cannot do this -// right now, so we will just rebuild every time (similar to [`ListArray`]). +/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after +/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`) +/// falls below this threshold, the benefits of a rebuild may outweigh its cost. const REBUILD_DENSITY_THRESHOLD: f32 = 0.1; /// Modes for rebuilding a [`ListViewArray`]. @@ -389,14 +385,14 @@ impl ListViewArray { } } - pub fn should_rebuild(&self, exact: bool) -> bool { + pub fn should_rebuild(&self, exact: bool, ctx: &mut ExecutionCtx) -> VortexResult { let density = if exact { - self.compute_density() + self.compute_density(ctx)? } else { - self.estimate_density().ok().flatten() + self.estimate_density(ctx)? }; - density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD + Ok(density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD) } } diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index ea1aa8d7a26..33be7040736 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -6,25 +6,35 @@ use vortex_error::VortexResult; use vortex_mask::Mask; +use vortex_session::VortexSession; use super::common::create_basic_listview; use super::common::create_empty_lists_listview; use super::common::create_large_listview; use super::common::create_overlapping_listview; use super::common::create_sparse_overlapping_listview; +use crate::ExecutionCtx; +use crate::VortexSessionExecute; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::tests::common::create_empty_elements_listview; use crate::expr::stats::Precision; use crate::expr::stats::Stat; use crate::scalar::ScalarValue; +use crate::session::ArraySession; const EPS: f32 = 1e-6; +fn test_execution_ctx() -> ExecutionCtx { + let session = VortexSession::empty().with::(); + session.create_execution_ctx() +} + #[test] fn full_density_no_overlap() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_basic_listview(); - let exact = lv.compute_density().expect("non-empty elements"); - let est = lv.estimate_density()?.expect("non-empty elements"); + let exact = lv.compute_density(&mut ctx)?.expect("non-empty elements"); + let est = lv.estimate_density(&mut ctx)?.expect("non-empty elements"); assert!((exact - 1.0).abs() < EPS); assert!((est - 1.0).abs() < EPS); @@ -33,9 +43,10 @@ fn full_density_no_overlap() -> VortexResult<()> { #[test] fn sparse_no_overlap_matches_exact() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_large_listview(); - let exact = lv.compute_density().expect("non-empty"); - let est = lv.estimate_density()?.expect("non-empty"); + let exact = lv.compute_density(&mut ctx)?.expect("non-empty"); + let est = lv.estimate_density(&mut ctx)?.expect("non-empty"); assert!((exact - 0.5).abs() < EPS); assert!((est - 0.5).abs() < EPS); @@ -44,9 +55,14 @@ fn sparse_no_overlap_matches_exact() -> VortexResult<()> { #[test] fn all_empty_lists_is_zero_density() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_empty_lists_listview(); - let exact = lv.compute_density().expect("elements has length 1"); - let est = lv.estimate_density()?.expect("elements has length 1"); + let exact = lv + .compute_density(&mut ctx)? + .expect("elements has length 1"); + let est = lv + .estimate_density(&mut ctx)? + .expect("elements has length 1"); assert_eq!(exact, 0.0); assert_eq!(est, 0.0); @@ -55,9 +71,10 @@ fn all_empty_lists_is_zero_density() -> VortexResult<()> { #[test] fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_overlapping_listview(); - let exact = lv.compute_density().expect("non-empty"); - let est = lv.estimate_density()?.expect("non-empty"); + let exact = lv.compute_density(&mut ctx)?.expect("non-empty"); + let est = lv.estimate_density(&mut ctx)?.expect("non-empty"); assert!((exact - 1.0).abs() < EPS); assert!((est - 1.0).abs() < EPS); @@ -66,10 +83,11 @@ fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> { #[test] fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_sparse_overlapping_listview(); - let exact = lv.compute_density().expect("non-empty"); - let est = lv.estimate_density()?.expect("non-empty"); + let exact = lv.compute_density(&mut ctx)?.expect("non-empty"); + let est = lv.estimate_density(&mut ctx)?.expect("non-empty"); assert!((exact - 0.25).abs() < EPS); assert!((est - 0.40).abs() < EPS); @@ -78,15 +96,17 @@ fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> { #[test] fn empty_elements_returns_none() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_empty_elements_listview(); - assert!(lv.compute_density().is_none()); - assert!(lv.estimate_density()?.is_none()); + assert!(lv.compute_density(&mut ctx)?.is_none()); + assert!(lv.estimate_density(&mut ctx)?.is_none()); Ok(()) } #[test] fn estimate_uses_cached_sum_stat() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_basic_listview(); // Pre-populate Stat::Sum with a deliberately-wrong 5 so we can prove // estimate_density reads from the cache instead of computing fresh. @@ -94,16 +114,17 @@ fn estimate_uses_cached_sum_stat() -> VortexResult<()> { .statistics() .set(Stat::Sum, Precision::Exact(ScalarValue::from(5u64))); - let est = lv.estimate_density()?.expect("non-empty"); + let est = lv.estimate_density(&mut ctx)?.expect("non-empty"); assert!((est - 0.5).abs() < EPS); Ok(()) } #[test] fn referenced_mask_set_bits_match_views() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_sparse_overlapping_listview(); let mask = lv - .compute_referenced_elements_mask() + .compute_referenced_elements_mask(&mut ctx)? .expect("non-empty elements"); let bits = match mask { Mask::Values(v) => v, diff --git a/vortex-array/src/stats/array.rs b/vortex-array/src/stats/array.rs index fd41090c528..7718237c6a5 100644 --- a/vortex-array/src/stats/array.rs +++ b/vortex-array/src/stats/array.rs @@ -153,6 +153,8 @@ impl StatsSetRef<'_> { f(&mut lock.iter()) } + /// Returns the value of `stat` by either fetching it from cache if it exists and is [`Precision::Exact`], or falling back to + /// computation. pub fn compute_stat(&self, stat: Stat, ctx: &mut ExecutionCtx) -> VortexResult> { // If it's already computed and exact, we can return it. if let Some(Precision::Exact(s)) = self.get(stat) { diff --git a/vortex-duckdb/src/exporter/list_view.rs b/vortex-duckdb/src/exporter/list_view.rs index 9ec07f33a28..41b5b2bead8 100644 --- a/vortex-duckdb/src/exporter/list_view.rs +++ b/vortex-duckdb/src/exporter/list_view.rs @@ -52,7 +52,7 @@ pub(crate) fn new_exporter( ) -> VortexResult> { let len = array.len(); - let compact_array = if array.should_rebuild(false) { + let compact_array = if array.should_rebuild(false, ctx)? { array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? } else { array From 15dd4d308b1a2ed8d5eb1037da96c6bcd061952b Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 15:53:05 -0400 Subject: [PATCH 11/13] maybe rebuild at arrow boundary Signed-off-by: Matthew Katz --- vortex-array/src/arrow/executor/list_view.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index ef858fa9916..81f756b3518 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -11,10 +11,10 @@ use vortex_error::vortex_ensure; use crate::ArrayRef; use crate::ExecutionCtx; -use crate::arrays::ListView; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; use crate::arrays::listview::ListViewDataParts; +use crate::arrays::listview::ListViewRebuildMode; use crate::arrow::executor::validity::to_arrow_null_buffer; use crate::arrow::session::ArrowSessionExt; use crate::builtins::ArrayBuiltins; @@ -27,15 +27,16 @@ pub(super) fn to_arrow_list_view( elements_field: &FieldRef, ctx: &mut ExecutionCtx, ) -> VortexResult { - // Check for Vortex ListViewArray and convert directly. - let array = match array.try_downcast::() { - Ok(array) => return list_view_to_list_view::(array, elements_field, ctx), - Err(array) => array, + let array = array.execute::(ctx)?; + + // If array is sparse, rebuild before handing it to Arrow + let array = if array.should_rebuild(false, ctx)? { + array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? + } else { + array }; - // Otherwise, we execute to ListViewArray and convert. - let list_view_array = array.execute::(ctx)?; - list_view_to_list_view::(list_view_array, elements_field, ctx) + list_view_to_list_view::(array, elements_field, ctx) } fn list_view_to_list_view( From 14b76c9a526b1d8decda0e64bcb791c0b5f14092 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 16:00:10 -0400 Subject: [PATCH 12/13] lint Signed-off-by: Matthew Katz --- vortex-array/src/arrays/listview/array.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 5d27b29fd0c..edd1492fcca 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -408,6 +408,7 @@ pub trait ListViewArrayExt: TypedArrayRef { /// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly. /// /// Returns `Ok(None)` when `elements` is empty. + #[allow(clippy::cognitive_complexity, clippy::unnecessary_fallible_conversions)] fn compute_referenced_elements_mask( &self, ctx: &mut ExecutionCtx, From cd7c97e8488eeafcebb9a0f947fb730aad492dd8 Mon Sep 17 00:00:00 2001 From: Matthew Katz Date: Thu, 21 May 2026 16:03:40 -0400 Subject: [PATCH 13/13] public api Signed-off-by: Matthew Katz --- vortex-array/public-api.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 543211d6bf2..c76f588c402 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -4446,13 +4446,13 @@ pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::v pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef -pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self) -> core::option::Option +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> -pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self) -> core::option::Option +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef -pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self) -> vortex_error::VortexResult> +pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult @@ -4472,13 +4472,13 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_ impl> vortex_array::arrays::listview::ListViewArrayExt for T -pub fn T::compute_density(&self) -> core::option::Option +pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> -pub fn T::compute_referenced_elements_mask(&self) -> core::option::Option +pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> pub fn T::elements(&self) -> &vortex_array::ArrayRef -pub fn T::estimate_density(&self) -> vortex_error::VortexResult> +pub fn T::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult @@ -23966,7 +23966,7 @@ impl vortex_array::Array pub fn vortex_array::Array::rebuild(&self, vortex_array::arrays::listview::ListViewRebuildMode) -> vortex_error::VortexResult -pub fn vortex_array::Array::should_rebuild(&self, bool) -> bool +pub fn vortex_array::Array::should_rebuild(&self, bool, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult impl vortex_array::Array