Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions vortex-array/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4446,8 +4446,14 @@ pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::v

pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>

pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>

pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_mask::Mask>>

pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef

pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>

pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>

pub fn vortex_array::arrays::listview::ListViewArrayExt::listview_validity(&self) -> vortex_array::validity::Validity
Expand All @@ -4466,8 +4472,14 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_

impl<T: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>> vortex_array::arrays::listview::ListViewArrayExt for T

pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>

pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_mask::Mask>>

pub fn T::elements(&self) -> &vortex_array::ArrayRef

pub fn T::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>

pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>

pub fn T::listview_validity(&self) -> vortex_array::validity::Validity
Expand Down Expand Up @@ -23954,6 +23966,8 @@ impl vortex_array::Array<vortex_array::arrays::ListView>

pub fn vortex_array::Array<vortex_array::arrays::ListView>::rebuild(&self, vortex_array::arrays::listview::ListViewRebuildMode) -> vortex_error::VortexResult<vortex_array::arrays::ListViewArray>

pub fn vortex_array::Array<vortex_array::arrays::ListView>::should_rebuild(&self, bool, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<bool>

impl vortex_array::Array<vortex_array::arrays::Masked>

pub fn vortex_array::Array<vortex_array::arrays::Masked>::try_new(vortex_array::ArrayRef, vortex_array::validity::Validity) -> vortex_error::VortexResult<Self>
Expand Down
15 changes: 1 addition & 14 deletions vortex-array/src/arrays/filter/execute/listview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ use vortex_mask::MaskValues;

use crate::arrays::ListViewArray;
use crate::arrays::filter::execute::filter_validity;
use crate::arrays::listview;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;

/// [`ListViewArray`] filter implementation.
///
Expand Down Expand Up @@ -55,18 +53,7 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc<MaskValues>)
// - Offsets and sizes are derived from existing valid child arrays.
// - Offsets and sizes have the same length (both filtered by `selection_mask`).
// - Validity matches the filtered array's nullability.
let new_array = unsafe {
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
};

let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32;
if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD {
new_array
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
} else {
new_array
}
unsafe { ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) }
}

#[cfg(test)]
Expand Down
89 changes: 89 additions & 0 deletions vortex-array/src/arrays/listview/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ use std::sync::Arc;

use num_traits::AsPrimitive;
use smallvec::smallvec;
use vortex_buffer::BitBufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;
use vortex_error::vortex_err;
use vortex_mask::Mask;

use crate::ArrayRef;
use crate::ArraySlots;
use crate::ExecutionCtx;
use crate::LEGACY_SESSION;
#[expect(deprecated)]
use crate::ToCanonical as _;
Expand All @@ -30,6 +33,7 @@ use crate::arrays::PrimitiveArray;
use crate::arrays::bool;
use crate::dtype::DType;
use crate::dtype::IntegerPType;
use crate::expr::stats::Stat;
use crate::match_each_integer_ptype;
use crate::validity::Validity;

Expand Down Expand Up @@ -396,6 +400,91 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
let sizes_primitive = self.sizes().to_primitive();
validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok()
}

/// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that
/// position in `elements` is referenced by at least one view.
///
/// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`,
/// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly.
///
/// Returns `Ok(None)` when `elements` is empty.
#[allow(clippy::cognitive_complexity, clippy::unnecessary_fallible_conversions)]
fn compute_referenced_elements_mask(
&self,
ctx: &mut ExecutionCtx,
) -> VortexResult<Option<Mask>> {
let len = self.elements().len();
if len == 0 {
return Ok(None);
}

let offsets_primitive = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
let sizes_primitive = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;

let mut buf = BitBufferMut::new_unset(len);
let offset_len = self.as_ref().len();

match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
let offsets_slice = offsets_primitive.as_slice::<O>();
let sizes_slice = sizes_primitive.as_slice::<S>();

for i in 0..offset_len {
let start =
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't use a fallible convert use the numtrait infallible one.

less branching

usize::try_from(offsets_slice[i]).vortex_expect("offset must fit in usize");
let size =
usize::try_from(sizes_slice[i]).vortex_expect("size must fit in usize");
buf.fill_range(start, start + size, true);
}
})
});

Ok(Some(Mask::from_buffer(buf.freeze())))
}

/// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly.
///
/// Returns `Ok(None)` when `elements` is empty.
fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<Option<f32>> {
Ok(self
.compute_referenced_elements_mask(ctx)?
.map(|mask| match mask {
Mask::AllTrue(_) => 1.0,
Mask::AllFalse(_) => 0.0,
Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
}))
}

/// Upper-bound estimate of [`compute_density`](Self::compute_density) via
/// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`.
///
/// Exact for non-overlapping views, but overcounts when multiple views share the same elements.
///
/// Returns `Ok(None)` when `elements` is empty.
fn estimate_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<Option<f32>> {
let n_elts = self.elements().len();
if n_elts == 0 {
return Ok(None);
}

let sizes = self.sizes();
if sizes.is_empty() {
return Ok(Some(0.0));
}

// compute_stat short-circuits on a cached exact Sum and otherwise computes-and-caches.
let sizes_sum = sizes
.statistics()
.compute_stat(Stat::Sum, ctx)?
.ok_or_else(|| vortex_err!("Sum stat unavailable for sizes"))?
.as_primitive()
.as_::<u64>()
.ok_or_else(|| vortex_err!("could not cast sum of sizes to u64"))?;

let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how can this go above or below 0,1 maybe a panic here.

or debug assert

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a debug assert here


Ok(Some(estimate))
}
}
impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}

Expand Down
12 changes: 0 additions & 12 deletions vortex-array/src/arrays/listview/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,3 @@ mod mask;
pub(crate) mod rules;
mod slice;
mod take;

/// The threshold below which we rebuild the elements of a listview.
///
/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
/// However, we also don't want to drag around a large amount of garbage data when the selection
/// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
/// Rebuilding is needed when exporting the ListView's elements.
///
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
// right now, so we will just rebuild every time (similar to [`ListArray`]).
pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
29 changes: 1 addition & 28 deletions vortex-array/src/arrays/listview/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
use num_traits::Zero;
use vortex_error::VortexResult;

use super::REBUILD_DENSITY_THRESHOLD;
use crate::ArrayRef;
use crate::ExecutionCtx;
use crate::IntoArray;
Expand All @@ -14,7 +13,6 @@ use crate::arrays::ListViewArray;
use crate::arrays::dict::TakeExecute;
use crate::arrays::dict::TakeReduce;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;
use crate::builtins::ArrayBuiltins;
use crate::dtype::Nullability;
use crate::match_each_integer_ptype;
Expand All @@ -23,43 +21,18 @@ use crate::scalar::Scalar;
/// Metadata-only take for [`ListViewArray`].
impl TakeReduce for ListView {
fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
// Approximate element density by the fraction of list rows retained. Assumes roughly
// uniform list sizes; good enough to decide whether dragging along the full `elements`
// buffer is worth avoiding a rebuild.
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
return Ok(None);
}

Ok(Some(apply_take(array, indices)?.into_array()))
}
}

/// Execution-path take for [`ListViewArray`].
///
/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the
/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`.
impl TakeExecute for ListView {
fn take(
array: ArrayView<'_, ListView>,
indices: &ArrayRef,
_ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
let taken = apply_take(array, indices)?;

if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do
// this right now, so we will just rebuild every time (similar to `ListArray`).
Ok(Some(
taken
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
.into_array(),
))
} else {
Ok(Some(taken.into_array()))
}
Ok(Some(apply_take(array, indices)?.into_array()))
}
}

Expand Down
18 changes: 18 additions & 0 deletions vortex-array/src/arrays/listview/rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use vortex_buffer::BufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;

use crate::ExecutionCtx;
use crate::IntoArray;
use crate::LEGACY_SESSION;
#[expect(deprecated)]
Expand All @@ -25,6 +26,13 @@ use crate::match_each_integer_ptype;
use crate::scalar::Scalar;
use crate::scalar_fn::fns::operators::Operator;

/// Density threshold to decide whether to rebuild a sparse `ListViewArray`.
///
/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after
/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`)
/// falls below this threshold, the benefits of a rebuild may outweigh its cost.
const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
Comment thread
mhk197 marked this conversation as resolved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

explain it's a guess and likely different for different element dtypes and different users of the list


/// Modes for rebuilding a [`ListViewArray`].
pub enum ListViewRebuildMode {
/// Removes all unused data and flattens out all list data, such that the array is zero-copyable
Expand Down Expand Up @@ -376,6 +384,16 @@ impl ListViewArray {
self.rebuild_zero_copy_to_list()
}
}

pub fn should_rebuild(&self, exact: bool, ctx: &mut ExecutionCtx) -> VortexResult<bool> {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment. explain this. also this looks like a footgun.

add the docstr with a detail message here or point to the threshold!

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like Joe said, this seems like a footgun. I think that the caller should just decide, and the REBUILD_DENSITY_THRESHOLD can just be DEFAULT_REBUILD_DENSITY_THRESHOLD or similar

let density = if exact {
self.compute_density(ctx)?
} else {
self.estimate_density(ctx)?
};

Ok(density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD)
}
}

#[cfg(test)]
Expand Down
20 changes: 20 additions & 0 deletions vortex-array/src/arrays/listview/tests/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ pub fn create_basic_listview() -> ListViewArray {
}
}

/// Creates a sparse ListView with two overlap regions
/// `[[0,1,2], [1,2], [18, 19], [19]]` over 20 elements.
pub fn create_sparse_overlapping_listview() -> ListViewArray {
let elements = buffer![0i32..20].into_array();
let offsets = buffer![0u32, 1, 18, 19].into_array();
let sizes = buffer![3u32, 2, 2, 1].into_array();
ListViewArray::new(elements, offsets, sizes, Validity::NonNullable)
}

/// Creates a nullable ListView: [[10,20], null, [50]]
pub fn create_nullable_listview() -> ListViewArray {
let elements = buffer![10i32, 20, 30, 40, 50].into_array();
Expand All @@ -45,6 +54,17 @@ pub fn create_empty_lists_listview() -> ListViewArray {
}
}

/// Creates a ListView with empty lists and elements: [[]]
pub fn create_empty_elements_listview() -> ListViewArray {
let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array();
let offsets = buffer![0u32; 0].into_array();
let sizes = buffer![0u32; 0].into_array();
unsafe {
ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable)
.with_zero_copy_to_list(true)
}
}

/// Creates a ListView with overlapping lists and out-of-order offsets
/// Lists: [[5,6,7], [2,3], [8,9], [0,1], [1,2,3,4]]
pub fn create_overlapping_listview() -> ListViewArray {
Expand Down
Loading
Loading