From baceda26783d13b8ce083cce4fb190f87c66109b Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 09:31:37 -0400 Subject: [PATCH 01/11] Sparse Dictionary Canonicalize Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/vtable/mod.rs | 322 +++++++++++++++++++++ vortex-duckdb/src/exporter/cache.rs | 2 - vortex-duckdb/src/exporter/dict.rs | 193 ++++++++++-- 3 files changed, 494 insertions(+), 23 deletions(-) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 4893be5c7ed..dd5b87a772f 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -4,12 +4,15 @@ use std::hash::Hasher; use kernel::PARENT_KERNELS; +use num_traits::FromPrimitive; use prost::Message; +use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_error::vortex_panic; +use vortex_mask::Mask; use vortex_session::VortexSession; use vortex_session::registry::CachedId; @@ -24,6 +27,7 @@ use crate::ArrayEq; use crate::ArrayHash; use crate::ArrayRef; use crate::Canonical; +use crate::IntoArray; use crate::Precision; use crate::array::Array; use crate::array::ArrayId; @@ -32,16 +36,19 @@ use crate::array::ArrayView; use crate::array::VTable; use crate::arrays::ConstantArray; use crate::arrays::Primitive; +use crate::arrays::PrimitiveArray; use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::dict::compute::rules::PARENT_RULES; use crate::arrays::dict::execute::take_canonical; use crate::buffer::BufferHandle; use crate::dtype::DType; +use crate::dtype::IntegerPType; use crate::dtype::Nullability; use crate::dtype::PType; use crate::executor::ExecutionCtx; use crate::executor::ExecutionResult; +use crate::match_each_integer_ptype; use crate::require_child; use crate::scalar::Scalar; use crate::serde::ArrayChildren; @@ -54,6 +61,11 @@ mod validity; /// A [`Dict`]-encoded Vortex array. pub type DictArray = Array; +// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values +// encoding, code count, unique-code count, and exporter/canonicalization costs. +const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4; +const SPARSE_CANONICALIZE_SAMPLE_SIZE: usize = 128; + #[derive(Clone, Debug)] pub struct Dict; @@ -185,6 +197,10 @@ impl VTable for Dict { ))); } + if let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? { + return Ok(ExecutionResult::done(canonical)); + } + let array = require_child!(array, array.values(), DictSlots::VALUES => AnyCanonical); let DictParts { values, codes, .. } = array.into_parts(); @@ -213,3 +229,309 @@ impl VTable for Dict { PARENT_KERNELS.execute(array, parent, child_idx, ctx) } } + +struct SparseDictCodes { + unique_codes: PrimitiveArray, + remapped_codes: PrimitiveArray, +} + +fn sparse_canonicalize_dict( + array: &DictArray, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let codes = array.codes().as_::().into_owned(); + let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else { + return Ok(None); + }; + + let values = array.values(); + let unique_values_parent = DictArray::new( + sparse_codes.unique_codes.clone().into_array(), + values.clone(), + ) + .into_array(); + let unique_values = if let Some(taken_values) = + values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)? + { + taken_values.execute::(ctx)?.into_array() + } else { + let canonical_values = values.clone().execute::(ctx)?.into_array(); + DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values) + .into_array() + .execute::(ctx)? + .into_array() + }; + + let compact_dict = unsafe { + DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values) + .set_all_values_referenced(true) + }; + + compact_dict + .into_array() + .execute::(ctx) + .map(Some) +} + +fn collect_sparse_codes( + codes: &PrimitiveArray, + values_len: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let validity = codes.validity()?; + let validity_mask = validity.execute_mask(codes.len(), ctx)?; + + if !should_collect_sparse_codes(codes, values_len, &validity_mask) { + return Ok(None); + } + + let Some(sparse_codes) = match_each_integer_ptype!(codes.ptype(), |P| { + collect_sparse_codes_typed::

(codes, values_len, validity_mask, validity)? + }) else { + return Ok(None); + }; + + Ok(Some(sparse_codes)) +} + +fn should_collect_sparse_codes( + codes: &PrimitiveArray, + values_len: usize, + validity_mask: &Mask, +) -> bool { + if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 { + return false; + } + + if codes + .len() + .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) + < values_len + { + return true; + } + + let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| { + estimate_code_cardinality::

(codes, validity_mask) + }) else { + return false; + }; + + estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) + < values_len +} + +fn estimate_code_cardinality( + codes: &PrimitiveArray, + validity_mask: &Mask, +) -> Option { + let sample_count = codes.len().min(SPARSE_CANONICALIZE_SAMPLE_SIZE); + let mut observed_codes = Vec::<(usize, usize)>::new(); + + for sample_idx in 0..sample_count { + let idx = sample_index(sample_idx, codes.len(), sample_count); + if !validity_mask.value(idx) { + continue; + } + + let code = codes.as_slice::

()[idx].as_(); + if let Some((_, count)) = observed_codes + .iter_mut() + .find(|(observed, _)| *observed == code) + { + *count += 1; + } else { + observed_codes.push((code, 1)); + } + } + + if observed_codes.is_empty() { + return None; + } + + let unique_count = observed_codes.len(); + let singleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 1) + .count(); + let doubleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 2) + .count(); + + let unseen_estimate = if doubleton_count == 0 { + singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 + } else { + div_ceil( + singleton_count.saturating_mul(singleton_count), + 2 * doubleton_count, + ) + }; + + Some(unique_count.saturating_add(unseen_estimate)) +} + +fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { + debug_assert!(len > 0); + debug_assert!(sample_count > 0); + + let sample_idx = sample_idx as u128; + let len = len as u128; + let sample_count = sample_count as u128; + let bucket_start = sample_idx * len / sample_count; + let bucket_end = (sample_idx + 1) * len / sample_count; + + ((bucket_start + bucket_end) / 2).min(len - 1) as usize +} + +fn div_ceil(numerator: usize, denominator: usize) -> usize { + debug_assert!(denominator > 0); + numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) +} + +fn collect_sparse_codes_typed( + codes: &PrimitiveArray, + values_len: usize, + validity_mask: Mask, + validity: Validity, +) -> VortexResult> { + let mut value_remap = vec![usize::MAX; values_len]; + let mut unique_codes = Vec::new(); + let mut remapped_codes = Vec::with_capacity(codes.len()); + + for (idx, &code) in codes.as_slice::

().iter().enumerate() { + if !validity_mask.value(idx) { + remapped_codes.push(P::default()); + continue; + } + + let old_code = code.as_(); + let mut new_code = value_remap[old_code]; + if new_code == usize::MAX { + new_code = unique_codes.len(); + value_remap[old_code] = new_code; + unique_codes.push(old_code as u64); + } + + remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| { + vortex_panic!( + "compacted dictionary code {new_code} does not fit in {}", + P::PTYPE + ) + })); + } + + if unique_codes + .len() + .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) + >= values_len + { + return Ok(None); + } + + Ok(Some(SparseDictCodes { + unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable), + remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity), + })) +} + +#[cfg(test)] +mod tests { + use vortex_error::VortexResult; + + use super::*; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::assert_arrays_eq; + + #[test] + fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> { + let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]); + let Some(sparse) = + collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + else { + panic!("codes are sparse"); + }; + + assert_arrays_eq!( + sparse.unique_codes.into_array(), + PrimitiveArray::from_iter([50u64, 70]).into_array() + ); + assert_arrays_eq!( + sparse.remapped_codes.into_array(), + PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), Some(0)]).into_array() + ); + + Ok(()) + } + + #[test] + fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> { + let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32)); + let Some(sparse) = + collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + else { + panic!("sampled codes are sparse"); + }; + + assert_arrays_eq!( + sparse.unique_codes.into_array(), + PrimitiveArray::from_iter([42u64]).into_array() + ); + assert_arrays_eq!( + sparse.remapped_codes.into_array(), + PrimitiveArray::from_iter((0..1024).map(|_| 0u32)).into_array() + ); + + Ok(()) + } + + #[test] + fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> { + let codes = PrimitiveArray::from_iter((0..1024).map(|idx| (idx % 100) as u32)); + + assert!( + collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + .is_none() + ); + + Ok(()) + } + + #[test] + fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> { + let dict = DictArray::new( + PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(), + PrimitiveArray::from_iter(0..100i32).into_array(), + ); + + let actual = dict + .into_array() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .into_array(); + + assert_arrays_eq!( + actual, + PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)]) + ); + + Ok(()) + } + + #[test] + fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> { + let dict = DictArray::new( + PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(), + PrimitiveArray::from_iter(0..100i32).into_array(), + ); + + let actual = dict + .into_array() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .into_array(); + + assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32))); + + Ok(()) + } +} diff --git a/vortex-duckdb/src/exporter/cache.rs b/vortex-duckdb/src/exporter/cache.rs index 2f495ba9608..e20efbf7117 100644 --- a/vortex-duckdb/src/exporter/cache.rs +++ b/vortex-duckdb/src/exporter/cache.rs @@ -5,7 +5,6 @@ use std::sync::Arc; use parking_lot::Mutex; use vortex::array::ArrayRef; -use vortex::array::Canonical; use vortex_utils::aliases::dash_map::DashMap; use crate::duckdb::ReusableDict; @@ -20,6 +19,5 @@ use crate::duckdb::Vector; pub struct ConversionCache { pub dict_cache: DashMap, pub values_cache: DashMap>)>, - pub canonical_cache: DashMap, pub file_index: usize, } diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs index da03cd4254c..be14569a48c 100644 --- a/vortex-duckdb/src/exporter/dict.rs +++ b/vortex-duckdb/src/exporter/dict.rs @@ -33,6 +33,11 @@ struct DictExporter { codes_type: PhantomData, } +// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values +// encoding, code count, unique-code count, and exporter/canonicalization costs. +const SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD: usize = 4; +const SPARSE_EXPORT_SAMPLE_SIZE: usize = 128; + pub(crate) fn new_exporter_with_flatten( array: &DictArray, cache: &ConversionCache, @@ -42,21 +47,21 @@ pub(crate) fn new_exporter_with_flatten( ) -> VortexResult> { // Grab the cache dictionary values. let values = array.values(); - let codes = array.codes(); - let codes_len = codes.len(); + let codes_array = array.codes(); + let codes_len = codes_array.len(); if let Some(constant) = values.as_opt::() { return constant::new_exporter_with_mask( ConstantArray::new(constant.scalar().clone(), codes_len), - codes.validity()?.execute_mask(codes_len, ctx)?, + codes_array.validity()?.execute_mask(codes_len, ctx)?, cache, ctx, ); } - let codes_mask = codes.validity()?.execute_mask(codes_len, ctx)?; + let codes_mask = codes_array.validity()?.execute_mask(codes_len, ctx)?; - match codes_mask { + match &codes_mask { Mask::AllTrue(_) => {} Mask::AllFalse(_) => return Ok(all_invalid::new_exporter()), Mask::Values(_) => { @@ -67,25 +72,24 @@ pub(crate) fn new_exporter_with_flatten( } let values_key = values.addr(); - let codes = array.codes().clone().execute::(ctx)?; + let codes = codes_array.clone().execute::(ctx)?; + + if !flatten && should_export_sparse(&codes, values.len(), &codes_mask) { + return new_array_exporter( + array + .clone() + .into_array() + .execute::(ctx)? + .into_array(), + cache, + ctx, + ); + } let reusable_dict = if flatten { - let canonical = cache - .canonical_cache - .get(&values_key) - .map(|entry| entry.value().1.clone()); - let canonical = match canonical { - Some(c) => c, - None => { - let canonical = values.clone().execute::(ctx)?; - cache - .canonical_cache - .insert(values_key, (values.clone(), canonical.clone())); - canonical - } - }; return new_array_exporter( - DictArray::new(array.codes().clone(), canonical.into_array()) + array + .clone() .into_array() .execute::(ctx)? .into_array(), @@ -129,6 +133,96 @@ pub(crate) fn new_exporter_with_flatten( }) } +fn should_export_sparse(codes: &PrimitiveArray, values_len: usize, codes_mask: &Mask) -> bool { + if codes.is_empty() || values_len == 0 || codes_mask.true_count() == 0 { + return false; + } + + if codes + .len() + .saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD) + < values_len + { + return true; + } + + let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |I| { + estimate_code_cardinality::(codes, codes_mask) + }) else { + return false; + }; + + estimated_unique_codes.saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD) < values_len +} + +fn estimate_code_cardinality( + codes: &PrimitiveArray, + codes_mask: &Mask, +) -> Option { + let sample_count = codes.len().min(SPARSE_EXPORT_SAMPLE_SIZE); + let mut observed_codes = Vec::<(usize, usize)>::new(); + + for sample_idx in 0..sample_count { + let idx = sample_index(sample_idx, codes.len(), sample_count); + if !codes_mask.value(idx) { + continue; + } + + let code = codes.as_slice::()[idx].as_(); + if let Some((_, count)) = observed_codes + .iter_mut() + .find(|(observed, _)| *observed == code) + { + *count += 1; + } else { + observed_codes.push((code, 1)); + } + } + + if observed_codes.is_empty() { + return None; + } + + let unique_count = observed_codes.len(); + let singleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 1) + .count(); + let doubleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 2) + .count(); + + let unseen_estimate = if doubleton_count == 0 { + singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 + } else { + div_ceil( + singleton_count.saturating_mul(singleton_count), + 2 * doubleton_count, + ) + }; + + Some(unique_count.saturating_add(unseen_estimate)) +} + +fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { + debug_assert!(len > 0); + debug_assert!(sample_count > 0); + + let sample_idx = sample_idx as u128; + let len = len as u128; + let sample_count = sample_count as u128; + let bucket_start = sample_idx * len / sample_count; + let bucket_end = (sample_idx + 1) * len / sample_count; + + ((bucket_start + bucket_end) / 2).min(len - 1) as usize +} + +fn div_ceil(numerator: usize, denominator: usize) -> usize { + debug_assert!(denominator > 0); + numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) +} + impl> ColumnExporter for DictExporter { fn export( &self, @@ -285,6 +379,63 @@ mod tests { Ok(()) } + #[test] + fn test_sparse_dict_exports_flat() -> VortexResult<()> { + let arr = DictArray::new( + PrimitiveArray::from_iter([50u32, 70]).into_array(), + PrimitiveArray::from_iter(0..100i32).into_array(), + ); + + let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]); + + new_exporter(&arr, &ConversionCache::default())?.export( + 0, + 2, + chunk.get_vector_mut(0), + &mut SESSION.create_execution_ctx(), + )?; + chunk.set_len(2); + + assert_eq!( + format!("{}", String::try_from(&*chunk)?), + r#"Chunk - [1 Columns] +- FLAT INTEGER: 2 = [ 50, 70] +"# + ); + + Ok(()) + } + + #[test] + fn test_sampled_sparse_dict_exports_flat() -> VortexResult<()> { + let arr = DictArray::new( + PrimitiveArray::from_iter((0..32).map(|_| 42u32)).into_array(), + PrimitiveArray::from_iter(0..100i32).into_array(), + ); + + let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]); + + new_exporter(&arr, &ConversionCache::default())?.export( + 0, + 32, + chunk.get_vector_mut(0), + &mut SESSION.create_execution_ctx(), + )?; + chunk.set_len(32); + + let expected_values = std::iter::repeat_n("42", 32).collect::>().join(", "); + assert_eq!( + format!("{}", String::try_from(&*chunk)?), + format!( + r#"Chunk - [1 Columns] +- FLAT INTEGER: 32 = [ {expected_values}] +"# + ) + ); + + Ok(()) + } + #[ignore = "TODO(connor)[4809]: Exporters do not correctly handle empty vectors"] #[test] fn test_export_empty_dict() -> VortexResult<()> { From 90bc59c9e076445ae19c50f80699872fb5e92f49 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 09:38:32 -0400 Subject: [PATCH 02/11] Document sparse dictionary cardinality estimation Signed-off-by: Nicholas Gates --- .../src/arrays/dict/vtable/cardinality.rs | 85 +++++++++++++++++ vortex-array/src/arrays/dict/vtable/mod.rs | 92 +++++-------------- vortex-duckdb/src/exporter/dict.rs | 70 +------------- .../src/exporter/dict_cardinality.rs | 84 +++++++++++++++++ vortex-duckdb/src/exporter/mod.rs | 1 + 5 files changed, 193 insertions(+), 139 deletions(-) create mode 100644 vortex-array/src/arrays/dict/vtable/cardinality.rs create mode 100644 vortex-duckdb/src/exporter/dict_cardinality.rs diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs new file mode 100644 index 00000000000..848abec2997 --- /dev/null +++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_mask::Mask; + +use crate::arrays::PrimitiveArray; +use crate::dtype::IntegerPType; + +const SAMPLE_SIZE: usize = 128; + +pub(super) fn estimate_code_cardinality( + codes: &PrimitiveArray, + validity_mask: &Mask, +) -> Option { + let sample_count = codes.len().min(SAMPLE_SIZE); + let mut observed_codes = Vec::<(usize, usize)>::new(); + + // Sample deterministic bucket midpoints instead of using randomness. The estimate only gates + // whether to run the exact pass; correctness never depends on the sample. + for sample_idx in 0..sample_count { + let idx = sample_index(sample_idx, codes.len(), sample_count); + if !validity_mask.value(idx) { + continue; + } + + let code = codes.as_slice::

()[idx].as_(); + if let Some((_, count)) = observed_codes + .iter_mut() + .find(|(observed, _)| *observed == code) + { + *count += 1; + } else { + observed_codes.push((code, 1)); + } + } + + estimate_cardinality_from_observations(&observed_codes) +} + +fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option { + if observed_codes.is_empty() { + return None; + } + + let unique_count = observed_codes.len(); + let singleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 1) + .count(); + let doubleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 2) + .count(); + + // Chao1-style lower-bias estimate for unseen codes. Repeated samples keep the estimate small + // for low-cardinality code streams; many singleton samples make dense streams look expensive. + let unseen_estimate = if doubleton_count == 0 { + singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 + } else { + div_ceil( + singleton_count.saturating_mul(singleton_count), + 2 * doubleton_count, + ) + }; + + Some(unique_count.saturating_add(unseen_estimate)) +} + +fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { + debug_assert!(len > 0); + debug_assert!(sample_count > 0); + + let sample_idx = sample_idx as u128; + let len = len as u128; + let sample_count = sample_count as u128; + let bucket_start = sample_idx * len / sample_count; + let bucket_end = (sample_idx + 1) * len / sample_count; + + ((bucket_start + bucket_end) / 2).min(len - 1) as usize +} + +fn div_ceil(numerator: usize, denominator: usize) -> usize { + debug_assert!(denominator > 0); + numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) +} diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index dd5b87a772f..3b5070fc23e 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -54,6 +54,7 @@ use crate::scalar::Scalar; use crate::serde::ArrayChildren; use crate::validity::Validity; +mod cardinality; mod kernel; mod operations; mod validity; @@ -64,7 +65,6 @@ pub type DictArray = Array; // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values // encoding, code count, unique-code count, and exporter/canonicalization costs. const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4; -const SPARSE_CANONICALIZE_SAMPLE_SIZE: usize = 128; #[derive(Clone, Debug)] pub struct Dict; @@ -231,7 +231,9 @@ impl VTable for Dict { } struct SparseDictCodes { + /// Original dictionary value indices that are actually referenced by the live codes. unique_codes: PrimitiveArray, + /// Codes rewritten to index into `unique_codes` instead of the original values array. remapped_codes: PrimitiveArray, } @@ -244,6 +246,10 @@ fn sparse_canonicalize_dict( return Ok(None); }; + // Build a temporary parent that represents `values.take(unique_codes)`. Calling + // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just + // the referenced dictionary values. If the child has no specialized parent execution, fall + // back to canonicalizing all values and then taking from the canonical array. let values = array.values(); let unique_values_parent = DictArray::new( sparse_codes.unique_codes.clone().into_array(), @@ -262,6 +268,9 @@ fn sparse_canonicalize_dict( .into_array() }; + // Now the dictionary is dense over its compacted values, so normal dictionary execution only + // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a + // large dictionary with many unused values. let compact_dict = unsafe { DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values) .set_all_values_referenced(true) @@ -281,6 +290,8 @@ fn collect_sparse_codes( let validity = codes.validity()?; let validity_mask = validity.execute_mask(codes.len(), ctx)?; + // The exact pass below scans every code and allocates a remap table sized to the values array. + // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough. if !should_collect_sparse_codes(codes, values_len, &validity_mask) { return Ok(None); } @@ -303,6 +314,8 @@ fn should_collect_sparse_codes( return false; } + // If even the worst case "every live code is unique" is sparse, skip sampling and go straight + // to the exact remap pass. if codes .len() .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) @@ -311,8 +324,10 @@ fn should_collect_sparse_codes( return true; } + // Otherwise sample first. This catches cases like many live rows all referencing the same + // dictionary value without forcing dense dictionaries through the exact remap scan. let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| { - estimate_code_cardinality::

(codes, validity_mask) + cardinality::estimate_code_cardinality::

(codes, validity_mask) }) else { return false; }; @@ -321,74 +336,6 @@ fn should_collect_sparse_codes( < values_len } -fn estimate_code_cardinality( - codes: &PrimitiveArray, - validity_mask: &Mask, -) -> Option { - let sample_count = codes.len().min(SPARSE_CANONICALIZE_SAMPLE_SIZE); - let mut observed_codes = Vec::<(usize, usize)>::new(); - - for sample_idx in 0..sample_count { - let idx = sample_index(sample_idx, codes.len(), sample_count); - if !validity_mask.value(idx) { - continue; - } - - let code = codes.as_slice::

()[idx].as_(); - if let Some((_, count)) = observed_codes - .iter_mut() - .find(|(observed, _)| *observed == code) - { - *count += 1; - } else { - observed_codes.push((code, 1)); - } - } - - if observed_codes.is_empty() { - return None; - } - - let unique_count = observed_codes.len(); - let singleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 1) - .count(); - let doubleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 2) - .count(); - - let unseen_estimate = if doubleton_count == 0 { - singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 - } else { - div_ceil( - singleton_count.saturating_mul(singleton_count), - 2 * doubleton_count, - ) - }; - - Some(unique_count.saturating_add(unseen_estimate)) -} - -fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { - debug_assert!(len > 0); - debug_assert!(sample_count > 0); - - let sample_idx = sample_idx as u128; - let len = len as u128; - let sample_count = sample_count as u128; - let bucket_start = sample_idx * len / sample_count; - let bucket_end = (sample_idx + 1) * len / sample_count; - - ((bucket_start + bucket_end) / 2).min(len - 1) as usize -} - -fn div_ceil(numerator: usize, denominator: usize) -> usize { - debug_assert!(denominator > 0); - numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) -} - fn collect_sparse_codes_typed( codes: &PrimitiveArray, values_len: usize, @@ -399,6 +346,8 @@ fn collect_sparse_codes_typed( let mut unique_codes = Vec::new(); let mut remapped_codes = Vec::with_capacity(codes.len()); + // `value_remap[old_code]` stores the compact code assigned to an original dictionary value. + // `usize::MAX` means that value has not been referenced yet. for (idx, &code) in codes.as_slice::

().iter().enumerate() { if !validity_mask.value(idx) { remapped_codes.push(P::default()); @@ -421,6 +370,9 @@ fn collect_sparse_codes_typed( })); } + // The sample only decides whether the exact pass is worth trying. Recheck the real + // cardinality before compacting so a misleading sparse-looking sample cannot pessimize dense + // dictionaries. if unique_codes .len() .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs index be14569a48c..c611a803d94 100644 --- a/vortex-duckdb/src/exporter/dict.rs +++ b/vortex-duckdb/src/exporter/dict.rs @@ -24,6 +24,7 @@ use crate::exporter::ColumnExporter; use crate::exporter::all_invalid; use crate::exporter::cache::ConversionCache; use crate::exporter::constant; +use crate::exporter::dict_cardinality::estimate_code_cardinality; use crate::exporter::new_array_exporter; struct DictExporter { @@ -36,7 +37,6 @@ struct DictExporter { // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values // encoding, code count, unique-code count, and exporter/canonicalization costs. const SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD: usize = 4; -const SPARSE_EXPORT_SAMPLE_SIZE: usize = 128; pub(crate) fn new_exporter_with_flatten( array: &DictArray, @@ -155,74 +155,6 @@ fn should_export_sparse(codes: &PrimitiveArray, values_len: usize, codes_mask: & estimated_unique_codes.saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD) < values_len } -fn estimate_code_cardinality( - codes: &PrimitiveArray, - codes_mask: &Mask, -) -> Option { - let sample_count = codes.len().min(SPARSE_EXPORT_SAMPLE_SIZE); - let mut observed_codes = Vec::<(usize, usize)>::new(); - - for sample_idx in 0..sample_count { - let idx = sample_index(sample_idx, codes.len(), sample_count); - if !codes_mask.value(idx) { - continue; - } - - let code = codes.as_slice::()[idx].as_(); - if let Some((_, count)) = observed_codes - .iter_mut() - .find(|(observed, _)| *observed == code) - { - *count += 1; - } else { - observed_codes.push((code, 1)); - } - } - - if observed_codes.is_empty() { - return None; - } - - let unique_count = observed_codes.len(); - let singleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 1) - .count(); - let doubleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 2) - .count(); - - let unseen_estimate = if doubleton_count == 0 { - singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 - } else { - div_ceil( - singleton_count.saturating_mul(singleton_count), - 2 * doubleton_count, - ) - }; - - Some(unique_count.saturating_add(unseen_estimate)) -} - -fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { - debug_assert!(len > 0); - debug_assert!(sample_count > 0); - - let sample_idx = sample_idx as u128; - let len = len as u128; - let sample_count = sample_count as u128; - let bucket_start = sample_idx * len / sample_count; - let bucket_end = (sample_idx + 1) * len / sample_count; - - ((bucket_start + bucket_end) / 2).min(len - 1) as usize -} - -fn div_ceil(numerator: usize, denominator: usize) -> usize { - debug_assert!(denominator > 0); - numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) -} - impl> ColumnExporter for DictExporter { fn export( &self, diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs new file mode 100644 index 00000000000..37f0f55dfb9 --- /dev/null +++ b/vortex-duckdb/src/exporter/dict_cardinality.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex::array::arrays::PrimitiveArray; +use vortex::dtype::IntegerPType; +use vortex::mask::Mask; + +const SAMPLE_SIZE: usize = 128; + +pub(super) fn estimate_code_cardinality( + codes: &PrimitiveArray, + codes_mask: &Mask, +) -> Option { + let sample_count = codes.len().min(SAMPLE_SIZE); + let mut observed_codes = Vec::<(usize, usize)>::new(); + + // This mirrors the array-side sparse dictionary gate. The exporter needs the estimate before + // it decides between a reusable DuckDB dictionary and executing the Vortex dictionary away. + // Correctness does not depend on the estimate; it only decides whether to take the compacting + // path. + for sample_idx in 0..sample_count { + let idx = sample_index(sample_idx, codes.len(), sample_count); + if !codes_mask.value(idx) { + continue; + } + + let code = codes.as_slice::()[idx].as_(); + if let Some((_, count)) = observed_codes + .iter_mut() + .find(|(observed, _)| *observed == code) + { + *count += 1; + } else { + observed_codes.push((code, 1)); + } + } + + estimate_cardinality_from_observations(&observed_codes) +} + +fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option { + if observed_codes.is_empty() { + return None; + } + + let unique_count = observed_codes.len(); + let singleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 1) + .count(); + let doubleton_count = observed_codes + .iter() + .filter(|(_, count)| *count == 2) + .count(); + + let unseen_estimate = if doubleton_count == 0 { + singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 + } else { + div_ceil( + singleton_count.saturating_mul(singleton_count), + 2 * doubleton_count, + ) + }; + + Some(unique_count.saturating_add(unseen_estimate)) +} + +fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { + debug_assert!(len > 0); + debug_assert!(sample_count > 0); + + let sample_idx = sample_idx as u128; + let len = len as u128; + let sample_count = sample_count as u128; + let bucket_start = sample_idx * len / sample_count; + let bucket_end = (sample_idx + 1) * len / sample_count; + + ((bucket_start + bucket_end) / 2).min(len - 1) as usize +} + +fn div_ceil(numerator: usize, denominator: usize) -> usize { + debug_assert!(denominator > 0); + numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) +} diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 517776f5521..9c01e2f35e5 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -7,6 +7,7 @@ mod cache; mod constant; mod decimal; mod dict; +mod dict_cardinality; mod fixed_size_list; mod list; mod list_view; From 4137b92ab4401f849332cea9065a26518ee46944 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 09:49:13 -0400 Subject: [PATCH 03/11] Document sparse cardinality estimators Signed-off-by: Nicholas Gates --- .../src/arrays/dict/vtable/cardinality.rs | 19 +++++++++++++++++++ .../src/exporter/dict_cardinality.rs | 18 ++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs index 848abec2997..d97b26fd3e5 100644 --- a/vortex-array/src/arrays/dict/vtable/cardinality.rs +++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs @@ -1,6 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Sampling-based cardinality estimation for dictionary codes. +//! +//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass. The +//! estimate may be conservative or noisy, but correctness does not depend on it: callers must still +//! collect the exact unique code set and re-check the sparse threshold before compacting. + use vortex_mask::Mask; use crate::arrays::PrimitiveArray; @@ -8,6 +14,11 @@ use crate::dtype::IntegerPType; const SAMPLE_SIZE: usize = 128; +/// Estimate the number of distinct non-null dictionary codes. +/// +/// The estimator samples deterministic bucket midpoints so repeated executions make the same +/// compaction decision for the same input. Returning `None` means no valid sampled codes were seen. +/// A returned value should only be used to decide whether an exact pass is worth attempting. pub(super) fn estimate_code_cardinality( codes: &PrimitiveArray, validity_mask: &Mask, @@ -37,6 +48,10 @@ pub(super) fn estimate_code_cardinality( estimate_cardinality_from_observations(&observed_codes) } +/// Estimate total cardinality from `(code, observed_count)` sample observations. +/// +/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated +/// observations imply the code stream is likely low-cardinality. fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option { if observed_codes.is_empty() { return None; @@ -66,6 +81,10 @@ fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Some(unique_count.saturating_add(unseen_estimate)) } +/// Return the midpoint index for one deterministic sampling bucket. +/// +/// Splitting the full code range into buckets avoids clustering all samples near the start while +/// avoiding RNG state in a hot execution path. fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { debug_assert!(len > 0); debug_assert!(sample_count > 0); diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs index 37f0f55dfb9..1a56fa1a954 100644 --- a/vortex-duckdb/src/exporter/dict_cardinality.rs +++ b/vortex-duckdb/src/exporter/dict_cardinality.rs @@ -1,12 +1,22 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Sampling-based cardinality estimation for DuckDB dictionary export. +//! +//! The exporter uses this as a cheap routing hint before choosing between a reusable DuckDB +//! dictionary and executing the Vortex dictionary into a flat vector. Correctness does not depend on +//! the estimate: the compacting path still executes Vortex's dictionary canonicalization logic. + use vortex::array::arrays::PrimitiveArray; use vortex::dtype::IntegerPType; use vortex::mask::Mask; const SAMPLE_SIZE: usize = 128; +/// Estimate the number of distinct non-null dictionary codes in a DuckDB export batch. +/// +/// Returning `None` means no valid sampled codes were seen. A returned estimate should be treated +/// only as a cost signal for whether the exporter should call into Vortex dictionary execution. pub(super) fn estimate_code_cardinality( codes: &PrimitiveArray, codes_mask: &Mask, @@ -38,6 +48,10 @@ pub(super) fn estimate_code_cardinality( estimate_cardinality_from_observations(&observed_codes) } +/// Estimate total cardinality from `(code, observed_count)` sample observations. +/// +/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated +/// observations imply the selected code stream is likely low-cardinality. fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option { if observed_codes.is_empty() { return None; @@ -65,6 +79,10 @@ fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Some(unique_count.saturating_add(unseen_estimate)) } +/// Return the midpoint index for one deterministic sampling bucket. +/// +/// Bucket midpoint sampling gives coverage across the whole code vector without introducing RNG +/// state or nondeterministic exporter decisions. fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { debug_assert!(len > 0); debug_assert!(sample_count > 0); From 4d65cf147d8b57270881cfb457ac9f7c6f76bda8 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 11:23:32 -0400 Subject: [PATCH 04/11] Share dictionary referenced-values mask Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/array.rs | 91 +++++++++++++--------- vortex-array/src/arrays/dict/vtable/mod.rs | 60 +++++++------- 2 files changed, 85 insertions(+), 66 deletions(-) diff --git a/vortex-array/src/arrays/dict/array.rs b/vortex-array/src/arrays/dict/array.rs index eafbc9f6642..0bb998c88c8 100644 --- a/vortex-array/src/arrays/dict/array.rs +++ b/vortex-array/src/arrays/dict/array.rs @@ -10,6 +10,7 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_mask::AllOr; +use vortex_mask::Mask; use crate::ArrayRef; use crate::LEGACY_SESSION; @@ -21,6 +22,7 @@ use crate::array::ArrayParts; use crate::array::TypedArrayRef; use crate::array_slots; use crate::arrays::Dict; +use crate::arrays::PrimitiveArray; use crate::dtype::DType; use crate::dtype::PType; use crate::match_each_integer_ptype; @@ -148,46 +150,63 @@ pub trait DictArrayExt: TypedArrayRef + DictArraySlotsExt { .execute_mask(codes.len(), &mut LEGACY_SESSION.create_execution_ctx())?; #[expect(deprecated)] let codes_primitive = self.codes().to_primitive(); - let values_len = self.values().len(); - - let init_value = !referenced; - let referenced_value = referenced; - - let mut values_vec = vec![init_value; values_len]; - match codes_validity.bit_buffer() { - AllOr::All => { - match_each_integer_ptype!(codes_primitive.ptype(), |P| { - #[allow( - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" - )] - for &idx in codes_primitive.as_slice::

() { - values_vec[idx as usize] = referenced_value; - } - }); - } - AllOr::None => {} - AllOr::Some(mask) => { - match_each_integer_ptype!(codes_primitive.ptype(), |P| { - let codes = codes_primitive.as_slice::

(); - - #[allow( - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" - )] - mask.set_indices().for_each(|idx| { - values_vec[codes[idx] as usize] = referenced_value; - }); + compute_referenced_values_mask_from_codes( + &codes_primitive, + self.values().len(), + &codes_validity, + referenced, + ) + } +} +impl> DictArrayExt for T {} + +/// Build an exact bitmap over dictionary values referenced by valid codes. +/// +/// The sampling-based sparse dictionary estimator only decides whether an exact pass is likely to +/// be worthwhile. This helper is the exact pass: aggregate kernels use it to ignore unreferenced +/// values, and sparse dictionary canonicalization uses it to compact values before remapping codes. +pub(crate) fn compute_referenced_values_mask_from_codes( + codes_primitive: &PrimitiveArray, + values_len: usize, + codes_validity: &Mask, + referenced: bool, +) -> VortexResult { + let init_value = !referenced; + let referenced_value = referenced; + + let mut values_vec = vec![init_value; values_len]; + match codes_validity.bit_buffer() { + AllOr::All => { + match_each_integer_ptype!(codes_primitive.ptype(), |P| { + #[allow( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" + )] + for &idx in codes_primitive.as_slice::

() { + values_vec[idx as usize] = referenced_value; + } + }); + } + AllOr::None => {} + AllOr::Some(mask) => { + match_each_integer_ptype!(codes_primitive.ptype(), |P| { + let codes = codes_primitive.as_slice::

(); + + #[allow( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index" + )] + mask.set_indices().for_each(|idx| { + values_vec[codes[idx] as usize] = referenced_value; }); - } + }); } - - Ok(BitBuffer::from(values_vec)) } + + Ok(BitBuffer::from(values_vec)) } -impl> DictArrayExt for T {} /// Concrete parts of a [`DictArray`](super::DictArray) after iterative execution. pub struct DictParts { diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 3b5070fc23e..0a8f9c3f8aa 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -6,6 +6,7 @@ use std::hash::Hasher; use kernel::PARENT_KERNELS; use num_traits::FromPrimitive; use prost::Message; +use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -22,6 +23,7 @@ use super::DictOwnedExt; use super::DictParts; use super::array::DictSlots; use super::array::DictSlotsView; +use super::array::compute_referenced_values_mask_from_codes; use crate::AnyCanonical; use crate::ArrayEq; use crate::ArrayHash; @@ -296,11 +298,16 @@ fn collect_sparse_codes( return Ok(None); } - let Some(sparse_codes) = match_each_integer_ptype!(codes.ptype(), |P| { - collect_sparse_codes_typed::

(codes, values_len, validity_mask, validity)? - }) else { + let referenced_values = + compute_referenced_values_mask_from_codes(codes, values_len, &validity_mask, true)?; + let unique_count = referenced_values.true_count(); + if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len { return Ok(None); - }; + } + + let sparse_codes = match_each_integer_ptype!(codes.ptype(), |P| { + collect_sparse_codes_typed::

(codes, referenced_values, validity_mask, validity)? + }); Ok(Some(sparse_codes)) } @@ -338,16 +345,24 @@ fn should_collect_sparse_codes( fn collect_sparse_codes_typed( codes: &PrimitiveArray, - values_len: usize, + referenced_values: BitBuffer, validity_mask: Mask, validity: Validity, -) -> VortexResult> { - let mut value_remap = vec![usize::MAX; values_len]; - let mut unique_codes = Vec::new(); - let mut remapped_codes = Vec::with_capacity(codes.len()); +) -> VortexResult { + let unique_count = referenced_values.true_count(); + let mut value_remap = vec![usize::MAX; referenced_values.len()]; + let mut unique_codes = Vec::with_capacity(unique_count); + + // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking + // the bitmap assigns compact codes in original dictionary order, which keeps compaction + // deterministic and independent of the first live row that happened to reference each value. + for old_code in referenced_values.set_indices() { + let new_code = unique_codes.len(); + value_remap[old_code] = new_code; + unique_codes.push(old_code as u64); + } - // `value_remap[old_code]` stores the compact code assigned to an original dictionary value. - // `usize::MAX` means that value has not been referenced yet. + let mut remapped_codes = Vec::with_capacity(codes.len()); for (idx, &code) in codes.as_slice::

().iter().enumerate() { if !validity_mask.value(idx) { remapped_codes.push(P::default()); @@ -355,12 +370,8 @@ fn collect_sparse_codes_typed( } let old_code = code.as_(); - let mut new_code = value_remap[old_code]; - if new_code == usize::MAX { - new_code = unique_codes.len(); - value_remap[old_code] = new_code; - unique_codes.push(old_code as u64); - } + let new_code = value_remap[old_code]; + debug_assert_ne!(new_code, usize::MAX); remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| { vortex_panic!( @@ -370,21 +381,10 @@ fn collect_sparse_codes_typed( })); } - // The sample only decides whether the exact pass is worth trying. Recheck the real - // cardinality before compacting so a misleading sparse-looking sample cannot pessimize dense - // dictionaries. - if unique_codes - .len() - .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) - >= values_len - { - return Ok(None); - } - - Ok(Some(SparseDictCodes { + Ok(SparseDictCodes { unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable), remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity), - })) + }) } #[cfg(test)] From 4ab2dc5d3086e0d1a5f1f9004dd91a3e70209fee Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 11:44:48 -0400 Subject: [PATCH 05/11] Skip sparse dict compaction for referenced dictionaries Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/vtable/mod.rs | 7 +++++++ vortex-duckdb/src/exporter/dict.rs | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index b03f356460e..6b93f7029fa 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -244,6 +244,13 @@ fn sparse_canonicalize_dict( array: &DictArray, ctx: &mut ExecutionCtx, ) -> VortexResult> { + // If metadata tells us every dictionary value is referenced, there is no garbage to compact. + // This also keeps hot paths such as dictionary comparisons from paying the sparse estimator + // cost when they produce dense, all-referenced result dictionaries. + if array.has_all_values_referenced() { + return Ok(None); + } + let codes = array.codes().as_::().into_owned(); let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else { return Ok(None); diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs index 616f519fcf0..0728779cdf6 100644 --- a/vortex-duckdb/src/exporter/dict.rs +++ b/vortex-duckdb/src/exporter/dict.rs @@ -11,6 +11,7 @@ use vortex::array::arrays::Constant; use vortex::array::arrays::ConstantArray; use vortex::array::arrays::DictArray; use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::dict::DictArrayExt; use vortex::array::arrays::dict::DictArraySlotsExt; use vortex::array::match_each_integer_ptype; use vortex::dtype::IntegerPType; @@ -74,7 +75,10 @@ pub(crate) fn new_exporter_with_flatten( let values_key = values.addr(); let codes = codes_array.clone().execute::(ctx)?; - if !flatten && should_export_sparse(&codes, values.len(), &codes_mask) { + if !flatten + && !array.has_all_values_referenced() + && should_export_sparse(&codes, values.len(), &codes_mask) + { return new_array_exporter( array .clone() From 13ae594513a28f4b4fb56919bb847156fd9cc210 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 12:03:22 -0400 Subject: [PATCH 06/11] Avoid sparse dict overhead on dense paths Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/compute/slice.rs | 68 +++++++++++++++++-- .../src/arrays/dict/vtable/cardinality.rs | 30 ++++++++ vortex-array/src/arrays/dict/vtable/mod.rs | 18 ++--- 3 files changed, 101 insertions(+), 15 deletions(-) diff --git a/vortex-array/src/arrays/dict/compute/slice.rs b/vortex-array/src/arrays/dict/compute/slice.rs index 509fa7535d8..04434bb593a 100644 --- a/vortex-array/src/arrays/dict/compute/slice.rs +++ b/vortex-array/src/arrays/dict/compute/slice.rs @@ -12,25 +12,36 @@ use crate::arrays::Constant; use crate::arrays::ConstantArray; use crate::arrays::Dict; use crate::arrays::DictArray; +use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::slice::SliceReduce; use crate::scalar::Scalar; impl SliceReduce for Dict { fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { + let is_full_slice = range.len() == array.len(); let sliced_code = array.codes().slice(range)?; // TODO(joe): if the range is size 1 replace with a constant array if let Some(code) = sliced_code.as_opt::() { let code = code.scalar().as_primitive().as_::(); return if let Some(code) = code { let values = array.values().slice(code..code + 1)?; - Ok(Some( - DictArray::new( + // SAFETY: the only dictionary value is referenced by every non-null code when the + // slice is non-empty. An empty code stream cannot reference a non-empty values + // array. + let sliced = unsafe { + DictArray::new_unchecked( ConstantArray::new(0u8, sliced_code.len()).into_array(), values, ) - .into_array(), - )) + }; + if sliced_code.is_empty() { + Ok(Some(sliced.into_array())) + } else { + Ok(Some(unsafe { + sliced.set_all_values_referenced(true).into_array() + })) + } } else { Ok(Some( ConstantArray::new(Scalar::null(array.dtype().clone()), sliced_code.len()) @@ -39,9 +50,18 @@ impl SliceReduce for Dict { }; } // SAFETY: slicing the codes preserves invariants. - Ok(Some( - unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) }.into_array(), - )) + let sliced = unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) }; + if is_full_slice { + // A full-length slice preserves the exact code stream, so the referenced-values + // metadata remains sound. Partial slices may drop the only reference to a value. + Ok(Some(unsafe { + sliced + .set_all_values_referenced(array.has_all_values_referenced()) + .into_array() + })) + } else { + Ok(Some(sliced.into_array())) + } } } @@ -51,8 +71,10 @@ mod tests { use vortex_error::VortexResult; use crate::IntoArray; + use crate::arrays::Dict; use crate::arrays::DictArray; use crate::arrays::PrimitiveArray; + use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::compute::slice::ConstantArray; use crate::assert_arrays_eq; use crate::dtype::DType; @@ -84,4 +106,36 @@ mod tests { assert_arrays_eq!(sliced, expected); Ok(()) } + + #[test] + fn full_slice_preserves_all_values_referenced_metadata() -> VortexResult<()> { + let dict = unsafe { + DictArray::new_unchecked( + buffer![0u8, 1].into_array(), + buffer![10i32, 20].into_array(), + ) + .set_all_values_referenced(true) + }; + + let sliced = dict.slice(0..2)?; + + assert!(sliced.as_::().has_all_values_referenced()); + Ok(()) + } + + #[test] + fn partial_slice_drops_all_values_referenced_metadata() -> VortexResult<()> { + let dict = unsafe { + DictArray::new_unchecked( + buffer![0u8, 1].into_array(), + buffer![10i32, 20].into_array(), + ) + .set_all_values_referenced(true) + }; + + let sliced = dict.slice(0..1)?; + + assert!(!sliced.as_::().has_all_values_referenced()); + Ok(()) + } } diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs index d97b26fd3e5..ad8fa24bdd0 100644 --- a/vortex-array/src/arrays/dict/vtable/cardinality.rs +++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs @@ -13,6 +13,36 @@ use crate::arrays::PrimitiveArray; use crate::dtype::IntegerPType; const SAMPLE_SIZE: usize = 128; +const REPEATED_CODE_PROBE_SIZE: usize = 16; + +/// Return whether a small deterministic probe observes a repeated non-null code. +/// +/// Sparse canonicalization always has a cheap worst-case gate before it samples. This probe is the +/// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should +/// not pay the full estimator cost unless the code stream first shows evidence of repeated codes. +/// A `true` result only means "run the estimator"; it is not enough to compact by itself. +pub(super) fn has_repeated_code_sample( + codes: &PrimitiveArray, + validity_mask: &Mask, +) -> bool { + let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE); + let mut observed_codes = Vec::::with_capacity(sample_count); + + for sample_idx in 0..sample_count { + let idx = sample_index(sample_idx, codes.len(), sample_count); + if !validity_mask.value(idx) { + continue; + } + + let code = codes.as_slice::

()[idx].as_(); + if observed_codes.contains(&code) { + return true; + } + observed_codes.push(code); + } + + false +} /// Estimate the number of distinct non-null dictionary codes. /// diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 6b93f7029fa..8ece354aee7 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -200,7 +200,9 @@ impl VTable for Dict { ))); } - if let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? { + if !array.has_all_values_referenced() + && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? + { return Ok(ExecutionResult::done(canonical)); } @@ -244,13 +246,6 @@ fn sparse_canonicalize_dict( array: &DictArray, ctx: &mut ExecutionCtx, ) -> VortexResult> { - // If metadata tells us every dictionary value is referenced, there is no garbage to compact. - // This also keeps hot paths such as dictionary comparisons from paying the sparse estimator - // cost when they produce dense, all-referenced result dictionaries. - if array.has_all_values_referenced() { - return Ok(None); - } - let codes = array.codes().as_::().into_owned(); let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else { return Ok(None); @@ -341,6 +336,13 @@ fn should_collect_sparse_codes( // Otherwise sample first. This catches cases like many live rows all referencing the same // dictionary value without forcing dense dictionaries through the exact remap scan. + let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| { + cardinality::has_repeated_code_sample::

(codes, validity_mask) + }); + if !worth_sampling { + return false; + } + let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| { cardinality::estimate_code_cardinality::

(codes, validity_mask) }) else { From fd59336d825aa66a13f75a6dbea17544ad694883 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 12:27:59 -0400 Subject: [PATCH 07/11] Gate sparse dict sampling by shape Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/vtable/mod.rs | 34 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 8ece354aee7..cbe8d7bf8c9 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -68,6 +68,8 @@ pub type DictArray = Array; // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values // encoding, code count, unique-code count, and exporter/canonicalization costs. const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4; +const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2; +const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512; #[derive(Clone, Debug)] pub struct Dict; @@ -200,7 +202,8 @@ impl VTable for Dict { ))); } - if !array.has_all_values_referenced() + if should_consider_sparse_canonicalize(array.codes().len(), array.values().len()) + && !array.has_all_values_referenced() && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? { return Ok(ExecutionResult::done(canonical)); @@ -242,6 +245,8 @@ struct SparseDictCodes { remapped_codes: PrimitiveArray, } +#[cold] +#[inline(never)] fn sparse_canonicalize_dict( array: &DictArray, ctx: &mut ExecutionCtx, @@ -334,6 +339,10 @@ fn should_collect_sparse_codes( return true; } + if !should_sample_sparse_canonicalize(codes.len(), values_len) { + return false; + } + // Otherwise sample first. This catches cases like many live rows all referencing the same // dictionary value without forcing dense dictionaries through the exact remap scan. let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| { @@ -353,6 +362,21 @@ fn should_collect_sparse_codes( < values_len } +#[inline] +fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { + codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len + || should_sample_sparse_canonicalize(codes_len, values_len) +} + +#[inline] +fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { + // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away + // from tiny dictionary domains and near-dense slices where the estimator overhead dominates. + values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN + && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD) + < values_len +} + fn collect_sparse_codes_typed( codes: &PrimitiveArray, referenced_values: BitBuffer, @@ -431,7 +455,7 @@ mod tests { fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> { let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32)); let Some(sparse) = - collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? else { panic!("sampled codes are sparse"); }; @@ -450,10 +474,10 @@ mod tests { #[test] fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> { - let codes = PrimitiveArray::from_iter((0..1024).map(|idx| (idx % 100) as u32)); + let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32)); assert!( - collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? .is_none() ); @@ -484,7 +508,7 @@ mod tests { fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> { let dict = DictArray::new( PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(), - PrimitiveArray::from_iter(0..100i32).into_array(), + PrimitiveArray::from_iter(0..3000i32).into_array(), ); let actual = dict From 58bb1d444f3c6d3b1afd52bc4a836f4bf71f6184 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 12:36:44 -0400 Subject: [PATCH 08/11] Keep sparse dict helpers cold Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/vtable/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index cbe8d7bf8c9..74d96d2575b 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -292,6 +292,8 @@ fn sparse_canonicalize_dict( .map(Some) } +#[cold] +#[inline(never)] fn collect_sparse_codes( codes: &PrimitiveArray, values_len: usize, @@ -320,6 +322,8 @@ fn collect_sparse_codes( Ok(Some(sparse_codes)) } +#[cold] +#[inline(never)] fn should_collect_sparse_codes( codes: &PrimitiveArray, values_len: usize, @@ -377,6 +381,8 @@ fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> boo < values_len } +#[cold] +#[inline(never)] fn collect_sparse_codes_typed( codes: &PrimitiveArray, referenced_values: BitBuffer, From d6e109619adbcdd1a019972fe6a2e37cc27561f5 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 13:16:46 -0400 Subject: [PATCH 09/11] Avoid sparse dict compaction over filters Signed-off-by: Nicholas Gates --- .../src/arrays/dict/vtable/cardinality.rs | 17 +- vortex-array/src/arrays/dict/vtable/mod.rs | 314 +---------------- vortex-array/src/arrays/dict/vtable/sparse.rs | 320 ++++++++++++++++++ 3 files changed, 335 insertions(+), 316 deletions(-) create mode 100644 vortex-array/src/arrays/dict/vtable/sparse.rs diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs index ad8fa24bdd0..50cb4037e96 100644 --- a/vortex-array/src/arrays/dict/vtable/cardinality.rs +++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs @@ -10,8 +10,6 @@ use vortex_mask::Mask; use crate::arrays::PrimitiveArray; -use crate::dtype::IntegerPType; - const SAMPLE_SIZE: usize = 128; const REPEATED_CODE_PROBE_SIZE: usize = 16; @@ -21,10 +19,7 @@ const REPEATED_CODE_PROBE_SIZE: usize = 16; /// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should /// not pay the full estimator cost unless the code stream first shows evidence of repeated codes. /// A `true` result only means "run the estimator"; it is not enough to compact by itself. -pub(super) fn has_repeated_code_sample( - codes: &PrimitiveArray, - validity_mask: &Mask, -) -> bool { +pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &Mask) -> bool { let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE); let mut observed_codes = Vec::::with_capacity(sample_count); @@ -34,7 +29,7 @@ pub(super) fn has_repeated_code_sample( continue; } - let code = codes.as_slice::

()[idx].as_(); + let code = code_at(codes, idx); if observed_codes.contains(&code) { return true; } @@ -49,7 +44,7 @@ pub(super) fn has_repeated_code_sample( /// The estimator samples deterministic bucket midpoints so repeated executions make the same /// compaction decision for the same input. Returning `None` means no valid sampled codes were seen. /// A returned value should only be used to decide whether an exact pass is worth attempting. -pub(super) fn estimate_code_cardinality( +pub(super) fn estimate_code_cardinality( codes: &PrimitiveArray, validity_mask: &Mask, ) -> Option { @@ -64,7 +59,7 @@ pub(super) fn estimate_code_cardinality( continue; } - let code = codes.as_slice::

()[idx].as_(); + let code = code_at(codes, idx); if let Some((_, count)) = observed_codes .iter_mut() .find(|(observed, _)| *observed == code) @@ -128,6 +123,10 @@ fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { ((bucket_start + bucket_end) / 2).min(len - 1) as usize } +fn code_at(codes: &PrimitiveArray, idx: usize) -> usize { + usize::try_from(codes.as_slice::()[idx]).unwrap_or(usize::MAX) +} + fn div_ceil(numerator: usize, denominator: usize) -> usize { debug_assert!(denominator > 0); numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 74d96d2575b..7ca71b49d49 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -4,17 +4,13 @@ use std::hash::Hasher; use kernel::PARENT_KERNELS; -use num_traits::FromPrimitive; use prost::Message; use smallvec::smallvec; -use vortex_buffer::BitBuffer; -use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_error::vortex_panic; -use vortex_mask::Mask; use vortex_session::VortexSession; use vortex_session::registry::CachedId; @@ -24,13 +20,11 @@ use super::DictOwnedExt; use super::DictParts; use super::array::DictSlots; use super::array::DictSlotsView; -use super::array::compute_referenced_values_mask_from_codes; use crate::AnyCanonical; use crate::ArrayEq; use crate::ArrayHash; use crate::ArrayRef; use crate::Canonical; -use crate::IntoArray; use crate::Precision; use crate::array::Array; use crate::array::ArrayId; @@ -38,20 +32,18 @@ use crate::array::ArrayParts; use crate::array::ArrayView; use crate::array::VTable; use crate::arrays::ConstantArray; +use crate::arrays::Filter; use crate::arrays::Primitive; -use crate::arrays::PrimitiveArray; use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::dict::compute::rules::PARENT_RULES; use crate::arrays::dict::execute::take_canonical; use crate::buffer::BufferHandle; use crate::dtype::DType; -use crate::dtype::IntegerPType; use crate::dtype::Nullability; use crate::dtype::PType; use crate::executor::ExecutionCtx; use crate::executor::ExecutionResult; -use crate::match_each_integer_ptype; use crate::require_child; use crate::scalar::Scalar; use crate::serde::ArrayChildren; @@ -60,17 +52,12 @@ use crate::validity::Validity; mod cardinality; mod kernel; mod operations; +mod sparse; mod validity; /// A [`Dict`]-encoded Vortex array. pub type DictArray = Array; -// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values -// encoding, code count, unique-code count, and exporter/canonicalization costs. -const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4; -const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2; -const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512; - #[derive(Clone, Debug)] pub struct Dict; @@ -202,9 +189,12 @@ impl VTable for Dict { ))); } - if should_consider_sparse_canonicalize(array.codes().len(), array.values().len()) + if sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len()) && !array.has_all_values_referenced() - && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? + // `take(FilterArray(...))` is also represented as a dictionary. Compacting that shape + // burns time before the lazy filter has a chance to push the row mask into its child. + && !array.values().is::() + && let Some(canonical) = sparse::sparse_canonicalize_dict(&array, ctx)? { return Ok(ExecutionResult::done(canonical)); } @@ -237,293 +227,3 @@ impl VTable for Dict { PARENT_KERNELS.execute(array, parent, child_idx, ctx) } } - -struct SparseDictCodes { - /// Original dictionary value indices that are actually referenced by the live codes. - unique_codes: PrimitiveArray, - /// Codes rewritten to index into `unique_codes` instead of the original values array. - remapped_codes: PrimitiveArray, -} - -#[cold] -#[inline(never)] -fn sparse_canonicalize_dict( - array: &DictArray, - ctx: &mut ExecutionCtx, -) -> VortexResult> { - let codes = array.codes().as_::().into_owned(); - let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else { - return Ok(None); - }; - - // Build a temporary parent that represents `values.take(unique_codes)`. Calling - // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just - // the referenced dictionary values. If the child has no specialized parent execution, fall - // back to canonicalizing all values and then taking from the canonical array. - let values = array.values(); - let unique_values_parent = DictArray::new( - sparse_codes.unique_codes.clone().into_array(), - values.clone(), - ) - .into_array(); - let unique_values = if let Some(taken_values) = - values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)? - { - taken_values.execute::(ctx)?.into_array() - } else { - let canonical_values = values.clone().execute::(ctx)?.into_array(); - DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values) - .into_array() - .execute::(ctx)? - .into_array() - }; - - // Now the dictionary is dense over its compacted values, so normal dictionary execution only - // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a - // large dictionary with many unused values. - let compact_dict = unsafe { - DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values) - .set_all_values_referenced(true) - }; - - compact_dict - .into_array() - .execute::(ctx) - .map(Some) -} - -#[cold] -#[inline(never)] -fn collect_sparse_codes( - codes: &PrimitiveArray, - values_len: usize, - ctx: &mut ExecutionCtx, -) -> VortexResult> { - let validity = codes.validity()?; - let validity_mask = validity.execute_mask(codes.len(), ctx)?; - - // The exact pass below scans every code and allocates a remap table sized to the values array. - // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough. - if !should_collect_sparse_codes(codes, values_len, &validity_mask) { - return Ok(None); - } - - let referenced_values = - compute_referenced_values_mask_from_codes(codes, values_len, &validity_mask, true)?; - let unique_count = referenced_values.true_count(); - if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len { - return Ok(None); - } - - let sparse_codes = match_each_integer_ptype!(codes.ptype(), |P| { - collect_sparse_codes_typed::

(codes, referenced_values, validity_mask, validity)? - }); - - Ok(Some(sparse_codes)) -} - -#[cold] -#[inline(never)] -fn should_collect_sparse_codes( - codes: &PrimitiveArray, - values_len: usize, - validity_mask: &Mask, -) -> bool { - if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 { - return false; - } - - // If even the worst case "every live code is unique" is sparse, skip sampling and go straight - // to the exact remap pass. - if codes - .len() - .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) - < values_len - { - return true; - } - - if !should_sample_sparse_canonicalize(codes.len(), values_len) { - return false; - } - - // Otherwise sample first. This catches cases like many live rows all referencing the same - // dictionary value without forcing dense dictionaries through the exact remap scan. - let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| { - cardinality::has_repeated_code_sample::

(codes, validity_mask) - }); - if !worth_sampling { - return false; - } - - let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| { - cardinality::estimate_code_cardinality::

(codes, validity_mask) - }) else { - return false; - }; - - estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) - < values_len -} - -#[inline] -fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { - codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len - || should_sample_sparse_canonicalize(codes_len, values_len) -} - -#[inline] -fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { - // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away - // from tiny dictionary domains and near-dense slices where the estimator overhead dominates. - values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN - && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD) - < values_len -} - -#[cold] -#[inline(never)] -fn collect_sparse_codes_typed( - codes: &PrimitiveArray, - referenced_values: BitBuffer, - validity_mask: Mask, - validity: Validity, -) -> VortexResult { - let unique_count = referenced_values.true_count(); - let mut value_remap = vec![usize::MAX; referenced_values.len()]; - let mut unique_codes = Vec::with_capacity(unique_count); - - // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking - // the bitmap assigns compact codes in original dictionary order, which keeps compaction - // deterministic and independent of the first live row that happened to reference each value. - for old_code in referenced_values.set_indices() { - let new_code = unique_codes.len(); - value_remap[old_code] = new_code; - unique_codes.push(old_code as u64); - } - - let mut remapped_codes = Vec::with_capacity(codes.len()); - for (idx, &code) in codes.as_slice::

().iter().enumerate() { - if !validity_mask.value(idx) { - remapped_codes.push(P::default()); - continue; - } - - let old_code = code.as_(); - let new_code = value_remap[old_code]; - debug_assert_ne!(new_code, usize::MAX); - - remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| { - vortex_panic!( - "compacted dictionary code {new_code} does not fit in {}", - P::PTYPE - ) - })); - } - - Ok(SparseDictCodes { - unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable), - remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity), - }) -} - -#[cfg(test)] -mod tests { - use vortex_error::VortexResult; - - use super::*; - use crate::LEGACY_SESSION; - use crate::VortexSessionExecute; - use crate::assert_arrays_eq; - - #[test] - fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> { - let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]); - let Some(sparse) = - collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? - else { - panic!("codes are sparse"); - }; - - assert_arrays_eq!( - sparse.unique_codes.into_array(), - PrimitiveArray::from_iter([50u64, 70]).into_array() - ); - assert_arrays_eq!( - sparse.remapped_codes.into_array(), - PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), Some(0)]).into_array() - ); - - Ok(()) - } - - #[test] - fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> { - let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32)); - let Some(sparse) = - collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? - else { - panic!("sampled codes are sparse"); - }; - - assert_arrays_eq!( - sparse.unique_codes.into_array(), - PrimitiveArray::from_iter([42u64]).into_array() - ); - assert_arrays_eq!( - sparse.remapped_codes.into_array(), - PrimitiveArray::from_iter((0..1024).map(|_| 0u32)).into_array() - ); - - Ok(()) - } - - #[test] - fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> { - let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32)); - - assert!( - collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? - .is_none() - ); - - Ok(()) - } - - #[test] - fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> { - let dict = DictArray::new( - PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(), - PrimitiveArray::from_iter(0..100i32).into_array(), - ); - - let actual = dict - .into_array() - .execute::(&mut LEGACY_SESSION.create_execution_ctx())? - .into_array(); - - assert_arrays_eq!( - actual, - PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)]) - ); - - Ok(()) - } - - #[test] - fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> { - let dict = DictArray::new( - PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(), - PrimitiveArray::from_iter(0..3000i32).into_array(), - ); - - let actual = dict - .into_array() - .execute::(&mut LEGACY_SESSION.create_execution_ctx())? - .into_array(); - - assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32))); - - Ok(()) - } -} diff --git a/vortex-array/src/arrays/dict/vtable/sparse.rs b/vortex-array/src/arrays/dict/vtable/sparse.rs new file mode 100644 index 00000000000..1552df6ea5d --- /dev/null +++ b/vortex-array/src/arrays/dict/vtable/sparse.rs @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_buffer::BitBuffer; +use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_panic; +use vortex_mask::Mask; + +use super::super::array::DictSlots; +use super::super::array::compute_referenced_values_mask_from_codes; +use super::DictArray; +use super::cardinality; +use crate::Canonical; +use crate::IntoArray; +use crate::arrays::Primitive; +use crate::arrays::PrimitiveArray; +use crate::arrays::dict::DictArraySlotsExt; +use crate::builtins::ArrayBuiltins; +use crate::dtype::DType; +use crate::dtype::PType; +use crate::executor::ExecutionCtx; +use crate::validity::Validity; + +// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values +// encoding, code count, unique-code count, and exporter/canonicalization costs. +const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4; +const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2; +const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512; + +struct SparseDictCodes { + /// Original dictionary value indices that are actually referenced by the live codes. + unique_codes: PrimitiveArray, + /// Codes rewritten to index into `unique_codes` instead of the original values array. + remapped_codes: PrimitiveArray, +} + +#[cold] +#[inline(never)] +pub(super) fn sparse_canonicalize_dict( + array: &DictArray, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let codes = array.codes().as_::().into_owned(); + let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else { + return Ok(None); + }; + + // Build a temporary parent that represents `values.take(unique_codes)`. Calling + // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just + // the referenced dictionary values. If the child has no specialized parent execution, fall + // back to canonicalizing all values and then taking from the canonical array. + let values = array.values(); + let unique_values_parent = DictArray::new( + sparse_codes.unique_codes.clone().into_array(), + values.clone(), + ) + .into_array(); + let unique_values = if let Some(taken_values) = + values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)? + { + taken_values.execute::(ctx)?.into_array() + } else { + let canonical_values = values.clone().execute::(ctx)?.into_array(); + DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values) + .into_array() + .execute::(ctx)? + .into_array() + }; + + // Now the dictionary is dense over its compacted values, so normal dictionary execution only + // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a + // large dictionary with many unused values. + let compact_dict = unsafe { + DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values) + .set_all_values_referenced(true) + }; + + compact_dict + .into_array() + .execute::(ctx) + .map(Some) +} + +#[cold] +#[inline(never)] +fn collect_sparse_codes( + codes: &PrimitiveArray, + values_len: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let validity = codes.validity()?; + let validity_mask = validity.execute_mask(codes.len(), ctx)?; + let codes = codes_as_u64(codes, ctx)?; + + // The exact pass below scans every code and allocates a remap table sized to the values array. + // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough. + if !should_collect_sparse_codes(&codes, values_len, &validity_mask) { + return Ok(None); + } + + let referenced_values = + compute_referenced_values_mask_from_codes(&codes, values_len, &validity_mask, true)?; + let unique_count = referenced_values.true_count(); + if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len { + return Ok(None); + } + + collect_sparse_codes_u64(&codes, referenced_values, validity_mask, validity).map(Some) +} + +fn codes_as_u64(codes: &PrimitiveArray, ctx: &mut ExecutionCtx) -> VortexResult { + if codes.ptype() == PType::U64 { + return Ok(codes.clone()); + } + + codes + .clone() + .into_array() + .cast(DType::Primitive(PType::U64, codes.dtype().nullability()))? + .execute::(ctx) +} + +#[cold] +#[inline(never)] +fn should_collect_sparse_codes( + codes: &PrimitiveArray, + values_len: usize, + validity_mask: &Mask, +) -> bool { + if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 { + return false; + } + + // If even the worst case "every live code is unique" is sparse, skip sampling and go straight + // to the exact remap pass. + if codes + .len() + .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) + < values_len + { + return true; + } + + if !should_sample_sparse_canonicalize(codes.len(), values_len) { + return false; + } + + // Otherwise sample first. This catches cases like many live rows all referencing the same + // dictionary value without forcing dense dictionaries through the exact remap scan. + if !cardinality::has_repeated_code_sample(codes, validity_mask) { + return false; + } + + let Some(estimated_unique_codes) = cardinality::estimate_code_cardinality(codes, validity_mask) + else { + return false; + }; + + estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) + < values_len +} + +#[inline] +pub(super) fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { + codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len + || should_sample_sparse_canonicalize(codes_len, values_len) +} + +#[inline] +fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool { + // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away + // from tiny dictionary domains and near-dense slices where the estimator overhead dominates. + values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN + && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD) + < values_len +} + +#[cold] +#[inline(never)] +fn collect_sparse_codes_u64( + codes: &PrimitiveArray, + referenced_values: BitBuffer, + validity_mask: Mask, + validity: Validity, +) -> VortexResult { + let unique_count = referenced_values.true_count(); + let mut value_remap = vec![usize::MAX; referenced_values.len()]; + let mut unique_codes = Vec::with_capacity(unique_count); + + // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking + // the bitmap assigns compact codes in original dictionary order, which keeps compaction + // deterministic and independent of the first live row that happened to reference each value. + for old_code in referenced_values.set_indices() { + let new_code = unique_codes.len(); + value_remap[old_code] = new_code; + unique_codes.push(old_code as u64); + } + + let mut remapped_codes = Vec::with_capacity(codes.len()); + for (idx, &code) in codes.as_slice::().iter().enumerate() { + if !validity_mask.value(idx) { + remapped_codes.push(0); + continue; + } + + let old_code = usize::try_from(code) + .unwrap_or_else(|_| vortex_panic!("dictionary code {code} does not fit usize")); + let new_code = value_remap[old_code]; + debug_assert_ne!(new_code, usize::MAX); + + remapped_codes.push(new_code as u64); + } + + Ok(SparseDictCodes { + unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable), + remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity), + }) +} + +#[cfg(test)] +mod tests { + use vortex_error::VortexResult; + + use super::*; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::assert_arrays_eq; + + #[test] + fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> { + let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]); + let Some(sparse) = + collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())? + else { + panic!("codes are sparse"); + }; + + assert_arrays_eq!( + sparse.unique_codes.into_array(), + PrimitiveArray::from_iter([50u64, 70]).into_array() + ); + assert_arrays_eq!( + sparse.remapped_codes.into_array(), + PrimitiveArray::from_option_iter([Some(0u64), None, Some(1), Some(0)]).into_array() + ); + + Ok(()) + } + + #[test] + fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> { + let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32)); + let Some(sparse) = + collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? + else { + panic!("sampled codes are sparse"); + }; + + assert_arrays_eq!( + sparse.unique_codes.into_array(), + PrimitiveArray::from_iter([42u64]).into_array() + ); + assert_arrays_eq!( + sparse.remapped_codes.into_array(), + PrimitiveArray::from_iter((0..1024).map(|_| 0u64)).into_array() + ); + + Ok(()) + } + + #[test] + fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> { + let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32)); + + assert!( + collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())? + .is_none() + ); + + Ok(()) + } + + #[test] + fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> { + let dict = DictArray::new( + PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(), + PrimitiveArray::from_iter(0..100i32).into_array(), + ); + + let actual = dict + .into_array() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .into_array(); + + assert_arrays_eq!( + actual, + PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)]) + ); + + Ok(()) + } + + #[test] + fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> { + let dict = DictArray::new( + PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(), + PrimitiveArray::from_iter(0..3000i32).into_array(), + ); + + let actual = dict + .into_array() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .into_array(); + + assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32))); + + Ok(()) + } +} From 54b8dba4b5f29238f000161368e6b7d3fa94a55e Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 13:27:43 -0400 Subject: [PATCH 10/11] Skip sparse dict gate for filter values Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/dict/vtable/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index 7ca71b49d49..ab679dc27d1 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -189,11 +189,11 @@ impl VTable for Dict { ))); } - if sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len()) - && !array.has_all_values_referenced() + if !array.has_all_values_referenced() // `take(FilterArray(...))` is also represented as a dictionary. Compacting that shape // burns time before the lazy filter has a chance to push the row mask into its child. && !array.values().is::() + && sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len()) && let Some(canonical) = sparse::sparse_canonicalize_dict(&array, ctx)? { return Ok(ExecutionResult::done(canonical)); From 22be09b1e5c03807776ccd2c688bed59f48b7530 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 8 May 2026 14:04:53 -0400 Subject: [PATCH 11/11] Deduplicate cardinality code Signed-off-by: Nicholas Gates --- vortex-array/public-api.lock | 6 ++ .../arrays/dict/{vtable => }/cardinality.rs | 24 +++-- vortex-array/src/arrays/dict/mod.rs | 1 + vortex-array/src/arrays/dict/vtable/mod.rs | 1 - vortex-array/src/arrays/dict/vtable/sparse.rs | 7 +- vortex-duckdb/src/exporter/dict.rs | 2 +- .../src/exporter/dict_cardinality.rs | 102 ------------------ vortex-duckdb/src/exporter/mod.rs | 1 - 8 files changed, 25 insertions(+), 119 deletions(-) rename vortex-array/src/arrays/dict/{vtable => }/cardinality.rs (88%) delete mode 100644 vortex-duckdb/src/exporter/dict_cardinality.rs diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 8d5cc75c895..5e838a6ffcf 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -2268,6 +2268,12 @@ pub type vortex_array::arrays::decimal::DecimalArray = vortex_array::Array(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> core::option::Option + +pub fn vortex_array::arrays::dict::cardinality::has_repeated_code_sample(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> bool + pub mod vortex_array::arrays::dict::vtable pub struct vortex_array::arrays::dict::vtable::Dict diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/cardinality.rs similarity index 88% rename from vortex-array/src/arrays/dict/vtable/cardinality.rs rename to vortex-array/src/arrays/dict/cardinality.rs index 50cb4037e96..fb8ed84da5b 100644 --- a/vortex-array/src/arrays/dict/vtable/cardinality.rs +++ b/vortex-array/src/arrays/dict/cardinality.rs @@ -3,13 +3,16 @@ //! Sampling-based cardinality estimation for dictionary codes. //! -//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass. The -//! estimate may be conservative or noisy, but correctness does not depend on it: callers must still -//! collect the exact unique code set and re-check the sparse threshold before compacting. +//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass and as a +//! routing hint for downstream exporters. The estimate may be conservative or noisy, but +//! correctness does not depend on it: callers must still collect the exact unique code set and +//! re-check the sparse threshold before compacting. use vortex_mask::Mask; use crate::arrays::PrimitiveArray; +use crate::dtype::IntegerPType; + const SAMPLE_SIZE: usize = 128; const REPEATED_CODE_PROBE_SIZE: usize = 16; @@ -19,7 +22,10 @@ const REPEATED_CODE_PROBE_SIZE: usize = 16; /// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should /// not pay the full estimator cost unless the code stream first shows evidence of repeated codes. /// A `true` result only means "run the estimator"; it is not enough to compact by itself. -pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &Mask) -> bool { +pub fn has_repeated_code_sample( + codes: &PrimitiveArray, + validity_mask: &Mask, +) -> bool { let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE); let mut observed_codes = Vec::::with_capacity(sample_count); @@ -29,7 +35,7 @@ pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &M continue; } - let code = code_at(codes, idx); + let code: usize = codes.as_slice::()[idx].as_(); if observed_codes.contains(&code) { return true; } @@ -44,7 +50,7 @@ pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &M /// The estimator samples deterministic bucket midpoints so repeated executions make the same /// compaction decision for the same input. Returning `None` means no valid sampled codes were seen. /// A returned value should only be used to decide whether an exact pass is worth attempting. -pub(super) fn estimate_code_cardinality( +pub fn estimate_code_cardinality( codes: &PrimitiveArray, validity_mask: &Mask, ) -> Option { @@ -59,7 +65,7 @@ pub(super) fn estimate_code_cardinality( continue; } - let code = code_at(codes, idx); + let code: usize = codes.as_slice::()[idx].as_(); if let Some((_, count)) = observed_codes .iter_mut() .find(|(observed, _)| *observed == code) @@ -123,10 +129,6 @@ fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { ((bucket_start + bucket_end) / 2).min(len - 1) as usize } -fn code_at(codes: &PrimitiveArray, idx: usize) -> usize { - usize::try_from(codes.as_slice::()[idx]).unwrap_or(usize::MAX) -} - fn div_ceil(numerator: usize, denominator: usize) -> usize { debug_assert!(denominator > 0); numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) diff --git a/vortex-array/src/arrays/dict/mod.rs b/vortex-array/src/arrays/dict/mod.rs index 0414eea7def..f0bba784794 100644 --- a/vortex-array/src/arrays/dict/mod.rs +++ b/vortex-array/src/arrays/dict/mod.rs @@ -14,6 +14,7 @@ pub use arbitrary::ArbitraryDictArray; mod array; pub use array::*; +pub mod cardinality; pub(crate) mod compute; mod execute; diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs index ab679dc27d1..a0cde010147 100644 --- a/vortex-array/src/arrays/dict/vtable/mod.rs +++ b/vortex-array/src/arrays/dict/vtable/mod.rs @@ -49,7 +49,6 @@ use crate::scalar::Scalar; use crate::serde::ArrayChildren; use crate::validity::Validity; -mod cardinality; mod kernel; mod operations; mod sparse; diff --git a/vortex-array/src/arrays/dict/vtable/sparse.rs b/vortex-array/src/arrays/dict/vtable/sparse.rs index 1552df6ea5d..23944dc0b29 100644 --- a/vortex-array/src/arrays/dict/vtable/sparse.rs +++ b/vortex-array/src/arrays/dict/vtable/sparse.rs @@ -9,8 +9,8 @@ use vortex_mask::Mask; use super::super::array::DictSlots; use super::super::array::compute_referenced_values_mask_from_codes; +use super::super::cardinality; use super::DictArray; -use super::cardinality; use crate::Canonical; use crate::IntoArray; use crate::arrays::Primitive; @@ -148,11 +148,12 @@ fn should_collect_sparse_codes( // Otherwise sample first. This catches cases like many live rows all referencing the same // dictionary value without forcing dense dictionaries through the exact remap scan. - if !cardinality::has_repeated_code_sample(codes, validity_mask) { + if !cardinality::has_repeated_code_sample::(codes, validity_mask) { return false; } - let Some(estimated_unique_codes) = cardinality::estimate_code_cardinality(codes, validity_mask) + let Some(estimated_unique_codes) = + cardinality::estimate_code_cardinality::(codes, validity_mask) else { return false; }; diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs index 0728779cdf6..eec71dc08a6 100644 --- a/vortex-duckdb/src/exporter/dict.rs +++ b/vortex-duckdb/src/exporter/dict.rs @@ -13,6 +13,7 @@ use vortex::array::arrays::DictArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::dict::DictArrayExt; use vortex::array::arrays::dict::DictArraySlotsExt; +use vortex::array::arrays::dict::cardinality::estimate_code_cardinality; use vortex::array::match_each_integer_ptype; use vortex::dtype::IntegerPType; use vortex::error::VortexResult; @@ -25,7 +26,6 @@ use crate::exporter::ColumnExporter; use crate::exporter::all_invalid; use crate::exporter::cache::ConversionCache; use crate::exporter::constant; -use crate::exporter::dict_cardinality::estimate_code_cardinality; use crate::exporter::new_array_exporter; struct DictExporter { diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs deleted file mode 100644 index 1a56fa1a954..00000000000 --- a/vortex-duckdb/src/exporter/dict_cardinality.rs +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Sampling-based cardinality estimation for DuckDB dictionary export. -//! -//! The exporter uses this as a cheap routing hint before choosing between a reusable DuckDB -//! dictionary and executing the Vortex dictionary into a flat vector. Correctness does not depend on -//! the estimate: the compacting path still executes Vortex's dictionary canonicalization logic. - -use vortex::array::arrays::PrimitiveArray; -use vortex::dtype::IntegerPType; -use vortex::mask::Mask; - -const SAMPLE_SIZE: usize = 128; - -/// Estimate the number of distinct non-null dictionary codes in a DuckDB export batch. -/// -/// Returning `None` means no valid sampled codes were seen. A returned estimate should be treated -/// only as a cost signal for whether the exporter should call into Vortex dictionary execution. -pub(super) fn estimate_code_cardinality( - codes: &PrimitiveArray, - codes_mask: &Mask, -) -> Option { - let sample_count = codes.len().min(SAMPLE_SIZE); - let mut observed_codes = Vec::<(usize, usize)>::new(); - - // This mirrors the array-side sparse dictionary gate. The exporter needs the estimate before - // it decides between a reusable DuckDB dictionary and executing the Vortex dictionary away. - // Correctness does not depend on the estimate; it only decides whether to take the compacting - // path. - for sample_idx in 0..sample_count { - let idx = sample_index(sample_idx, codes.len(), sample_count); - if !codes_mask.value(idx) { - continue; - } - - let code = codes.as_slice::()[idx].as_(); - if let Some((_, count)) = observed_codes - .iter_mut() - .find(|(observed, _)| *observed == code) - { - *count += 1; - } else { - observed_codes.push((code, 1)); - } - } - - estimate_cardinality_from_observations(&observed_codes) -} - -/// Estimate total cardinality from `(code, observed_count)` sample observations. -/// -/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated -/// observations imply the selected code stream is likely low-cardinality. -fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option { - if observed_codes.is_empty() { - return None; - } - - let unique_count = observed_codes.len(); - let singleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 1) - .count(); - let doubleton_count = observed_codes - .iter() - .filter(|(_, count)| *count == 2) - .count(); - - let unseen_estimate = if doubleton_count == 0 { - singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2 - } else { - div_ceil( - singleton_count.saturating_mul(singleton_count), - 2 * doubleton_count, - ) - }; - - Some(unique_count.saturating_add(unseen_estimate)) -} - -/// Return the midpoint index for one deterministic sampling bucket. -/// -/// Bucket midpoint sampling gives coverage across the whole code vector without introducing RNG -/// state or nondeterministic exporter decisions. -fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize { - debug_assert!(len > 0); - debug_assert!(sample_count > 0); - - let sample_idx = sample_idx as u128; - let len = len as u128; - let sample_count = sample_count as u128; - let bucket_start = sample_idx * len / sample_count; - let bucket_end = (sample_idx + 1) * len / sample_count; - - ((bucket_start + bucket_end) / 2).min(len - 1) as usize -} - -fn div_ceil(numerator: usize, denominator: usize) -> usize { - debug_assert!(denominator > 0); - numerator / denominator + usize::from(!numerator.is_multiple_of(denominator)) -} diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 9c01e2f35e5..517776f5521 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -7,7 +7,6 @@ mod cache; mod constant; mod decimal; mod dict; -mod dict_cardinality; mod fixed_size_list; mod list; mod list_view;