From baceda26783d13b8ce083cce4fb190f87c66109b Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 09:31:37 -0400
Subject: [PATCH 01/11] Sparse Dictionary Canonicalize

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/vtable/mod.rs | 322 +++++++++++++++++++++
 vortex-duckdb/src/exporter/cache.rs        |   2 -
 vortex-duckdb/src/exporter/dict.rs         | 193 ++++++++++--
 3 files changed, 494 insertions(+), 23 deletions(-)
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 4893be5c7ed..dd5b87a772f 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -4,12 +4,15 @@
 use std::hash::Hasher;
 
 use kernel::PARENT_KERNELS;
+use num_traits::FromPrimitive;
 use prost::Message;
+use vortex_buffer::Buffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_ensure;
 use vortex_error::vortex_err;
 use vortex_error::vortex_panic;
+use vortex_mask::Mask;
 use vortex_session::VortexSession;
 use vortex_session::registry::CachedId;
 
@@ -24,6 +27,7 @@ use crate::ArrayEq;
 use crate::ArrayHash;
 use crate::ArrayRef;
 use crate::Canonical;
+use crate::IntoArray;
 use crate::Precision;
 use crate::array::Array;
 use crate::array::ArrayId;
@@ -32,16 +36,19 @@ use crate::array::ArrayView;
 use crate::array::VTable;
 use crate::arrays::ConstantArray;
 use crate::arrays::Primitive;
+use crate::arrays::PrimitiveArray;
 use crate::arrays::dict::DictArrayExt;
 use crate::arrays::dict::DictArraySlotsExt;
 use crate::arrays::dict::compute::rules::PARENT_RULES;
 use crate::arrays::dict::execute::take_canonical;
 use crate::buffer::BufferHandle;
 use crate::dtype::DType;
+use crate::dtype::IntegerPType;
 use crate::dtype::Nullability;
 use crate::dtype::PType;
 use crate::executor::ExecutionCtx;
 use crate::executor::ExecutionResult;
+use crate::match_each_integer_ptype;
 use crate::require_child;
 use crate::scalar::Scalar;
 use crate::serde::ArrayChildren;
@@ -54,6 +61,11 @@ mod validity;
 /// A [`Dict`]-encoded Vortex array.
 pub type DictArray = Array<Dict>;
 
+// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
+// encoding, code count, unique-code count, and exporter/canonicalization costs.
+const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4;
+const SPARSE_CANONICALIZE_SAMPLE_SIZE: usize = 128;
+
 #[derive(Clone, Debug)]
 pub struct Dict;
 
@@ -185,6 +197,10 @@ impl VTable for Dict {
             )));
         }
 
+        if let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? {
+            return Ok(ExecutionResult::done(canonical));
+        }
+
         let array = require_child!(array, array.values(), DictSlots::VALUES => AnyCanonical);
 
         let DictParts { values, codes, .. } = array.into_parts();
@@ -213,3 +229,309 @@ impl VTable for Dict {
         PARENT_KERNELS.execute(array, parent, child_idx, ctx)
     }
 }
+
+struct SparseDictCodes {
+    unique_codes: PrimitiveArray,
+    remapped_codes: PrimitiveArray,
+}
+
+fn sparse_canonicalize_dict(
+    array: &DictArray,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<Canonical>> {
+    let codes = array.codes().as_::<Primitive>().into_owned();
+    let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else {
+        return Ok(None);
+    };
+
+    let values = array.values();
+    let unique_values_parent = DictArray::new(
+        sparse_codes.unique_codes.clone().into_array(),
+        values.clone(),
+    )
+    .into_array();
+    let unique_values = if let Some(taken_values) =
+        values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)?
+    {
+        taken_values.execute::<Canonical>(ctx)?.into_array()
+    } else {
+        let canonical_values = values.clone().execute::<Canonical>(ctx)?.into_array();
+        DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values)
+            .into_array()
+            .execute::<Canonical>(ctx)?
+            .into_array()
+    };
+
+    let compact_dict = unsafe {
+        DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values)
+            .set_all_values_referenced(true)
+    };
+
+    compact_dict
+        .into_array()
+        .execute::<Canonical>(ctx)
+        .map(Some)
+}
+
+fn collect_sparse_codes(
+    codes: &PrimitiveArray,
+    values_len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<SparseDictCodes>> {
+    let validity = codes.validity()?;
+    let validity_mask = validity.execute_mask(codes.len(), ctx)?;
+
+    if !should_collect_sparse_codes(codes, values_len, &validity_mask) {
+        return Ok(None);
+    }
+
+    let Some(sparse_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
+        collect_sparse_codes_typed::<P>(codes, values_len, validity_mask, validity)?
+    }) else {
+        return Ok(None);
+    };
+
+    Ok(Some(sparse_codes))
+}
+
+fn should_collect_sparse_codes(
+    codes: &PrimitiveArray,
+    values_len: usize,
+    validity_mask: &Mask,
+) -> bool {
+    if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 {
+        return false;
+    }
+
+    if codes
+        .len()
+        .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
+        < values_len
+    {
+        return true;
+    }
+
+    let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
+        estimate_code_cardinality::<P>(codes, validity_mask)
+    }) else {
+        return false;
+    };
+
+    estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
+        < values_len
+}
+
+fn estimate_code_cardinality<P: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> Option<usize> {
+    let sample_count = codes.len().min(SPARSE_CANONICALIZE_SAMPLE_SIZE);
+    let mut observed_codes = Vec::<(usize, usize)>::new();
+
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !validity_mask.value(idx) {
+            continue;
+        }
+
+        let code = codes.as_slice::<P>()[idx].as_();
+        if let Some((_, count)) = observed_codes
+            .iter_mut()
+            .find(|(observed, _)| *observed == code)
+        {
+            *count += 1;
+        } else {
+            observed_codes.push((code, 1));
+        }
+    }
+
+    if observed_codes.is_empty() {
+        return None;
+    }
+
+    let unique_count = observed_codes.len();
+    let singleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 1)
+        .count();
+    let doubleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 2)
+        .count();
+
+    let unseen_estimate = if doubleton_count == 0 {
+        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
+    } else {
+        div_ceil(
+            singleton_count.saturating_mul(singleton_count),
+            2 * doubleton_count,
+        )
+    };
+
+    Some(unique_count.saturating_add(unseen_estimate))
+}
+
+fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
+    debug_assert!(len > 0);
+    debug_assert!(sample_count > 0);
+
+    let sample_idx = sample_idx as u128;
+    let len = len as u128;
+    let sample_count = sample_count as u128;
+    let bucket_start = sample_idx * len / sample_count;
+    let bucket_end = (sample_idx + 1) * len / sample_count;
+
+    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
+}
+
+fn div_ceil(numerator: usize, denominator: usize) -> usize {
+    debug_assert!(denominator > 0);
+    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
+}
+
+fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
+    codes: &PrimitiveArray,
+    values_len: usize,
+    validity_mask: Mask,
+    validity: Validity,
+) -> VortexResult<Option<SparseDictCodes>> {
+    let mut value_remap = vec![usize::MAX; values_len];
+    let mut unique_codes = Vec::new();
+    let mut remapped_codes = Vec::with_capacity(codes.len());
+
+    for (idx, &code) in codes.as_slice::<P>().iter().enumerate() {
+        if !validity_mask.value(idx) {
+            remapped_codes.push(P::default());
+            continue;
+        }
+
+        let old_code = code.as_();
+        let mut new_code = value_remap[old_code];
+        if new_code == usize::MAX {
+            new_code = unique_codes.len();
+            value_remap[old_code] = new_code;
+            unique_codes.push(old_code as u64);
+        }
+
+        remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| {
+            vortex_panic!(
+                "compacted dictionary code {new_code} does not fit in {}",
+                P::PTYPE
+            )
+        }));
+    }
+
+    if unique_codes
+        .len()
+        .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
+        >= values_len
+    {
+        return Ok(None);
+    }
+
+    Ok(Some(SparseDictCodes {
+        unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable),
+        remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity),
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::LEGACY_SESSION;
+    use crate::VortexSessionExecute;
+    use crate::assert_arrays_eq;
+
+    #[test]
+    fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]);
+        let Some(sparse) =
+            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+        else {
+            panic!("codes are sparse");
+        };
+
+        assert_arrays_eq!(
+            sparse.unique_codes.into_array(),
+            PrimitiveArray::from_iter([50u64, 70]).into_array()
+        );
+        assert_arrays_eq!(
+            sparse.remapped_codes.into_array(),
+            PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), Some(0)]).into_array()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32));
+        let Some(sparse) =
+            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+        else {
+            panic!("sampled codes are sparse");
+        };
+
+        assert_arrays_eq!(
+            sparse.unique_codes.into_array(),
+            PrimitiveArray::from_iter([42u64]).into_array()
+        );
+        assert_arrays_eq!(
+            sparse.remapped_codes.into_array(),
+            PrimitiveArray::from_iter((0..1024).map(|_| 0u32)).into_array()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_iter((0..1024).map(|idx| (idx % 100) as u32));
+
+        assert!(
+            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+                .is_none()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> {
+        let dict = DictArray::new(
+            PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(),
+            PrimitiveArray::from_iter(0..100i32).into_array(),
+        );
+
+        let actual = dict
+            .into_array()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
+            .into_array();
+
+        assert_arrays_eq!(
+            actual,
+            PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> {
+        let dict = DictArray::new(
+            PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(),
+            PrimitiveArray::from_iter(0..100i32).into_array(),
+        );
+
+        let actual = dict
+            .into_array()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
+            .into_array();
+
+        assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32)));
+
+        Ok(())
+    }
+}
diff --git a/vortex-duckdb/src/exporter/cache.rs b/vortex-duckdb/src/exporter/cache.rs
index 2f495ba9608..e20efbf7117 100644
--- a/vortex-duckdb/src/exporter/cache.rs
+++ b/vortex-duckdb/src/exporter/cache.rs
@@ -5,7 +5,6 @@ use std::sync::Arc;
 
 use parking_lot::Mutex;
 use vortex::array::ArrayRef;
-use vortex::array::Canonical;
 use vortex_utils::aliases::dash_map::DashMap;
 
 use crate::duckdb::ReusableDict;
@@ -20,6 +19,5 @@ use crate::duckdb::Vector;
 pub struct ConversionCache {
     pub dict_cache: DashMap<usize, (ArrayRef, ReusableDict)>,
     pub values_cache: DashMap<usize, (ArrayRef, Arc<Mutex<Vector>>)>,
-    pub canonical_cache: DashMap<usize, (ArrayRef, Canonical)>,
     pub file_index: usize,
 }
diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs
index da03cd4254c..be14569a48c 100644
--- a/vortex-duckdb/src/exporter/dict.rs
+++ b/vortex-duckdb/src/exporter/dict.rs
@@ -33,6 +33,11 @@ struct DictExporter<I: IntegerPType> {
     codes_type: PhantomData<I>,
 }
 
+// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
+// encoding, code count, unique-code count, and exporter/canonicalization costs.
+const SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD: usize = 4;
+const SPARSE_EXPORT_SAMPLE_SIZE: usize = 128;
+
 pub(crate) fn new_exporter_with_flatten(
     array: &DictArray,
     cache: &ConversionCache,
@@ -42,21 +47,21 @@ pub(crate) fn new_exporter_with_flatten(
 ) -> VortexResult<Box<dyn ColumnExporter>> {
     // Grab the cache dictionary values.
     let values = array.values();
-    let codes = array.codes();
-    let codes_len = codes.len();
+    let codes_array = array.codes();
+    let codes_len = codes_array.len();
 
     if let Some(constant) = values.as_opt::<Constant>() {
         return constant::new_exporter_with_mask(
             ConstantArray::new(constant.scalar().clone(), codes_len),
-            codes.validity()?.execute_mask(codes_len, ctx)?,
+            codes_array.validity()?.execute_mask(codes_len, ctx)?,
             cache,
             ctx,
         );
     }
 
-    let codes_mask = codes.validity()?.execute_mask(codes_len, ctx)?;
+    let codes_mask = codes_array.validity()?.execute_mask(codes_len, ctx)?;
 
-    match codes_mask {
+    match &codes_mask {
         Mask::AllTrue(_) => {}
         Mask::AllFalse(_) => return Ok(all_invalid::new_exporter()),
         Mask::Values(_) => {
@@ -67,25 +72,24 @@ pub(crate) fn new_exporter_with_flatten(
     }
 
     let values_key = values.addr();
-    let codes = array.codes().clone().execute::<PrimitiveArray>(ctx)?;
+    let codes = codes_array.clone().execute::<PrimitiveArray>(ctx)?;
+
+    if !flatten && should_export_sparse(&codes, values.len(), &codes_mask) {
+        return new_array_exporter(
+            array
+                .clone()
+                .into_array()
+                .execute::<Canonical>(ctx)?
+                .into_array(),
+            cache,
+            ctx,
+        );
+    }
 
     let reusable_dict = if flatten {
-        let canonical = cache
-            .canonical_cache
-            .get(&values_key)
-            .map(|entry| entry.value().1.clone());
-        let canonical = match canonical {
-            Some(c) => c,
-            None => {
-                let canonical = values.clone().execute::<Canonical>(ctx)?;
-                cache
-                    .canonical_cache
-                    .insert(values_key, (values.clone(), canonical.clone()));
-                canonical
-            }
-        };
         return new_array_exporter(
-            DictArray::new(array.codes().clone(), canonical.into_array())
+            array
+                .clone()
                 .into_array()
                 .execute::<Canonical>(ctx)?
                 .into_array(),
@@ -129,6 +133,96 @@ pub(crate) fn new_exporter_with_flatten(
     })
 }
 
+fn should_export_sparse(codes: &PrimitiveArray, values_len: usize, codes_mask: &Mask) -> bool {
+    if codes.is_empty() || values_len == 0 || codes_mask.true_count() == 0 {
+        return false;
+    }
+
+    if codes
+        .len()
+        .saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD)
+        < values_len
+    {
+        return true;
+    }
+
+    let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |I| {
+        estimate_code_cardinality::<I>(codes, codes_mask)
+    }) else {
+        return false;
+    };
+
+    estimated_unique_codes.saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD) < values_len
+}
+
+fn estimate_code_cardinality<I: IntegerPType>(
+    codes: &PrimitiveArray,
+    codes_mask: &Mask,
+) -> Option<usize> {
+    let sample_count = codes.len().min(SPARSE_EXPORT_SAMPLE_SIZE);
+    let mut observed_codes = Vec::<(usize, usize)>::new();
+
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !codes_mask.value(idx) {
+            continue;
+        }
+
+        let code = codes.as_slice::<I>()[idx].as_();
+        if let Some((_, count)) = observed_codes
+            .iter_mut()
+            .find(|(observed, _)| *observed == code)
+        {
+            *count += 1;
+        } else {
+            observed_codes.push((code, 1));
+        }
+    }
+
+    if observed_codes.is_empty() {
+        return None;
+    }
+
+    let unique_count = observed_codes.len();
+    let singleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 1)
+        .count();
+    let doubleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 2)
+        .count();
+
+    let unseen_estimate = if doubleton_count == 0 {
+        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
+    } else {
+        div_ceil(
+            singleton_count.saturating_mul(singleton_count),
+            2 * doubleton_count,
+        )
+    };
+
+    Some(unique_count.saturating_add(unseen_estimate))
+}
+
+fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
+    debug_assert!(len > 0);
+    debug_assert!(sample_count > 0);
+
+    let sample_idx = sample_idx as u128;
+    let len = len as u128;
+    let sample_count = sample_count as u128;
+    let bucket_start = sample_idx * len / sample_count;
+    let bucket_end = (sample_idx + 1) * len / sample_count;
+
+    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
+}
+
+fn div_ceil(numerator: usize, denominator: usize) -> usize {
+    debug_assert!(denominator > 0);
+    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
+}
+
 impl<I: IntegerPType + AsPrimitive<u32>> ColumnExporter for DictExporter<I> {
     fn export(
         &self,
@@ -285,6 +379,63 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_sparse_dict_exports_flat() -> VortexResult<()> {
+        let arr = DictArray::new(
+            PrimitiveArray::from_iter([50u32, 70]).into_array(),
+            PrimitiveArray::from_iter(0..100i32).into_array(),
+        );
+
+        let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]);
+
+        new_exporter(&arr, &ConversionCache::default())?.export(
+            0,
+            2,
+            chunk.get_vector_mut(0),
+            &mut SESSION.create_execution_ctx(),
+        )?;
+        chunk.set_len(2);
+
+        assert_eq!(
+            format!("{}", String::try_from(&*chunk)?),
+            r#"Chunk - [1 Columns]
+- FLAT INTEGER: 2 = [ 50, 70]
+"#
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_sampled_sparse_dict_exports_flat() -> VortexResult<()> {
+        let arr = DictArray::new(
+            PrimitiveArray::from_iter((0..32).map(|_| 42u32)).into_array(),
+            PrimitiveArray::from_iter(0..100i32).into_array(),
+        );
+
+        let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]);
+
+        new_exporter(&arr, &ConversionCache::default())?.export(
+            0,
+            32,
+            chunk.get_vector_mut(0),
+            &mut SESSION.create_execution_ctx(),
+        )?;
+        chunk.set_len(32);
+
+        let expected_values = std::iter::repeat_n("42", 32).collect::<Vec<_>>().join(", ");
+        assert_eq!(
+            format!("{}", String::try_from(&*chunk)?),
+            format!(
+                r#"Chunk - [1 Columns]
+- FLAT INTEGER: 32 = [ {expected_values}]
+"#
+            )
+        );
+
+        Ok(())
+    }
+
     #[ignore = "TODO(connor)[4809]: Exporters do not correctly handle empty vectors"]
     #[test]
     fn test_export_empty_dict() -> VortexResult<()> {

From 90bc59c9e076445ae19c50f80699872fb5e92f49 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 09:38:32 -0400
Subject: [PATCH 02/11] Document sparse dictionary cardinality estimation

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 .../src/arrays/dict/vtable/cardinality.rs     | 85 +++++++++++++++++
 vortex-array/src/arrays/dict/vtable/mod.rs    | 92 +++++--------------
 vortex-duckdb/src/exporter/dict.rs            | 70 +-------------
 .../src/exporter/dict_cardinality.rs          | 84 +++++++++++++++++
 vortex-duckdb/src/exporter/mod.rs             |  1 +
 5 files changed, 193 insertions(+), 139 deletions(-)
 create mode 100644 vortex-array/src/arrays/dict/vtable/cardinality.rs
 create mode 100644 vortex-duckdb/src/exporter/dict_cardinality.rs

diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs
new file mode 100644
index 00000000000..848abec2997
--- /dev/null
+++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_mask::Mask;
+
+use crate::arrays::PrimitiveArray;
+use crate::dtype::IntegerPType;
+
+const SAMPLE_SIZE: usize = 128;
+
+pub(super) fn estimate_code_cardinality<P: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> Option<usize> {
+    let sample_count = codes.len().min(SAMPLE_SIZE);
+    let mut observed_codes = Vec::<(usize, usize)>::new();
+
+    // Sample deterministic bucket midpoints instead of using randomness. The estimate only gates
+    // whether to run the exact pass; correctness never depends on the sample.
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !validity_mask.value(idx) {
+            continue;
+        }
+
+        let code = codes.as_slice::<P>()[idx].as_();
+        if let Some((_, count)) = observed_codes
+            .iter_mut()
+            .find(|(observed, _)| *observed == code)
+        {
+            *count += 1;
+        } else {
+            observed_codes.push((code, 1));
+        }
+    }
+
+    estimate_cardinality_from_observations(&observed_codes)
+}
+
+fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
+    if observed_codes.is_empty() {
+        return None;
+    }
+
+    let unique_count = observed_codes.len();
+    let singleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 1)
+        .count();
+    let doubleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 2)
+        .count();
+
+    // Chao1-style lower-bias estimate for unseen codes. Repeated samples keep the estimate small
+    // for low-cardinality code streams; many singleton samples make dense streams look expensive.
+    let unseen_estimate = if doubleton_count == 0 {
+        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
+    } else {
+        div_ceil(
+            singleton_count.saturating_mul(singleton_count),
+            2 * doubleton_count,
+        )
+    };
+
+    Some(unique_count.saturating_add(unseen_estimate))
+}
+
+fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
+    debug_assert!(len > 0);
+    debug_assert!(sample_count > 0);
+
+    let sample_idx = sample_idx as u128;
+    let len = len as u128;
+    let sample_count = sample_count as u128;
+    let bucket_start = sample_idx * len / sample_count;
+    let bucket_end = (sample_idx + 1) * len / sample_count;
+
+    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
+}
+
+fn div_ceil(numerator: usize, denominator: usize) -> usize {
+    debug_assert!(denominator > 0);
+    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
+}
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index dd5b87a772f..3b5070fc23e 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -54,6 +54,7 @@ use crate::scalar::Scalar;
 use crate::serde::ArrayChildren;
 use crate::validity::Validity;
 
+mod cardinality;
 mod kernel;
 mod operations;
 mod validity;
@@ -64,7 +65,6 @@ pub type DictArray = Array<Dict>;
 // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
 // encoding, code count, unique-code count, and exporter/canonicalization costs.
 const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4;
-const SPARSE_CANONICALIZE_SAMPLE_SIZE: usize = 128;
 
 #[derive(Clone, Debug)]
 pub struct Dict;
@@ -231,7 +231,9 @@ impl VTable for Dict {
 }
 
 struct SparseDictCodes {
+    /// Original dictionary value indices that are actually referenced by the live codes.
     unique_codes: PrimitiveArray,
+    /// Codes rewritten to index into `unique_codes` instead of the original values array.
     remapped_codes: PrimitiveArray,
 }
 
@@ -244,6 +246,10 @@ fn sparse_canonicalize_dict(
         return Ok(None);
     };
 
+    // Build a temporary parent that represents `values.take(unique_codes)`. Calling
+    // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just
+    // the referenced dictionary values. If the child has no specialized parent execution, fall
+    // back to canonicalizing all values and then taking from the canonical array.
     let values = array.values();
     let unique_values_parent = DictArray::new(
         sparse_codes.unique_codes.clone().into_array(),
@@ -262,6 +268,9 @@ fn sparse_canonicalize_dict(
             .into_array()
     };
 
+    // Now the dictionary is dense over its compacted values, so normal dictionary execution only
+    // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a
+    // large dictionary with many unused values.
     let compact_dict = unsafe {
         DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values)
             .set_all_values_referenced(true)
@@ -281,6 +290,8 @@ fn collect_sparse_codes(
     let validity = codes.validity()?;
     let validity_mask = validity.execute_mask(codes.len(), ctx)?;
 
+    // The exact pass below scans every code and allocates a remap table sized to the values array.
+    // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough.
     if !should_collect_sparse_codes(codes, values_len, &validity_mask) {
         return Ok(None);
     }
@@ -303,6 +314,8 @@ fn should_collect_sparse_codes(
         return false;
     }
 
+    // If even the worst case "every live code is unique" is sparse, skip sampling and go straight
+    // to the exact remap pass.
     if codes
         .len()
         .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
@@ -311,8 +324,10 @@ fn should_collect_sparse_codes(
         return true;
     }
 
+    // Otherwise sample first. This catches cases like many live rows all referencing the same
+    // dictionary value without forcing dense dictionaries through the exact remap scan.
     let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
-        estimate_code_cardinality::<P>(codes, validity_mask)
+        cardinality::estimate_code_cardinality::<P>(codes, validity_mask)
     }) else {
         return false;
     };
@@ -321,74 +336,6 @@ fn should_collect_sparse_codes(
         < values_len
 }
 
-fn estimate_code_cardinality<P: IntegerPType>(
-    codes: &PrimitiveArray,
-    validity_mask: &Mask,
-) -> Option<usize> {
-    let sample_count = codes.len().min(SPARSE_CANONICALIZE_SAMPLE_SIZE);
-    let mut observed_codes = Vec::<(usize, usize)>::new();
-
-    for sample_idx in 0..sample_count {
-        let idx = sample_index(sample_idx, codes.len(), sample_count);
-        if !validity_mask.value(idx) {
-            continue;
-        }
-
-        let code = codes.as_slice::<P>()[idx].as_();
-        if let Some((_, count)) = observed_codes
-            .iter_mut()
-            .find(|(observed, _)| *observed == code)
-        {
-            *count += 1;
-        } else {
-            observed_codes.push((code, 1));
-        }
-    }
-
-    if observed_codes.is_empty() {
-        return None;
-    }
-
-    let unique_count = observed_codes.len();
-    let singleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 1)
-        .count();
-    let doubleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 2)
-        .count();
-
-    let unseen_estimate = if doubleton_count == 0 {
-        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
-    } else {
-        div_ceil(
-            singleton_count.saturating_mul(singleton_count),
-            2 * doubleton_count,
-        )
-    };
-
-    Some(unique_count.saturating_add(unseen_estimate))
-}
-
-fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
-    debug_assert!(len > 0);
-    debug_assert!(sample_count > 0);
-
-    let sample_idx = sample_idx as u128;
-    let len = len as u128;
-    let sample_count = sample_count as u128;
-    let bucket_start = sample_idx * len / sample_count;
-    let bucket_end = (sample_idx + 1) * len / sample_count;
-
-    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
-}
-
-fn div_ceil(numerator: usize, denominator: usize) -> usize {
-    debug_assert!(denominator > 0);
-    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
-}
-
 fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
     codes: &PrimitiveArray,
     values_len: usize,
@@ -399,6 +346,8 @@ fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
     let mut unique_codes = Vec::new();
     let mut remapped_codes = Vec::with_capacity(codes.len());
 
+    // `value_remap[old_code]` stores the compact code assigned to an original dictionary value.
+    // `usize::MAX` means that value has not been referenced yet.
     for (idx, &code) in codes.as_slice::<P>().iter().enumerate() {
         if !validity_mask.value(idx) {
             remapped_codes.push(P::default());
@@ -421,6 +370,9 @@ fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
         }));
     }
 
+    // The sample only decides whether the exact pass is worth trying. Recheck the real
+    // cardinality before compacting so a misleading sparse-looking sample cannot pessimize dense
+    // dictionaries.
     if unique_codes
         .len()
         .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs
index be14569a48c..c611a803d94 100644
--- a/vortex-duckdb/src/exporter/dict.rs
+++ b/vortex-duckdb/src/exporter/dict.rs
@@ -24,6 +24,7 @@ use crate::exporter::ColumnExporter;
 use crate::exporter::all_invalid;
 use crate::exporter::cache::ConversionCache;
 use crate::exporter::constant;
+use crate::exporter::dict_cardinality::estimate_code_cardinality;
 use crate::exporter::new_array_exporter;
 
 struct DictExporter<I: IntegerPType> {
@@ -36,7 +37,6 @@ struct DictExporter<I: IntegerPType> {
 // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
 // encoding, code count, unique-code count, and exporter/canonicalization costs.
 const SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD: usize = 4;
-const SPARSE_EXPORT_SAMPLE_SIZE: usize = 128;
 
 pub(crate) fn new_exporter_with_flatten(
     array: &DictArray,
@@ -155,74 +155,6 @@ fn should_export_sparse(codes: &PrimitiveArray, values_len: usize, codes_mask: &
     estimated_unique_codes.saturating_mul(SPARSE_EXPORT_CODES_PER_VALUE_THRESHOLD) < values_len
 }
 
-fn estimate_code_cardinality<I: IntegerPType>(
-    codes: &PrimitiveArray,
-    codes_mask: &Mask,
-) -> Option<usize> {
-    let sample_count = codes.len().min(SPARSE_EXPORT_SAMPLE_SIZE);
-    let mut observed_codes = Vec::<(usize, usize)>::new();
-
-    for sample_idx in 0..sample_count {
-        let idx = sample_index(sample_idx, codes.len(), sample_count);
-        if !codes_mask.value(idx) {
-            continue;
-        }
-
-        let code = codes.as_slice::<I>()[idx].as_();
-        if let Some((_, count)) = observed_codes
-            .iter_mut()
-            .find(|(observed, _)| *observed == code)
-        {
-            *count += 1;
-        } else {
-            observed_codes.push((code, 1));
-        }
-    }
-
-    if observed_codes.is_empty() {
-        return None;
-    }
-
-    let unique_count = observed_codes.len();
-    let singleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 1)
-        .count();
-    let doubleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 2)
-        .count();
-
-    let unseen_estimate = if doubleton_count == 0 {
-        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
-    } else {
-        div_ceil(
-            singleton_count.saturating_mul(singleton_count),
-            2 * doubleton_count,
-        )
-    };
-
-    Some(unique_count.saturating_add(unseen_estimate))
-}
-
-fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
-    debug_assert!(len > 0);
-    debug_assert!(sample_count > 0);
-
-    let sample_idx = sample_idx as u128;
-    let len = len as u128;
-    let sample_count = sample_count as u128;
-    let bucket_start = sample_idx * len / sample_count;
-    let bucket_end = (sample_idx + 1) * len / sample_count;
-
-    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
-}
-
-fn div_ceil(numerator: usize, denominator: usize) -> usize {
-    debug_assert!(denominator > 0);
-    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
-}
-
 impl<I: IntegerPType + AsPrimitive<u32>> ColumnExporter for DictExporter<I> {
     fn export(
         &self,
diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs
new file mode 100644
index 00000000000..37f0f55dfb9
--- /dev/null
+++ b/vortex-duckdb/src/exporter/dict_cardinality.rs
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex::array::arrays::PrimitiveArray;
+use vortex::dtype::IntegerPType;
+use vortex::mask::Mask;
+
+const SAMPLE_SIZE: usize = 128;
+
+pub(super) fn estimate_code_cardinality<I: IntegerPType>(
+    codes: &PrimitiveArray,
+    codes_mask: &Mask,
+) -> Option<usize> {
+    let sample_count = codes.len().min(SAMPLE_SIZE);
+    let mut observed_codes = Vec::<(usize, usize)>::new();
+
+    // This mirrors the array-side sparse dictionary gate. The exporter needs the estimate before
+    // it decides between a reusable DuckDB dictionary and executing the Vortex dictionary away.
+    // Correctness does not depend on the estimate; it only decides whether to take the compacting
+    // path.
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !codes_mask.value(idx) {
+            continue;
+        }
+
+        let code = codes.as_slice::<I>()[idx].as_();
+        if let Some((_, count)) = observed_codes
+            .iter_mut()
+            .find(|(observed, _)| *observed == code)
+        {
+            *count += 1;
+        } else {
+            observed_codes.push((code, 1));
+        }
+    }
+
+    estimate_cardinality_from_observations(&observed_codes)
+}
+
+fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
+    if observed_codes.is_empty() {
+        return None;
+    }
+
+    let unique_count = observed_codes.len();
+    let singleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 1)
+        .count();
+    let doubleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 2)
+        .count();
+
+    let unseen_estimate = if doubleton_count == 0 {
+        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
+    } else {
+        div_ceil(
+            singleton_count.saturating_mul(singleton_count),
+            2 * doubleton_count,
+        )
+    };
+
+    Some(unique_count.saturating_add(unseen_estimate))
+}
+
+fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
+    debug_assert!(len > 0);
+    debug_assert!(sample_count > 0);
+
+    let sample_idx = sample_idx as u128;
+    let len = len as u128;
+    let sample_count = sample_count as u128;
+    let bucket_start = sample_idx * len / sample_count;
+    let bucket_end = (sample_idx + 1) * len / sample_count;
+
+    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
+}
+
+fn div_ceil(numerator: usize, denominator: usize) -> usize {
+    debug_assert!(denominator > 0);
+    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
+}
diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs
index 517776f5521..9c01e2f35e5 100644
--- a/vortex-duckdb/src/exporter/mod.rs
+++ b/vortex-duckdb/src/exporter/mod.rs
@@ -7,6 +7,7 @@ mod cache;
 mod constant;
 mod decimal;
 mod dict;
+mod dict_cardinality;
 mod fixed_size_list;
 mod list;
 mod list_view;

From 4137b92ab4401f849332cea9065a26518ee46944 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 09:49:13 -0400
Subject: [PATCH 03/11] Document sparse cardinality estimators

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 .../src/arrays/dict/vtable/cardinality.rs     | 19 +++++++++++++++++++
 .../src/exporter/dict_cardinality.rs          | 18 ++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs
index 848abec2997..d97b26fd3e5 100644
--- a/vortex-array/src/arrays/dict/vtable/cardinality.rs
+++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs
@@ -1,6 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+//! Sampling-based cardinality estimation for dictionary codes.
+//!
+//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass. The
+//! estimate may be conservative or noisy, but correctness does not depend on it: callers must still
+//! collect the exact unique code set and re-check the sparse threshold before compacting.
+
 use vortex_mask::Mask;
 
 use crate::arrays::PrimitiveArray;
@@ -8,6 +14,11 @@ use crate::dtype::IntegerPType;
 
 const SAMPLE_SIZE: usize = 128;
 
+/// Estimate the number of distinct non-null dictionary codes.
+///
+/// The estimator samples deterministic bucket midpoints so repeated executions make the same
+/// compaction decision for the same input. Returning `None` means no valid sampled codes were seen.
+/// A returned value should only be used to decide whether an exact pass is worth attempting.
 pub(super) fn estimate_code_cardinality<P: IntegerPType>(
     codes: &PrimitiveArray,
     validity_mask: &Mask,
@@ -37,6 +48,10 @@ pub(super) fn estimate_code_cardinality<P: IntegerPType>(
     estimate_cardinality_from_observations(&observed_codes)
 }
 
+/// Estimate total cardinality from `(code, observed_count)` sample observations.
+///
+/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated
+/// observations imply the code stream is likely low-cardinality.
 fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
     if observed_codes.is_empty() {
         return None;
@@ -66,6 +81,10 @@ fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) ->
     Some(unique_count.saturating_add(unseen_estimate))
 }
 
+/// Return the midpoint index for one deterministic sampling bucket.
+///
+/// Splitting the full code range into buckets avoids clustering all samples near the start while
+/// avoiding RNG state in a hot execution path.
 fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
     debug_assert!(len > 0);
     debug_assert!(sample_count > 0);
diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs
index 37f0f55dfb9..1a56fa1a954 100644
--- a/vortex-duckdb/src/exporter/dict_cardinality.rs
+++ b/vortex-duckdb/src/exporter/dict_cardinality.rs
@@ -1,12 +1,22 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+//! Sampling-based cardinality estimation for DuckDB dictionary export.
+//!
+//! The exporter uses this as a cheap routing hint before choosing between a reusable DuckDB
+//! dictionary and executing the Vortex dictionary into a flat vector. Correctness does not depend on
+//! the estimate: the compacting path still executes Vortex's dictionary canonicalization logic.
+
 use vortex::array::arrays::PrimitiveArray;
 use vortex::dtype::IntegerPType;
 use vortex::mask::Mask;
 
 const SAMPLE_SIZE: usize = 128;
 
+/// Estimate the number of distinct non-null dictionary codes in a DuckDB export batch.
+///
+/// Returning `None` means no valid sampled codes were seen. A returned estimate should be treated
+/// only as a cost signal for whether the exporter should call into Vortex dictionary execution.
 pub(super) fn estimate_code_cardinality<I: IntegerPType>(
     codes: &PrimitiveArray,
     codes_mask: &Mask,
@@ -38,6 +48,10 @@ pub(super) fn estimate_code_cardinality<I: IntegerPType>(
     estimate_cardinality_from_observations(&observed_codes)
 }
 
+/// Estimate total cardinality from `(code, observed_count)` sample observations.
+///
+/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated
+/// observations imply the selected code stream is likely low-cardinality.
 fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
     if observed_codes.is_empty() {
         return None;
@@ -65,6 +79,10 @@ fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) ->
     Some(unique_count.saturating_add(unseen_estimate))
 }
 
+/// Return the midpoint index for one deterministic sampling bucket.
+///
+/// Bucket midpoint sampling gives coverage across the whole code vector without introducing RNG
+/// state or nondeterministic exporter decisions.
 fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
     debug_assert!(len > 0);
     debug_assert!(sample_count > 0);

From 4d65cf147d8b57270881cfb457ac9f7c6f76bda8 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 11:23:32 -0400
Subject: [PATCH 04/11] Share dictionary referenced-values mask

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/array.rs      | 91 +++++++++++++---------
 vortex-array/src/arrays/dict/vtable/mod.rs | 60 +++++++-------
 2 files changed, 85 insertions(+), 66 deletions(-)

diff --git a/vortex-array/src/arrays/dict/array.rs b/vortex-array/src/arrays/dict/array.rs
index eafbc9f6642..0bb998c88c8 100644
--- a/vortex-array/src/arrays/dict/array.rs
+++ b/vortex-array/src/arrays/dict/array.rs
@@ -10,6 +10,7 @@ use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_ensure;
 use vortex_mask::AllOr;
+use vortex_mask::Mask;
 
 use crate::ArrayRef;
 use crate::LEGACY_SESSION;
@@ -21,6 +22,7 @@ use crate::array::ArrayParts;
 use crate::array::TypedArrayRef;
 use crate::array_slots;
 use crate::arrays::Dict;
+use crate::arrays::PrimitiveArray;
 use crate::dtype::DType;
 use crate::dtype::PType;
 use crate::match_each_integer_ptype;
@@ -148,46 +150,63 @@ pub trait DictArrayExt: TypedArrayRef<Dict> + DictArraySlotsExt {
             .execute_mask(codes.len(), &mut LEGACY_SESSION.create_execution_ctx())?;
         #[expect(deprecated)]
         let codes_primitive = self.codes().to_primitive();
-        let values_len = self.values().len();
-
-        let init_value = !referenced;
-        let referenced_value = referenced;
-
-        let mut values_vec = vec![init_value; values_len];
-        match codes_validity.bit_buffer() {
-            AllOr::All => {
-                match_each_integer_ptype!(codes_primitive.ptype(), |P| {
-                    #[allow(
-                        clippy::cast_possible_truncation,
-                        clippy::cast_sign_loss,
-                        reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
-                    )]
-                    for &idx in codes_primitive.as_slice::<P>() {
-                        values_vec[idx as usize] = referenced_value;
-                    }
-                });
-            }
-            AllOr::None => {}
-            AllOr::Some(mask) => {
-                match_each_integer_ptype!(codes_primitive.ptype(), |P| {
-                    let codes = codes_primitive.as_slice::<P>();
-
-                    #[allow(
-                        clippy::cast_possible_truncation,
-                        clippy::cast_sign_loss,
-                        reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
-                    )]
-                    mask.set_indices().for_each(|idx| {
-                        values_vec[codes[idx] as usize] = referenced_value;
-                    });
+        compute_referenced_values_mask_from_codes(
+            &codes_primitive,
+            self.values().len(),
+            &codes_validity,
+            referenced,
+        )
+    }
+}
+impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}
+
+/// Build an exact bitmap over dictionary values referenced by valid codes.
+///
+/// The sampling-based sparse dictionary estimator only decides whether an exact pass is likely to
+/// be worthwhile. This helper is the exact pass: aggregate kernels use it to ignore unreferenced
+/// values, and sparse dictionary canonicalization uses it to compact values before remapping codes.
+pub(crate) fn compute_referenced_values_mask_from_codes(
+    codes_primitive: &PrimitiveArray,
+    values_len: usize,
+    codes_validity: &Mask,
+    referenced: bool,
+) -> VortexResult<BitBuffer> {
+    let init_value = !referenced;
+    let referenced_value = referenced;
+
+    let mut values_vec = vec![init_value; values_len];
+    match codes_validity.bit_buffer() {
+        AllOr::All => {
+            match_each_integer_ptype!(codes_primitive.ptype(), |P| {
+                #[allow(
+                    clippy::cast_possible_truncation,
+                    clippy::cast_sign_loss,
+                    reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
+                )]
+                for &idx in codes_primitive.as_slice::<P>() {
+                    values_vec[idx as usize] = referenced_value;
+                }
+            });
+        }
+        AllOr::None => {}
+        AllOr::Some(mask) => {
+            match_each_integer_ptype!(codes_primitive.ptype(), |P| {
+                let codes = codes_primitive.as_slice::<P>();
+
+                #[allow(
+                    clippy::cast_possible_truncation,
+                    clippy::cast_sign_loss,
+                    reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
+                )]
+                mask.set_indices().for_each(|idx| {
+                    values_vec[codes[idx] as usize] = referenced_value;
                 });
-            }
+            });
         }
-
-        Ok(BitBuffer::from(values_vec))
     }
+
+    Ok(BitBuffer::from(values_vec))
 }
-impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}
 
 /// Concrete parts of a [`DictArray`](super::DictArray) after iterative execution.
 pub struct DictParts {
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 3b5070fc23e..0a8f9c3f8aa 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -6,6 +6,7 @@ use std::hash::Hasher;
 use kernel::PARENT_KERNELS;
 use num_traits::FromPrimitive;
 use prost::Message;
+use vortex_buffer::BitBuffer;
 use vortex_buffer::Buffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
@@ -22,6 +23,7 @@ use super::DictOwnedExt;
 use super::DictParts;
 use super::array::DictSlots;
 use super::array::DictSlotsView;
+use super::array::compute_referenced_values_mask_from_codes;
 use crate::AnyCanonical;
 use crate::ArrayEq;
 use crate::ArrayHash;
@@ -296,11 +298,16 @@ fn collect_sparse_codes(
         return Ok(None);
     }
 
-    let Some(sparse_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
-        collect_sparse_codes_typed::<P>(codes, values_len, validity_mask, validity)?
-    }) else {
+    let referenced_values =
+        compute_referenced_values_mask_from_codes(codes, values_len, &validity_mask, true)?;
+    let unique_count = referenced_values.true_count();
+    if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len {
         return Ok(None);
-    };
+    }
+
+    let sparse_codes = match_each_integer_ptype!(codes.ptype(), |P| {
+        collect_sparse_codes_typed::<P>(codes, referenced_values, validity_mask, validity)?
+    });
 
     Ok(Some(sparse_codes))
 }
@@ -338,16 +345,24 @@ fn should_collect_sparse_codes(
 
 fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
     codes: &PrimitiveArray,
-    values_len: usize,
+    referenced_values: BitBuffer,
     validity_mask: Mask,
     validity: Validity,
-) -> VortexResult<Option<SparseDictCodes>> {
-    let mut value_remap = vec![usize::MAX; values_len];
-    let mut unique_codes = Vec::new();
-    let mut remapped_codes = Vec::with_capacity(codes.len());
+) -> VortexResult<SparseDictCodes> {
+    let unique_count = referenced_values.true_count();
+    let mut value_remap = vec![usize::MAX; referenced_values.len()];
+    let mut unique_codes = Vec::with_capacity(unique_count);
+
+    // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking
+    // the bitmap assigns compact codes in original dictionary order, which keeps compaction
+    // deterministic and independent of the first live row that happened to reference each value.
+    for old_code in referenced_values.set_indices() {
+        let new_code = unique_codes.len();
+        value_remap[old_code] = new_code;
+        unique_codes.push(old_code as u64);
+    }
 
-    // `value_remap[old_code]` stores the compact code assigned to an original dictionary value.
-    // `usize::MAX` means that value has not been referenced yet.
+    let mut remapped_codes = Vec::with_capacity(codes.len());
     for (idx, &code) in codes.as_slice::<P>().iter().enumerate() {
         if !validity_mask.value(idx) {
             remapped_codes.push(P::default());
@@ -355,12 +370,8 @@ fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
         }
 
         let old_code = code.as_();
-        let mut new_code = value_remap[old_code];
-        if new_code == usize::MAX {
-            new_code = unique_codes.len();
-            value_remap[old_code] = new_code;
-            unique_codes.push(old_code as u64);
-        }
+        let new_code = value_remap[old_code];
+        debug_assert_ne!(new_code, usize::MAX);
 
         remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| {
             vortex_panic!(
@@ -370,21 +381,10 @@ fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
         }));
     }
 
-    // The sample only decides whether the exact pass is worth trying. Recheck the real
-    // cardinality before compacting so a misleading sparse-looking sample cannot pessimize dense
-    // dictionaries.
-    if unique_codes
-        .len()
-        .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
-        >= values_len
-    {
-        return Ok(None);
-    }
-
-    Ok(Some(SparseDictCodes {
+    Ok(SparseDictCodes {
         unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable),
         remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity),
-    }))
+    })
 }
 
 #[cfg(test)]

From 4ab2dc5d3086e0d1a5f1f9004dd91a3e70209fee Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 11:44:48 -0400
Subject: [PATCH 05/11] Skip sparse dict compaction for referenced dictionaries

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/vtable/mod.rs | 7 +++++++
 vortex-duckdb/src/exporter/dict.rs         | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index b03f356460e..6b93f7029fa 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -244,6 +244,13 @@ fn sparse_canonicalize_dict(
     array: &DictArray,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<Option<Canonical>> {
+    // If metadata tells us every dictionary value is referenced, there is no garbage to compact.
+    // This also keeps hot paths such as dictionary comparisons from paying the sparse estimator
+    // cost when they produce dense, all-referenced result dictionaries.
+    if array.has_all_values_referenced() {
+        return Ok(None);
+    }
+
     let codes = array.codes().as_::<Primitive>().into_owned();
     let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else {
         return Ok(None);
diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs
index 616f519fcf0..0728779cdf6 100644
--- a/vortex-duckdb/src/exporter/dict.rs
+++ b/vortex-duckdb/src/exporter/dict.rs
@@ -11,6 +11,7 @@ use vortex::array::arrays::Constant;
 use vortex::array::arrays::ConstantArray;
 use vortex::array::arrays::DictArray;
 use vortex::array::arrays::PrimitiveArray;
+use vortex::array::arrays::dict::DictArrayExt;
 use vortex::array::arrays::dict::DictArraySlotsExt;
 use vortex::array::match_each_integer_ptype;
 use vortex::dtype::IntegerPType;
@@ -74,7 +75,10 @@ pub(crate) fn new_exporter_with_flatten(
     let values_key = values.addr();
     let codes = codes_array.clone().execute::<PrimitiveArray>(ctx)?;
 
-    if !flatten && should_export_sparse(&codes, values.len(), &codes_mask) {
+    if !flatten
+        && !array.has_all_values_referenced()
+        && should_export_sparse(&codes, values.len(), &codes_mask)
+    {
         return new_array_exporter(
             array
                 .clone()

From 13ae594513a28f4b4fb56919bb847156fd9cc210 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 12:03:22 -0400
Subject: [PATCH 06/11] Avoid sparse dict overhead on dense paths

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/compute/slice.rs | 68 +++++++++++++++++--
 .../src/arrays/dict/vtable/cardinality.rs     | 30 ++++++++
 vortex-array/src/arrays/dict/vtable/mod.rs    | 18 ++---
 3 files changed, 101 insertions(+), 15 deletions(-)

diff --git a/vortex-array/src/arrays/dict/compute/slice.rs b/vortex-array/src/arrays/dict/compute/slice.rs
index 509fa7535d8..04434bb593a 100644
--- a/vortex-array/src/arrays/dict/compute/slice.rs
+++ b/vortex-array/src/arrays/dict/compute/slice.rs
@@ -12,25 +12,36 @@ use crate::arrays::Constant;
 use crate::arrays::ConstantArray;
 use crate::arrays::Dict;
 use crate::arrays::DictArray;
+use crate::arrays::dict::DictArrayExt;
 use crate::arrays::dict::DictArraySlotsExt;
 use crate::arrays::slice::SliceReduce;
 use crate::scalar::Scalar;
 
 impl SliceReduce for Dict {
     fn slice(array: ArrayView<'_, Self>, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
+        let is_full_slice = range.len() == array.len();
         let sliced_code = array.codes().slice(range)?;
         // TODO(joe): if the range is size 1 replace with a constant array
         if let Some(code) = sliced_code.as_opt::<Constant>() {
             let code = code.scalar().as_primitive().as_::<usize>();
             return if let Some(code) = code {
                 let values = array.values().slice(code..code + 1)?;
-                Ok(Some(
-                    DictArray::new(
+                // SAFETY: the only dictionary value is referenced by every non-null code when the
+                // slice is non-empty. An empty code stream cannot reference a non-empty values
+                // array.
+                let sliced = unsafe {
+                    DictArray::new_unchecked(
                         ConstantArray::new(0u8, sliced_code.len()).into_array(),
                         values,
                     )
-                    .into_array(),
-                ))
+                };
+                if sliced_code.is_empty() {
+                    Ok(Some(sliced.into_array()))
+                } else {
+                    Ok(Some(unsafe {
+                        sliced.set_all_values_referenced(true).into_array()
+                    }))
+                }
             } else {
                 Ok(Some(
                     ConstantArray::new(Scalar::null(array.dtype().clone()), sliced_code.len())
@@ -39,9 +50,18 @@ impl SliceReduce for Dict {
             };
         }
         // SAFETY: slicing the codes preserves invariants.
-        Ok(Some(
-            unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) }.into_array(),
-        ))
+        let sliced = unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) };
+        if is_full_slice {
+            // A full-length slice preserves the exact code stream, so the referenced-values
+            // metadata remains sound. Partial slices may drop the only reference to a value.
+            Ok(Some(unsafe {
+                sliced
+                    .set_all_values_referenced(array.has_all_values_referenced())
+                    .into_array()
+            }))
+        } else {
+            Ok(Some(sliced.into_array()))
+        }
     }
 }
 
@@ -51,8 +71,10 @@ mod tests {
     use vortex_error::VortexResult;
 
     use crate::IntoArray;
+    use crate::arrays::Dict;
     use crate::arrays::DictArray;
     use crate::arrays::PrimitiveArray;
+    use crate::arrays::dict::DictArrayExt;
     use crate::arrays::dict::compute::slice::ConstantArray;
     use crate::assert_arrays_eq;
     use crate::dtype::DType;
@@ -84,4 +106,36 @@ mod tests {
         assert_arrays_eq!(sliced, expected);
         Ok(())
     }
+
+    #[test]
+    fn full_slice_preserves_all_values_referenced_metadata() -> VortexResult<()> {
+        let dict = unsafe {
+            DictArray::new_unchecked(
+                buffer![0u8, 1].into_array(),
+                buffer![10i32, 20].into_array(),
+            )
+            .set_all_values_referenced(true)
+        };
+
+        let sliced = dict.slice(0..2)?;
+
+        assert!(sliced.as_::<Dict>().has_all_values_referenced());
+        Ok(())
+    }
+
+    #[test]
+    fn partial_slice_drops_all_values_referenced_metadata() -> VortexResult<()> {
+        let dict = unsafe {
+            DictArray::new_unchecked(
+                buffer![0u8, 1].into_array(),
+                buffer![10i32, 20].into_array(),
+            )
+            .set_all_values_referenced(true)
+        };
+
+        let sliced = dict.slice(0..1)?;
+
+        assert!(!sliced.as_::<Dict>().has_all_values_referenced());
+        Ok(())
+    }
 }
diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs
index d97b26fd3e5..ad8fa24bdd0 100644
--- a/vortex-array/src/arrays/dict/vtable/cardinality.rs
+++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs
@@ -13,6 +13,36 @@ use crate::arrays::PrimitiveArray;
 use crate::dtype::IntegerPType;
 
 const SAMPLE_SIZE: usize = 128;
+const REPEATED_CODE_PROBE_SIZE: usize = 16;
+
+/// Return whether a small deterministic probe observes a repeated non-null code.
+///
+/// Sparse canonicalization always has a cheap worst-case gate before it samples. This probe is the
+/// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should
+/// not pay the full estimator cost unless the code stream first shows evidence of repeated codes.
+/// A `true` result only means "run the estimator"; it is not enough to compact by itself.
+pub(super) fn has_repeated_code_sample<P: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> bool {
+    let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE);
+    let mut observed_codes = Vec::<usize>::with_capacity(sample_count);
+
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !validity_mask.value(idx) {
+            continue;
+        }
+
+        let code = codes.as_slice::<P>()[idx].as_();
+        if observed_codes.contains(&code) {
+            return true;
+        }
+        observed_codes.push(code);
+    }
+
+    false
+}
 
 /// Estimate the number of distinct non-null dictionary codes.
 ///
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 6b93f7029fa..8ece354aee7 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -200,7 +200,9 @@ impl VTable for Dict {
             )));
         }
 
-        if let Some(canonical) = sparse_canonicalize_dict(&array, ctx)? {
+        if !array.has_all_values_referenced()
+            && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)?
+        {
             return Ok(ExecutionResult::done(canonical));
         }
 
@@ -244,13 +246,6 @@ fn sparse_canonicalize_dict(
     array: &DictArray,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<Option<Canonical>> {
-    // If metadata tells us every dictionary value is referenced, there is no garbage to compact.
-    // This also keeps hot paths such as dictionary comparisons from paying the sparse estimator
-    // cost when they produce dense, all-referenced result dictionaries.
-    if array.has_all_values_referenced() {
-        return Ok(None);
-    }
-
     let codes = array.codes().as_::<Primitive>().into_owned();
     let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else {
         return Ok(None);
@@ -341,6 +336,13 @@ fn should_collect_sparse_codes(
 
     // Otherwise sample first. This catches cases like many live rows all referencing the same
     // dictionary value without forcing dense dictionaries through the exact remap scan.
+    let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| {
+        cardinality::has_repeated_code_sample::<P>(codes, validity_mask)
+    });
+    if !worth_sampling {
+        return false;
+    }
+
     let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
         cardinality::estimate_code_cardinality::<P>(codes, validity_mask)
     }) else {

From fd59336d825aa66a13f75a6dbea17544ad694883 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 12:27:59 -0400
Subject: [PATCH 07/11] Gate sparse dict sampling by shape

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/vtable/mod.rs | 34 ++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 8ece354aee7..cbe8d7bf8c9 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -68,6 +68,8 @@ pub type DictArray = Array<Dict>;
 // TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
 // encoding, code count, unique-code count, and exporter/canonicalization costs.
 const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4;
+const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2;
+const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512;
 
 #[derive(Clone, Debug)]
 pub struct Dict;
@@ -200,7 +202,8 @@ impl VTable for Dict {
             )));
         }
 
-        if !array.has_all_values_referenced()
+        if should_consider_sparse_canonicalize(array.codes().len(), array.values().len())
+            && !array.has_all_values_referenced()
             && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)?
         {
             return Ok(ExecutionResult::done(canonical));
@@ -242,6 +245,8 @@ struct SparseDictCodes {
     remapped_codes: PrimitiveArray,
 }
 
+#[cold]
+#[inline(never)]
 fn sparse_canonicalize_dict(
     array: &DictArray,
     ctx: &mut ExecutionCtx,
@@ -334,6 +339,10 @@ fn should_collect_sparse_codes(
         return true;
     }
 
+    if !should_sample_sparse_canonicalize(codes.len(), values_len) {
+        return false;
+    }
+
     // Otherwise sample first. This catches cases like many live rows all referencing the same
     // dictionary value without forcing dense dictionaries through the exact remap scan.
     let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| {
@@ -353,6 +362,21 @@ fn should_collect_sparse_codes(
         < values_len
 }
 
+#[inline]
+fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
+    codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len
+        || should_sample_sparse_canonicalize(codes_len, values_len)
+}
+
+#[inline]
+fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
+    // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away
+    // from tiny dictionary domains and near-dense slices where the estimator overhead dominates.
+    values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN
+        && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD)
+            < values_len
+}
+
 fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
     codes: &PrimitiveArray,
     referenced_values: BitBuffer,
@@ -431,7 +455,7 @@ mod tests {
     fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> {
         let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32));
         let Some(sparse) =
-            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
         else {
             panic!("sampled codes are sparse");
         };
@@ -450,10 +474,10 @@ mod tests {
 
     #[test]
     fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> {
-        let codes = PrimitiveArray::from_iter((0..1024).map(|idx| (idx % 100) as u32));
+        let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32));
 
         assert!(
-            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
                 .is_none()
         );
 
@@ -484,7 +508,7 @@ mod tests {
     fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> {
         let dict = DictArray::new(
             PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(),
-            PrimitiveArray::from_iter(0..100i32).into_array(),
+            PrimitiveArray::from_iter(0..3000i32).into_array(),
         );
 
         let actual = dict

From 58bb1d444f3c6d3b1afd52bc4a836f4bf71f6184 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 12:36:44 -0400
Subject: [PATCH 08/11] Keep sparse dict helpers cold

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/vtable/mod.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index cbe8d7bf8c9..74d96d2575b 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -292,6 +292,8 @@ fn sparse_canonicalize_dict(
         .map(Some)
 }
 
+#[cold]
+#[inline(never)]
 fn collect_sparse_codes(
     codes: &PrimitiveArray,
     values_len: usize,
@@ -320,6 +322,8 @@ fn collect_sparse_codes(
     Ok(Some(sparse_codes))
 }
 
+#[cold]
+#[inline(never)]
 fn should_collect_sparse_codes(
     codes: &PrimitiveArray,
     values_len: usize,
@@ -377,6 +381,8 @@ fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> boo
             < values_len
 }
 
+#[cold]
+#[inline(never)]
 fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
     codes: &PrimitiveArray,
     referenced_values: BitBuffer,

From d6e109619adbcdd1a019972fe6a2e37cc27561f5 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 13:16:46 -0400
Subject: [PATCH 09/11] Avoid sparse dict compaction over filters

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 .../src/arrays/dict/vtable/cardinality.rs     |  17 +-
 vortex-array/src/arrays/dict/vtable/mod.rs    | 314 +----------------
 vortex-array/src/arrays/dict/vtable/sparse.rs | 320 ++++++++++++++++++
 3 files changed, 335 insertions(+), 316 deletions(-)
 create mode 100644 vortex-array/src/arrays/dict/vtable/sparse.rs

diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/vtable/cardinality.rs
index ad8fa24bdd0..50cb4037e96 100644
--- a/vortex-array/src/arrays/dict/vtable/cardinality.rs
+++ b/vortex-array/src/arrays/dict/vtable/cardinality.rs
@@ -10,8 +10,6 @@
 use vortex_mask::Mask;
 
 use crate::arrays::PrimitiveArray;
-use crate::dtype::IntegerPType;
-
 const SAMPLE_SIZE: usize = 128;
 const REPEATED_CODE_PROBE_SIZE: usize = 16;
 
@@ -21,10 +19,7 @@ const REPEATED_CODE_PROBE_SIZE: usize = 16;
 /// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should
 /// not pay the full estimator cost unless the code stream first shows evidence of repeated codes.
 /// A `true` result only means "run the estimator"; it is not enough to compact by itself.
-pub(super) fn has_repeated_code_sample<P: IntegerPType>(
-    codes: &PrimitiveArray,
-    validity_mask: &Mask,
-) -> bool {
+pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &Mask) -> bool {
     let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE);
     let mut observed_codes = Vec::<usize>::with_capacity(sample_count);
 
@@ -34,7 +29,7 @@ pub(super) fn has_repeated_code_sample<P: IntegerPType>(
             continue;
         }
 
-        let code = codes.as_slice::<P>()[idx].as_();
+        let code = code_at(codes, idx);
         if observed_codes.contains(&code) {
             return true;
         }
@@ -49,7 +44,7 @@ pub(super) fn has_repeated_code_sample<P: IntegerPType>(
 /// The estimator samples deterministic bucket midpoints so repeated executions make the same
 /// compaction decision for the same input. Returning `None` means no valid sampled codes were seen.
 /// A returned value should only be used to decide whether an exact pass is worth attempting.
-pub(super) fn estimate_code_cardinality<P: IntegerPType>(
+pub(super) fn estimate_code_cardinality(
     codes: &PrimitiveArray,
     validity_mask: &Mask,
 ) -> Option<usize> {
@@ -64,7 +59,7 @@ pub(super) fn estimate_code_cardinality<P: IntegerPType>(
             continue;
         }
 
-        let code = codes.as_slice::<P>()[idx].as_();
+        let code = code_at(codes, idx);
         if let Some((_, count)) = observed_codes
             .iter_mut()
             .find(|(observed, _)| *observed == code)
@@ -128,6 +123,10 @@ fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
     ((bucket_start + bucket_end) / 2).min(len - 1) as usize
 }
 
+fn code_at(codes: &PrimitiveArray, idx: usize) -> usize {
+    usize::try_from(codes.as_slice::<u64>()[idx]).unwrap_or(usize::MAX)
+}
+
 fn div_ceil(numerator: usize, denominator: usize) -> usize {
     debug_assert!(denominator > 0);
     numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 74d96d2575b..7ca71b49d49 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -4,17 +4,13 @@
 use std::hash::Hasher;
 
 use kernel::PARENT_KERNELS;
-use num_traits::FromPrimitive;
 use prost::Message;
 use smallvec::smallvec;
-use vortex_buffer::BitBuffer;
-use vortex_buffer::Buffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_ensure;
 use vortex_error::vortex_err;
 use vortex_error::vortex_panic;
-use vortex_mask::Mask;
 use vortex_session::VortexSession;
 use vortex_session::registry::CachedId;
 
@@ -24,13 +20,11 @@ use super::DictOwnedExt;
 use super::DictParts;
 use super::array::DictSlots;
 use super::array::DictSlotsView;
-use super::array::compute_referenced_values_mask_from_codes;
 use crate::AnyCanonical;
 use crate::ArrayEq;
 use crate::ArrayHash;
 use crate::ArrayRef;
 use crate::Canonical;
-use crate::IntoArray;
 use crate::Precision;
 use crate::array::Array;
 use crate::array::ArrayId;
@@ -38,20 +32,18 @@ use crate::array::ArrayParts;
 use crate::array::ArrayView;
 use crate::array::VTable;
 use crate::arrays::ConstantArray;
+use crate::arrays::Filter;
 use crate::arrays::Primitive;
-use crate::arrays::PrimitiveArray;
 use crate::arrays::dict::DictArrayExt;
 use crate::arrays::dict::DictArraySlotsExt;
 use crate::arrays::dict::compute::rules::PARENT_RULES;
 use crate::arrays::dict::execute::take_canonical;
 use crate::buffer::BufferHandle;
 use crate::dtype::DType;
-use crate::dtype::IntegerPType;
 use crate::dtype::Nullability;
 use crate::dtype::PType;
 use crate::executor::ExecutionCtx;
 use crate::executor::ExecutionResult;
-use crate::match_each_integer_ptype;
 use crate::require_child;
 use crate::scalar::Scalar;
 use crate::serde::ArrayChildren;
@@ -60,17 +52,12 @@ use crate::validity::Validity;
 mod cardinality;
 mod kernel;
 mod operations;
+mod sparse;
 mod validity;
 
 /// A [`Dict`]-encoded Vortex array.
 pub type DictArray = Array<Dict>;
 
-// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
-// encoding, code count, unique-code count, and exporter/canonicalization costs.
-const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4;
-const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2;
-const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512;
-
 #[derive(Clone, Debug)]
 pub struct Dict;
 
@@ -202,9 +189,12 @@ impl VTable for Dict {
             )));
         }
 
-        if should_consider_sparse_canonicalize(array.codes().len(), array.values().len())
+        if sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len())
             && !array.has_all_values_referenced()
-            && let Some(canonical) = sparse_canonicalize_dict(&array, ctx)?
+            // `take(FilterArray(...))` is also represented as a dictionary. Compacting that shape
+            // burns time before the lazy filter has a chance to push the row mask into its child.
+            && !array.values().is::<Filter>()
+            && let Some(canonical) = sparse::sparse_canonicalize_dict(&array, ctx)?
         {
             return Ok(ExecutionResult::done(canonical));
         }
@@ -237,293 +227,3 @@ impl VTable for Dict {
         PARENT_KERNELS.execute(array, parent, child_idx, ctx)
     }
 }
-
-struct SparseDictCodes {
-    /// Original dictionary value indices that are actually referenced by the live codes.
-    unique_codes: PrimitiveArray,
-    /// Codes rewritten to index into `unique_codes` instead of the original values array.
-    remapped_codes: PrimitiveArray,
-}
-
-#[cold]
-#[inline(never)]
-fn sparse_canonicalize_dict(
-    array: &DictArray,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<Option<Canonical>> {
-    let codes = array.codes().as_::<Primitive>().into_owned();
-    let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else {
-        return Ok(None);
-    };
-
-    // Build a temporary parent that represents `values.take(unique_codes)`. Calling
-    // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just
-    // the referenced dictionary values. If the child has no specialized parent execution, fall
-    // back to canonicalizing all values and then taking from the canonical array.
-    let values = array.values();
-    let unique_values_parent = DictArray::new(
-        sparse_codes.unique_codes.clone().into_array(),
-        values.clone(),
-    )
-    .into_array();
-    let unique_values = if let Some(taken_values) =
-        values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)?
-    {
-        taken_values.execute::<Canonical>(ctx)?.into_array()
-    } else {
-        let canonical_values = values.clone().execute::<Canonical>(ctx)?.into_array();
-        DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values)
-            .into_array()
-            .execute::<Canonical>(ctx)?
-            .into_array()
-    };
-
-    // Now the dictionary is dense over its compacted values, so normal dictionary execution only
-    // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a
-    // large dictionary with many unused values.
-    let compact_dict = unsafe {
-        DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values)
-            .set_all_values_referenced(true)
-    };
-
-    compact_dict
-        .into_array()
-        .execute::<Canonical>(ctx)
-        .map(Some)
-}
-
-#[cold]
-#[inline(never)]
-fn collect_sparse_codes(
-    codes: &PrimitiveArray,
-    values_len: usize,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<Option<SparseDictCodes>> {
-    let validity = codes.validity()?;
-    let validity_mask = validity.execute_mask(codes.len(), ctx)?;
-
-    // The exact pass below scans every code and allocates a remap table sized to the values array.
-    // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough.
-    if !should_collect_sparse_codes(codes, values_len, &validity_mask) {
-        return Ok(None);
-    }
-
-    let referenced_values =
-        compute_referenced_values_mask_from_codes(codes, values_len, &validity_mask, true)?;
-    let unique_count = referenced_values.true_count();
-    if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len {
-        return Ok(None);
-    }
-
-    let sparse_codes = match_each_integer_ptype!(codes.ptype(), |P| {
-        collect_sparse_codes_typed::<P>(codes, referenced_values, validity_mask, validity)?
-    });
-
-    Ok(Some(sparse_codes))
-}
-
-#[cold]
-#[inline(never)]
-fn should_collect_sparse_codes(
-    codes: &PrimitiveArray,
-    values_len: usize,
-    validity_mask: &Mask,
-) -> bool {
-    if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 {
-        return false;
-    }
-
-    // If even the worst case "every live code is unique" is sparse, skip sampling and go straight
-    // to the exact remap pass.
-    if codes
-        .len()
-        .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
-        < values_len
-    {
-        return true;
-    }
-
-    if !should_sample_sparse_canonicalize(codes.len(), values_len) {
-        return false;
-    }
-
-    // Otherwise sample first. This catches cases like many live rows all referencing the same
-    // dictionary value without forcing dense dictionaries through the exact remap scan.
-    let worth_sampling = match_each_integer_ptype!(codes.ptype(), |P| {
-        cardinality::has_repeated_code_sample::<P>(codes, validity_mask)
-    });
-    if !worth_sampling {
-        return false;
-    }
-
-    let Some(estimated_unique_codes) = match_each_integer_ptype!(codes.ptype(), |P| {
-        cardinality::estimate_code_cardinality::<P>(codes, validity_mask)
-    }) else {
-        return false;
-    };
-
-    estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
-        < values_len
-}
-
-#[inline]
-fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
-    codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len
-        || should_sample_sparse_canonicalize(codes_len, values_len)
-}
-
-#[inline]
-fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
-    // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away
-    // from tiny dictionary domains and near-dense slices where the estimator overhead dominates.
-    values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN
-        && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD)
-            < values_len
-}
-
-#[cold]
-#[inline(never)]
-fn collect_sparse_codes_typed<P: IntegerPType + FromPrimitive>(
-    codes: &PrimitiveArray,
-    referenced_values: BitBuffer,
-    validity_mask: Mask,
-    validity: Validity,
-) -> VortexResult<SparseDictCodes> {
-    let unique_count = referenced_values.true_count();
-    let mut value_remap = vec![usize::MAX; referenced_values.len()];
-    let mut unique_codes = Vec::with_capacity(unique_count);
-
-    // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking
-    // the bitmap assigns compact codes in original dictionary order, which keeps compaction
-    // deterministic and independent of the first live row that happened to reference each value.
-    for old_code in referenced_values.set_indices() {
-        let new_code = unique_codes.len();
-        value_remap[old_code] = new_code;
-        unique_codes.push(old_code as u64);
-    }
-
-    let mut remapped_codes = Vec::with_capacity(codes.len());
-    for (idx, &code) in codes.as_slice::<P>().iter().enumerate() {
-        if !validity_mask.value(idx) {
-            remapped_codes.push(P::default());
-            continue;
-        }
-
-        let old_code = code.as_();
-        let new_code = value_remap[old_code];
-        debug_assert_ne!(new_code, usize::MAX);
-
-        remapped_codes.push(P::from_usize(new_code).unwrap_or_else(|| {
-            vortex_panic!(
-                "compacted dictionary code {new_code} does not fit in {}",
-                P::PTYPE
-            )
-        }));
-    }
-
-    Ok(SparseDictCodes {
-        unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable),
-        remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity),
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use vortex_error::VortexResult;
-
-    use super::*;
-    use crate::LEGACY_SESSION;
-    use crate::VortexSessionExecute;
-    use crate::assert_arrays_eq;
-
-    #[test]
-    fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> {
-        let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]);
-        let Some(sparse) =
-            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
-        else {
-            panic!("codes are sparse");
-        };
-
-        assert_arrays_eq!(
-            sparse.unique_codes.into_array(),
-            PrimitiveArray::from_iter([50u64, 70]).into_array()
-        );
-        assert_arrays_eq!(
-            sparse.remapped_codes.into_array(),
-            PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), Some(0)]).into_array()
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> {
-        let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32));
-        let Some(sparse) =
-            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
-        else {
-            panic!("sampled codes are sparse");
-        };
-
-        assert_arrays_eq!(
-            sparse.unique_codes.into_array(),
-            PrimitiveArray::from_iter([42u64]).into_array()
-        );
-        assert_arrays_eq!(
-            sparse.remapped_codes.into_array(),
-            PrimitiveArray::from_iter((0..1024).map(|_| 0u32)).into_array()
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> {
-        let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32));
-
-        assert!(
-            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
-                .is_none()
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> {
-        let dict = DictArray::new(
-            PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(),
-            PrimitiveArray::from_iter(0..100i32).into_array(),
-        );
-
-        let actual = dict
-            .into_array()
-            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
-            .into_array();
-
-        assert_arrays_eq!(
-            actual,
-            PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)])
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> {
-        let dict = DictArray::new(
-            PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(),
-            PrimitiveArray::from_iter(0..3000i32).into_array(),
-        );
-
-        let actual = dict
-            .into_array()
-            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
-            .into_array();
-
-        assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32)));
-
-        Ok(())
-    }
-}
diff --git a/vortex-array/src/arrays/dict/vtable/sparse.rs b/vortex-array/src/arrays/dict/vtable/sparse.rs
new file mode 100644
index 00000000000..1552df6ea5d
--- /dev/null
+++ b/vortex-array/src/arrays/dict/vtable/sparse.rs
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_buffer::BitBuffer;
+use vortex_buffer::Buffer;
+use vortex_error::VortexResult;
+use vortex_error::vortex_panic;
+use vortex_mask::Mask;
+
+use super::super::array::DictSlots;
+use super::super::array::compute_referenced_values_mask_from_codes;
+use super::DictArray;
+use super::cardinality;
+use crate::Canonical;
+use crate::IntoArray;
+use crate::arrays::Primitive;
+use crate::arrays::PrimitiveArray;
+use crate::arrays::dict::DictArraySlotsExt;
+use crate::builtins::ArrayBuiltins;
+use crate::dtype::DType;
+use crate::dtype::PType;
+use crate::executor::ExecutionCtx;
+use crate::validity::Validity;
+
+// TODO: Replace this fixed sparse-dictionary threshold with a cost model that accounts for values
+// encoding, code count, unique-code count, and exporter/canonicalization costs.
+const SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD: usize = 4;
+const SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD: usize = 2;
+const SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN: usize = 512;
+
+struct SparseDictCodes {
+    /// Original dictionary value indices that are actually referenced by the live codes.
+    unique_codes: PrimitiveArray,
+    /// Codes rewritten to index into `unique_codes` instead of the original values array.
+    remapped_codes: PrimitiveArray,
+}
+
+#[cold]
+#[inline(never)]
+pub(super) fn sparse_canonicalize_dict(
+    array: &DictArray,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<Canonical>> {
+    let codes = array.codes().as_::<Primitive>().into_owned();
+    let Some(sparse_codes) = collect_sparse_codes(&codes, array.values().len(), ctx)? else {
+        return Ok(None);
+    };
+
+    // Build a temporary parent that represents `values.take(unique_codes)`. Calling
+    // `execute_parent` on the values child lets encodings such as FSST/VarBin sparse-take just
+    // the referenced dictionary values. If the child has no specialized parent execution, fall
+    // back to canonicalizing all values and then taking from the canonical array.
+    let values = array.values();
+    let unique_values_parent = DictArray::new(
+        sparse_codes.unique_codes.clone().into_array(),
+        values.clone(),
+    )
+    .into_array();
+    let unique_values = if let Some(taken_values) =
+        values.execute_parent(&unique_values_parent, DictSlots::VALUES, ctx)?
+    {
+        taken_values.execute::<Canonical>(ctx)?.into_array()
+    } else {
+        let canonical_values = values.clone().execute::<Canonical>(ctx)?.into_array();
+        DictArray::new(sparse_codes.unique_codes.into_array(), canonical_values)
+            .into_array()
+            .execute::<Canonical>(ctx)?
+            .into_array()
+    };
+
+    // Now the dictionary is dense over its compacted values, so normal dictionary execution only
+    // takes from the small `unique_values` array. This avoids `values.take(codes)` preserving a
+    // large dictionary with many unused values.
+    let compact_dict = unsafe {
+        DictArray::new_unchecked(sparse_codes.remapped_codes.into_array(), unique_values)
+            .set_all_values_referenced(true)
+    };
+
+    compact_dict
+        .into_array()
+        .execute::<Canonical>(ctx)
+        .map(Some)
+}
+
+#[cold]
+#[inline(never)]
+fn collect_sparse_codes(
+    codes: &PrimitiveArray,
+    values_len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<SparseDictCodes>> {
+    let validity = codes.validity()?;
+    let validity_mask = validity.execute_mask(codes.len(), ctx)?;
+    let codes = codes_as_u64(codes, ctx)?;
+
+    // The exact pass below scans every code and allocates a remap table sized to the values array.
+    // Do it only when a cheap upper bound/sample says the dictionary is likely sparse enough.
+    if !should_collect_sparse_codes(&codes, values_len, &validity_mask) {
+        return Ok(None);
+    }
+
+    let referenced_values =
+        compute_referenced_values_mask_from_codes(&codes, values_len, &validity_mask, true)?;
+    let unique_count = referenced_values.true_count();
+    if unique_count.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) >= values_len {
+        return Ok(None);
+    }
+
+    collect_sparse_codes_u64(&codes, referenced_values, validity_mask, validity).map(Some)
+}
+
+fn codes_as_u64(codes: &PrimitiveArray, ctx: &mut ExecutionCtx) -> VortexResult<PrimitiveArray> {
+    if codes.ptype() == PType::U64 {
+        return Ok(codes.clone());
+    }
+
+    codes
+        .clone()
+        .into_array()
+        .cast(DType::Primitive(PType::U64, codes.dtype().nullability()))?
+        .execute::<PrimitiveArray>(ctx)
+}
+
+#[cold]
+#[inline(never)]
+fn should_collect_sparse_codes(
+    codes: &PrimitiveArray,
+    values_len: usize,
+    validity_mask: &Mask,
+) -> bool {
+    if codes.is_empty() || values_len == 0 || validity_mask.true_count() == 0 {
+        return false;
+    }
+
+    // If even the worst case "every live code is unique" is sparse, skip sampling and go straight
+    // to the exact remap pass.
+    if codes
+        .len()
+        .saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
+        < values_len
+    {
+        return true;
+    }
+
+    if !should_sample_sparse_canonicalize(codes.len(), values_len) {
+        return false;
+    }
+
+    // Otherwise sample first. This catches cases like many live rows all referencing the same
+    // dictionary value without forcing dense dictionaries through the exact remap scan.
+    if !cardinality::has_repeated_code_sample(codes, validity_mask) {
+        return false;
+    }
+
+    let Some(estimated_unique_codes) = cardinality::estimate_code_cardinality(codes, validity_mask)
+    else {
+        return false;
+    };
+
+    estimated_unique_codes.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD)
+        < values_len
+}
+
+#[inline]
+pub(super) fn should_consider_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
+    codes_len.saturating_mul(SPARSE_CANONICALIZE_CODES_PER_VALUE_THRESHOLD) < values_len
+        || should_sample_sparse_canonicalize(codes_len, values_len)
+}
+
+#[inline]
+fn should_sample_sparse_canonicalize(codes_len: usize, values_len: usize) -> bool {
+    // Sampling is only a preflight for cases that are not sparse by row count alone. Keep it away
+    // from tiny dictionary domains and near-dense slices where the estimator overhead dominates.
+    values_len >= SPARSE_CANONICALIZE_MIN_SAMPLED_VALUES_LEN
+        && codes_len.saturating_mul(SPARSE_CANONICALIZE_SAMPLED_CODES_PER_VALUE_THRESHOLD)
+            < values_len
+}
+
+#[cold]
+#[inline(never)]
+fn collect_sparse_codes_u64(
+    codes: &PrimitiveArray,
+    referenced_values: BitBuffer,
+    validity_mask: Mask,
+    validity: Validity,
+) -> VortexResult<SparseDictCodes> {
+    let unique_count = referenced_values.true_count();
+    let mut value_remap = vec![usize::MAX; referenced_values.len()];
+    let mut unique_codes = Vec::with_capacity(unique_count);
+
+    // Reuse the same exact referenced-values bitmap as the dictionary aggregate kernels. Walking
+    // the bitmap assigns compact codes in original dictionary order, which keeps compaction
+    // deterministic and independent of the first live row that happened to reference each value.
+    for old_code in referenced_values.set_indices() {
+        let new_code = unique_codes.len();
+        value_remap[old_code] = new_code;
+        unique_codes.push(old_code as u64);
+    }
+
+    let mut remapped_codes = Vec::with_capacity(codes.len());
+    for (idx, &code) in codes.as_slice::<u64>().iter().enumerate() {
+        if !validity_mask.value(idx) {
+            remapped_codes.push(0);
+            continue;
+        }
+
+        let old_code = usize::try_from(code)
+            .unwrap_or_else(|_| vortex_panic!("dictionary code {code} does not fit usize"));
+        let new_code = value_remap[old_code];
+        debug_assert_ne!(new_code, usize::MAX);
+
+        remapped_codes.push(new_code as u64);
+    }
+
+    Ok(SparseDictCodes {
+        unique_codes: PrimitiveArray::new(Buffer::from_iter(unique_codes), Validity::NonNullable),
+        remapped_codes: PrimitiveArray::new(Buffer::from_iter(remapped_codes), validity),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::LEGACY_SESSION;
+    use crate::VortexSessionExecute;
+    use crate::assert_arrays_eq;
+
+    #[test]
+    fn collect_sparse_codes_remaps_unique_values() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]);
+        let Some(sparse) =
+            collect_sparse_codes(&codes, 100, &mut LEGACY_SESSION.create_execution_ctx())?
+        else {
+            panic!("codes are sparse");
+        };
+
+        assert_arrays_eq!(
+            sparse.unique_codes.into_array(),
+            PrimitiveArray::from_iter([50u64, 70]).into_array()
+        );
+        assert_arrays_eq!(
+            sparse.remapped_codes.into_array(),
+            PrimitiveArray::from_option_iter([Some(0u64), None, Some(1), Some(0)]).into_array()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sampled_sparse_codes_remaps_repeated_large_codes() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_iter((0..1024).map(|_| 42u32));
+        let Some(sparse) =
+            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
+        else {
+            panic!("sampled codes are sparse");
+        };
+
+        assert_arrays_eq!(
+            sparse.unique_codes.into_array(),
+            PrimitiveArray::from_iter([42u64]).into_array()
+        );
+        assert_arrays_eq!(
+            sparse.remapped_codes.into_array(),
+            PrimitiveArray::from_iter((0..1024).map(|_| 0u64)).into_array()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn dense_sample_skips_sparse_code_collection() -> VortexResult<()> {
+        let codes = PrimitiveArray::from_iter((0..1024).map(|idx| idx as u32));
+
+        assert!(
+            collect_sparse_codes(&codes, 3000, &mut LEGACY_SESSION.create_execution_ctx())?
+                .is_none()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sparse_dict_canonicalizes_correctly() -> VortexResult<()> {
+        let dict = DictArray::new(
+            PrimitiveArray::from_option_iter([Some(50u32), None, Some(70), Some(50)]).into_array(),
+            PrimitiveArray::from_iter(0..100i32).into_array(),
+        );
+
+        let actual = dict
+            .into_array()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
+            .into_array();
+
+        assert_arrays_eq!(
+            actual,
+            PrimitiveArray::from_option_iter([Some(50i32), None, Some(70), Some(50)])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn sampled_sparse_dict_canonicalizes_repeated_codes() -> VortexResult<()> {
+        let dict = DictArray::new(
+            PrimitiveArray::from_iter((0..1024).map(|_| 42u32)).into_array(),
+            PrimitiveArray::from_iter(0..3000i32).into_array(),
+        );
+
+        let actual = dict
+            .into_array()
+            .execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())?
+            .into_array();
+
+        assert_arrays_eq!(actual, PrimitiveArray::from_iter((0..1024).map(|_| 42i32)));
+
+        Ok(())
+    }
+}

From 54b8dba4b5f29238f000161368e6b7d3fa94a55e Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 13:27:43 -0400
Subject: [PATCH 10/11] Skip sparse dict gate for filter values

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/src/arrays/dict/vtable/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index 7ca71b49d49..ab679dc27d1 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -189,11 +189,11 @@ impl VTable for Dict {
             )));
         }
 
-        if sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len())
-            && !array.has_all_values_referenced()
+        if !array.has_all_values_referenced()
             // `take(FilterArray(...))` is also represented as a dictionary. Compacting that shape
             // burns time before the lazy filter has a chance to push the row mask into its child.
             && !array.values().is::<Filter>()
+            && sparse::should_consider_sparse_canonicalize(array.codes().len(), array.values().len())
             && let Some(canonical) = sparse::sparse_canonicalize_dict(&array, ctx)?
         {
             return Ok(ExecutionResult::done(canonical));

From 22be09b1e5c03807776ccd2c688bed59f48b7530 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 8 May 2026 14:04:53 -0400
Subject: [PATCH 11/11] Deduplicate cardinality code

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 vortex-array/public-api.lock                  |   6 ++
 .../arrays/dict/{vtable => }/cardinality.rs   |  24 +++--
 vortex-array/src/arrays/dict/mod.rs           |   1 +
 vortex-array/src/arrays/dict/vtable/mod.rs    |   1 -
 vortex-array/src/arrays/dict/vtable/sparse.rs |   7 +-
 vortex-duckdb/src/exporter/dict.rs            |   2 +-
 .../src/exporter/dict_cardinality.rs          | 102 ------------------
 vortex-duckdb/src/exporter/mod.rs             |   1 -
 8 files changed, 25 insertions(+), 119 deletions(-)
 rename vortex-array/src/arrays/dict/{vtable => }/cardinality.rs (88%)
 delete mode 100644 vortex-duckdb/src/exporter/dict_cardinality.rs

diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock
index 8d5cc75c895..5e838a6ffcf 100644
--- a/vortex-array/public-api.lock
+++ b/vortex-array/public-api.lock
@@ -2268,6 +2268,12 @@ pub type vortex_array::arrays::decimal::DecimalArray = vortex_array::Array<vorte
 
 pub mod vortex_array::arrays::dict
 
+pub mod vortex_array::arrays::dict::cardinality
+
+pub fn vortex_array::arrays::dict::cardinality::estimate_code_cardinality<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> core::option::Option<usize>
+
+pub fn vortex_array::arrays::dict::cardinality::has_repeated_code_sample<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> bool
+
 pub mod vortex_array::arrays::dict::vtable
 
 pub struct vortex_array::arrays::dict::vtable::Dict
diff --git a/vortex-array/src/arrays/dict/vtable/cardinality.rs b/vortex-array/src/arrays/dict/cardinality.rs
similarity index 88%
rename from vortex-array/src/arrays/dict/vtable/cardinality.rs
rename to vortex-array/src/arrays/dict/cardinality.rs
index 50cb4037e96..fb8ed84da5b 100644
--- a/vortex-array/src/arrays/dict/vtable/cardinality.rs
+++ b/vortex-array/src/arrays/dict/cardinality.rs
@@ -3,13 +3,16 @@
 
 //! Sampling-based cardinality estimation for dictionary codes.
 //!
-//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass. The
-//! estimate may be conservative or noisy, but correctness does not depend on it: callers must still
-//! collect the exact unique code set and re-check the sparse threshold before compacting.
+//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass and as a
+//! routing hint for downstream exporters. The estimate may be conservative or noisy, but
+//! correctness does not depend on it: callers must still collect the exact unique code set and
+//! re-check the sparse threshold before compacting.
 
 use vortex_mask::Mask;
 
 use crate::arrays::PrimitiveArray;
+use crate::dtype::IntegerPType;
+
 const SAMPLE_SIZE: usize = 128;
 const REPEATED_CODE_PROBE_SIZE: usize = 16;
 
@@ -19,7 +22,10 @@ const REPEATED_CODE_PROBE_SIZE: usize = 16;
 /// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should
 /// not pay the full estimator cost unless the code stream first shows evidence of repeated codes.
 /// A `true` result only means "run the estimator"; it is not enough to compact by itself.
-pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &Mask) -> bool {
+pub fn has_repeated_code_sample<I: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> bool {
     let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE);
     let mut observed_codes = Vec::<usize>::with_capacity(sample_count);
 
@@ -29,7 +35,7 @@ pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &M
             continue;
         }
 
-        let code = code_at(codes, idx);
+        let code: usize = codes.as_slice::<I>()[idx].as_();
         if observed_codes.contains(&code) {
             return true;
         }
@@ -44,7 +50,7 @@ pub(super) fn has_repeated_code_sample(codes: &PrimitiveArray, validity_mask: &M
 /// The estimator samples deterministic bucket midpoints so repeated executions make the same
 /// compaction decision for the same input. Returning `None` means no valid sampled codes were seen.
 /// A returned value should only be used to decide whether an exact pass is worth attempting.
-pub(super) fn estimate_code_cardinality(
+pub fn estimate_code_cardinality<I: IntegerPType>(
     codes: &PrimitiveArray,
     validity_mask: &Mask,
 ) -> Option<usize> {
@@ -59,7 +65,7 @@ pub(super) fn estimate_code_cardinality(
             continue;
         }
 
-        let code = code_at(codes, idx);
+        let code: usize = codes.as_slice::<I>()[idx].as_();
         if let Some((_, count)) = observed_codes
             .iter_mut()
             .find(|(observed, _)| *observed == code)
@@ -123,10 +129,6 @@ fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
     ((bucket_start + bucket_end) / 2).min(len - 1) as usize
 }
 
-fn code_at(codes: &PrimitiveArray, idx: usize) -> usize {
-    usize::try_from(codes.as_slice::<u64>()[idx]).unwrap_or(usize::MAX)
-}
-
 fn div_ceil(numerator: usize, denominator: usize) -> usize {
     debug_assert!(denominator > 0);
     numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
diff --git a/vortex-array/src/arrays/dict/mod.rs b/vortex-array/src/arrays/dict/mod.rs
index 0414eea7def..f0bba784794 100644
--- a/vortex-array/src/arrays/dict/mod.rs
+++ b/vortex-array/src/arrays/dict/mod.rs
@@ -14,6 +14,7 @@ pub use arbitrary::ArbitraryDictArray;
 mod array;
 pub use array::*;
 
+pub mod cardinality;
 pub(crate) mod compute;
 mod execute;
 
diff --git a/vortex-array/src/arrays/dict/vtable/mod.rs b/vortex-array/src/arrays/dict/vtable/mod.rs
index ab679dc27d1..a0cde010147 100644
--- a/vortex-array/src/arrays/dict/vtable/mod.rs
+++ b/vortex-array/src/arrays/dict/vtable/mod.rs
@@ -49,7 +49,6 @@ use crate::scalar::Scalar;
 use crate::serde::ArrayChildren;
 use crate::validity::Validity;
 
-mod cardinality;
 mod kernel;
 mod operations;
 mod sparse;
diff --git a/vortex-array/src/arrays/dict/vtable/sparse.rs b/vortex-array/src/arrays/dict/vtable/sparse.rs
index 1552df6ea5d..23944dc0b29 100644
--- a/vortex-array/src/arrays/dict/vtable/sparse.rs
+++ b/vortex-array/src/arrays/dict/vtable/sparse.rs
@@ -9,8 +9,8 @@ use vortex_mask::Mask;
 
 use super::super::array::DictSlots;
 use super::super::array::compute_referenced_values_mask_from_codes;
+use super::super::cardinality;
 use super::DictArray;
-use super::cardinality;
 use crate::Canonical;
 use crate::IntoArray;
 use crate::arrays::Primitive;
@@ -148,11 +148,12 @@ fn should_collect_sparse_codes(
 
     // Otherwise sample first. This catches cases like many live rows all referencing the same
     // dictionary value without forcing dense dictionaries through the exact remap scan.
-    if !cardinality::has_repeated_code_sample(codes, validity_mask) {
+    if !cardinality::has_repeated_code_sample::<u64>(codes, validity_mask) {
         return false;
     }
 
-    let Some(estimated_unique_codes) = cardinality::estimate_code_cardinality(codes, validity_mask)
+    let Some(estimated_unique_codes) =
+        cardinality::estimate_code_cardinality::<u64>(codes, validity_mask)
     else {
         return false;
     };
diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs
index 0728779cdf6..eec71dc08a6 100644
--- a/vortex-duckdb/src/exporter/dict.rs
+++ b/vortex-duckdb/src/exporter/dict.rs
@@ -13,6 +13,7 @@ use vortex::array::arrays::DictArray;
 use vortex::array::arrays::PrimitiveArray;
 use vortex::array::arrays::dict::DictArrayExt;
 use vortex::array::arrays::dict::DictArraySlotsExt;
+use vortex::array::arrays::dict::cardinality::estimate_code_cardinality;
 use vortex::array::match_each_integer_ptype;
 use vortex::dtype::IntegerPType;
 use vortex::error::VortexResult;
@@ -25,7 +26,6 @@ use crate::exporter::ColumnExporter;
 use crate::exporter::all_invalid;
 use crate::exporter::cache::ConversionCache;
 use crate::exporter::constant;
-use crate::exporter::dict_cardinality::estimate_code_cardinality;
 use crate::exporter::new_array_exporter;
 
 struct DictExporter<I: IntegerPType> {
diff --git a/vortex-duckdb/src/exporter/dict_cardinality.rs b/vortex-duckdb/src/exporter/dict_cardinality.rs
deleted file mode 100644
index 1a56fa1a954..00000000000
--- a/vortex-duckdb/src/exporter/dict_cardinality.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Sampling-based cardinality estimation for DuckDB dictionary export.
-//!
-//! The exporter uses this as a cheap routing hint before choosing between a reusable DuckDB
-//! dictionary and executing the Vortex dictionary into a flat vector. Correctness does not depend on
-//! the estimate: the compacting path still executes Vortex's dictionary canonicalization logic.
-
-use vortex::array::arrays::PrimitiveArray;
-use vortex::dtype::IntegerPType;
-use vortex::mask::Mask;
-
-const SAMPLE_SIZE: usize = 128;
-
-/// Estimate the number of distinct non-null dictionary codes in a DuckDB export batch.
-///
-/// Returning `None` means no valid sampled codes were seen. A returned estimate should be treated
-/// only as a cost signal for whether the exporter should call into Vortex dictionary execution.
-pub(super) fn estimate_code_cardinality<I: IntegerPType>(
-    codes: &PrimitiveArray,
-    codes_mask: &Mask,
-) -> Option<usize> {
-    let sample_count = codes.len().min(SAMPLE_SIZE);
-    let mut observed_codes = Vec::<(usize, usize)>::new();
-
-    // This mirrors the array-side sparse dictionary gate. The exporter needs the estimate before
-    // it decides between a reusable DuckDB dictionary and executing the Vortex dictionary away.
-    // Correctness does not depend on the estimate; it only decides whether to take the compacting
-    // path.
-    for sample_idx in 0..sample_count {
-        let idx = sample_index(sample_idx, codes.len(), sample_count);
-        if !codes_mask.value(idx) {
-            continue;
-        }
-
-        let code = codes.as_slice::<I>()[idx].as_();
-        if let Some((_, count)) = observed_codes
-            .iter_mut()
-            .find(|(observed, _)| *observed == code)
-        {
-            *count += 1;
-        } else {
-            observed_codes.push((code, 1));
-        }
-    }
-
-    estimate_cardinality_from_observations(&observed_codes)
-}
-
-/// Estimate total cardinality from `(code, observed_count)` sample observations.
-///
-/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated
-/// observations imply the selected code stream is likely low-cardinality.
-fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
-    if observed_codes.is_empty() {
-        return None;
-    }
-
-    let unique_count = observed_codes.len();
-    let singleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 1)
-        .count();
-    let doubleton_count = observed_codes
-        .iter()
-        .filter(|(_, count)| *count == 2)
-        .count();
-
-    let unseen_estimate = if doubleton_count == 0 {
-        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
-    } else {
-        div_ceil(
-            singleton_count.saturating_mul(singleton_count),
-            2 * doubleton_count,
-        )
-    };
-
-    Some(unique_count.saturating_add(unseen_estimate))
-}
-
-/// Return the midpoint index for one deterministic sampling bucket.
-///
-/// Bucket midpoint sampling gives coverage across the whole code vector without introducing RNG
-/// state or nondeterministic exporter decisions.
-fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
-    debug_assert!(len > 0);
-    debug_assert!(sample_count > 0);
-
-    let sample_idx = sample_idx as u128;
-    let len = len as u128;
-    let sample_count = sample_count as u128;
-    let bucket_start = sample_idx * len / sample_count;
-    let bucket_end = (sample_idx + 1) * len / sample_count;
-
-    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
-}
-
-fn div_ceil(numerator: usize, denominator: usize) -> usize {
-    debug_assert!(denominator > 0);
-    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
-}
diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs
index 9c01e2f35e5..517776f5521 100644
--- a/vortex-duckdb/src/exporter/mod.rs
+++ b/vortex-duckdb/src/exporter/mod.rs
@@ -7,7 +7,6 @@ mod cache;
 mod constant;
 mod decimal;
 mod dict;
-mod dict_cardinality;
 mod fixed_size_list;
 mod list;
 mod list_view;