diff --git a/Cargo.lock b/Cargo.lock index 9c3afb6808f..23ee851ffdb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10417,6 +10417,7 @@ dependencies = [ "vortex-fastlanes", "vortex-fsst", "vortex-mask", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-sequence", @@ -10757,6 +10758,7 @@ dependencies = [ "vortex-layout", "vortex-mask", "vortex-metrics", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-scan", @@ -10964,6 +10966,45 @@ dependencies = [ "vortex-cuda-macros", ] +[[package]] +name = "vortex-onpair" +version = "0.1.0" +dependencies = [ + "codspeed-divan-compat", + "memchr", + "prost 0.14.3", + "rstest", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-onpair-sys", + "vortex-session", +] + +[[package]] +name = "vortex-onpair-rs" +version = "0.1.0" +dependencies = [ + "aho-corasick", + "arrow-array 58.2.0", + "arrow-schema 58.2.0", + "codspeed-divan-compat", + "hashbrown 0.17.1", + "memchr", + "parquet 58.2.0", + "rand 0.10.1", + "rstest", + "vortex-onpair-sys", +] + +[[package]] +name = "vortex-onpair-sys" +version = "0.1.0" +dependencies = [ + "cmake", +] + [[package]] name = "vortex-parquet-variant" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 3ad6f177778..382c34c3db1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,10 @@ members = [ "encodings/zstd", "encodings/bytebool", "encodings/parquet-variant", + # Experimental encodings + "encodings/experimental/onpair", + "encodings/experimental/onpair-sys", + "encodings/experimental/onpair-rs", # Benchmarks "benchmarks/lance-bench", "benchmarks/compress-bench", @@ -289,6 +293,9 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false } vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false } vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false } +vortex-onpair = { version = "0.1.0", path = "./encodings/experimental/onpair", default-features = false } +vortex-onpair-rs = { version = "0.1.0", path = "./encodings/experimental/onpair-rs", default-features = false } +vortex-onpair-sys = { version = "0.1.0", path = "./encodings/experimental/onpair-sys", default-features = false } vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false } vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false } vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false } diff --git a/encodings/experimental/onpair-rs/Cargo.toml b/encodings/experimental/onpair-rs/Cargo.toml new file mode 100644 index 00000000000..f44ec00428a --- /dev/null +++ b/encodings/experimental/onpair-rs/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "vortex-onpair-rs" +description = "Pure-Rust port of the OnPair short-string compression library" +authors = { workspace = true } +categories = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +aho-corasick = { workspace = true } +hashbrown = { workspace = true } +memchr = { workspace = true } +rand = { workspace = true } + +[dev-dependencies] +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +divan = { workspace = true } +parquet = { workspace = true } +rstest = { workspace = true } +vortex-onpair-sys = { workspace = true } + +[[bench]] +name = "clickbench" +harness = false diff --git a/encodings/experimental/onpair-rs/README.md b/encodings/experimental/onpair-rs/README.md new file mode 100644 index 00000000000..623ebbe410e --- /dev/null +++ b/encodings/experimental/onpair-rs/README.md @@ -0,0 +1,11 @@ +# onpair-lib + +Pure-Rust port of the training + encoding parts of +[`onpair_cpp`](https://github.com/gargiulofrancesco/onpair_cpp). + +Scope is limited to what `vortex-onpair` actually consumes from +`vortex-onpair-sys`: `Column::compress` (BPE-style dictionary training plus +LSB-first bit-packed token encoding) and raw access to the resulting parts +(dictionary bytes/offsets, packed token stream, per-row boundaries). Decode, +LIKE, and EQ predicates are already pure Rust in `vortex-onpair` and reuse the +same `parts()` layout. diff --git a/encodings/experimental/onpair-rs/benches/clickbench.rs b/encodings/experimental/onpair-rs/benches/clickbench.rs new file mode 100644 index 00000000000..b53b85fcbee --- /dev/null +++ b/encodings/experimental/onpair-rs/benches/clickbench.rs @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#![allow( + clippy::cast_possible_truncation, + clippy::clone_on_ref_ptr, + clippy::expect_used, + clippy::many_single_char_names, + clippy::missing_panics_doc, + clippy::unwrap_in_result, + clippy::unwrap_used +)] +// +// End-to-end benchmark suite over a real parquet file (ClickBench-style +// hits or any UTF-8 string column). +// +// Data source resolution, in order: +// 1. env var `ONPAIR_BENCH_PARQUET` — path to a parquet file +// (e.g. ClickBench `hits.parquet`). Optionally set +// `ONPAIR_BENCH_COLUMN` to pick a specific UTF-8 column; otherwise +// we pick the first BYTE_ARRAY / Utf8 / Utf8View column with the +// largest total byte volume. +// 2. `/tmp/userdata1.parquet` if present (small real-world parquet, +// good for smoke runs). +// 3. A synthetic ClickBench-shaped URL corpus (100 000 rows of +// repetitive URLs with realistic prefix sharing). +// +// Each benchmark group runs three configurations: +// * the full pipeline (`train_and_compress`) +// * a single op against an already-built `Column` +// +// Run with: cargo bench -p vortex-onpair-rs --bench clickbench + +use std::env; +use std::fs::File; +use std::path::PathBuf; +use std::sync::OnceLock; + +use arrow_array::Array; +use arrow_array::cast::AsArray; +use divan::Bencher; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use vortex_onpair_rs::AhoCorasickAutomaton; +use vortex_onpair_rs::Column; +use vortex_onpair_rs::EqAutomaton; +use vortex_onpair_rs::KmpAutomaton; +use vortex_onpair_rs::OnPairTrainingConfig; +use vortex_onpair_rs::PrefixAutomaton; +use vortex_onpair_rs::and; +use vortex_onpair_rs::not; + +const BITS_CONFIGS: &[u32] = &[12, 16]; + +/// Pack `Vec>` (the corpus) into `(bytes, offsets)`. +fn pack(strings: &[Vec]) -> (Vec, Vec) { + let mut bytes = Vec::with_capacity(strings.iter().map(|s| s.len()).sum()); + let mut offsets = Vec::with_capacity(strings.len() + 1); + offsets.push(0u64); + for s in strings { + bytes.extend_from_slice(s); + offsets.push(bytes.len() as u64); + } + (bytes, offsets) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Corpus loading. +// ───────────────────────────────────────────────────────────────────────────── + +struct Corpus { + /// Where the rows came from (printed at startup). + source: String, + rows: Vec>, + /// Bytes packed once, reused across benches. + bytes: Vec, + offsets: Vec, + total_bytes: usize, +} + +fn corpus() -> &'static Corpus { + static CORPUS: OnceLock = OnceLock::new(); + CORPUS.get_or_init(|| { + let (source, rows) = load_corpus(); + let (bytes, offsets) = pack(&rows); + let total_bytes = bytes.len(); + let c = Corpus { + source, + rows, + bytes, + offsets, + total_bytes, + }; + eprintln!( + "[onpair bench] corpus: {} ({} rows, {:.2} MiB)", + c.source, + c.rows.len(), + c.total_bytes as f64 / (1024.0 * 1024.0) + ); + c + }) +} + +fn load_corpus() -> (String, Vec>) { + if let Ok(path) = env::var("ONPAIR_BENCH_PARQUET") + && let Some(rows) = read_parquet_strings(&PathBuf::from(&path)) + { + return (format!("{path} (env)"), rows); + } + let fallback = PathBuf::from("/tmp/userdata1.parquet"); + if fallback.exists() + && let Some(rows) = read_parquet_strings(&fallback) + { + return (format!("{} (auto-detected)", fallback.display()), rows); + } + let rows = synthetic_clickbench_urls(100_000); + ("synthetic ClickBench-shaped URL corpus".to_string(), rows) +} + +/// Load the largest UTF-8-typed column from a parquet file and return it as +/// `Vec>`. Honours `ONPAIR_BENCH_COLUMN` if set. +fn read_parquet_strings(path: &PathBuf) -> Option>> { + let file = File::open(path).ok()?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file).ok()?; + let schema = builder.schema().clone(); + + let col_name = env::var("ONPAIR_BENCH_COLUMN").ok(); + let picked = match col_name.as_deref() { + Some(name) => schema.fields().iter().position(|f| f.name() == name)?, + None => { + // Pick first Utf8 / LargeUtf8 / Utf8View column. + schema.fields().iter().position(|f| { + use arrow_schema::DataType::*; + matches!(f.data_type(), Utf8 | LargeUtf8 | Utf8View) + })? + } + }; + let col_field = schema.fields().get(picked).unwrap().clone(); + eprintln!( + "[onpair bench] reading column #{picked} `{}` ({})", + col_field.name(), + col_field.data_type() + ); + + let mut rows: Vec> = Vec::new(); + let reader = builder.build().ok()?; + for batch in reader.flatten() { + let arr = batch.column(picked); + use arrow_schema::DataType::*; + match arr.data_type() { + Utf8 => { + for s in arr.as_string::().iter() { + rows.push(s.unwrap_or("").as_bytes().to_vec()); + } + } + LargeUtf8 => { + for s in arr.as_string::().iter() { + rows.push(s.unwrap_or("").as_bytes().to_vec()); + } + } + Utf8View => { + for s in arr.as_string_view().iter() { + rows.push(s.unwrap_or("").as_bytes().to_vec()); + } + } + _ => return None, + } + } + Some(rows) +} + +/// 100 000 URLs whose distribution roughly matches ClickBench's URL column: +/// * heavy prefix sharing on `https://`, `http://`, `ftp://` +/// * a handful of repeating host roots +/// * variable path / query parts +fn synthetic_clickbench_urls(n: usize) -> Vec> { + const HOSTS: &[&str] = &[ + "https://www.yandex.ru", + "https://www.google.com", + "https://news.ycombinator.com", + "https://www.example.com", + "https://docs.example.org", + "https://api.example.net", + "http://m.yandex.ru", + "https://maps.example.com", + "https://shop.example.com", + "ftp://files.example.com", + ]; + const PATHS: &[&str] = &[ + "/", + "/page", + "/news", + "/search?q=", + "/profile", + "/login", + "/api/v1/data", + "/static/asset.png", + "/blog/post-", + "/feed.xml", + "/sitemap.xml", + "/users/", + "/admin/dashboard", + "/categories/electronics", + "/cart/checkout", + ]; + const TAILS: &[&str] = &["", "alpha", "beta", "gamma", "delta", "001", "002", "003"]; + let mut out = Vec::with_capacity(n); + let mut x = 0x9E3779B97F4A7C15u64; + for _ in 0..n { + // SplitMix64-style state advance — deterministic, no rand dep. + x = x.wrapping_add(0x9E3779B97F4A7C15); + let h = HOSTS[(x as usize) % HOSTS.len()]; + let p = PATHS[((x >> 16) as usize) % PATHS.len()]; + let t = TAILS[((x >> 32) as usize) % TAILS.len()]; + let n = (x >> 48) as u16; + out.push(format!("{h}{p}{t}{n}").into_bytes()); + } + out +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helpers shared by the bench groups. +// ───────────────────────────────────────────────────────────────────────────── + +fn compress_column(bits: u32) -> Column { + let c = corpus(); + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress(&c.bytes, &c.offsets, cfg).unwrap() +} + +/// Pick a needle that almost certainly appears in the corpus (for substring +/// queries) and one that definitely doesn't (for negative queries). +fn substring_needle() -> &'static [u8] { + b"example" +} + +fn equality_needle() -> Vec { + corpus() + .rows + .get(corpus().rows.len() / 2) + .cloned() + .unwrap_or_default() +} + +fn prefix_needle() -> &'static [u8] { + b"https://" +} + +// ───────────────────────────────────────────────────────────────────────────── +// Benches. +// ───────────────────────────────────────────────────────────────────────────── + +#[divan::bench(args = BITS_CONFIGS)] +fn train_and_compress(bencher: Bencher, bits: u32) { + let c = corpus(); + bencher + .counter(divan::counter::BytesCount::new(c.total_bytes)) + .bench(|| { + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress( + divan::black_box(&c.bytes), + divan::black_box(&c.offsets), + cfg, + ) + .unwrap() + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn decompress_row_random(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let n = col.len(); + let mut buf = Vec::with_capacity(256); + let mut x = 0xC2B2AE3D27D4EB4Fu64; + bencher.bench_local(|| { + x = x.wrapping_mul(0x9E3779B97F4A7C15).wrapping_add(1); + let row = (x as usize) % n; + let _ = col.decompress_row(divan::black_box(row), &mut buf); + divan::black_box(&buf); + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn decode_all(bencher: Bencher, bits: u32) { + let c = corpus(); + let col = compress_column(bits); + bencher + .counter(divan::counter::BytesCount::new(c.total_bytes)) + .bench(|| { + divan::black_box(col.decode_all()); + }); +} + +// ── Bitmap (decompress-then-match) predicates ───────────────────────────────── + +#[divan::bench(args = BITS_CONFIGS)] +fn equals_bitmap(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let needle = equality_needle(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| divan::black_box(col.equals_bitmap(&needle))); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn starts_with_bitmap(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| divan::black_box(col.starts_with_bitmap(prefix_needle()))); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn contains_bitmap(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| divan::black_box(col.contains_bitmap(substring_needle()))); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn multi_pattern_bitmap(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let needles: &[&[u8]] = &[b"example", b"yandex", b"google", b"news"]; + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| divan::black_box(col.multi_pattern_bitmap(needles))); +} + +// ── Token-automaton (compressed-domain) predicates ──────────────────────────── + +#[divan::bench(args = BITS_CONFIGS)] +fn eq_automaton(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let dict = col.dictionary().clone(); + let needle = equality_needle(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| { + let eq = EqAutomaton::new(&needle, &dict); + divan::black_box(col.scan_bitmap(eq)); + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn prefix_automaton(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let dict = col.dictionary().clone(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| { + let pa = PrefixAutomaton::new(prefix_needle(), &dict); + divan::black_box(col.scan_bitmap(pa)); + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn kmp_automaton(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let dict = col.dictionary().clone(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| { + let kmp = KmpAutomaton::new(substring_needle(), &dict); + divan::black_box(col.scan_bitmap(kmp)); + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn ac_automaton(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let dict = col.dictionary().clone(); + let needles: &[&[u8]] = &[b"example", b"yandex", b"google", b"news"]; + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| { + let ac = AhoCorasickAutomaton::new(needles, &dict); + divan::black_box(col.scan_bitmap(ac)); + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn and_not_compressed_domain(bencher: Bencher, bits: u32) { + let col = compress_column(bits); + let dict = col.dictionary().clone(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| { + let mut a = KmpAutomaton::new(b"example", &dict); + let mut b = KmpAutomaton::new(b"yandex", &dict); + divan::black_box(col.scan_bitmap(and(&mut a, not(&mut b)))); + }); +} + +// ── C++ comparison via vortex-onpair-sys ────────────────────────────────────── + +#[divan::bench(args = BITS_CONFIGS)] +fn cpp_train_and_compress(bencher: Bencher, bits: u32) { + use vortex_onpair_sys::Column as CppColumn; + use vortex_onpair_sys::OnPairTrainingConfig as CppCfg; + let c = corpus(); + bencher + .counter(divan::counter::BytesCount::new(c.total_bytes)) + .bench(|| { + let cfg = CppCfg { + bits, + threshold: 0.5, + seed: 42, + }; + CppColumn::compress(&c.bytes, &c.offsets, cfg).unwrap() + }); +} + +#[divan::bench(args = BITS_CONFIGS)] +fn cpp_contains_bitmap(bencher: Bencher, bits: u32) { + use vortex_onpair_sys::Column as CppColumn; + use vortex_onpair_sys::OnPairTrainingConfig as CppCfg; + let c = corpus(); + let cfg = CppCfg { + bits, + threshold: 0.5, + seed: 42, + }; + let col = CppColumn::compress(&c.bytes, &c.offsets, cfg).unwrap(); + bencher + .counter(divan::counter::ItemsCount::new(col.len())) + .bench(|| divan::black_box(col.contains_bitmap(substring_needle()))); +} + +fn main() { + // Touch the corpus so the source line prints before divan begins. + let _ = corpus(); + divan::main(); +} diff --git a/encodings/experimental/onpair-rs/public-api.lock b/encodings/experimental/onpair-rs/public-api.lock new file mode 100644 index 00000000000..e486544a9a3 --- /dev/null +++ b/encodings/experimental/onpair-rs/public-api.lock @@ -0,0 +1,1375 @@ +pub mod vortex_onpair_rs + +pub mod vortex_onpair_rs::aho_corasick + +pub struct vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +impl vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::from_trie(&vortex_onpair_rs::aho_corasick::AhoCorasickTrie, &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::new(&[&[u8]], &vortex_onpair_rs::dict::Dictionary) -> Self + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::aho_corasick::AhoCorasickTrie + +impl vortex_onpair_rs::aho_corasick::AhoCorasickTrie + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::advance(&self, vortex_onpair_rs::aho_corasick::AcState, u8) -> vortex_onpair_rs::aho_corasick::AcState + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::edge_labels(&self, vortex_onpair_rs::aho_corasick::AcState) -> &[u8] + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::fail_link(&self, vortex_onpair_rs::aho_corasick::AcState) -> vortex_onpair_rs::aho_corasick::AcState + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::is_accepting(&self, vortex_onpair_rs::aho_corasick::AcState) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::new(&[&[u8]]) -> Self + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::num_patterns(&self) -> usize + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::num_states(&self) -> usize + +pub type vortex_onpair_rs::aho_corasick::AcState = u16 + +pub mod vortex_onpair_rs::automaton + +pub struct vortex_onpair_rs::automaton::And(pub A, pub B) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::And + +pub fn vortex_onpair_rs::automaton::And::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::And::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::automaton::EqAutomaton + +impl vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::automaton::EqAutomaton::query_length(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::EqAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::automaton::Negated(pub A) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::automaton::Negated::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Negated::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::automaton::Or(pub A, pub B) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Or + +pub fn vortex_onpair_rs::automaton::Or::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Or::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::automaton::PrefixAutomaton + +impl vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::query_length(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub trait vortex_onpair_rs::automaton::TokenAutomaton + +pub fn vortex_onpair_rs::automaton::TokenAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::TokenAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::TokenAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::TokenAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::EqAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for &mut A + +pub fn &mut A::is_accepted(&self) -> bool + +pub fn &mut A::is_dead(&self) -> bool + +pub fn &mut A::reset(&mut self) + +pub fn &mut A::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::And + +pub fn vortex_onpair_rs::automaton::And::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::And::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Or + +pub fn vortex_onpair_rs::automaton::Or::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Or::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::automaton::Negated::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Negated::step(&mut self, vortex_onpair_rs::types::Token) + +pub fn vortex_onpair_rs::automaton::and(A, B) -> vortex_onpair_rs::automaton::And + +pub fn vortex_onpair_rs::automaton::not(A) -> vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::automaton::or(A, B) -> vortex_onpair_rs::automaton::Or + +pub mod vortex_onpair_rs::bits + +pub struct vortex_onpair_rs::bits::BitWriter<'a> + +impl<'a> vortex_onpair_rs::bits::BitWriter<'a> + +pub fn vortex_onpair_rs::bits::BitWriter<'a>::flush(&mut self) + +pub fn vortex_onpair_rs::bits::BitWriter<'a>::new(&'a mut vortex_onpair_rs::store::Store) -> Self + +pub fn vortex_onpair_rs::bits::BitWriter<'a>::tokens_written(&self) -> usize + +pub fn vortex_onpair_rs::bits::BitWriter<'a>::write(&mut self, vortex_onpair_rs::types::Token) + +impl core::ops::drop::Drop for vortex_onpair_rs::bits::BitWriter<'_> + +pub fn vortex_onpair_rs::bits::BitWriter<'_>::drop(&mut self) + +pub struct vortex_onpair_rs::bits::TokenCursor<'a, const BITS: u32> + +impl<'a, const BITS: u32> vortex_onpair_rs::bits::TokenCursor<'a, BITS> + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::has_more(&self) -> bool + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::new(&'a [u64], vortex_onpair_rs::types::StreamSpan) -> Self + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::new_unbound(&'a [u64]) -> Self + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::next(&mut self) -> vortex_onpair_rs::types::Token + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::remaining(&self) -> u32 + +pub fn vortex_onpair_rs::bits::TokenCursor<'a, BITS>::reset_to(&mut self, vortex_onpair_rs::types::StreamSpan) + +pub fn vortex_onpair_rs::bits::read_bits_lsb(&[u64], usize, u32) -> u16 + +pub fn vortex_onpair_rs::bits::unpack_codes_to_u16(&[u64], usize, u32) -> alloc::vec::Vec + +pub mod vortex_onpair_rs::column + +pub struct vortex_onpair_rs::column::Column + +impl vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::bits(&self) -> u32 + +pub fn vortex_onpair_rs::column::Column::compress(&[u8], &[u64], vortex_onpair_rs::config::OnPairTrainingConfig) -> core::result::Result + +pub fn vortex_onpair_rs::column::Column::contains_bitmap(&self, &[u8]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::decode_all(&self) -> (alloc::vec::Vec, alloc::vec::Vec) + +pub fn vortex_onpair_rs::column::Column::decompress_row(&self, usize, &mut alloc::vec::Vec) -> core::result::Result<(), vortex_onpair_rs::config::Error> + +pub fn vortex_onpair_rs::column::Column::dict_size(&self) -> usize + +pub fn vortex_onpair_rs::column::Column::dictionary(&self) -> &vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::column::Column::equals_bitmap(&self, &[u8]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::is_empty(&self) -> bool + +pub fn vortex_onpair_rs::column::Column::len(&self) -> usize + +pub fn vortex_onpair_rs::column::Column::multi_pattern_bitmap(&self, &[&[u8]]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::parts(&self) -> core::result::Result, vortex_onpair_rs::config::Error> + +pub fn vortex_onpair_rs::column::Column::scan(&self, A) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::scan_bitmap(&self, A) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::scan_with(&self, A, F) + +pub fn vortex_onpair_rs::column::Column::starts_with_bitmap(&self, &[u8]) -> alloc::vec::Vec + +impl core::clone::Clone for vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::clone(&self) -> vortex_onpair_rs::column::Column + +impl core::fmt::Debug for vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::column::Parts<'a> + +pub vortex_onpair_rs::column::Parts::bits: u32 + +pub vortex_onpair_rs::column::Parts::codes_boundaries: &'a [u32] + +pub vortex_onpair_rs::column::Parts::codes_packed: &'a [u64] + +pub vortex_onpair_rs::column::Parts::dict_bytes: &'a [u8] + +pub vortex_onpair_rs::column::Parts::dict_offsets: &'a [u32] + +pub vortex_onpair_rs::column::Parts::num_rows: usize + +impl<'a> core::clone::Clone for vortex_onpair_rs::column::Parts<'a> + +pub fn vortex_onpair_rs::column::Parts<'a>::clone(&self) -> vortex_onpair_rs::column::Parts<'a> + +impl<'a> core::marker::Copy for vortex_onpair_rs::column::Parts<'a> + +pub mod vortex_onpair_rs::config + +pub enum vortex_onpair_rs::config::Error + +pub vortex_onpair_rs::config::Error::BadFormat + +pub vortex_onpair_rs::config::Error::Internal + +pub vortex_onpair_rs::config::Error::InvalidArg + +pub vortex_onpair_rs::config::Error::Oom + +pub vortex_onpair_rs::config::Error::OutOfRange + +impl core::clone::Clone for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::clone(&self) -> vortex_onpair_rs::config::Error + +impl core::cmp::Eq for vortex_onpair_rs::config::Error + +impl core::cmp::PartialEq for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::eq(&self, &vortex_onpair_rs::config::Error) -> bool + +impl core::error::Error for vortex_onpair_rs::config::Error + +impl core::fmt::Debug for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::Error + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::Error + +pub enum vortex_onpair_rs::config::ThresholdSpec + +pub vortex_onpair_rs::config::ThresholdSpec::Dynamic(vortex_onpair_rs::config::DynamicThreshold) + +pub vortex_onpair_rs::config::ThresholdSpec::Fixed(vortex_onpair_rs::config::FixedThreshold) + +impl core::clone::Clone for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::clone(&self) -> vortex_onpair_rs::config::ThresholdSpec + +impl core::default::Default for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::ThresholdSpec + +pub struct vortex_onpair_rs::config::DynamicThreshold + +pub vortex_onpair_rs::config::DynamicThreshold::sample_fraction: f64 + +impl core::clone::Clone for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::clone(&self) -> vortex_onpair_rs::config::DynamicThreshold + +impl core::cmp::PartialEq for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::eq(&self, &vortex_onpair_rs::config::DynamicThreshold) -> bool + +impl core::default::Default for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::DynamicThreshold + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::DynamicThreshold + +pub struct vortex_onpair_rs::config::FixedThreshold + +pub vortex_onpair_rs::config::FixedThreshold::value: u8 + +impl core::clone::Clone for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::clone(&self) -> vortex_onpair_rs::config::FixedThreshold + +impl core::cmp::Eq for vortex_onpair_rs::config::FixedThreshold + +impl core::cmp::PartialEq for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::eq(&self, &vortex_onpair_rs::config::FixedThreshold) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::FixedThreshold + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::FixedThreshold + +#[repr(C)] pub struct vortex_onpair_rs::config::OnPairTrainingConfig + +pub vortex_onpair_rs::config::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_rs::config::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_rs::config::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_rs::config::OnPairTrainingConfig + +pub fn vortex_onpair_rs::config::OnPairTrainingConfig::clone(&self) -> vortex_onpair_rs::config::OnPairTrainingConfig + +impl core::convert::From for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::from(vortex_onpair_rs::config::OnPairTrainingConfig) -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::OnPairTrainingConfig + +pub fn vortex_onpair_rs::config::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::OnPairTrainingConfig + +pub struct vortex_onpair_rs::config::TrainingConfig + +pub vortex_onpair_rs::config::TrainingConfig::bits: vortex_onpair_rs::types::BitWidth + +pub vortex_onpair_rs::config::TrainingConfig::seed: core::option::Option + +pub vortex_onpair_rs::config::TrainingConfig::threshold: vortex_onpair_rs::config::ThresholdSpec + +impl core::clone::Clone for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::clone(&self) -> vortex_onpair_rs::config::TrainingConfig + +impl core::convert::From for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::from(vortex_onpair_rs::config::OnPairTrainingConfig) -> Self + +impl core::default::Default for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub const vortex_onpair_rs::config::DEFAULT_DICT12_CONFIG: vortex_onpair_rs::config::OnPairTrainingConfig + +pub mod vortex_onpair_rs::dict + +pub struct vortex_onpair_rs::dict::Dictionary + +pub vortex_onpair_rs::dict::Dictionary::bytes: alloc::vec::Vec + +pub vortex_onpair_rs::dict::Dictionary::offsets: alloc::vec::Vec + +impl vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::bytes_used(&self) -> usize + +pub fn vortex_onpair_rs::dict::Dictionary::data(&self, vortex_onpair_rs::types::Token) -> &[u8] + +pub fn vortex_onpair_rs::dict::Dictionary::num_tokens(&self) -> usize + +pub fn vortex_onpair_rs::dict::Dictionary::pad_for_decoder(&mut self) + +pub fn vortex_onpair_rs::dict::Dictionary::prefix_range(&self, &[u8]) -> vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::dict::Dictionary::span(&self, vortex_onpair_rs::types::Token) -> vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::dict::Dictionary::token_size(&self, vortex_onpair_rs::types::Token) -> usize + +impl core::clone::Clone for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::clone(&self) -> vortex_onpair_rs::dict::Dictionary + +impl core::default::Default for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::default() -> vortex_onpair_rs::dict::Dictionary + +impl core::fmt::Debug for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub mod vortex_onpair_rs::kmp + +pub struct vortex_onpair_rs::kmp::KmpAutomaton + +impl vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::pattern_length(&self) -> usize + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::sparse_range_count(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub mod vortex_onpair_rs::lpm + +pub struct vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::find_longest_match(&self, &[u8]) -> (vortex_onpair_rs::types::Token, usize) + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::from_dictionary(&vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::insert(&mut self, &[u8]) -> vortex_onpair_rs::types::Token + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::new() -> Self + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::size(&self) -> usize + +impl core::clone::Clone for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::clone(&self) -> vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::default::Default for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::default() -> vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::fmt::Debug for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub mod vortex_onpair_rs::parser + +pub fn vortex_onpair_rs::parser::parse(&[u8], &[u32], usize, &vortex_onpair_rs::lpm::LongestPrefixMatcher, vortex_onpair_rs::types::BitWidth, &mut vortex_onpair_rs::store::Store) + +pub mod vortex_onpair_rs::store + +pub struct vortex_onpair_rs::store::Store + +pub vortex_onpair_rs::store::Store::bit_width: vortex_onpair_rs::types::BitWidth + +pub vortex_onpair_rs::store::Store::boundaries: alloc::vec::Vec + +pub vortex_onpair_rs::store::Store::packed: alloc::vec::Vec + +impl vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::bytes_used(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::num_strings(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::num_tokens(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::string_span(&self, usize) -> vortex_onpair_rs::types::StreamSpan + +impl core::clone::Clone for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::clone(&self) -> vortex_onpair_rs::store::Store + +impl core::default::Default for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::default() -> vortex_onpair_rs::store::Store + +impl core::fmt::Debug for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub mod vortex_onpair_rs::tokenize + +pub fn vortex_onpair_rs::tokenize::tokenize(&[u8], &vortex_onpair_rs::dict::Dictionary) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::tokenize::tokenize_with(&[u8], &vortex_onpair_rs::lpm::LongestPrefixMatcher) -> alloc::vec::Vec + +pub mod vortex_onpair_rs::trainer + +pub struct vortex_onpair_rs::trainer::TrainResult + +pub vortex_onpair_rs::trainer::TrainResult::dict: vortex_onpair_rs::dict::Dictionary + +pub vortex_onpair_rs::trainer::TrainResult::lpm: vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::clone::Clone for vortex_onpair_rs::trainer::TrainResult + +pub fn vortex_onpair_rs::trainer::TrainResult::clone(&self) -> vortex_onpair_rs::trainer::TrainResult + +impl core::fmt::Debug for vortex_onpair_rs::trainer::TrainResult + +pub fn vortex_onpair_rs::trainer::TrainResult::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub fn vortex_onpair_rs::trainer::train(&[u8], &[u32], usize, &vortex_onpair_rs::config::TrainingConfig) -> vortex_onpair_rs::trainer::TrainResult + +pub mod vortex_onpair_rs::types + +pub struct vortex_onpair_rs::types::ByteSpan + +pub vortex_onpair_rs::types::ByteSpan::begin: u32 + +pub vortex_onpair_rs::types::ByteSpan::end: u32 + +impl vortex_onpair_rs::types::ByteSpan + +pub const fn vortex_onpair_rs::types::ByteSpan::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::clone(&self) -> vortex_onpair_rs::types::ByteSpan + +impl core::cmp::Eq for vortex_onpair_rs::types::ByteSpan + +impl core::cmp::PartialEq for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::eq(&self, &vortex_onpair_rs::types::ByteSpan) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::ByteSpan + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::ByteSpan + +pub struct vortex_onpair_rs::types::StreamSpan + +pub vortex_onpair_rs::types::StreamSpan::begin: u32 + +pub vortex_onpair_rs::types::StreamSpan::end: u32 + +impl vortex_onpair_rs::types::StreamSpan + +pub const fn vortex_onpair_rs::types::StreamSpan::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::clone(&self) -> vortex_onpair_rs::types::StreamSpan + +impl core::cmp::Eq for vortex_onpair_rs::types::StreamSpan + +impl core::cmp::PartialEq for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::eq(&self, &vortex_onpair_rs::types::StreamSpan) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::StreamSpan + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::StreamSpan + +pub struct vortex_onpair_rs::types::TokenRange + +pub vortex_onpair_rs::types::TokenRange::begin: vortex_onpair_rs::types::Token + +pub vortex_onpair_rs::types::TokenRange::last: vortex_onpair_rs::types::Token + +impl vortex_onpair_rs::types::TokenRange + +pub const fn vortex_onpair_rs::types::TokenRange::contains(self, vortex_onpair_rs::types::Token) -> bool + +pub const fn vortex_onpair_rs::types::TokenRange::empty(self) -> bool + +pub const fn vortex_onpair_rs::types::TokenRange::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::clone(&self) -> vortex_onpair_rs::types::TokenRange + +impl core::cmp::Eq for vortex_onpair_rs::types::TokenRange + +impl core::cmp::PartialEq for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::eq(&self, &vortex_onpair_rs::types::TokenRange) -> bool + +impl core::default::Default for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::TokenRange + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::TokenRange + +pub const vortex_onpair_rs::types::MAX_TOKEN_SIZE: usize + +pub const fn vortex_onpair_rs::types::is_valid_bits(vortex_onpair_rs::types::BitWidth) -> bool + +pub const fn vortex_onpair_rs::types::max_dict_size(vortex_onpair_rs::types::BitWidth) -> usize + +pub type vortex_onpair_rs::types::BitWidth = u8 + +pub type vortex_onpair_rs::types::Token = u16 + +pub macro vortex_onpair_rs::dispatch_bits! + +pub enum vortex_onpair_rs::Error + +pub vortex_onpair_rs::Error::BadFormat + +pub vortex_onpair_rs::Error::Internal + +pub vortex_onpair_rs::Error::InvalidArg + +pub vortex_onpair_rs::Error::Oom + +pub vortex_onpair_rs::Error::OutOfRange + +impl core::clone::Clone for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::clone(&self) -> vortex_onpair_rs::config::Error + +impl core::cmp::Eq for vortex_onpair_rs::config::Error + +impl core::cmp::PartialEq for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::eq(&self, &vortex_onpair_rs::config::Error) -> bool + +impl core::error::Error for vortex_onpair_rs::config::Error + +impl core::fmt::Debug for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair_rs::config::Error + +pub fn vortex_onpair_rs::config::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::Error + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::Error + +pub enum vortex_onpair_rs::ThresholdSpec + +pub vortex_onpair_rs::ThresholdSpec::Dynamic(vortex_onpair_rs::config::DynamicThreshold) + +pub vortex_onpair_rs::ThresholdSpec::Fixed(vortex_onpair_rs::config::FixedThreshold) + +impl core::clone::Clone for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::clone(&self) -> vortex_onpair_rs::config::ThresholdSpec + +impl core::default::Default for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::ThresholdSpec + +pub fn vortex_onpair_rs::config::ThresholdSpec::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::ThresholdSpec + +pub struct vortex_onpair_rs::AhoCorasickAutomaton + +impl vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::from_trie(&vortex_onpair_rs::aho_corasick::AhoCorasickTrie, &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::new(&[&[u8]], &vortex_onpair_rs::dict::Dictionary) -> Self + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::AhoCorasickTrie + +impl vortex_onpair_rs::aho_corasick::AhoCorasickTrie + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::advance(&self, vortex_onpair_rs::aho_corasick::AcState, u8) -> vortex_onpair_rs::aho_corasick::AcState + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::edge_labels(&self, vortex_onpair_rs::aho_corasick::AcState) -> &[u8] + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::fail_link(&self, vortex_onpair_rs::aho_corasick::AcState) -> vortex_onpair_rs::aho_corasick::AcState + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::is_accepting(&self, vortex_onpair_rs::aho_corasick::AcState) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::new(&[&[u8]]) -> Self + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::num_patterns(&self) -> usize + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickTrie::num_states(&self) -> usize + +pub struct vortex_onpair_rs::And(pub A, pub B) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::And + +pub fn vortex_onpair_rs::automaton::And::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::And::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::ByteSpan + +pub vortex_onpair_rs::ByteSpan::begin: u32 + +pub vortex_onpair_rs::ByteSpan::end: u32 + +impl vortex_onpair_rs::types::ByteSpan + +pub const fn vortex_onpair_rs::types::ByteSpan::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::clone(&self) -> vortex_onpair_rs::types::ByteSpan + +impl core::cmp::Eq for vortex_onpair_rs::types::ByteSpan + +impl core::cmp::PartialEq for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::eq(&self, &vortex_onpair_rs::types::ByteSpan) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::types::ByteSpan::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::ByteSpan + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::ByteSpan + +pub struct vortex_onpair_rs::Column + +impl vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::bits(&self) -> u32 + +pub fn vortex_onpair_rs::column::Column::compress(&[u8], &[u64], vortex_onpair_rs::config::OnPairTrainingConfig) -> core::result::Result + +pub fn vortex_onpair_rs::column::Column::contains_bitmap(&self, &[u8]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::decode_all(&self) -> (alloc::vec::Vec, alloc::vec::Vec) + +pub fn vortex_onpair_rs::column::Column::decompress_row(&self, usize, &mut alloc::vec::Vec) -> core::result::Result<(), vortex_onpair_rs::config::Error> + +pub fn vortex_onpair_rs::column::Column::dict_size(&self) -> usize + +pub fn vortex_onpair_rs::column::Column::dictionary(&self) -> &vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::column::Column::equals_bitmap(&self, &[u8]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::is_empty(&self) -> bool + +pub fn vortex_onpair_rs::column::Column::len(&self) -> usize + +pub fn vortex_onpair_rs::column::Column::multi_pattern_bitmap(&self, &[&[u8]]) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::parts(&self) -> core::result::Result, vortex_onpair_rs::config::Error> + +pub fn vortex_onpair_rs::column::Column::scan(&self, A) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::scan_bitmap(&self, A) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::column::Column::scan_with(&self, A, F) + +pub fn vortex_onpair_rs::column::Column::starts_with_bitmap(&self, &[u8]) -> alloc::vec::Vec + +impl core::clone::Clone for vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::clone(&self) -> vortex_onpair_rs::column::Column + +impl core::fmt::Debug for vortex_onpair_rs::column::Column + +pub fn vortex_onpair_rs::column::Column::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::Dictionary + +pub vortex_onpair_rs::Dictionary::bytes: alloc::vec::Vec + +pub vortex_onpair_rs::Dictionary::offsets: alloc::vec::Vec + +impl vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::bytes_used(&self) -> usize + +pub fn vortex_onpair_rs::dict::Dictionary::data(&self, vortex_onpair_rs::types::Token) -> &[u8] + +pub fn vortex_onpair_rs::dict::Dictionary::num_tokens(&self) -> usize + +pub fn vortex_onpair_rs::dict::Dictionary::pad_for_decoder(&mut self) + +pub fn vortex_onpair_rs::dict::Dictionary::prefix_range(&self, &[u8]) -> vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::dict::Dictionary::span(&self, vortex_onpair_rs::types::Token) -> vortex_onpair_rs::types::ByteSpan + +pub fn vortex_onpair_rs::dict::Dictionary::token_size(&self, vortex_onpair_rs::types::Token) -> usize + +impl core::clone::Clone for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::clone(&self) -> vortex_onpair_rs::dict::Dictionary + +impl core::default::Default for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::default() -> vortex_onpair_rs::dict::Dictionary + +impl core::fmt::Debug for vortex_onpair_rs::dict::Dictionary + +pub fn vortex_onpair_rs::dict::Dictionary::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::DynamicThreshold + +pub vortex_onpair_rs::DynamicThreshold::sample_fraction: f64 + +impl core::clone::Clone for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::clone(&self) -> vortex_onpair_rs::config::DynamicThreshold + +impl core::cmp::PartialEq for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::eq(&self, &vortex_onpair_rs::config::DynamicThreshold) -> bool + +impl core::default::Default for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::DynamicThreshold + +pub fn vortex_onpair_rs::config::DynamicThreshold::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::DynamicThreshold + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::DynamicThreshold + +pub struct vortex_onpair_rs::EqAutomaton + +impl vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::automaton::EqAutomaton::query_length(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::EqAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::FixedThreshold + +pub vortex_onpair_rs::FixedThreshold::value: u8 + +impl core::clone::Clone for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::clone(&self) -> vortex_onpair_rs::config::FixedThreshold + +impl core::cmp::Eq for vortex_onpair_rs::config::FixedThreshold + +impl core::cmp::PartialEq for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::eq(&self, &vortex_onpair_rs::config::FixedThreshold) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::config::FixedThreshold + +pub fn vortex_onpair_rs::config::FixedThreshold::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::FixedThreshold + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::config::FixedThreshold + +pub struct vortex_onpair_rs::KmpAutomaton + +impl vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::pattern_length(&self) -> usize + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::sparse_range_count(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::LongestPrefixMatcher + +impl vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::find_longest_match(&self, &[u8]) -> (vortex_onpair_rs::types::Token, usize) + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::from_dictionary(&vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::insert(&mut self, &[u8]) -> vortex_onpair_rs::types::Token + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::new() -> Self + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::size(&self) -> usize + +impl core::clone::Clone for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::clone(&self) -> vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::default::Default for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::default() -> vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::fmt::Debug for vortex_onpair_rs::lpm::LongestPrefixMatcher + +pub fn vortex_onpair_rs::lpm::LongestPrefixMatcher::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::Negated(pub A) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::automaton::Negated::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Negated::step(&mut self, vortex_onpair_rs::types::Token) + +#[repr(C)] pub struct vortex_onpair_rs::OnPairTrainingConfig + +pub vortex_onpair_rs::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_rs::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_rs::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_rs::config::OnPairTrainingConfig + +pub fn vortex_onpair_rs::config::OnPairTrainingConfig::clone(&self) -> vortex_onpair_rs::config::OnPairTrainingConfig + +impl core::convert::From for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::from(vortex_onpair_rs::config::OnPairTrainingConfig) -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::OnPairTrainingConfig + +pub fn vortex_onpair_rs::config::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::config::OnPairTrainingConfig + +pub struct vortex_onpair_rs::Or(pub A, pub B) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Or + +pub fn vortex_onpair_rs::automaton::Or::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Or::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::Parts<'a> + +pub vortex_onpair_rs::Parts::bits: u32 + +pub vortex_onpair_rs::Parts::codes_boundaries: &'a [u32] + +pub vortex_onpair_rs::Parts::codes_packed: &'a [u64] + +pub vortex_onpair_rs::Parts::dict_bytes: &'a [u8] + +pub vortex_onpair_rs::Parts::dict_offsets: &'a [u32] + +pub vortex_onpair_rs::Parts::num_rows: usize + +impl<'a> core::clone::Clone for vortex_onpair_rs::column::Parts<'a> + +pub fn vortex_onpair_rs::column::Parts<'a>::clone(&self) -> vortex_onpair_rs::column::Parts<'a> + +impl<'a> core::marker::Copy for vortex_onpair_rs::column::Parts<'a> + +pub struct vortex_onpair_rs::PrefixAutomaton + +impl vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::new(&[u8], &vortex_onpair_rs::dict::Dictionary) -> Self + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::query_length(&self) -> usize + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +pub struct vortex_onpair_rs::Store + +pub vortex_onpair_rs::Store::bit_width: vortex_onpair_rs::types::BitWidth + +pub vortex_onpair_rs::Store::boundaries: alloc::vec::Vec + +pub vortex_onpair_rs::Store::packed: alloc::vec::Vec + +impl vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::bytes_used(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::num_strings(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::num_tokens(&self) -> usize + +pub fn vortex_onpair_rs::store::Store::string_span(&self, usize) -> vortex_onpair_rs::types::StreamSpan + +impl core::clone::Clone for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::clone(&self) -> vortex_onpair_rs::store::Store + +impl core::default::Default for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::default() -> vortex_onpair_rs::store::Store + +impl core::fmt::Debug for vortex_onpair_rs::store::Store + +pub fn vortex_onpair_rs::store::Store::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::StreamSpan + +pub vortex_onpair_rs::StreamSpan::begin: u32 + +pub vortex_onpair_rs::StreamSpan::end: u32 + +impl vortex_onpair_rs::types::StreamSpan + +pub const fn vortex_onpair_rs::types::StreamSpan::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::clone(&self) -> vortex_onpair_rs::types::StreamSpan + +impl core::cmp::Eq for vortex_onpair_rs::types::StreamSpan + +impl core::cmp::PartialEq for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::eq(&self, &vortex_onpair_rs::types::StreamSpan) -> bool + +impl core::fmt::Debug for vortex_onpair_rs::types::StreamSpan + +pub fn vortex_onpair_rs::types::StreamSpan::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::StreamSpan + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::StreamSpan + +pub struct vortex_onpair_rs::TokenRange + +pub vortex_onpair_rs::TokenRange::begin: vortex_onpair_rs::types::Token + +pub vortex_onpair_rs::TokenRange::last: vortex_onpair_rs::types::Token + +impl vortex_onpair_rs::types::TokenRange + +pub const fn vortex_onpair_rs::types::TokenRange::contains(self, vortex_onpair_rs::types::Token) -> bool + +pub const fn vortex_onpair_rs::types::TokenRange::empty(self) -> bool + +pub const fn vortex_onpair_rs::types::TokenRange::size(self) -> u32 + +impl core::clone::Clone for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::clone(&self) -> vortex_onpair_rs::types::TokenRange + +impl core::cmp::Eq for vortex_onpair_rs::types::TokenRange + +impl core::cmp::PartialEq for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::eq(&self, &vortex_onpair_rs::types::TokenRange) -> bool + +impl core::default::Default for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::types::TokenRange + +pub fn vortex_onpair_rs::types::TokenRange::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_rs::types::TokenRange + +impl core::marker::StructuralPartialEq for vortex_onpair_rs::types::TokenRange + +pub struct vortex_onpair_rs::TrainResult + +pub vortex_onpair_rs::TrainResult::dict: vortex_onpair_rs::dict::Dictionary + +pub vortex_onpair_rs::TrainResult::lpm: vortex_onpair_rs::lpm::LongestPrefixMatcher + +impl core::clone::Clone for vortex_onpair_rs::trainer::TrainResult + +pub fn vortex_onpair_rs::trainer::TrainResult::clone(&self) -> vortex_onpair_rs::trainer::TrainResult + +impl core::fmt::Debug for vortex_onpair_rs::trainer::TrainResult + +pub fn vortex_onpair_rs::trainer::TrainResult::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_onpair_rs::TrainingConfig + +pub vortex_onpair_rs::TrainingConfig::bits: vortex_onpair_rs::types::BitWidth + +pub vortex_onpair_rs::TrainingConfig::seed: core::option::Option + +pub vortex_onpair_rs::TrainingConfig::threshold: vortex_onpair_rs::config::ThresholdSpec + +impl core::clone::Clone for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::clone(&self) -> vortex_onpair_rs::config::TrainingConfig + +impl core::convert::From for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::from(vortex_onpair_rs::config::OnPairTrainingConfig) -> Self + +impl core::default::Default for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::default() -> Self + +impl core::fmt::Debug for vortex_onpair_rs::config::TrainingConfig + +pub fn vortex_onpair_rs::config::TrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub const vortex_onpair_rs::DEFAULT_DICT12_CONFIG: vortex_onpair_rs::config::OnPairTrainingConfig + +pub const vortex_onpair_rs::MAX_TOKEN_SIZE: usize + +pub trait vortex_onpair_rs::TokenAutomaton + +pub fn vortex_onpair_rs::TokenAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::TokenAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::TokenAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::TokenAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::aho_corasick::AhoCorasickAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::EqAutomaton + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::EqAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::EqAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::PrefixAutomaton + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::PrefixAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::kmp::KmpAutomaton + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::reset(&mut self) + +pub fn vortex_onpair_rs::kmp::KmpAutomaton::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for &mut A + +pub fn &mut A::is_accepted(&self) -> bool + +pub fn &mut A::is_dead(&self) -> bool + +pub fn &mut A::reset(&mut self) + +pub fn &mut A::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::And + +pub fn vortex_onpair_rs::automaton::And::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::And::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::And::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Or + +pub fn vortex_onpair_rs::automaton::Or::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Or::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Or::step(&mut self, vortex_onpair_rs::types::Token) + +impl vortex_onpair_rs::automaton::TokenAutomaton for vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::automaton::Negated::is_accepted(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::is_dead(&self) -> bool + +pub fn vortex_onpair_rs::automaton::Negated::reset(&mut self) + +pub fn vortex_onpair_rs::automaton::Negated::step(&mut self, vortex_onpair_rs::types::Token) + +pub fn vortex_onpair_rs::and(A, B) -> vortex_onpair_rs::automaton::And + +pub const fn vortex_onpair_rs::is_valid_bits(vortex_onpair_rs::types::BitWidth) -> bool + +pub const fn vortex_onpair_rs::max_dict_size(vortex_onpair_rs::types::BitWidth) -> usize + +pub fn vortex_onpair_rs::not(A) -> vortex_onpair_rs::automaton::Negated + +pub fn vortex_onpair_rs::or(A, B) -> vortex_onpair_rs::automaton::Or + +pub fn vortex_onpair_rs::parse(&[u8], &[u32], usize, &vortex_onpair_rs::lpm::LongestPrefixMatcher, vortex_onpair_rs::types::BitWidth, &mut vortex_onpair_rs::store::Store) + +pub fn vortex_onpair_rs::read_bits_lsb(&[u64], usize, u32) -> u16 + +pub fn vortex_onpair_rs::tokenize(&[u8], &vortex_onpair_rs::dict::Dictionary) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::tokenize_with(&[u8], &vortex_onpair_rs::lpm::LongestPrefixMatcher) -> alloc::vec::Vec + +pub fn vortex_onpair_rs::train(&[u8], &[u32], usize, &vortex_onpair_rs::config::TrainingConfig) -> vortex_onpair_rs::trainer::TrainResult + +pub fn vortex_onpair_rs::unpack_codes_to_u16(&[u64], usize, u32) -> alloc::vec::Vec + +pub type vortex_onpair_rs::BitWidth = u8 + +pub type vortex_onpair_rs::Token = u16 diff --git a/encodings/experimental/onpair-rs/src/aho_corasick.rs b/encodings/experimental/onpair-rs/src/aho_corasick.rs new file mode 100644 index 00000000000..d6b2c713d91 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/aho_corasick.rs @@ -0,0 +1,647 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/search/aho_corasick_trie.h` and +// `include/onpair/search/automata/aho_corasick_automaton.h`. +// +// Token-level multi-pattern substring match (SQL `col LIKE '%a%' OR '%b%' +// OR ...`). The byte-level Aho-Corasick trie is built first; an eager +// dual-traversal then projects it onto the dictionary's token alphabet, +// giving the same base/sparse decomposition as `KmpAutomaton`. +// +// `step` is a single sparse-range scan + table lookup. `is_dead` becomes +// true the moment any pattern matches. + +use crate::automaton::TokenAutomaton; +use crate::dict::Dictionary; +use crate::types::Token; +use crate::types::TokenRange; + +// ───────────────────────────────────────────────────────────────────────────── +// AhoCorasickTrie — byte-level +// ───────────────────────────────────────────────────────────────────────────── + +pub type AcState = u16; + +const NULL_STATE: AcState = u16::MAX; +const ROOT_STATE: AcState = 0; + +pub struct AhoCorasickTrie { + /// Edge labels concatenated by parent state; sorted within each state's + /// child group (SoA Arrow layout). + edge_labels: Vec, + edge_targets: Vec, + child_offsets: Vec, + /// Failure link per state. + fail: Vec, + /// Marks state as "matches at least one pattern". + accepting: Vec, + num_states: usize, + num_patterns: usize, +} + +impl AhoCorasickTrie { + pub fn new(patterns: &[&[u8]]) -> Self { + let num_patterns = patterns.len(); + + // ── 1. Build trie as first-child / next-sibling temporary nodes ──── + struct Node { + c: u8, + first_child: AcState, + next_sibling: AcState, + } + let mut nodes: Vec = vec![Node { + c: 0, + first_child: NULL_STATE, + next_sibling: NULL_STATE, + }]; + let mut accepting = vec![false]; + + for pat in patterns { + if pat.is_empty() { + accepting[ROOT_STATE as usize] = true; + continue; + } + let mut cur = ROOT_STATE; + for &b in *pat { + // Walk the (sorted) sibling list. + let mut child = nodes[cur as usize].first_child; + let mut prev = NULL_STATE; + while child != NULL_STATE && nodes[child as usize].c < b { + prev = child; + child = nodes[child as usize].next_sibling; + } + if child == NULL_STATE || nodes[child as usize].c != b { + let new_node = nodes.len() as AcState; + nodes.push(Node { + c: b, + first_child: NULL_STATE, + next_sibling: child, + }); + accepting.push(false); + if prev == NULL_STATE { + nodes[cur as usize].first_child = new_node; + } else { + nodes[prev as usize].next_sibling = new_node; + } + child = new_node; + } + cur = child; + } + accepting[cur as usize] = true; + } + + let num_states = nodes.len(); + assert!( + num_states < NULL_STATE as usize, + "AhoCorasickTrie: too many states" + ); + + // ── 2. Compact into SoA ─────────────────────────────────────────── + let mut edge_labels: Vec = Vec::with_capacity(num_states); + let mut edge_targets: Vec = Vec::with_capacity(num_states); + let mut child_offsets: Vec = Vec::with_capacity(num_states + 1); + + for i in 0..num_states { + child_offsets.push(edge_labels.len() as u16); + let mut child = nodes[i].first_child; + while child != NULL_STATE { + edge_labels.push(nodes[child as usize].c); + edge_targets.push(child); + child = nodes[child as usize].next_sibling; + } + } + child_offsets.push(edge_labels.len() as u16); + + // ── 3. Failure links via BFS ────────────────────────────────────── + let mut fail = vec![ROOT_STATE; num_states]; + let mut bfs: Vec = Vec::with_capacity(num_states); + + let root_start = child_offsets[ROOT_STATE as usize] as usize; + let root_end = child_offsets[ROOT_STATE as usize + 1] as usize; + for i in root_start..root_end { + fail[edge_targets[i] as usize] = ROOT_STATE; + bfs.push(edge_targets[i]); + } + + let mut trie = Self { + edge_labels, + edge_targets, + child_offsets, + fail, + accepting, + num_states, + num_patterns, + }; + + let mut qi = 0; + while qi < bfs.len() { + let u = bfs[qi]; + qi += 1; + // Propagate accepting through fail chain. + if trie.accepting[trie.fail[u as usize] as usize] { + trie.accepting[u as usize] = true; + } + let lo = trie.child_offsets[u as usize] as usize; + let hi = trie.child_offsets[u as usize + 1] as usize; + for i in lo..hi { + let target = trie.edge_targets[i]; + let label = trie.edge_labels[i]; + trie.fail[target as usize] = trie.advance(trie.fail[u as usize], label); + bfs.push(target); + } + } + + trie + } + + /// Advance state `u` by byte `c`, resolving failure links as needed. + pub fn advance(&self, mut u: AcState, c: u8) -> AcState { + loop { + let lo = self.child_offsets[u as usize] as usize; + let hi = self.child_offsets[u as usize + 1] as usize; + // Edges are sorted by label. + for i in lo..hi { + let label = self.edge_labels[i]; + if label == c { + return self.edge_targets[i]; + } + if label > c { + break; + } + } + if u == ROOT_STATE { + return ROOT_STATE; + } + u = self.fail[u as usize]; + } + } + + pub fn is_accepting(&self, s: AcState) -> bool { + self.accepting[s as usize] + } + + pub fn num_states(&self) -> usize { + self.num_states + } + + pub fn num_patterns(&self) -> usize { + self.num_patterns + } + + pub fn fail_link(&self, s: AcState) -> AcState { + self.fail[s as usize] + } + + pub fn edge_labels(&self, s: AcState) -> &[u8] { + let lo = self.child_offsets[s as usize] as usize; + let hi = self.child_offsets[s as usize + 1] as usize; + &self.edge_labels[lo..hi] + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// AhoCorasickAutomaton — token-level +// ───────────────────────────────────────────────────────────────────────────── + +const HIT_STATE: AcState = NULL_STATE; + +pub struct AhoCorasickAutomaton { + base: Vec, + sparse_offsets: Vec, + sparse_ranges: Vec, + sparse_targets: Vec, + state: AcState, + hit: bool, + all_match: bool, +} + +impl AhoCorasickAutomaton { + /// Convenience: build the trie and project it in one go. + pub fn new(patterns: &[&[u8]], dict: &Dictionary) -> Self { + let trie = AhoCorasickTrie::new(patterns); + Self::from_trie(&trie, dict) + } + + /// Project an existing trie onto the token alphabet of `dict`. + pub fn from_trie(trie: &AhoCorasickTrie, dict: &Dictionary) -> Self { + let all_match = trie.is_accepting(ROOT_STATE); + let mut me = Self { + base: Vec::new(), + sparse_offsets: Vec::new(), + sparse_ranges: Vec::new(), + sparse_targets: Vec::new(), + state: ROOT_STATE, + hit: all_match, + all_match, + }; + if all_match { + return me; + } + + let num_states = trie.num_states(); + let num_tokens = dict.num_tokens(); + + // Evolve one state through one byte; collapse to HIT if accepting. + let evolve = |state: AcState, c: u8| -> AcState { + if state == HIT_STATE { + return HIT_STATE; + } + let next = trie.advance(state, c); + if trie.is_accepting(next) { + HIT_STATE + } else { + next + } + }; + + // ── 1. Base pass ────────────────────────────────────────────────── + me.base = Vec::with_capacity(num_tokens); + for t in 0..num_tokens { + let bytes = dict.data(t as Token); + let mut s = ROOT_STATE; + for &c in bytes { + s = trie.advance(s, c); + if trie.is_accepting(s) { + s = HIT_STATE; + break; + } + } + me.base.push(s); + } + + // ── 2. Sparse pass ──────────────────────────────────────────────── + me.sparse_offsets = vec![0u32; num_states + 1]; + + let mut builder = AcSparseBuilder { + dict, + base: &me.base, + ranges: &mut me.sparse_ranges, + targets: &mut me.sparse_targets, + range_start: 0, + }; + + let mut relevant_chars: Vec = Vec::new(); + + for j in 1..(num_states as AcState) { + builder.range_start = builder.ranges.len(); + me.sparse_offsets[j as usize] = builder.range_start as u32; + + // Collect labels along the failure chain. + relevant_chars.clear(); + let mut u = j; + while u != ROOT_STATE { + for &c in trie.edge_labels(u) { + relevant_chars.push(c); + } + u = trie.fail_link(u); + } + relevant_chars.sort_unstable(); + relevant_chars.dedup(); + + for &byte in &relevant_chars { + // Identical transition from j and root → no exception needed. + if trie.advance(j, byte) == trie.advance(ROOT_STATE, byte) { + continue; + } + let range = dict.prefix_range(&[byte]); + if range.empty() { + continue; + } + builder.traverse(range, 1, evolve(j, byte), evolve(ROOT_STATE, byte), &evolve); + } + } + me.sparse_offsets[num_states] = me.sparse_ranges.len() as u32; + + me + } +} + +impl TokenAutomaton for AhoCorasickAutomaton { + #[inline] + fn step(&mut self, t: Token) { + if self.hit { + return; + } + if self.state != ROOT_STATE { + let lo = self.sparse_offsets[self.state as usize] as usize; + let hi = self.sparse_offsets[self.state as usize + 1] as usize; + for i in lo..hi { + let r = self.sparse_ranges[i]; + if t < r.begin { + break; + } + if t <= r.last { + let target = self.sparse_targets[i]; + self.hit = target == HIT_STATE; + self.state = target; + return; + } + } + } + let target = self.base[t as usize]; + self.hit = target == HIT_STATE; + self.state = target; + } + + #[inline] + fn is_accepted(&self) -> bool { + self.hit + } + + #[inline] + fn reset(&mut self) { + self.state = ROOT_STATE; + self.hit = self.all_match; + } + + #[inline] + fn is_dead(&self) -> bool { + self.hit + } +} + +struct AcSparseBuilder<'a> { + dict: &'a Dictionary, + base: &'a [AcState], + ranges: &'a mut Vec, + targets: &'a mut Vec, + range_start: usize, +} + +impl AcSparseBuilder<'_> { + fn emit(&mut self, range: TokenRange, target: AcState) { + if self.ranges.len() > self.range_start + && *self.targets.last().unwrap() == target + && (self.ranges.last().unwrap().last as u32) + 1 == range.begin as u32 + { + self.ranges.last_mut().unwrap().last = range.last; + return; + } + self.ranges.push(range); + self.targets.push(target); + } + + fn traverse( + &mut self, + tr: TokenRange, + depth: usize, + state_j: AcState, + state_0: AcState, + evolve: &F, + ) where + F: Fn(AcState, u8) -> AcState, + { + if state_j == state_0 || tr.empty() { + return; + } + if state_j == HIT_STATE { + let mut i = tr.begin; + while i <= tr.last { + if self.base[i as usize] != HIT_STATE { + let start = i; + while i <= tr.last && self.base[i as usize] != HIT_STATE { + if i == tr.last { + self.emit( + TokenRange { + begin: start, + last: i, + }, + HIT_STATE, + ); + return; + } + i += 1; + } + self.emit( + TokenRange { + begin: start, + last: i - 1, + }, + HIT_STATE, + ); + } else { + if i == tr.last { + break; + } + i += 1; + } + } + return; + } + + // Leaf tokens of length == depth share exit state state_j. + let mut cur = tr.begin; + while cur <= tr.last && self.dict.token_size(cur) == depth { + if cur == tr.last { + self.emit( + TokenRange { + begin: tr.begin, + last: cur, + }, + state_j, + ); + return; + } + cur += 1; + } + if cur > tr.begin { + self.emit( + TokenRange { + begin: tr.begin, + last: cur - 1, + }, + state_j, + ); + } + if cur > tr.last { + return; + } + + while cur <= tr.last { + let c = self.dict.data(cur)[depth]; + let mut sub_hi = cur; + while sub_hi < tr.last && self.dict.data(sub_hi + 1)[depth] == c { + sub_hi += 1; + } + self.traverse( + TokenRange { + begin: cur, + last: sub_hi, + }, + depth + 1, + evolve(state_j, c), + evolve(state_0, c), + evolve, + ); + if sub_hi == tr.last { + break; + } + cur = sub_hi + 1; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::column::Column; + use crate::config::DEFAULT_DICT12_CONFIG; + use crate::config::OnPairTrainingConfig; + use crate::test_corpus::make_raw; + use crate::test_corpus::user_strings; + + fn make_column>(strings: &[S]) -> Column { + make_column_bits(strings, 14) + } + + fn make_column_bits>(strings: &[S], bits: u32) -> Column { + let raw = make_raw(strings); + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress(&raw.data, &raw.offsets_u64, cfg).unwrap() + } + + fn contains_any(col: &Column, patterns: &[&[u8]]) -> Vec { + let dict = col.dictionary().clone(); + let ac = AhoCorasickAutomaton::new(patterns, &dict); + col.scan(ac) + } + + // ── Trie sanity ──────────────────────────────────────────────────────── + + #[test] + fn trie_basic_advance() { + let patterns: Vec<&[u8]> = vec![b"he", b"she", b"his", b"hers"]; + let t = AhoCorasickTrie::new(&patterns); + // root --h--> ? --e--> accepting "he" + let s_h = t.advance(ROOT_STATE, b'h'); + assert!(!t.is_accepting(s_h)); + let s_he = t.advance(s_h, b'e'); + assert!(t.is_accepting(s_he)); + // Walking "ushers" should reach an accepting state via failure links. + let mut s = ROOT_STATE; + for &c in b"ushers" { + s = t.advance(s, c); + } + assert!(t.is_accepting(s)); + } + + #[test] + fn trie_empty_pattern_marks_root_accepting() { + let patterns: Vec<&[u8]> = vec![b""]; + let t = AhoCorasickTrie::new(&patterns); + assert!(t.is_accepting(ROOT_STATE)); + } + + // ── Basic multi-pattern search ──────────────────────────────────────── + + #[test] + fn basic_multi_pattern() { + let data = [ + "error: disk full", + "warning: low memory", + "info: all ok", + "fatal: kernel panic", + "debug: trace", + ]; + let col = make_column(&data); + let result = contains_any(&col, &[b"error", b"fatal"]); + assert_eq!(result, vec![0, 3]); + } + + #[test] + fn single_pattern() { + let data = ["abc", "def", "abc_xyz"]; + let col = make_column(&data); + assert_eq!(contains_any(&col, &[b"abc"]), vec![0, 2]); + } + + #[test] + fn no_matches() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + assert!(contains_any(&col, &[b"xyz", b"uvw"]).is_empty()); + } + + #[test] + fn all_strings_match() { + let data = ["abc_def", "def_ghi", "ghi_abc"]; + let col = make_column(&data); + let r = contains_any(&col, &[b"abc", b"def", b"ghi"]); + assert_eq!(r.len(), 3); + } + + // ── Empty patterns ──────────────────────────────────────────────────── + + #[test] + fn empty_pattern_matches_all() { + let data = ["abc", "def"]; + let col = make_column(&data); + assert_eq!(contains_any(&col, &[b""]).len(), 2); + } + + #[test] + fn empty_pattern_set_matches_none() { + let data = ["abc", "def"]; + let col = make_column(&data); + assert!(contains_any(&col, &[]).is_empty()); + } + + // ── Overlapping / prefix patterns ───────────────────────────────────── + + #[test] + fn overlapping_patterns() { + let data = ["abcdef", "bcde", "xyz"]; + let col = make_column(&data); + assert_eq!(contains_any(&col, &[b"abc", b"bcd"]), vec![0, 1]); + } + + #[test] + fn prefix_patterns() { + let data = ["abc", "ab", "xyz"]; + let col = make_column(&data); + assert_eq!(contains_any(&col, &[b"ab", b"abc"]), vec![0, 1]); + } + + // ── All bit widths ──────────────────────────────────────────────────── + + #[test] + fn works_across_bit_widths() { + let data = ["error log", "warning log", "info log"]; + for bw in 9u32..=16 { + let col = make_column_bits(&data, bw); + let r = contains_any(&col, &[b"error", b"warning"]); + assert_eq!(r.len(), 2, "bw={bw}"); + } + } + + // ── Consistency with single-needle KMP ──────────────────────────────── + + #[test] + fn single_pattern_matches_kmp() { + use crate::kmp::KmpAutomaton; + let data = user_strings(50); + let col = make_column(&data); + let dict = col.dictionary().clone(); + let kmp = KmpAutomaton::new(b"https", &dict); + let kmp_result = col.scan(kmp); + let ac = AhoCorasickAutomaton::new(&[b"https"], &dict); + let ac_result = col.scan(ac); + assert_eq!(kmp_result, ac_result); + } + + // ── Empty column ────────────────────────────────────────────────────── + + #[test] + fn empty_column_returns_empty() { + let strings: Vec<&[u8]> = vec![]; + let raw = make_raw(&strings); + let col = Column::compress(&raw.data, &raw.offsets_u64, DEFAULT_DICT12_CONFIG).unwrap(); + let r = contains_any(&col, &[b"abc", b"def"]); + assert!(r.is_empty()); + } +} diff --git a/encodings/experimental/onpair-rs/src/automaton.rs b/encodings/experimental/onpair-rs/src/automaton.rs new file mode 100644 index 00000000000..0512d942c0f --- /dev/null +++ b/encodings/experimental/onpair-rs/src/automaton.rs @@ -0,0 +1,768 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/search/automata/token_automaton.h`. +// +// Token-level automata that consume the bit-packed token stream of an +// `vortex_onpair_rs::Column` directly. The scan loop in [`Column::scan`] resets +// the automaton at each row, feeds every token, and inspects +// `is_accepted()` once after the last token (or after `is_dead()` becomes +// true, whichever is first). +// +// Build composite predicates via [`and`], [`or`], [`not`]; the wrappers +// also implement [`TokenAutomaton`], so they nest. Every concrete +// automaton must implement `step` / `is_accepted` / `reset`; the default +// `is_dead` returns `false`, which is correct for any automaton that +// never finalises before the end of the row. + +use crate::types::Token; + +/// Token-by-token streaming predicate. Reset once per row, stepped on every +/// token, read for the final verdict. +pub trait TokenAutomaton { + fn step(&mut self, t: Token); + fn is_accepted(&self) -> bool; + fn reset(&mut self); + /// `true` once the verdict cannot change regardless of remaining + /// tokens. The scan loop uses this to skip the rest of a row. + fn is_dead(&self) -> bool { + false + } +} + +impl TokenAutomaton for &mut A { + #[inline] + fn step(&mut self, t: Token) { + (**self).step(t); + } + #[inline] + fn is_accepted(&self) -> bool { + (**self).is_accepted() + } + #[inline] + fn reset(&mut self) { + (**self).reset(); + } + #[inline] + fn is_dead(&self) -> bool { + (**self).is_dead() + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Combinators — Negated / And / Or. +// ───────────────────────────────────────────────────────────────────────────── + +/// `!A` — flips `is_accepted`. `is_dead` is forwarded unchanged. +pub struct Negated(pub A); + +impl TokenAutomaton for Negated { + #[inline] + fn step(&mut self, t: Token) { + self.0.step(t); + } + #[inline] + fn is_accepted(&self) -> bool { + !self.0.is_accepted() + } + #[inline] + fn reset(&mut self) { + self.0.reset(); + } + #[inline] + fn is_dead(&self) -> bool { + self.0.is_dead() + } +} + +/// `A AND B` — both must accept. Both step on every token. Early-exits +/// when either inner becomes dead in a state that proves rejection. +pub struct And(pub A, pub B); + +impl TokenAutomaton for And { + #[inline] + fn step(&mut self, t: Token) { + self.0.step(t); + self.1.step(t); + } + #[inline] + fn is_accepted(&self) -> bool { + self.0.is_accepted() && self.1.is_accepted() + } + #[inline] + fn reset(&mut self) { + self.0.reset(); + self.1.reset(); + } + #[inline] + fn is_dead(&self) -> bool { + (self.0.is_dead() && !self.0.is_accepted()) || (self.1.is_dead() && !self.1.is_accepted()) + } +} + +/// `A OR B` — either may accept. Both step on every token. Early-exits +/// when either inner becomes dead in a state that proves acceptance. +pub struct Or(pub A, pub B); + +impl TokenAutomaton for Or { + #[inline] + fn step(&mut self, t: Token) { + self.0.step(t); + self.1.step(t); + } + #[inline] + fn is_accepted(&self) -> bool { + self.0.is_accepted() || self.1.is_accepted() + } + #[inline] + fn reset(&mut self) { + self.0.reset(); + self.1.reset(); + } + #[inline] + fn is_dead(&self) -> bool { + (self.0.is_dead() && self.0.is_accepted()) || (self.1.is_dead() && self.1.is_accepted()) + } +} + +/// `not(a)` constructs a [`Negated`] wrapper. +pub fn not(a: A) -> Negated { + Negated(a) +} + +/// `and(a, b)` constructs an [`And`] wrapper. +pub fn and(a: A, b: B) -> And { + And(a, b) +} + +/// `or(a, b)` constructs an [`Or`] wrapper. +pub fn or(a: A, b: B) -> Or { + Or(a, b) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Tiny test automaton that accepts a fixed token-id once seen. + struct AcceptsToken { + target: Token, + seen: bool, + } + impl AcceptsToken { + fn new(t: Token) -> Self { + Self { + target: t, + seen: false, + } + } + } + impl TokenAutomaton for AcceptsToken { + fn step(&mut self, t: Token) { + if t == self.target { + self.seen = true; + } + } + fn is_accepted(&self) -> bool { + self.seen + } + fn reset(&mut self) { + self.seen = false; + } + fn is_dead(&self) -> bool { + self.seen + } + } + + fn drive(mut a: A, tokens: &[Token]) -> bool { + a.reset(); + for &t in tokens { + a.step(t); + if a.is_dead() { + break; + } + } + a.is_accepted() + } + + #[test] + fn accepts_token_basic() { + assert!(drive(AcceptsToken::new(7), &[1, 2, 3, 7, 9])); + assert!(!drive(AcceptsToken::new(7), &[1, 2, 3, 8])); + } + + #[test] + fn negation_inverts() { + assert!(!drive(not(AcceptsToken::new(7)), &[1, 7, 2])); + assert!(drive(not(AcceptsToken::new(7)), &[1, 8, 2])); + } + + #[test] + fn and_requires_both() { + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(drive(and(a, b), &[1, 2, 3])); + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(!drive(and(a, b), &[1, 3])); + } + + #[test] + fn or_requires_either() { + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(drive(or(a, b), &[1, 9])); + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(drive(or(a, b), &[9, 2])); + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(!drive(or(a, b), &[3, 4, 5])); + } + + #[test] + fn nested_and_not() { + // A AND NOT B + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(drive(and(a, not(b)), &[1, 3])); + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + assert!(!drive(and(a, not(b)), &[1, 2, 3])); + } + + #[test] + fn references_implement_trait() { + let mut a = AcceptsToken::new(7); + let result = drive(&mut a, &[1, 7]); + assert!(result); + // Inner state remains accepting after. + assert!(a.is_accepted()); + } + + #[test] + fn or_dead_when_accepted() { + // Once an Or component accepts and is dead, the combinator is dead. + let a = AcceptsToken::new(1); + let b = AcceptsToken::new(2); + let mut comb = or(a, b); + comb.reset(); + comb.step(1); + assert!(comb.is_dead()); + assert!(comb.is_accepted()); + } +} +// Port of `include/onpair/search/automata/eq_automaton.h`. +// +// Token-level automaton for SQL `col = value`. Tokenises the query once +// against the column's dictionary, then a step is a single bounds check + +// `u16` compare. `is_dead()` becomes true the moment a token diverges from +// the query — the scan loop skips the rest of the row. + +use crate::dict::Dictionary; +use crate::tokenize::tokenize; + +pub struct EqAutomaton { + query: Vec, + pos: usize, + failed: bool, +} + +impl EqAutomaton { + /// Build the automaton against `dict`. Empty `value` matches only rows + /// with zero tokens (empty strings). + pub fn new(value: &[u8], dict: &Dictionary) -> Self { + Self { + query: tokenize(value, dict), + pos: 0, + failed: false, + } + } + + /// Number of tokens the query produced. + pub fn query_length(&self) -> usize { + self.query.len() + } +} + +impl TokenAutomaton for EqAutomaton { + #[inline] + fn step(&mut self, t: Token) { + // failed |= (pos >= len) || (t != query[pos]) + self.failed |= self.pos >= self.query.len() || t != self.query[self.pos]; + self.pos += 1; + } + + #[inline] + fn is_accepted(&self) -> bool { + !self.failed && self.pos == self.query.len() + } + + #[inline] + fn reset(&mut self) { + self.pos = 0; + self.failed = false; + } + + #[inline] + fn is_dead(&self) -> bool { + self.failed + } +} + +#[cfg(test)] +mod tests2 { + use super::*; + use crate::column::Column; + use crate::config::DEFAULT_DICT12_CONFIG; + use crate::config::OnPairTrainingConfig; + use crate::test_corpus::make_raw; + use crate::test_corpus::random_ascii_strings; + use crate::test_corpus::user_strings; + + fn make_column>(strings: &[S]) -> Column { + make_column_bits(strings, 14) + } + + fn make_column_bits>(strings: &[S], bits: u32) -> Column { + let raw = make_raw(strings); + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress(&raw.data, &raw.offsets_u64, cfg).unwrap() + } + + fn brute_eq>(strings: &[S], needle: &[u8]) -> Vec { + strings + .iter() + .enumerate() + .filter(|(_, s)| s.as_ref() == needle) + .map(|(i, _)| i) + .collect() + } + + // ── Basic correctness ───────────────────────────────────────────────── + + #[test] + fn single_match() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"def", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![1]); + } + + #[test] + fn no_match() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"xyz", &col.dictionary().clone()); + assert!(col.scan(eq).is_empty()); + } + + #[test] + fn multiple_identical_strings() { + let data = ["abc", "abc", "def", "abc"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![0, 1, 3]); + } + + #[test] + fn empty_value_matches_only_empty_strings() { + let data: Vec<&[u8]> = vec![b"", b"abc", b"", b"def", b""]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![0, 2, 4]); + } + + #[test] + fn empty_value_no_empty_strings() { + let data = ["abc", "def"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"", &col.dictionary().clone()); + assert!(col.scan(eq).is_empty()); + } + + #[test] + fn prefix_of_value_does_not_match() { + let data = ["abc", "abcd", "abcde"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![0]); + } + + #[test] + fn suffix_of_value_does_not_match() { + let data = ["bc", "abc", "xabc"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![1]); + } + + #[test] + fn value_longer_than_all_strings() { + let data = ["a", "b", "c"]; + let col = make_column(&data); + let eq = EqAutomaton::new(b"abcdefgh", &col.dictionary().clone()); + assert!(col.scan(eq).is_empty()); + } + + // ── Rescannable ─────────────────────────────────────────────────────── + + #[test] + fn rescannable_same_column() { + let data = ["abc", "def", "abc"]; + let col = make_column(&data); + let mut eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + let r1 = col.scan(&mut eq); + let r2 = col.scan(&mut eq); + assert_eq!(r1, r2); + } + + // ── All bit widths ──────────────────────────────────────────────────── + + #[test] + fn works_across_bit_widths() { + let data = ["abc", "def", "abc", "ghi"]; + for bw in 9u32..=16 { + let col = make_column_bits(&data, bw); + let eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(eq), vec![0, 2], "bw={bw}"); + } + } + + // ── Large corpus cross-validation ───────────────────────────────────── + + #[test] + fn large_corpus_cross_validation() { + let data = random_ascii_strings(200, 30, 123); + let col = make_column(&data); + let dict = col.dictionary().clone(); + for qi in (0..data.len()).step_by(40) { + let q = &data[qi]; + let eq = EqAutomaton::new(q, &dict); + assert_eq!(col.scan(eq), brute_eq(&data, q)); + } + } + + // ── Empty column ────────────────────────────────────────────────────── + + #[test] + fn empty_column_returns_empty() { + let strings: Vec<&[u8]> = vec![]; + let raw = make_raw(&strings); + let col = Column::compress(&raw.data, &raw.offsets_u64, DEFAULT_DICT12_CONFIG).unwrap(); + let eq = EqAutomaton::new(b"abc", &col.dictionary().clone()); + assert!(col.scan(eq).is_empty()); + } + + // ── Cross-validation against brute force on user strings ────────────── + + #[test] + fn consistency_with_brute_force() { + let data = user_strings(50); + let col = make_column(&data); + let dict = col.dictionary().clone(); + let q = "https://www.example.com/page"; + let eq = EqAutomaton::new(q.as_bytes(), &dict); + assert_eq!(col.scan(eq), brute_eq(&data, q.as_bytes())); + let eq2 = EqAutomaton::new(b"missing-needle", &dict); + assert!(col.scan(eq2).is_empty()); + } +} +// Port of `include/onpair/search/automata/prefix_automaton.h`. +// +// Token-level automaton for SQL `col LIKE 'prefix%'`. +// +// At each query position `i` we record two things: +// query[i] — the expected next token +// range[i] — the dictionary range of tokens whose bytes begin with the +// remaining suffix of the prefix at that position +// +// On `step(t)`: +// * if `t == query[i]`, advance. +// * else if `t` is inside `range[i]`, the row's i-th token diverges from +// the query but still extends the prefix legally — accept. +// * else reject. +// +// `is_dead` becomes true the moment we accept or reject; the row never has +// to be decompressed. + +use crate::types::TokenRange; + +#[derive(Clone, Copy, PartialEq, Eq)] +enum Status { + Matching, + Accepted, + Rejected, +} + +pub struct PrefixAutomaton { + query: Vec, + intervals: Vec, + pos: usize, + status: Status, +} + +impl PrefixAutomaton { + pub fn new(prefix: &[u8], dict: &Dictionary) -> Self { + let query = tokenize(prefix, dict); + let q_len = query.len(); + let mut intervals = vec![TokenRange::default(); q_len]; + + if q_len == 0 { + return Self { + query, + intervals, + pos: 0, + status: Status::Accepted, + }; + } + + // For each token position, precompute the dictionary prefix range + // that any *divergent* token would have to lie in. Walk the prefix + // string and at each step ask "which tokens have these remaining + // bytes as a prefix of their bytes?". + let mut cur = 0usize; + for (i, &tok) in query.iter().enumerate() { + intervals[i] = dict.prefix_range(&prefix[cur..]); + cur += dict.token_size(tok); + } + + Self { + query, + intervals, + pos: 0, + status: Status::Matching, + } + } + + pub fn query_length(&self) -> usize { + self.query.len() + } +} + +impl TokenAutomaton for PrefixAutomaton { + #[inline] + fn step(&mut self, t: Token) { + if self.status != Status::Matching { + return; + } + if t != self.query[self.pos] { + self.status = if self.intervals[self.pos].contains(t) { + Status::Accepted + } else { + Status::Rejected + }; + return; + } + self.pos += 1; + if self.pos == self.query.len() { + self.status = Status::Accepted; + } + } + + #[inline] + fn is_accepted(&self) -> bool { + self.status == Status::Accepted + } + + #[inline] + fn reset(&mut self) { + self.pos = 0; + self.status = if self.query.is_empty() { + Status::Accepted + } else { + Status::Matching + }; + } + + #[inline] + fn is_dead(&self) -> bool { + self.status != Status::Matching + } +} + +#[cfg(test)] +mod tests3 { + use super::*; + use crate::column::Column; + use crate::config::DEFAULT_DICT12_CONFIG; + use crate::config::OnPairTrainingConfig; + use crate::test_corpus::make_raw; + use crate::test_corpus::random_ascii_strings; + use crate::test_corpus::user_strings; + + fn make_column>(strings: &[S]) -> Column { + make_column_bits(strings, 14) + } + + fn make_column_bits>(strings: &[S], bits: u32) -> Column { + let raw = make_raw(strings); + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress(&raw.data, &raw.offsets_u64, cfg).unwrap() + } + + fn brute_prefix>(strings: &[S], prefix: &[u8]) -> Vec { + strings + .iter() + .enumerate() + .filter(|(_, s)| s.as_ref().starts_with(prefix)) + .map(|(i, _)| i) + .collect() + } + + // ── Basic ───────────────────────────────────────────────────────────── + + #[test] + fn basic_prefix_match() { + let data = [ + "user_000001", + "user_000002", + "admin_001", + "user_000003", + "guest_001", + "admin_002", + ]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"user_", &col.dictionary().clone()); + assert_eq!(col.scan(pa), vec![0, 1, 3]); + } + + #[test] + fn admin_prefix() { + let data = ["user_000001", "admin_001", "admin_002", "guest_001"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"admin", &col.dictionary().clone()); + assert_eq!(col.scan(pa), vec![1, 2]); + } + + #[test] + fn no_matches() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"xyz", &col.dictionary().clone()); + assert!(col.scan(pa).is_empty()); + } + + #[test] + fn exact_match() { + let data = ["abc", "abcd", "abcde"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 3); + } + + #[test] + fn prefix_longer_than_string() { + let data = ["ab", "abc", "abcd"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"abcde", &col.dictionary().clone()); + assert!(col.scan(pa).is_empty()); + } + + #[test] + fn single_char_prefix() { + let data = ["abc", "axe", "bcd", "apple"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"a", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 3); + } + + // ── Empty prefix ────────────────────────────────────────────────────── + + #[test] + fn empty_prefix_matches_all() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 3); + } + + // ── User strings ────────────────────────────────────────────────────── + + #[test] + fn user_strings_prefix() { + let data = user_strings(50); + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"https://", &col.dictionary().clone()); + // user_strings corpus rotates through six bases; five start with https://. + let expected = brute_prefix(&data, b"https://"); + assert_eq!(col.scan(pa), expected); + } + + // ── Rescannable ─────────────────────────────────────────────────────── + + #[test] + fn rescannable_same_column() { + let data = ["user_001", "admin_001", "user_002"]; + let col = make_column(&data); + let mut pa = PrefixAutomaton::new(b"user_", &col.dictionary().clone()); + let r1 = col.scan(&mut pa); + let r2 = col.scan(&mut pa); + assert_eq!(r1, r2); + } + + // ── All bit widths ──────────────────────────────────────────────────── + + #[test] + fn works_across_bit_widths() { + let data = ["user_001", "admin_001", "user_002", "user_003"]; + for bw in 9u32..=16 { + let col = make_column_bits(&data, bw); + let pa = PrefixAutomaton::new(b"user_", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 3, "bw={bw}"); + } + } + + // ── Divergence-inside-token ─────────────────────────────────────────── + + #[test] + fn prefix_boundary_within_token() { + let data = ["user_001", "useful", "umbrella"]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"use", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 2); + } + + // ── Cross-validation ────────────────────────────────────────────────── + + #[test] + fn consistency_with_brute_force() { + let data = random_ascii_strings(200, 30, 123); + let col = make_column(&data); + for prefix in [b"a" as &[u8], b"ab", b"z", b"xx"] { + let pa = PrefixAutomaton::new(prefix, &col.dictionary().clone()); + assert_eq!( + col.scan(pa), + brute_prefix(&data, prefix), + "prefix={prefix:?}" + ); + } + } + + // ── Empty column ────────────────────────────────────────────────────── + + #[test] + fn empty_column_returns_empty() { + let strings: Vec<&[u8]> = vec![]; + let raw = make_raw(&strings); + let col = Column::compress(&raw.data, &raw.offsets_u64, DEFAULT_DICT12_CONFIG).unwrap(); + let pa = PrefixAutomaton::new(b"abc", &col.dictionary().clone()); + assert!(col.scan(pa).is_empty()); + } + + #[test] + fn empty_string_matches_empty_prefix() { + let data: Vec<&[u8]> = vec![b"", b"abc", b""]; + let col = make_column(&data); + let pa = PrefixAutomaton::new(b"", &col.dictionary().clone()); + assert_eq!(col.scan(pa).len(), 3); + } +} diff --git a/encodings/experimental/onpair-rs/src/bits.rs b/encodings/experimental/onpair-rs/src/bits.rs new file mode 100644 index 00000000000..4617df8fc76 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/bits.rs @@ -0,0 +1,572 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Bit-packing primitives: writer, reader, monomorphic cursor. +// +// Ports of: +// * `encoding/parsing/bit_writer.h` -> [`BitWriter`] +// * (sys helper) -> [`read_bits_lsb`], [`unpack_codes_to_u16`] +// * `decoding/token_cursor.h` -> [`TokenCursor`] + [`dispatch_bits!`] +// +// All tokens are packed LSB-first across consecutive u64 words. `BitWriter` +// always appends one zero sentinel word after flushing so readers can safely +// over-read 8 bytes past the last real token. + +use std::marker::PhantomData; +use std::ptr; + +use crate::store::Store; +use crate::types::StreamSpan; +use crate::types::Token; + +// ───────────────────────────────────────────────────────────────────────────── +// Bit reader — runtime bit width. +// ───────────────────────────────────────────────────────────────────────────── + +/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position +/// `bit_pos`. Matches OnPair's `BitWriter` layout exactly. +#[inline] +pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 { + debug_assert!((1..=16).contains(&bits)); + let word_idx = bit_pos / 64; + let bit_off = (bit_pos % 64) as u32; + let mask: u64 = (1u64 << bits) - 1; + let low = packed[word_idx] >> bit_off; + let combined = if bit_off + bits <= 64 { + low & mask + } else { + let high = packed[word_idx + 1] << (64 - bit_off); + (low | high) & mask + }; + combined as u16 +} + +/// Decompress an LSB-first bit-packed token stream into a flat `Vec`. +pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec { + assert!((9..=16).contains(&bits), "bits must be in [9, 16]"); + let mut out = Vec::with_capacity(total_tokens); + for t in 0..total_tokens { + out.push(read_bits_lsb(packed, t * bits as usize, bits)); + } + out +} + +// ───────────────────────────────────────────────────────────────────────────── +// Monomorphic cursor — compile-time bit width. +// ───────────────────────────────────────────────────────────────────────────── + +/// Pull-model cursor over a bit-packed token stream, monomorphised on `BITS` +/// so every shift / mask folds to a literal. Use [`dispatch_bits!`] to lift +/// a runtime `BitWidth` (9..=16) into the const-generic parameter. +pub struct TokenCursor<'a, const BITS: u32> { + base: *const u8, + bit_pos: u32, + bit_end: u32, + _marker: PhantomData<&'a [u64]>, +} + +impl<'a, const BITS: u32> TokenCursor<'a, BITS> { + /// Bind to `packed` and select `span`. + #[inline] + pub fn new(packed: &'a [u64], span: StreamSpan) -> Self { + Self { + base: packed.as_ptr() as *const u8, + bit_pos: span.begin * BITS, + bit_end: span.end * BITS, + _marker: PhantomData, + } + } + + /// Bind without selecting a span yet; call [`Self::reset_to`] before reading. + #[inline] + pub fn new_unbound(packed: &'a [u64]) -> Self { + Self { + base: packed.as_ptr() as *const u8, + bit_pos: 0, + bit_end: 0, + _marker: PhantomData, + } + } + + /// Reset to a new span inside the same packed buffer. + #[inline] + pub fn reset_to(&mut self, span: StreamSpan) { + self.bit_pos = span.begin * BITS; + self.bit_end = span.end * BITS; + } + + #[inline] + pub fn has_more(&self) -> bool { + self.bit_pos < self.bit_end + } + + #[inline] + pub fn remaining(&self) -> u32 { + (self.bit_end - self.bit_pos) / BITS + } + + /// Decode and return the next token, advancing the cursor. + /// + /// # Safety + /// Caller must guarantee `has_more()` and that `packed` has the + /// `BitWriter`-emitted trailing zero-sentinel (8 bytes of safe + /// over-read). + #[inline] + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> Token { + // SAFETY: byte offset is bit_pos / 8 < bit_end / 8; the 4-byte load + // extends at most 3 bytes past the last real token byte, within the + // BitWriter sentinel pad. + unsafe { + let off = (self.bit_pos >> 3) as usize; + let raw = ptr::read_unaligned(self.base.add(off) as *const u32); + let mask: u32 = (1u32 << BITS) - 1; + let t = ((raw >> (self.bit_pos & 7)) & mask) as Token; + self.bit_pos += BITS; + t + } + } +} + +/// Dispatch a runtime `BitWidth` (9..=16) to a body where a `const BITS: u32` +/// is in scope. Equivalent to the C++ `dispatch_bits(bw, fn)` template. +/// +/// # Safety +/// The default arm is `unreachable_unchecked`. `Column::compress` validates +/// bits ∈ 9..=16, so any column-derived dispatch is sound. +#[macro_export] +macro_rules! dispatch_bits { + ($bits:expr, | $bits_const:ident | $body:expr) => { + match $bits { + 9 => { + const $bits_const: u32 = 9; + $body + } + 10 => { + const $bits_const: u32 = 10; + $body + } + 11 => { + const $bits_const: u32 = 11; + $body + } + 12 => { + const $bits_const: u32 = 12; + $body + } + 13 => { + const $bits_const: u32 = 13; + $body + } + 14 => { + const $bits_const: u32 = 14; + $body + } + 15 => { + const $bits_const: u32 = 15; + $body + } + 16 => { + const $bits_const: u32 = 16; + $body + } + _ => unsafe { std::hint::unreachable_unchecked() }, + } + }; +} + +// ───────────────────────────────────────────────────────────────────────────── +// BitWriter — runtime bit width, RAII flush. +// ───────────────────────────────────────────────────────────────────────────── + +/// LSB-first bit-packing into a [`Store`]. Tokens straddling a 64-bit word +/// boundary are split across the next word. The destructor flushes any +/// partial word and appends a zero sentinel. +pub struct BitWriter<'a> { + store: &'a mut Store, + bits: u8, + mask: u64, + buf: u64, + shift: u32, + count: usize, + flushed: bool, +} + +impl<'a> BitWriter<'a> { + pub fn new(store: &'a mut Store) -> Self { + let bits = store.bit_width; + let mask = (1u64 << bits) - 1; + store.packed.clear(); + store.packed.reserve(256); + Self { + store, + bits, + mask, + buf: 0, + shift: 0, + count: 0, + flushed: false, + } + } + + /// Append one token to the packed stream. + #[inline] + pub fn write(&mut self, token: Token) { + let value = (token as u64) & self.mask; + self.buf |= value << self.shift; + self.shift += self.bits as u32; + if self.shift >= 64 { + self.store.packed.push(self.buf); + self.shift -= 64; + // When the token exactly fills the word the spill must be zero; + // an unconditional right-shift by `bits` would only be a no-op + // at `bits == 64`. + self.buf = if self.shift == 0 { + 0 + } else { + value >> (self.bits as u32 - self.shift) + }; + } + self.count += 1; + } + + /// Flush the partial word and append the zero sentinel. Idempotent. + /// Called automatically on drop. + pub fn flush(&mut self) { + if self.flushed { + return; + } + if self.shift > 0 { + self.store.packed.push(self.buf); + self.buf = 0; + self.shift = 0; + } + if self.count > 0 { + self.store.packed.push(0); + } + self.flushed = true; + } + + #[inline] + pub fn tokens_written(&self) -> usize { + self.count + } +} + +impl Drop for BitWriter<'_> { + fn drop(&mut self) { + self.flush(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::BitWidth; + + // ── read_bits_lsb / unpack_codes_to_u16 ──────────────────────────────── + + #[test] + fn unpack_roundtrips_simple_pattern() { + let bits = 12u32; + let a = 0xABC_u64; + let b = 0xDEF_u64; + let c = 0x123_u64; + let word = a | (b << 12) | (c << 24); + let packed = vec![word, 0]; + assert_eq!(read_bits_lsb(&packed, 0, bits), 0xABC); + assert_eq!(read_bits_lsb(&packed, 12, bits), 0xDEF); + assert_eq!(read_bits_lsb(&packed, 24, bits), 0x123); + assert_eq!( + unpack_codes_to_u16(&packed, 3, bits), + vec![0xABC, 0xDEF, 0x123] + ); + } + + // ── BitWriter ────────────────────────────────────────────────────────── + + fn roundtrip(bits: BitWidth, tokens: &[Token]) -> Vec { + let mut store = Store { + bit_width: bits, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut store); + for &t in tokens { + w.write(t); + } + } + let mut out = Vec::with_capacity(tokens.len()); + for i in 0..tokens.len() { + out.push(read_bits_lsb(&store.packed, i * bits as usize, bits as u32)); + } + out + } + + fn max_token(bits: BitWidth) -> Token { + ((1u32 << bits) - 1) as Token + } + + fn group_size(bits: u32) -> usize { + let mut lcm = bits; + while !lcm.is_multiple_of(64) { + lcm += bits; + } + (lcm / bits) as usize + } + + fn expected_packed_words(n: usize, bits: BitWidth) -> usize { + (n * bits as usize).div_ceil(64) + } + + const WIDTHS: &[BitWidth] = &[9, 10, 11, 12, 13, 14, 15, 16]; + + #[test] + fn zero_tokens_produce_empty_packed() { + for &bw in WIDTHS { + let mut s = Store { + bit_width: bw, + ..Default::default() + }; + { + let w = BitWriter::new(&mut s); + assert_eq!(w.tokens_written(), 0); + } + assert!(s.packed.is_empty(), "bits={bw}"); + } + } + + #[test] + fn packed_size_consistent_with_token_count() { + for &bw in WIDTHS { + let n = group_size(bw as u32) + 3; + let mut s = Store { + bit_width: bw, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + for _ in 0..n { + w.write(1); + } + } + assert_eq!( + s.packed.len(), + expected_packed_words(n, bw) + 1, + "bits={bw}" + ); + } + } + + #[test] + fn tokens_written_count_increases_per_write() { + for &bw in WIDTHS { + let mut s = Store { + bit_width: bw, + ..Default::default() + }; + let mut w = BitWriter::new(&mut s); + for i in 0u16..10 { + assert_eq!(w.tokens_written(), i as usize); + w.write(i); + } + assert_eq!(w.tokens_written(), 10); + w.flush(); + assert_eq!(w.tokens_written(), 10); + } + } + + #[test] + fn single_zero_token_roundtrip() { + for &bw in WIDTHS { + let r = roundtrip(bw, &[0]); + assert_eq!(r, vec![0]); + } + } + + #[test] + fn single_max_token_roundtrip() { + for &bw in WIDTHS { + let mx = max_token(bw); + assert_eq!(roundtrip(bw, &[mx]), vec![mx]); + } + } + + #[test] + fn mixed_zero_and_max_roundtrip() { + for &bw in WIDTHS { + let mx = max_token(bw); + let tokens: Vec = (0..30).map(|i| if i % 2 == 0 { 0 } else { mx }).collect(); + assert_eq!(roundtrip(bw, &tokens), tokens, "bits={bw}"); + } + } + + #[test] + fn incrementing_tokens_roundtrip() { + for &bw in WIDTHS { + let range = (max_token(bw) as u32) + 1; + let tokens: Vec = (0..200u32).map(|i| (i % range) as Token).collect(); + assert_eq!(roundtrip(bw, &tokens), tokens, "bits={bw}"); + } + } + + #[test] + fn group_boundary_token_counts() { + for &bw in WIDTHS { + let gs = group_size(bw as u32); + for &count in &[gs - 1, gs, gs + 1] { + let tokens = vec![bw as Token; count]; + assert_eq!(roundtrip(bw, &tokens), tokens, "bits={bw} count={count}"); + } + } + } + + #[test] + fn implicit_flush_via_drop() { + let mut s = Store { + bit_width: 16, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + w.write(0xABCD); + } + assert_eq!(s.packed.len(), 2); + assert_eq!(s.packed[0] & 0xFFFF, 0xABCD); + } + + #[test] + fn explicit_flush_is_idempotent() { + let mut s = Store { + bit_width: 16, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + w.write(0xABCD); + w.flush(); + } + assert_eq!(s.packed.len(), 2); + assert_eq!(s.packed[0] & 0xFFFF, 0xABCD); + } + + #[test] + fn constructor_clears_previous_data() { + let mut s = Store { + bit_width: 16, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + w.write(0xAAAA); + } + assert_eq!(s.packed.len(), 2); + { + let mut w = BitWriter::new(&mut s); + w.write(0xBBBB); + } + assert_eq!(s.packed.len(), 2); + assert_eq!(s.packed[0] & 0xFFFF, 0xBBBB); + } + + #[test] + fn straddling_bit_layout_at_12_bits() { + let tokens: [Token; 6] = [0xABC, 0, 0, 0, 0, 0x123]; + let mut s = Store { + bit_width: 12, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + for &t in &tokens { + w.write(t); + } + } + assert_eq!(s.packed.len(), 3); + assert_eq!(s.packed[0], 0x3000000000000ABC); + assert_eq!(s.packed[1], 0x0000000000000012); + assert_eq!(roundtrip(12, &tokens), tokens); + } + + // ── TokenCursor ──────────────────────────────────────────────────────── + + fn collect_with(s: &Store, n: usize) -> Vec { + let mut c = TokenCursor::::new( + &s.packed, + StreamSpan { + begin: 0, + end: n as u32, + }, + ); + let mut out = Vec::with_capacity(n); + while c.has_more() { + out.push(c.next()); + } + out + } + + fn pack(bits: BitWidth, tokens: &[Token]) -> Store { + let mut s = Store { + bit_width: bits, + ..Default::default() + }; + { + let mut w = BitWriter::new(&mut s); + for &t in tokens { + w.write(t); + } + } + s + } + + #[test] + fn cursor_roundtrip_all_widths() { + for bits in 9u8..=16 { + let max = ((1u32 << bits) - 1) as Token; + let tokens: Vec = (0..100u32) + .map(|i| ((i as Token).wrapping_mul(7)) & max) + .collect(); + let s = pack(bits, &tokens); + let out = match bits { + 9 => collect_with::<9>(&s, tokens.len()), + 10 => collect_with::<10>(&s, tokens.len()), + 11 => collect_with::<11>(&s, tokens.len()), + 12 => collect_with::<12>(&s, tokens.len()), + 13 => collect_with::<13>(&s, tokens.len()), + 14 => collect_with::<14>(&s, tokens.len()), + 15 => collect_with::<15>(&s, tokens.len()), + 16 => collect_with::<16>(&s, tokens.len()), + _ => unreachable!(), + }; + assert_eq!(out, tokens, "bits={bits}"); + } + } + + #[test] + fn cursor_remaining_decrements() { + let s = pack(12, &[1, 2, 3, 4, 5]); + let mut c = TokenCursor::<12>::new(&s.packed, StreamSpan { begin: 0, end: 5 }); + assert_eq!(c.remaining(), 5); + c.next(); + assert_eq!(c.remaining(), 4); + } + + #[test] + fn cursor_reset_to_works() { + let s = pack(12, &[10, 20, 30, 40, 50]); + let mut c = TokenCursor::<12>::new_unbound(&s.packed); + c.reset_to(StreamSpan { begin: 2, end: 5 }); + assert_eq!(c.next(), 30); + assert_eq!(c.next(), 40); + assert_eq!(c.next(), 50); + assert!(!c.has_more()); + } + + #[test] + fn dispatch_bits_routes_to_correct_arm() { + for bits in 9u8..=16 { + let result = dispatch_bits!(bits, |B| B); + assert_eq!(result, bits as u32); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/column.rs b/encodings/experimental/onpair-rs/src/column.rs new file mode 100644 index 00000000000..e3cadfd1187 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/column.rs @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Owning compressed column. API-compatible with the subset of +// `vortex-onpair-sys::Column` that `vortex-onpair` actually consumes: +// `compress`, `len`, `bits`, `dict_size`, `parts`. The shim accepts +// `&[u64]` row offsets so callers don't need to truncate to u32; internally +// we sanity-check and downcast. + +use aho_corasick::AhoCorasick; + +use crate::automaton::TokenAutomaton; +use crate::bits::TokenCursor; +use crate::config::Error; +use crate::config::OnPairTrainingConfig; +use crate::dict::Dictionary; +use crate::dispatch_bits; +use crate::parser::parse; +use crate::store::Store; +use crate::trainer::TrainResult; +use crate::trainer::train; +use crate::types::StreamSpan; +use crate::types::is_valid_bits; + +// ───────────────────────────────────────────────────────────────────────────── +// Bitmap helpers (private — `Column::*_bitmap` are the public entry points). +// ───────────────────────────────────────────────────────────────────────────── + +#[inline] +fn empty_bitmap(n: usize) -> Vec { + vec![0u8; n.div_ceil(8)] +} + +#[inline] +fn set_bit(bits: &mut [u8], i: usize) { + bits[i / 8] |= 1u8 << (i % 8); +} + +#[inline] +fn fill_bitmap(n: usize) -> Vec { + let mut bits = empty_bitmap(n); + for i in 0..n { + set_bit(&mut bits, i); + } + bits +} + +// ───────────────────────────────────────────────────────────────────────────── +// Const-generic decode and scan inner loops. Each is monomorphised per +// `BITS ∈ 9..=16` by `dispatch_bits!`, which lets the compiler fold every +// shift / mask in the cursor to a literal — same effect as the C++ +// `scan_impl` template after `dispatch_bits()` resolves it. +// ───────────────────────────────────────────────────────────────────────────── + +/// Sum decoded-byte lengths for a token-id span in one pass over the cursor. +#[inline] +fn span_decoded_len( + dict_table: &[u64], + packed: &[u64], + span: StreamSpan, +) -> usize { + let mut total = 0usize; + let mut cursor = TokenCursor::::new(packed, span); + while cursor.has_more() { + let code = cursor.next() as usize; + // SAFETY: every code is < dict_size, validated at compress time. + total += unsafe { (*dict_table.get_unchecked(code) & 0xffff) as usize }; + } + total +} + +/// Decode a token span into `out`. Uses a fixed 16-byte over-copy per token +/// (the trainer pads `dict_bytes` with MAX_TOKEN_SIZE trailing zeros so this +/// never reads past the end) and advances the cursor by the token's true +/// length. The compiler lowers `copy_nonoverlapping` of MAX_TOKEN_SIZE to a +/// single unaligned SIMD store on x86_64 / aarch64. +/// +/// `out` must have at least `decoded_len + MAX_TOKEN_SIZE` reserved +/// capacity at call time; we always set the final length to the *true* total +/// (no over-copy bytes are visible). +#[inline] +unsafe fn decode_span_unchecked( + dict_bytes: *const u8, + dict_table: &[u64], + packed: &[u64], + span: StreamSpan, + dst: *mut u8, +) -> usize { + let mut cursor = TokenCursor::::new(packed, span); + let mut cur = dst; + // SAFETY: caller invariants — dict_table indices are bounded by + // dict_size; dst has decoded_len + MAX_TOKEN_SIZE capacity. + unsafe { + while cursor.has_more() { + let code = cursor.next() as usize; + let entry = *dict_table.get_unchecked(code); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping(dict_bytes.add(off), cur, crate::MAX_TOKEN_SIZE); + cur = cur.add(len); + } + cur.offset_from(dst) as usize + } +} + +#[inline] +fn scan_with_bits( + packed: &[u64], + boundaries: &[u32], + num_rows: usize, + aut: &mut A, + on_match: &mut F, +) where + A: TokenAutomaton, + F: FnMut(usize), +{ + let mut cursor = TokenCursor::::new_unbound(packed); + for row in 0..num_rows { + aut.reset(); + cursor.reset_to(StreamSpan { + begin: boundaries[row], + end: boundaries[row + 1], + }); + while cursor.has_more() { + let t = cursor.next(); + aut.step(t); + if aut.is_dead() { + break; + } + } + if aut.is_accepted() { + on_match(row); + } + } +} + +/// Pack `Dictionary` into a per-token `(offset << 16) | length` table. +/// Token length is bounded by `MAX_TOKEN_SIZE = 16`, so 16 bits suffice. +fn build_dict_table(dict: &Dictionary) -> Vec { + let n = dict.num_tokens(); + let mut table = Vec::with_capacity(n); + for i in 0..n { + let off = dict.offsets[i] as u64; + let len = (dict.offsets[i + 1] - dict.offsets[i]) as u64; + debug_assert!(len <= crate::MAX_TOKEN_SIZE as u64); + table.push((off << 16) | len); + } + table +} + +/// Owning compressed column. Built by [`Column::compress`]. +#[derive(Debug, Clone)] +pub struct Column { + dict: Dictionary, + store: Store, + num_rows: usize, + /// Per-token `(offset << 16) | length` packed into a `u64`. Built once + /// at compress / from_parts time so the decode and predicate hot loops + /// do one indexed load per token instead of two indexed offset reads. + /// Length = `dict.num_tokens()`. + dict_table: Vec, +} + +/// Borrowed raw arrays of a column. Mirrors `vortex-onpair-sys::Parts`. +#[derive(Copy, Clone)] +pub struct Parts<'a> { + /// Concatenated dictionary entry bytes. The C++ shim's caller pads this + /// with `MAX_TOKEN_SIZE` zeros before handing to the decoder; we expose + /// the unpadded logical slice (length `dict_offsets.last()`). + pub dict_bytes: &'a [u8], + /// Length `dict_size + 1`. + pub dict_offsets: &'a [u32], + /// LSB-first bit-packed token stream. + pub codes_packed: &'a [u64], + /// Length `num_rows + 1`. + pub codes_boundaries: &'a [u32], + /// Bits per token (9..=16). + pub bits: u32, + pub num_rows: usize, +} + +impl Column { + /// Compress `n` byte strings described by a flat `bytes` blob and an + /// `offsets` array of length `n + 1`. Matches + /// `vortex-onpair-sys::Column::compress`. + pub fn compress( + bytes: &[u8], + offsets: &[u64], + config: OnPairTrainingConfig, + ) -> Result { + if offsets.is_empty() { + return Err(Error::InvalidArg); + } + if !is_valid_bits(config.bits as u8) { + return Err(Error::InvalidArg); + } + let n = offsets.len() - 1; + + // Downcast u64 offsets to u32. Bail on overflow rather than wrap. + let mut off32 = Vec::with_capacity(offsets.len()); + for &o in offsets { + if o > u32::MAX as u64 { + return Err(Error::InvalidArg); + } + off32.push(o as u32); + } + if (off32[n] as usize) > bytes.len() { + return Err(Error::InvalidArg); + } + + let cfg = config.into(); + let TrainResult { dict, lpm } = train(bytes, &off32, n, &cfg); + let mut store = Store::default(); + parse(bytes, &off32, n, &lpm, config.bits as u8, &mut store); + let dict_table = build_dict_table(&dict); + + Ok(Self { + dict, + store, + num_rows: n, + dict_table, + }) + } + + #[inline] + pub fn len(&self) -> usize { + self.num_rows + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows == 0 + } + + #[inline] + pub fn bits(&self) -> u32 { + self.store.bit_width as u32 + } + + #[inline] + pub fn dict_size(&self) -> usize { + self.dict.num_tokens() + } + + /// Decompress row `row_id` into `out`, clearing it first. The hot path + /// uses the const-generic [`TokenCursor`] dispatched on `bits` plus a + /// fixed-width 16-byte over-copy per token (`MAX_TOKEN_SIZE`). LLVM + /// lowers each copy to one unaligned 128-bit SIMD store on x86_64 and + /// aarch64. + pub fn decompress_row(&self, row_id: usize, out: &mut Vec) -> Result<(), Error> { + if row_id >= self.num_rows { + return Err(Error::OutOfRange); + } + out.clear(); + let span = self.store.string_span(row_id); + // SAFETY of dispatch: bit_width is validated to 9..=16 in compress(). + dispatch_bits!(self.store.bit_width as u32, |B| { + self.decode_one_span_into::(span, out); + }); + Ok(()) + } + + /// Decompress every row into a flat byte buffer + `n + 1` offsets. + pub fn decode_all(&self) -> (Vec, Vec) { + let n = self.num_rows; + let mut offsets = Vec::with_capacity(n + 1); + offsets.push(0u32); + if n == 0 { + return (Vec::new(), offsets); + } + let bytes = dispatch_bits!(self.store.bit_width as u32, |B| { + self.decode_all_inner::(&mut offsets) + }); + (bytes, offsets) + } + + /// Inner monomorphic body of [`Self::decode_all`]. + #[inline] + fn decode_all_inner(&self, offsets: &mut Vec) -> Vec { + // Pre-compute the total decoded length: one cursor pass over every + // row in token-id space, summing each token's length from + // `dict_table[code] & 0xffff`. Lets us reserve `bytes` once and + // skip per-row capacity grow checks in the hot loop. + let last = *self.store.boundaries.last().unwrap_or(&0); + let total = span_decoded_len::( + &self.dict_table, + &self.store.packed, + StreamSpan { + begin: 0, + end: last, + }, + ); + let mut bytes: Vec = Vec::with_capacity(total + crate::MAX_TOKEN_SIZE); + let dst = bytes.as_mut_ptr(); + let dict_bytes = self.dict.bytes.as_ptr(); + let dict_table = self.dict_table.as_slice(); + let mut cur_off = 0usize; + for row in 0..self.num_rows { + let span = self.store.string_span(row); + // SAFETY: `dst` has `total + MAX_TOKEN_SIZE` reserved + // capacity; each token's over-copy stays within `dict_bytes` + // (padded by `pad_for_decoder`). + let written = unsafe { + decode_span_unchecked::( + dict_bytes, + dict_table, + &self.store.packed, + span, + dst.add(cur_off), + ) + }; + cur_off += written; + offsets.push(cur_off as u32); + } + // SAFETY: cur_off == total ≤ reserved capacity. + unsafe { + bytes.set_len(cur_off); + } + bytes + } + + /// Internal helper: decode one span into `out` with the SIMD-friendly + /// over-copy loop. Used by `decompress_row` and `run_byte_predicate`. + #[inline] + fn decode_one_span_into(&self, span: StreamSpan, out: &mut Vec) { + let len = span_decoded_len::(&self.dict_table, &self.store.packed, span); + let start = out.len(); + out.reserve(len + crate::MAX_TOKEN_SIZE); + // SAFETY: capacity reserved above; over-copy stays within dict pad. + unsafe { + let written = decode_span_unchecked::( + self.dict.bytes.as_ptr(), + &self.dict_table, + &self.store.packed, + span, + out.as_mut_ptr().add(start), + ); + debug_assert_eq!(written, len); + out.set_len(start + written); + } + } + + /// `WHERE col = needle` as an LSB-first packed bitmap of length `(n + 7) / 8`. + /// + /// Decompress-then-match implementation. For very large columns prefer the + /// compressed-domain [`crate::EqAutomaton`] via [`Self::scan_bitmap`]. + pub fn equals_bitmap(&self, needle: &[u8]) -> Vec { + self.run_byte_predicate(|row| row == needle) + } + + /// `col LIKE 'needle%'` as an LSB-first packed bitmap. + pub fn starts_with_bitmap(&self, needle: &[u8]) -> Vec { + self.run_byte_predicate(|row| row.starts_with(needle)) + } + + /// `col LIKE '%needle%'` as an LSB-first packed bitmap. Uses `memchr::memmem`. + pub fn contains_bitmap(&self, needle: &[u8]) -> Vec { + if needle.is_empty() { + return fill_bitmap(self.num_rows); + } + let finder = memchr::memmem::Finder::new(needle); + self.run_byte_predicate(|row| finder.find(row).is_some()) + } + + /// `LIKE '%a%' OR '%b%' OR ...` via Aho-Corasick. Empty `needles` → + /// all-zero bitmap. + pub fn multi_pattern_bitmap(&self, needles: &[&[u8]]) -> Vec { + if needles.is_empty() { + return empty_bitmap(self.num_rows); + } + let ac = AhoCorasick::new(needles).expect("aho-corasick: build"); + self.run_byte_predicate(|row| ac.is_match(row)) + } + + /// Decompress every row and apply `pred`. Shared backend for the + /// `*_bitmap` methods. + fn run_byte_predicate bool>(&self, mut pred: F) -> Vec { + let mut bits = empty_bitmap(self.num_rows); + // One reusable scratch buffer; the over-copy in + // `decode_one_span_into` extends the spare capacity by + // MAX_TOKEN_SIZE each call. + let mut buf: Vec = Vec::with_capacity(128); + // SAFETY of dispatch: bit_width validated 9..=16 in compress(). + dispatch_bits!(self.store.bit_width as u32, |B| { + for i in 0..self.num_rows { + buf.clear(); + let span = self.store.string_span(i); + self.decode_one_span_into::(span, &mut buf); + if pred(&buf) { + set_bit(&mut bits, i); + } + } + }); + bits + } + + /// Run a [`TokenAutomaton`] over every row's compressed token stream + /// and collect matching row ids. The automaton is reset at the start of + /// each row and stepped on every token; the loop breaks early when + /// `is_dead()` returns true. + pub fn scan(&self, mut aut: A) -> Vec { + let mut out = Vec::new(); + self.scan_with(&mut aut, |i| out.push(i)); + out + } + + /// Callback form of [`Self::scan`] — no `Vec` allocation. + /// Hot path runs through a monomorphic [`TokenCursor`] selected + /// once via [`dispatch_bits!`], identical structure to the C++ + /// `scan_impl` template. + pub fn scan_with(&self, mut aut: A, mut on_match: F) { + // SAFETY of dispatch: bit_width is validated to 9..=16 in compress(). + dispatch_bits!(self.store.bit_width as u32, |B| { + scan_with_bits::( + &self.store.packed, + &self.store.boundaries, + self.num_rows, + &mut aut, + &mut on_match, + ); + }); + } + + /// Run a [`TokenAutomaton`] and collect matches as an LSB-first packed + /// bitmap. Same shape as the byte-level `*_bitmap` APIs. + pub fn scan_bitmap(&self, aut: A) -> Vec { + let mut bits = empty_bitmap(self.num_rows); + self.scan_with(aut, |i| bits[i / 8] |= 1u8 << (i % 8)); + bits + } + + /// Access the column's dictionary. Required to construct any + /// `*Automaton` (they take `&Dictionary`). + pub fn dictionary(&self) -> &Dictionary { + &self.dict + } + + /// Borrow the column's raw arrays for downstream consumers (decode loop, + /// predicate kernels). Mirrors `vortex-onpair-sys::Column::parts`. + pub fn parts(&self) -> Result, Error> { + // dict_bytes: logical-size slice, not including decoder padding. The + // C++ shim returns the same thing — `dict_bytes_len` is the byte + // count from offsets.back(), and `vortex-onpair`'s compress.rs adds + // MAX_TOKEN_SIZE of trailing zero padding itself. + let true_dict_bytes = *self.dict.offsets.last().unwrap_or(&0) as usize; + // Skip the trailing zero sentinel `BitWriter::flush` appended. + let codes_packed = if self.store.packed.is_empty() { + &self.store.packed[..] + } else { + &self.store.packed[..self.store.packed.len() - 1] + }; + Ok(Parts { + dict_bytes: &self.dict.bytes[..true_dict_bytes], + dict_offsets: &self.dict.offsets, + codes_packed, + codes_boundaries: &self.store.boundaries, + bits: self.store.bit_width as u32, + num_rows: self.num_rows, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bits::unpack_codes_to_u16; + use crate::config::DEFAULT_DICT12_CONFIG; + use crate::types::Token; + + fn pack_inputs(strings: &[&str]) -> (Vec, Vec) { + let mut bytes = Vec::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + offsets.push(0u64); + for s in strings { + bytes.extend_from_slice(s.as_bytes()); + offsets.push(bytes.len() as u64); + } + (bytes, offsets) + } + + fn decode_row(parts: &Parts<'_>, dict_bytes: &[u8], row: usize) -> Vec { + let begin = parts.codes_boundaries[row] as usize; + let end = parts.codes_boundaries[row + 1] as usize; + let codes = unpack_codes_to_u16(parts.codes_packed, end, parts.bits); + let mut out = Vec::new(); + for &c in &codes[begin..end] { + let s = parts.dict_offsets[c as usize] as usize; + let e = parts.dict_offsets[c as usize + 1] as usize; + out.extend_from_slice(&dict_bytes[s..e]); + } + out + } + + #[test] + fn empty_offsets_returns_invalid_arg() { + let r = Column::compress(&[], &[], DEFAULT_DICT12_CONFIG); + assert_eq!(r.err(), Some(Error::InvalidArg)); + } + + #[test] + fn invalid_bits_returns_invalid_arg() { + let cfg = OnPairTrainingConfig { + bits: 8, + threshold: 0.5, + seed: 0, + }; + let r = Column::compress(&[], &[0], cfg); + assert_eq!(r.err(), Some(Error::InvalidArg)); + } + + #[test] + fn zero_rows_compress_succeeds() { + let col = Column::compress(&[], &[0], DEFAULT_DICT12_CONFIG).unwrap(); + assert_eq!(col.len(), 0); + let parts = col.parts().unwrap(); + assert_eq!(parts.num_rows, 0); + assert_eq!(parts.codes_boundaries, &[0u32]); + assert!(parts.codes_packed.is_empty()); + } + + #[test] + fn roundtrip_simple_strings() { + let strings = [ + "user_000001", + "user_000002", + "admin_001", + "user_000003", + "guest_001", + ]; + let (bytes, offsets) = pack_inputs(&strings); + let cfg = OnPairTrainingConfig { + bits: 12, + threshold: 0.5, + seed: 7, + }; + let col = Column::compress(&bytes, &offsets, cfg).unwrap(); + assert_eq!(col.len(), 5); + assert_eq!(col.bits(), 12); + let parts = col.parts().unwrap(); + let dict_bytes_padded = { + let mut v = parts.dict_bytes.to_vec(); + v.extend(std::iter::repeat_n(0u8, crate::MAX_TOKEN_SIZE)); + v + }; + for (i, &s) in strings.iter().enumerate() { + let decoded = decode_row(&parts, &dict_bytes_padded, i); + assert_eq!(decoded, s.as_bytes(), "row {i}"); + } + } + + #[test] + fn roundtrip_with_binary_data_and_all_bit_widths() { + let strings: Vec> = (0..30u8) + .map(|i| { + let mut v = Vec::with_capacity(20); + for j in 0..20u8 { + v.push(i.wrapping_add(j)); + } + v + }) + .collect(); + let mut bytes = Vec::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + offsets.push(0u64); + for s in &strings { + bytes.extend_from_slice(s); + offsets.push(bytes.len() as u64); + } + for bw in 9u32..=16 { + let cfg = OnPairTrainingConfig { + bits: bw, + threshold: 0.5, + seed: 99, + }; + let col = Column::compress(&bytes, &offsets, cfg).unwrap(); + assert_eq!(col.bits(), bw); + let parts = col.parts().unwrap(); + let dict_bytes_padded = { + let mut v = parts.dict_bytes.to_vec(); + v.extend(std::iter::repeat_n(0u8, crate::MAX_TOKEN_SIZE)); + v + }; + for (i, s) in strings.iter().enumerate() { + let decoded = decode_row(&parts, &dict_bytes_padded, i); + assert_eq!(decoded, *s, "bits={bw} row={i}"); + } + } + } + + #[test] + fn dict_first_256_tokens_cover_all_bytes_after_sort() { + let strings = ["hello world", "another row"]; + let (bytes, offsets) = pack_inputs(&strings); + let col = Column::compress(&bytes, &offsets, DEFAULT_DICT12_CONFIG).unwrap(); + let parts = col.parts().unwrap(); + // Every byte value 0..=255 must appear as a single-byte token somewhere + // in the dictionary. + let mut found = [false; 256]; + for i in 0..parts.dict_offsets.len() - 1 { + let s = parts.dict_offsets[i] as usize; + let e = parts.dict_offsets[i + 1] as usize; + if e - s == 1 { + found[parts.dict_bytes[s] as usize] = true; + } + } + for (i, &f) in found.iter().enumerate() { + assert!(f, "byte {i} missing"); + } + } + + #[test] + fn parts_codes_packed_excludes_sentinel() { + let strings = ["x", "y", "z"]; + let (bytes, offsets) = pack_inputs(&strings); + let col = Column::compress(&bytes, &offsets, DEFAULT_DICT12_CONFIG).unwrap(); + let parts = col.parts().unwrap(); + // Number of token bits actually used. + let total_tokens = *parts.codes_boundaries.last().unwrap() as usize; + let needed_words = (total_tokens * parts.bits as usize).div_ceil(64); + assert_eq!(parts.codes_packed.len(), needed_words); + // unpack should still yield exactly total_tokens valid codes. + let codes = unpack_codes_to_u16(parts.codes_packed, total_tokens, parts.bits); + assert_eq!(codes.len(), total_tokens); + // All codes must be valid dict indices. + let dict_size = col.dict_size() as Token; + for &c in &codes { + assert!(c < dict_size); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/config.rs b/encodings/experimental/onpair-rs/src/config.rs new file mode 100644 index 00000000000..8752b98bf82 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/config.rs @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/encoding/training/config.h` plus the +// FFI-shaped `OnPairTrainingConfig` from `vortex-onpair-sys`. + +use crate::types::BitWidth; + +/// Merge a token pair as soon as its frequency reaches `value`. +/// +/// Range: `[2, 255]`. The frequency counter is `u8` so larger values can +/// never trigger a merge. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct FixedThreshold { + pub value: u8, +} + +/// Adaptively tune the merge threshold so the dictionary fills to capacity +/// within `sample_fraction` of the total input bytes. Values in `(0.0, 1.0]`. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct DynamicThreshold { + pub sample_fraction: f64, +} + +impl Default for DynamicThreshold { + fn default() -> Self { + Self { + sample_fraction: 0.15, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum ThresholdSpec { + Fixed(FixedThreshold), + Dynamic(DynamicThreshold), +} + +impl Default for ThresholdSpec { + fn default() -> Self { + Self::Dynamic(DynamicThreshold::default()) + } +} + +/// Internal, full-fidelity training config matching the C++ +/// `encoding::TrainingConfig`. +#[derive(Clone, Debug)] +pub struct TrainingConfig { + /// `2^bits` is the max dictionary size. Legal range: `9..=16`. + pub bits: BitWidth, + pub threshold: ThresholdSpec, + /// `None` → non-deterministic seed. + pub seed: Option, +} + +impl Default for TrainingConfig { + fn default() -> Self { + Self { + bits: 16, + threshold: ThresholdSpec::default(), + seed: None, + } + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// FFI-shaped config. +// +// Matches `vortex-onpair-sys::OnPairTrainingConfig` field-for-field so this +// crate can be a drop-in replacement. `seed == 0` is interpreted as +// "non-deterministic", same as the C shim's behaviour. +// ───────────────────────────────────────────────────────────────────────────── + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct OnPairTrainingConfig { + pub bits: u32, + pub threshold: f64, + pub seed: u64, +} + +/// `dict-12`: 12-bit codes (4 096 entries), dynamic threshold 0.5. +pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = OnPairTrainingConfig { + bits: 12, + threshold: 0.5, + seed: 0, +}; + +impl From for TrainingConfig { + fn from(c: OnPairTrainingConfig) -> Self { + Self { + bits: c.bits as BitWidth, + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: c.threshold, + }), + seed: (c.seed != 0).then_some(c.seed), + } + } +} + +/// Errors mirroring `vortex-onpair-sys::Error`. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Error { + InvalidArg, + BadFormat, + OutOfRange, + Oom, + Internal, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let msg = match self { + Error::InvalidArg => "OnPair: invalid argument", + Error::BadFormat => "OnPair: bad serialized format", + Error::OutOfRange => "OnPair: row index out of range", + Error::Oom => "OnPair: out of memory or buffer too small", + Error::Internal => "OnPair: internal error", + }; + f.write_str(msg) + } +} + +impl std::error::Error for Error {} diff --git a/encodings/experimental/onpair-rs/src/dict.rs b/encodings/experimental/onpair-rs/src/dict.rs new file mode 100644 index 00000000000..b413ca44d6e --- /dev/null +++ b/encodings/experimental/onpair-rs/src/dict.rs @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/core/dictionary.h` and `dictionary_view.h`. +// +// The dictionary maps `Token` -> byte sequence with the Arrow binary layout: +// a flat `bytes` buffer plus an `offsets` array of length `num_tokens + 1`. +// Tokens are stored in lexicographic order so that prefix range lookups can +// use binary search. + +use crate::types::ByteSpan; +use crate::types::MAX_TOKEN_SIZE; +use crate::types::Token; +use crate::types::TokenRange; + +#[derive(Default, Debug, Clone)] +pub struct Dictionary { + /// Flat concatenation of token bytes. + /// + /// `pad_for_decoder()` may extend this past `offsets.back()` with zeros + /// so the decoder can issue a fixed `MAX_TOKEN_SIZE` byte over-copy from + /// any token offset without reading out of bounds. `bytes_used()` always + /// reports the logical size derived from `offsets`. + pub bytes: Vec, + + /// `offsets[i]..offsets[i+1]` = byte range of token `i` in `bytes`. + /// Invariants: `offsets[0] == 0`, `offsets.len() == num_tokens + 1`. + pub offsets: Vec, +} + +impl Dictionary { + #[inline] + pub fn num_tokens(&self) -> usize { + if self.offsets.is_empty() { + 0 + } else { + self.offsets.len() - 1 + } + } + + /// Logical byte cost (true token bytes + offsets array). Unaffected by + /// any padding `pad_for_decoder()` appended. + #[inline] + pub fn bytes_used(&self) -> usize { + let true_bytes = self.offsets.last().copied().unwrap_or(0) as usize; + let offsets_bytes = self.offsets.len() * size_of::(); + true_bytes + offsets_bytes + } + + /// Append zero bytes so the decoder may safely over-copy `MAX_TOKEN_SIZE` + /// bytes from any token offset. Idempotent. + pub fn pad_for_decoder(&mut self) { + if self.offsets.len() < 2 { + return; + } + let last_off = *self.offsets.last().unwrap() as usize; + if self.bytes.len() > last_off { + return; // already padded + } + let last_start = self.offsets[self.offsets.len() - 2] as usize; + let last_len = last_off - last_start; + self.bytes + .resize(self.bytes.len() + (MAX_TOKEN_SIZE - last_len), 0); + } + + // ── DictionaryView equivalents (free fns on Dictionary; no view type) ── + + #[inline] + pub fn span(&self, id: Token) -> ByteSpan { + ByteSpan { + begin: self.offsets[id as usize], + end: self.offsets[id as usize + 1], + } + } + + #[inline] + pub fn data(&self, id: Token) -> &[u8] { + let s = self.span(id); + &self.bytes[s.begin as usize..s.end as usize] + } + + #[inline] + pub fn token_size(&self, id: Token) -> usize { + let s = self.span(id); + s.size() as usize + } + + /// Inclusive `[lo, hi]` token-id range whose byte sequences begin with + /// `prefix`. Mirrors `DictionaryView::prefix_range` in onpair_cpp. + pub fn prefix_range(&self, prefix: &[u8]) -> TokenRange { + if prefix.len() > MAX_TOKEN_SIZE { + return TokenRange::default(); + } + + let n = self.num_tokens() as u32; + + // Find the first token whose bytes >= `target`, starting at `start`. + let lower_bound = |target: &[u8], start: u32| -> u32 { + let mut lo = start; + let mut hi = n; + while lo < hi { + let mid = lo + ((hi - lo) >> 1); + let m_off = self.offsets[mid as usize] as usize; + let m_end = self.offsets[mid as usize + 1] as usize; + let m_len = m_end - m_off; + let cmp_len = m_len.min(target.len()); + let cmp = self.bytes[m_off..m_off + cmp_len].cmp(&target[..cmp_len]); + // token[mid] < target iff cmp == Less, or cmp == Equal AND token shorter + if cmp == std::cmp::Ordering::Less + || (cmp == std::cmp::Ordering::Equal && m_len < target.len()) + { + lo = mid + 1; + } else { + hi = mid; + } + } + lo + }; + + let lo = lower_bound(prefix, 0); + + // Compute the next lexicographic prefix by incrementing the last + // non-0xFF byte, trimming trailing 0xFF bytes first. + let mut buf = [0u8; MAX_TOKEN_SIZE]; + let mut ulen = prefix.len(); + let mut overflow = true; + while ulen > 0 { + if prefix[ulen - 1] < 0xFF { + buf[..ulen].copy_from_slice(&prefix[..ulen]); + buf[ulen - 1] = buf[ulen - 1].wrapping_add(1); + overflow = false; + break; + } + ulen -= 1; + } + + let hi = if overflow { + n + } else { + lower_bound(&buf[..ulen], lo) + }; + + if lo < hi { + TokenRange { + begin: lo as Token, + last: (hi - 1) as Token, + } + } else { + TokenRange::default() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ─── Dictionary basics ────────────────────────────────────────────────── + + #[test] + fn num_tokens_zero_when_offsets_empty() { + let d = Dictionary::default(); + assert_eq!(d.num_tokens(), 0); + } + + #[test] + fn num_tokens_is_offsets_len_minus_one() { + let d = Dictionary { + bytes: vec![], + offsets: vec![0, 3, 5, 8], + }; + assert_eq!(d.num_tokens(), 3); + } + + #[test] + fn num_tokens_single_entry_minus_one() { + let d = Dictionary { + bytes: vec![], + offsets: vec![0, 7], + }; + assert_eq!(d.num_tokens(), 1); + } + + #[test] + fn bytes_used_accounts_for_both_vectors() { + let d = Dictionary { + bytes: vec![0x00, 0x01, 0x02], + offsets: vec![0, 1, 2, 3], + }; + let expected = 3 + 4 * size_of::(); + assert_eq!(d.bytes_used(), expected); + } + + #[test] + fn bytes_used_zero_when_empty() { + let d = Dictionary::default(); + assert_eq!(d.bytes_used(), 0); + } + + // ─── pad_for_decoder ──────────────────────────────────────────────────── + + #[test] + fn pad_for_decoder_adds_trailing_zeros() { + let mut d = Dictionary { + bytes: b"hello".to_vec(), + offsets: vec![0, 5], + }; + d.pad_for_decoder(); + assert_eq!(d.bytes.len(), MAX_TOKEN_SIZE); + } + + #[test] + fn pad_for_decoder_is_idempotent() { + let mut d = Dictionary { + bytes: b"ab".to_vec(), + offsets: vec![0, 2], + }; + d.pad_for_decoder(); + let size1 = d.bytes.len(); + d.pad_for_decoder(); + assert_eq!(d.bytes.len(), size1); + } + + #[test] + fn pad_for_decoder_noop_for_max_token_size() { + let mut d = Dictionary { + bytes: vec![b'x'; MAX_TOKEN_SIZE], + offsets: vec![0, MAX_TOKEN_SIZE as u32], + }; + d.pad_for_decoder(); + assert_eq!(d.bytes.len(), MAX_TOKEN_SIZE); + } + + #[test] + fn pad_for_decoder_uses_last_token_only() { + // tokens: "ab" (2 bytes), "cde" (3 bytes); pad based on last (3). + let mut d = Dictionary { + bytes: b"abcde".to_vec(), + offsets: vec![0, 2, 5], + }; + d.pad_for_decoder(); + assert_eq!(d.bytes.len(), 5 + (MAX_TOKEN_SIZE - 3)); + } + + #[test] + fn bytes_used_unchanged_after_padding() { + let mut d = Dictionary { + bytes: b"hello".to_vec(), + offsets: vec![0, 5], + }; + let before = d.bytes_used(); + d.pad_for_decoder(); + assert_eq!(d.bytes_used(), before); + assert!(d.bytes.len() > *d.offsets.last().unwrap() as usize); + } + + #[test] + fn pad_for_decoder_noop_when_fewer_than_two_offsets() { + let mut d = Dictionary { + bytes: vec![], + offsets: vec![0], + }; + d.pad_for_decoder(); + assert!(d.bytes.is_empty()); + } + + #[test] + fn pad_for_decoder_padding_bytes_are_zero() { + let mut d = Dictionary { + bytes: b"xy".to_vec(), + offsets: vec![0, 2], + }; + d.pad_for_decoder(); + for (i, b) in d.bytes.iter().enumerate().skip(2) { + assert_eq!(*b, 0, "non-zero padding at index {i}"); + } + } + + // ─── DictionaryView equivalents ───────────────────────────────────────── + + fn make_abc() -> Dictionary { + Dictionary { + bytes: vec![b'a', b'b', b'c'], + offsets: vec![0, 1, 2, 3], + } + } + + fn make_varying() -> Dictionary { + // "a", "bc", "def" + Dictionary { + bytes: b"abcdef".to_vec(), + offsets: vec![0, 1, 3, 6], + } + } + + fn make_prefix_dict() -> Dictionary { + // sorted: "a", "ab", "b" + Dictionary { + bytes: vec![b'a', b'a', b'b', b'b'], + offsets: vec![0, 1, 3, 4], + } + } + + #[test] + fn span_returns_correct_range() { + let d = make_varying(); + assert_eq!(d.span(0), ByteSpan { begin: 0, end: 1 }); + assert_eq!(d.span(1), ByteSpan { begin: 1, end: 3 }); + assert_eq!(d.span(2), ByteSpan { begin: 3, end: 6 }); + } + + #[test] + fn data_points_to_correct_byte() { + let d = make_varying(); + assert_eq!(d.data(0)[0], b'a'); + assert_eq!(d.data(1)[0], b'b'); + assert_eq!(d.data(2)[0], b'd'); + } + + #[test] + fn token_size_consistent_with_span() { + let d = make_varying(); + for t in 0u16..3 { + assert_eq!(d.token_size(t), d.span(t).size() as usize); + } + } + + #[test] + fn token_sizes() { + let d = make_varying(); + assert_eq!(d.token_size(0), 1); + assert_eq!(d.token_size(1), 2); + assert_eq!(d.token_size(2), 3); + } + + #[test] + fn num_tokens_matches_storage() { + let d = make_varying(); + assert_eq!(d.num_tokens(), 3); + } + + #[test] + fn accepts_empty_dictionary() { + let d = Dictionary::default(); + assert_eq!(d.num_tokens(), 0); + } + + #[test] + fn bytes_used_unaffected_by_padding() { + let mut d = make_varying(); + let before_pad = d.bytes_used(); + d.pad_for_decoder(); + assert_eq!(d.bytes_used(), before_pad); + } + + // ─── prefix_range ─────────────────────────────────────────────────────── + + #[test] + fn empty_dict_returns_empty_range() { + let d = Dictionary::default(); + assert!(d.prefix_range(b"a").empty()); + } + + #[test] + fn exact_single_token_match() { + let d = make_abc(); + let r = d.prefix_range(b"b"); + assert!(!r.empty()); + assert_eq!(r.size(), 1); + assert_eq!(r.begin, 1); + assert_eq!(r.last, 1); + } + + #[test] + fn prefix_matches_multiple_tokens() { + let d = make_prefix_dict(); + let r = d.prefix_range(b"a"); + assert!(!r.empty()); + assert_eq!(r.begin, 0); + assert_eq!(r.last, 1); + assert_eq!(r.size(), 2); + } + + #[test] + fn no_match_returns_empty() { + let d = make_abc(); + assert!(d.prefix_range(b"z").empty()); + } + + #[test] + fn prefix_longer_than_max_returns_empty() { + let d = make_abc(); + let buf = [0u8; MAX_TOKEN_SIZE + 1]; + assert!(d.prefix_range(&buf).empty()); + } + + #[test] + fn all_ff_bytes_prefix() { + // tokens {0xFF}, {0xFF, 0xFF} + let d = Dictionary { + bytes: vec![0xFF, 0xFF, 0xFF], + offsets: vec![0, 1, 3], + }; + let r = d.prefix_range(&[0xFF]); + assert_eq!(r.begin, 0); + assert_eq!(r.last, 1); + } + + #[test] + fn exact_length_match_first_and_only_token() { + let d = Dictionary { + bytes: b"hello".to_vec(), + offsets: vec![0, 5], + }; + let r = d.prefix_range(b"hello"); + assert!(!r.empty()); + assert_eq!(r.begin, 0); + assert_eq!(r.last, 0); + } + + #[test] + fn contains_returns_true_for_all_in_range() { + let d = make_prefix_dict(); + let r = d.prefix_range(b"a"); + assert!(r.contains(0)); + assert!(r.contains(1)); + assert!(!r.contains(2)); + } + + #[test] + fn empty_pattern_matches_all_tokens() { + let d = make_abc(); + let r = d.prefix_range(&[]); + assert_eq!(r.size(), 3); + assert_eq!(r.begin, 0); + assert_eq!(r.last, 2); + } + + #[test] + fn empty_pattern_on_single_token_dict() { + let d = Dictionary { + bytes: vec![b'x'], + offsets: vec![0, 1], + }; + let r = d.prefix_range(&[]); + assert!(!r.empty()); + assert_eq!(r.size(), 1); + } + + #[test] + fn pattern_exactly_max_token_size_can_match() { + let d = Dictionary { + bytes: vec![b'z'; MAX_TOKEN_SIZE], + offsets: vec![0, MAX_TOKEN_SIZE as u32], + }; + let needle = vec![b'z'; MAX_TOKEN_SIZE]; + let r = d.prefix_range(&needle); + assert!(!r.empty()); + assert_eq!(r.size(), 1); + assert_eq!(r.begin, 0); + } + + #[test] + fn all_ff_multi_byte_prefix() { + // tokens: {0xFF} and {0xFF, 0xFF}; prefix {0xFF, 0xFF} matches only the + // second. + let d = Dictionary { + bytes: vec![0xFF, 0xFF, 0xFF], + offsets: vec![0, 1, 3], + }; + let r = d.prefix_range(&[0xFF, 0xFF]); + assert!(!r.empty()); + assert_eq!(r.size(), 1); + assert_eq!(r.begin, 1); + assert_eq!(r.last, 1); + } + + #[test] + fn all_ff_prefix_beyond_all_tokens() { + let d = Dictionary { + bytes: vec![0xFE], + offsets: vec![0, 1], + }; + let r = d.prefix_range(&[0xFF, 0xFF]); + assert!(r.empty()); + } + + #[test] + fn single_token_dict_matching_prefix() { + let d = Dictionary { + bytes: b"hello".to_vec(), + offsets: vec![0, 5], + }; + let r = d.prefix_range(b"he"); + assert!(!r.empty()); + assert_eq!(r.size(), 1); + assert_eq!(r.begin, 0); + } + + #[test] + fn single_token_dict_non_matching_prefix() { + let d = Dictionary { + bytes: b"hello".to_vec(), + offsets: vec![0, 5], + }; + assert!(d.prefix_range(b"x").empty()); + } + + #[test] + fn overlapping_prefixes_deep_nesting() { + // "a", "aa", "aaa", "b" + let d = Dictionary { + bytes: vec![b'a', b'a', b'a', b'a', b'a', b'a', b'b'], + offsets: vec![0, 1, 3, 6, 7], + }; + let r_a = d.prefix_range(b"a"); + assert_eq!(r_a.size(), 3); + assert_eq!(r_a.begin, 0); + assert_eq!(r_a.last, 2); + + let r_aa = d.prefix_range(b"aa"); + assert_eq!(r_aa.size(), 2); + assert_eq!(r_aa.begin, 1); + assert_eq!(r_aa.last, 2); + + let r_aaa = d.prefix_range(b"aaa"); + assert_eq!(r_aaa.size(), 1); + assert_eq!(r_aaa.begin, 2); + + let r_b = d.prefix_range(b"b"); + assert_eq!(r_b.size(), 1); + assert_eq!(r_b.begin, 3); + } + + #[test] + fn contiguous_range_bounds_no_spillover() { + // "apple", "apt", "b" + let d = Dictionary { + bytes: b"appleaptb".to_vec(), + offsets: vec![0, 5, 8, 9], + }; + let r = d.prefix_range(b"ap"); + assert_eq!(r.size(), 2); + assert!(!r.contains(2)); + } + + #[test] + fn prefix_equals_full_token_content() { + // "ab", "abc", "abd", "b" + let d = Dictionary { + bytes: b"ababcabdb".to_vec(), + offsets: vec![0, 2, 5, 8, 9], + }; + let r = d.prefix_range(b"ab"); + assert_eq!(r.size(), 3); + assert_eq!(r.begin, 0); + assert_eq!(r.last, 2); + assert!(!r.contains(3)); + } +} diff --git a/encodings/experimental/onpair-rs/src/kmp.rs b/encodings/experimental/onpair-rs/src/kmp.rs new file mode 100644 index 00000000000..8263e9f5ae2 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/kmp.rs @@ -0,0 +1,554 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/search/automata/kmp_automaton.h`. +// +// Token-level KMP automaton for SQL `col LIKE '%pattern%'`. +// +// Construction (mirrors the C++ original): +// 1. Byte-level KMP failure table over the pattern. +// 2. Base pass — for every dictionary token `t`, evolve KMP from state 0 +// through `t`'s bytes and record the exit state in `base[t]`. +// 3. Sparse pass — for every non-zero entry state `j`, find dictionary +// tokens whose exit state from `j` differs from `base[t]`. These are +// stored as sorted-by-token sparse range tables grouped per entry +// state. +// +// `step(t)`: +// * if `state > 0`, scan the sparse ranges for the current state. If `t` +// lies inside one of them, jump to its target state. +// * otherwise the transition is `base[t]`. +// +// `is_dead()` becomes true once `state == pattern_length` (a match has been +// observed). Pattern length is capped at 255 bytes because states are +// stored as `u8`. + +use crate::automaton::TokenAutomaton; +use crate::dict::Dictionary; +use crate::types::Token; +use crate::types::TokenRange; + +type State = u8; + +#[derive(Clone, Copy, Debug)] +struct SparseTransition { + range: TokenRange, + target: State, +} + +pub struct KmpAutomaton { + match_state: State, + state: State, + /// `base[token] = KMP exit state after consuming token's bytes from state 0`. + base: Vec, + /// Flattened sparse transitions: transitions for entry state `s` live + /// at `sparse[offsets[s]..offsets[s+1]]`. + sparse: Vec, + offsets: Vec, +} + +impl KmpAutomaton { + pub fn new(pattern: &[u8], dict: &Dictionary) -> Self { + let m = pattern.len(); + assert!( + m <= u8::MAX as usize, + "KmpAutomaton pattern must be at most 255 bytes" + ); + let num_tokens = dict.num_tokens(); + + if m == 0 { + return Self { + match_state: 0, + state: 0, + base: vec![0; num_tokens], + sparse: Vec::new(), + offsets: vec![0u16; 2], + }; + } + let match_state = m as State; + + // ── 1. KMP failure table ─────────────────────────────────────────── + let fail = build_failure(pattern); + + // Step the KMP DFA over a byte string starting from `s`. Once we + // reach `m` (match), subsequent bytes are absorbed and we stay at m. + let step_bytes = |mut s: State, data: &[u8]| -> State { + for &c in data { + if s as usize == m { + return match_state; + } + while s > 0 && pattern[s as usize] != c { + s = fail[(s - 1) as usize]; + } + if pattern[s as usize] == c { + s += 1; + } + } + s + }; + + // ── 2. Base pass ────────────────────────────────────────────────── + let mut base = vec![0u8; num_tokens]; + let p0 = pattern[0]; + for t in 0..num_tokens { + let tok = dict.data(t as Token); + if !tok.contains(&p0) { + base[t] = 0; + continue; + } + base[t] = step_bytes(0, tok); + } + + // ── 3. Sparse pass — dual-KMP trie traversal ─────────────────────── + let mut sparse: Vec = Vec::new(); + let mut offsets = vec![0u16; m + 1]; + + let mut work = SparseBuilder { + dict, + base: &base, + sparse: &mut sparse, + range_start: 0, + }; + + let mut relevant_chars: Vec = Vec::with_capacity(m); + + for j in 1..(m as State) { + work.range_start = work.sparse.len(); + offsets[j as usize] = work.range_start as u16; + + // Bytes that could cause a different KMP transition from j vs + // from 0: exactly the pattern bytes along the failure chain + // j → fail[j-1] → ... → 0. + relevant_chars.clear(); + let mut s = j; + while s > 0 { + relevant_chars.push(pattern[s as usize]); + s = fail[(s - 1) as usize]; + } + relevant_chars.sort_unstable(); + relevant_chars.dedup(); + + for &byte in &relevant_chars { + let range = dict.prefix_range(&[byte]); + if range.empty() { + continue; + } + let kj = step_bytes(j, &[byte]); + let k0 = step_bytes(0, &[byte]); + work.traverse(range, 1, kj, k0, match_state, &step_bytes); + } + } + + offsets[m] = sparse.len() as u16; + + Self { + match_state, + state: 0, + base, + sparse, + offsets, + } + } + + /// Pattern length (== matching state, capped at 255). + pub fn pattern_length(&self) -> usize { + self.match_state as usize + } + + /// Total number of sparse transitions emitted at construction. + pub fn sparse_range_count(&self) -> usize { + self.sparse.len() + } +} + +impl TokenAutomaton for KmpAutomaton { + #[inline] + fn step(&mut self, t: Token) { + if self.state == self.match_state { + return; + } + if self.state > 0 { + let lo = self.offsets[self.state as usize] as usize; + let hi = self.offsets[(self.state as usize) + 1] as usize; + // Sparse table is sorted by range.begin and ranges don't overlap. + for r in &self.sparse[lo..hi] { + if t < r.range.begin { + break; + } + if t <= r.range.last { + self.state = r.target; + return; + } + } + } + self.state = self.base[t as usize]; + } + + #[inline] + fn is_accepted(&self) -> bool { + self.state == self.match_state + } + + #[inline] + fn reset(&mut self) { + self.state = 0; + } + + #[inline] + fn is_dead(&self) -> bool { + self.state == self.match_state + } +} + +/// Internal builder split out so we can recurse while holding a `&mut +/// Vec`. The closure-based recursion in the C++ original +/// translates poorly to Rust's borrow checker; an explicit struct works +/// fine. +struct SparseBuilder<'a> { + dict: &'a Dictionary, + base: &'a [State], + sparse: &'a mut Vec, + /// Index in `sparse` where the current entry state's ranges began. + /// Used to merge adjacent same-target ranges within one state group. + range_start: usize, +} + +impl SparseBuilder<'_> { + fn emit(&mut self, range: TokenRange, target: State) { + if self.sparse.len() > self.range_start { + let last = self.sparse.last_mut().unwrap(); + if last.target == target && (last.range.last as u32) + 1 == range.begin as u32 { + last.range.last = range.last; + return; + } + } + self.sparse.push(SparseTransition { range, target }); + } + + fn traverse( + &mut self, + tr: TokenRange, + depth: usize, + kmp_j: State, + kmp_0: State, + match_state: State, + step_bytes: &F, + ) where + F: Fn(State, &[u8]) -> State, + { + if kmp_j == kmp_0 || tr.empty() { + return; + } + + // Full match from kmp_j: override tokens whose base != match_state. + if kmp_j == match_state { + let mut i = tr.begin; + while i <= tr.last { + if self.base[i as usize] != match_state { + let start = i; + while i <= tr.last && self.base[i as usize] != match_state { + i = i.wrapping_add(1); + if i == 0 { + // overflow guard for u16 + break; + } + } + self.emit( + TokenRange { + begin: start, + last: i.wrapping_sub(1), + }, + match_state, + ); + } else { + if i == tr.last { + break; + } + i = i.wrapping_add(1); + } + } + return; + } + + // Leaf tokens (token length == depth) all share exit state kmp_j. + let mut cur = tr.begin; + while cur <= tr.last && self.dict.token_size(cur) == depth { + if cur == tr.last { + self.emit( + TokenRange { + begin: tr.begin, + last: cur, + }, + kmp_j, + ); + return; + } + cur = cur.wrapping_add(1); + } + if cur > tr.begin { + self.emit( + TokenRange { + begin: tr.begin, + last: cur - 1, + }, + kmp_j, + ); + } + if cur > tr.last { + return; + } + + // Recurse into subtrees partitioned by byte at `depth`. + while cur <= tr.last { + let c = self.dict.data(cur)[depth]; + let mut sub_hi = cur; + while sub_hi < tr.last && self.dict.data(sub_hi + 1)[depth] == c { + sub_hi += 1; + } + self.traverse( + TokenRange { + begin: cur, + last: sub_hi, + }, + depth + 1, + step_bytes(kmp_j, &[c]), + step_bytes(kmp_0, &[c]), + match_state, + step_bytes, + ); + if sub_hi == tr.last { + break; + } + cur = sub_hi + 1; + } + } +} + +/// Standard KMP failure function over `pattern`. +fn build_failure(pattern: &[u8]) -> Vec { + let m = pattern.len(); + let mut fail = vec![0u8; m]; + let mut len: State = 0; + let mut i = 1usize; + while i < m { + if pattern[i] == pattern[len as usize] { + len += 1; + fail[i] = len; + i += 1; + } else if len > 0 { + len = fail[(len - 1) as usize]; + } else { + fail[i] = 0; + i += 1; + } + } + fail +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::column::Column; + use crate::config::DEFAULT_DICT12_CONFIG; + use crate::config::OnPairTrainingConfig; + use crate::test_corpus::make_raw; + use crate::test_corpus::random_ascii_strings; + use crate::test_corpus::user_strings; + + fn make_column>(strings: &[S]) -> Column { + make_column_bits(strings, 14) + } + + fn make_column_bits>(strings: &[S], bits: u32) -> Column { + let raw = make_raw(strings); + let cfg = OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 42, + }; + Column::compress(&raw.data, &raw.offsets_u64, cfg).unwrap() + } + + fn brute_contains>(strings: &[S], needle: &[u8]) -> Vec { + if needle.is_empty() { + return (0..strings.len()).collect(); + } + strings + .iter() + .enumerate() + .filter(|(_, s)| s.as_ref().windows(needle.len()).any(|w| w == needle)) + .map(|(i, _)| i) + .collect() + } + + // ── Empty pattern ───────────────────────────────────────────────────── + + #[test] + fn empty_pattern_matches_all() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"", &col.dictionary().clone()); + assert_eq!(col.scan(kmp).len(), 3); + } + + // ── Basic substring search ──────────────────────────────────────────── + + #[test] + fn basic_substring_match() { + let data = [ + "hello world", + "foo bar", + "hello there", + "world hello", + "xyz", + ]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"hello", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 2, 3]); + } + + #[test] + fn pattern_at_beginning() { + let data = ["abc_def", "xyz_abc", "abc"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(kmp).len(), 3); + } + + #[test] + fn pattern_at_end() { + let data = ["hello_xyz", "abc_xyz", "no_match"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"xyz", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 1]); + } + + #[test] + fn no_matches() { + let data = ["abc", "def", "ghi"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"xyz", &col.dictionary().clone()); + assert!(col.scan(kmp).is_empty()); + } + + #[test] + fn exact_string_match() { + let data = ["abc", "abcd", "ab"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"abc", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 1]); + } + + #[test] + fn single_char_pattern() { + let data = ["abc", "def", "axe"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"a", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 2]); + } + + // ── KMP failure-function stress ─────────────────────────────────────── + + #[test] + fn overlapping_pattern_in_string() { + let data = ["aaaa", "ab", "ba"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"aa", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0]); + } + + #[test] + fn kmp_failure_function_stress() { + // Pattern "abab" has LPS [0,0,1,2]. + let data = ["ababab", "abab", "abba", "baba"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"abab", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 1]); + } + + // ── Cross-validation against brute force ────────────────────────────── + + #[test] + fn cross_validation_with_brute_force() { + let data = random_ascii_strings(100, 30, 42); + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"ab", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), brute_contains(&data, b"ab")); + } + + #[test] + fn cross_validation_url_corpus() { + let data = user_strings(60); + let col = make_column(&data); + for needle in [&b"example"[..], b"https", b"docs", b"missing", b"://"] { + let kmp = KmpAutomaton::new(needle, &col.dictionary().clone()); + assert_eq!( + col.scan(kmp), + brute_contains(&data, needle), + "needle={needle:?}" + ); + } + } + + // ── All bit widths ──────────────────────────────────────────────────── + + #[test] + fn works_across_bit_widths() { + let data = ["the quick brown fox", "lazy dog", "quick fox"]; + for bw in 9u32..=16 { + let col = make_column_bits(&data, bw); + let kmp = KmpAutomaton::new(b"quick", &col.dictionary().clone()); + assert_eq!(col.scan(kmp), vec![0, 2], "bw={bw}"); + } + } + + // ── Pattern longer than any string ──────────────────────────────────── + + #[test] + fn pattern_longer_than_strings() { + let data = ["ab", "cd"]; + let col = make_column(&data); + let kmp = KmpAutomaton::new(b"abcdefghij", &col.dictionary().clone()); + assert!(col.scan(kmp).is_empty()); + } + + // ── Empty column ────────────────────────────────────────────────────── + + #[test] + fn empty_column_returns_empty() { + let strings: Vec<&[u8]> = vec![]; + let raw = make_raw(&strings); + let col = Column::compress(&raw.data, &raw.offsets_u64, DEFAULT_DICT12_CONFIG).unwrap(); + let kmp = KmpAutomaton::new(b"abc", &col.dictionary().clone()); + assert!(col.scan(kmp).is_empty()); + } + + // ── Rescannable ─────────────────────────────────────────────────────── + + #[test] + fn rescannable() { + let data = ["abc", "def", "abc_xyz"]; + let col = make_column(&data); + let mut kmp = KmpAutomaton::new(b"abc", &col.dictionary().clone()); + let r1 = col.scan(&mut kmp); + let r2 = col.scan(&mut kmp); + assert_eq!(r1, r2); + } + + // ── Equivalence with byte-level contains_bitmap ─────────────────────── + + #[test] + fn equivalent_to_contains_bitmap() { + let data = user_strings(80); + let col = make_column(&data); + for needle in [&b"example"[..], b"https", b"docs"] { + let kmp = KmpAutomaton::new(needle, &col.dictionary().clone()); + let token_result = col.scan(kmp); + let bitmap = col.contains_bitmap(needle); + let bitmap_result: Vec = (0..data.len()) + .filter(|&i| (bitmap[i / 8] >> (i % 8)) & 1 == 1) + .collect(); + assert_eq!(token_result, bitmap_result, "needle={needle:?}"); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/lib.rs b/encodings/experimental/onpair-rs/src/lib.rs new file mode 100644 index 00000000000..d2b53786d0c --- /dev/null +++ b/encodings/experimental/onpair-rs/src/lib.rs @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#![allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_sign_loss, + clippy::expect_used, + clippy::many_single_char_names, + clippy::panic, + clippy::unwrap_used +)] + +//! Pure-Rust port of [`onpair_cpp`](https://github.com/gargiulofrancesco/onpair_cpp). +//! +//! ## Quick start +//! +//! ```ignore +//! use vortex_onpair_rs::{Column, KmpAutomaton, OnPairTrainingConfig, and, not}; +//! +//! let col = Column::compress(&bytes, &offsets, OnPairTrainingConfig { +//! bits: 12, threshold: 0.5, seed: 42, +//! })?; +//! +//! // Compressed-domain predicates (single pass over the token stream): +//! let mut user = KmpAutomaton::new(b"user", col.dictionary()); +//! let mut admin = KmpAutomaton::new(b"admin", col.dictionary()); +//! let row_ids = col.scan(and(&mut user, not(&mut admin))); +//! ``` +//! +//! ## Module map +//! +//! - [`Column`] — the entry point: train + compress, decompress, scan +//! - [`TokenAutomaton`] + [`EqAutomaton`] / [`PrefixAutomaton`] / +//! [`KmpAutomaton`] / [`AhoCorasickAutomaton`] — compressed-domain predicates +//! - [`and`], [`or`], [`not`] — combinators +//! - [`Parts`] — borrow the raw `(dict, codes, boundaries, bits)` for +//! downstream consumers (drop-in for `vortex-onpair-sys::Parts`) + +pub mod aho_corasick; +pub mod automaton; +pub mod bits; +pub mod column; +pub mod config; +pub mod dict; +pub mod kmp; +pub mod lpm; +pub mod parser; +pub mod store; +pub mod tokenize; +pub mod trainer; +pub mod types; + +#[cfg(test)] +mod test_corpus; + +pub use aho_corasick::AhoCorasickAutomaton; +pub use aho_corasick::AhoCorasickTrie; +pub use automaton::And; +pub use automaton::EqAutomaton; +pub use automaton::Negated; +pub use automaton::Or; +pub use automaton::PrefixAutomaton; +pub use automaton::TokenAutomaton; +pub use automaton::and; +pub use automaton::not; +pub use automaton::or; +pub use bits::read_bits_lsb; +pub use bits::unpack_codes_to_u16; +pub use column::Column; +pub use column::Parts; +pub use config::DEFAULT_DICT12_CONFIG; +pub use config::DynamicThreshold; +pub use config::Error; +pub use config::FixedThreshold; +pub use config::OnPairTrainingConfig; +pub use config::ThresholdSpec; +pub use config::TrainingConfig; +pub use dict::Dictionary; +pub use kmp::KmpAutomaton; +pub use lpm::LongestPrefixMatcher; +pub use parser::parse; +pub use store::Store; +pub use tokenize::tokenize; +pub use tokenize::tokenize_with; +pub use trainer::TrainResult; +pub use trainer::train; +pub use types::BitWidth; +pub use types::ByteSpan; +pub use types::MAX_TOKEN_SIZE; +pub use types::StreamSpan; +pub use types::Token; +pub use types::TokenRange; +pub use types::is_valid_bits; +pub use types::max_dict_size; diff --git a/encodings/experimental/onpair-rs/src/lpm.rs b/encodings/experimental/onpair-rs/src/lpm.rs new file mode 100644 index 00000000000..70f67e90554 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/lpm.rs @@ -0,0 +1,582 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/encoding/lpm.h`. +// +// Two-tier storage mirroring the C++ design: +// * **short map** — tokens of length 1..=8 keyed by their bytes packed +// into a `u64` plus the length. Almost all dictionary entries land here +// (256 single-byte base tokens + the BPE merges, which tend to stay +// short on real data). +// * **long map** — tokens of length 9..=16 keyed by `(u128, u8)`. +// +// `find_longest_match` reads up to 16 bytes from the input as a `u128`, then +// probes the long map for lengths `min(max_len, 16) .. 9` (only if at least +// 9 bytes are available) and falls through to the short map for lengths +// `min(max_len, 8) .. 1`. Most calls return after one or two short-map +// hits; the long map is only consulted for inputs with a multi-byte token +// match. + +use hashbrown::HashMap; + +use crate::dict::Dictionary; +use crate::types::MAX_TOKEN_SIZE; +use crate::types::Token; + +const SHORT_LEN: usize = 8; + +/// Load up to 16 bytes from `data` as a little-endian `u128`. Bytes beyond +/// `data.len()` are read as zero. +#[inline] +fn load_le_u128(data: &[u8]) -> u128 { + let mut buf = [0u8; 16]; + let n = data.len().min(16); + buf[..n].copy_from_slice(&data[..n]); + u128::from_le_bytes(buf) +} + +/// Mask of the low `len * 8` bits in a `u128`. +#[inline] +fn mask_u128(len: usize) -> u128 { + if len >= 16 { + u128::MAX + } else { + (1u128 << (len * 8)) - 1 + } +} + +#[inline] +fn mask_u64(len: usize) -> u64 { + if len >= 8 { + u64::MAX + } else { + (1u64 << (len * 8)) - 1 + } +} + +/// Maps byte sequences (1..=`MAX_TOKEN_SIZE` bytes) to `Token` IDs. Always +/// holds the 256 single-byte tokens after construction so +/// `find_longest_match` is total. +#[derive(Default, Debug, Clone)] +pub struct LongestPrefixMatcher { + /// Length 1..=8 tokens keyed by (low-8-byte u64, length). + short_map: HashMap<(u64, u8), Token>, + /// Length 9..=16 tokens keyed by (full u128, length). + long_map: HashMap<(u128, u8), Token>, + /// Next ID to assign. Stored as u32 so we can represent the full 16-bit + /// token space (65 536 entries) without overflow. + next_id: u32, +} + +impl LongestPrefixMatcher { + /// Pre-inserts the 256 single-byte tokens with IDs 0..=255. + pub fn new() -> Self { + let mut short_map = HashMap::with_capacity(256); + for i in 0u16..=255 { + short_map.insert((i as u64, 1u8), i); + } + Self { + short_map, + long_map: HashMap::new(), + next_id: 256, + } + } + + /// Build a matcher from a complete dictionary: token at index `i` + /// receives ID `i`. Caller guarantees the dictionary contains every + /// single-byte token so `find_longest_match` remains total. + pub fn from_dictionary(dict: &Dictionary) -> Self { + let n = dict.num_tokens(); + let mut me = Self { + short_map: HashMap::with_capacity(n.min(SHORT_LEN * 256)), + long_map: HashMap::new(), + next_id: n as u32, + }; + for i in 0..n { + let id = i as Token; + me.insert_internal(dict.data(id), id); + } + me + } + + /// Insert `data` and assign it the next available token id. + /// + /// Precondition: `1 <= data.len() <= MAX_TOKEN_SIZE` and + /// `size() < 65_536`. + pub fn insert(&mut self, data: &[u8]) -> Token { + let id = self.next_id as Token; + self.next_id += 1; + self.insert_internal(data, id); + id + } + + #[inline] + fn insert_internal(&mut self, data: &[u8], id: Token) { + debug_assert!(!data.is_empty() && data.len() <= MAX_TOKEN_SIZE); + let len = data.len(); + if len <= SHORT_LEN { + let key = (load_le_u128(data) as u64) & mask_u64(len); + self.short_map.insert((key, len as u8), id); + } else { + let key = load_le_u128(data) & mask_u128(len); + self.long_map.insert((key, len as u8), id); + } + } + + /// Longest token whose bytes are a prefix of `data`, together with that + /// prefix's length. + /// + /// Precondition: `!data.is_empty()` and the matcher contains every + /// single-byte token (always true after [`new`] or [`from_dictionary`] + /// with a complete dictionary). + #[inline] + pub fn find_longest_match(&self, data: &[u8]) -> (Token, usize) { + let max_len = data.len().min(MAX_TOKEN_SIZE); + let packed = load_le_u128(data); + // Long map: only relevant when at least 9 bytes of input are available. + if max_len > SHORT_LEN && !self.long_map.is_empty() { + for len in (SHORT_LEN + 1..=max_len).rev() { + let key = packed & mask_u128(len); + if let Some(&t) = self.long_map.get(&(key, len as u8)) { + return (t, len); + } + } + } + // Short map: lengths min(max_len, 8) down to 1. + let short_max = max_len.min(SHORT_LEN); + let low64 = packed as u64; + for len in (1..=short_max).rev() { + let key = low64 & mask_u64(len); + if let Some(&t) = self.short_map.get(&(key, len as u8)) { + return (t, len); + } + } + unreachable!("LPM precondition: every single-byte token must be present") + } + + /// Number of tokens currently in the matcher. + #[inline] + pub fn size(&self) -> usize { + self.next_id as usize + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn insert_str(lpm: &mut LongestPrefixMatcher, s: &str) -> Token { + lpm.insert(s.as_bytes()) + } + + fn find_str(lpm: &LongestPrefixMatcher, s: &str) -> (Token, usize) { + lpm.find_longest_match(s.as_bytes()) + } + + fn make_test_dictionary(extra: &[&str]) -> Dictionary { + let mut d = Dictionary::default(); + d.offsets.push(0); + for i in 0u16..=255 { + d.bytes.push(i as u8); + d.offsets.push(d.bytes.len() as u32); + } + for &s in extra { + d.bytes.extend_from_slice(s.as_bytes()); + d.offsets.push(d.bytes.len() as u32); + } + d + } + + // ── Construction ───────────────────────────────────────────────────────── + + #[test] + fn default_constructor_size_is_256() { + let lpm = LongestPrefixMatcher::new(); + assert_eq!(lpm.size(), 256); + } + + #[test] + fn all_single_bytes_found_after_construction() { + let lpm = LongestPrefixMatcher::new(); + for i in 0u16..=255 { + let b = [i as u8]; + let (tok, len) = lpm.find_longest_match(&b); + assert_eq!(tok, i, "wrong token for byte {i}"); + assert_eq!(len, 1, "wrong length for byte {i}"); + } + } + + #[test] + fn zero_byte_is_token_0() { + let lpm = LongestPrefixMatcher::new(); + let (tok, len) = lpm.find_longest_match(&[0x00]); + assert_eq!(tok, 0); + assert_eq!(len, 1); + } + + #[test] + fn max_byte_is_token_255() { + let lpm = LongestPrefixMatcher::new(); + let (tok, len) = lpm.find_longest_match(&[0xFF]); + assert_eq!(tok, 255); + assert_eq!(len, 1); + } + + // ── Insert ─────────────────────────────────────────────────────────────── + + #[test] + fn first_insert_returns_id_256() { + let mut lpm = LongestPrefixMatcher::new(); + assert_eq!(insert_str(&mut lpm, "ab"), 256); + } + + #[test] + fn subsequent_inserts_increment_id() { + let mut lpm = LongestPrefixMatcher::new(); + assert_eq!(insert_str(&mut lpm, "ab"), 256); + assert_eq!(insert_str(&mut lpm, "cd"), 257); + assert_eq!(insert_str(&mut lpm, "ef"), 258); + } + + #[test] + fn size_grows_with_each_insert() { + let mut lpm = LongestPrefixMatcher::new(); + assert_eq!(lpm.size(), 256); + insert_str(&mut lpm, "ab"); + assert_eq!(lpm.size(), 257); + insert_str(&mut lpm, "cd"); + assert_eq!(lpm.size(), 258); + } + + #[test] + fn exactly_eight_bytes_short_store() { + let mut lpm = LongestPrefixMatcher::new(); + let id = insert_str(&mut lpm, "12345678"); + let (tok, len) = find_str(&lpm, "12345678"); + assert_eq!(tok, id); + assert_eq!(len, 8); + } + + #[test] + fn exactly_nine_bytes_long_store() { + let mut lpm = LongestPrefixMatcher::new(); + let id = insert_str(&mut lpm, "123456789"); + let (tok, len) = find_str(&lpm, "123456789X"); + assert_eq!(tok, id); + assert_eq!(len, 9); + } + + #[test] + fn max_token_size_insert_and_find() { + let mut lpm = LongestPrefixMatcher::new(); + let pat = "0123456789abcdef"; + assert_eq!(pat.len(), MAX_TOKEN_SIZE); + let id = lpm.insert(pat.as_bytes()); + let (tok, len) = lpm.find_longest_match(pat.as_bytes()); + assert_eq!(tok, id); + assert_eq!(len, MAX_TOKEN_SIZE); + } + + #[test] + fn sequence_with_embedded_zero_bytes() { + let mut lpm = LongestPrefixMatcher::new(); + let data = [0x00u8, 0x01, 0x02]; + let id = lpm.insert(&data); + let (tok, len) = lpm.find_longest_match(&data); + assert_eq!(tok, id); + assert_eq!(len, 3); + } + + // ── find_longest_match ─────────────────────────────────────────────────── + + #[test] + fn single_byte_found_with_correct_id() { + let lpm = LongestPrefixMatcher::new(); + let (tok, len) = lpm.find_longest_match(&[0x42]); + assert_eq!(tok, 0x42); + assert_eq!(len, 1); + } + + #[test] + fn longest_match_wins_over_shorter() { + let mut lpm = LongestPrefixMatcher::new(); + insert_str(&mut lpm, "abc"); + let long_id = insert_str(&mut lpm, "abcdefghi"); + let (tok, len) = find_str(&lpm, "abcdefghi"); + assert_eq!(tok, long_id); + assert_eq!(len, 9); + } + + #[test] + fn falls_back_to_shorter_if_long_not_present() { + let mut lpm = LongestPrefixMatcher::new(); + let short_id = insert_str(&mut lpm, "abc"); + let (tok, len) = find_str(&lpm, "abcdef"); + assert_eq!(tok, short_id); + assert_eq!(len, 3); + } + + #[test] + fn falls_back_to_single_byte() { + let mut lpm = LongestPrefixMatcher::new(); + insert_str(&mut lpm, "XY"); + let (tok, len) = find_str(&lpm, "XZ"); + assert_eq!(tok, b'X' as Token); + assert_eq!(len, 1); + } + + #[test] + fn exact_match_no_trailing_bytes() { + let mut lpm = LongestPrefixMatcher::new(); + let id = insert_str(&mut lpm, "hello"); + let (tok, len) = find_str(&lpm, "hello"); + assert_eq!(tok, id); + assert_eq!(len, 5); + } + + #[test] + fn input_shorter_than_stored_pattern_falls_to_single_byte() { + let mut lpm = LongestPrefixMatcher::new(); + insert_str(&mut lpm, "abcde"); + let (tok, len) = find_str(&lpm, "ab"); + assert_eq!(tok, b'a' as Token); + assert_eq!(len, 1); + } + + #[test] + fn input_shorter_than_longest_matches_shorter_token() { + let mut lpm = LongestPrefixMatcher::new(); + let id2 = insert_str(&mut lpm, "ab"); + insert_str(&mut lpm, "abcde"); + let (tok, len) = find_str(&lpm, "ab"); + assert_eq!(tok, id2); + assert_eq!(len, 2); + } + + #[test] + fn eight_byte_token_with_longer_input() { + let mut lpm = LongestPrefixMatcher::new(); + let id = insert_str(&mut lpm, "ABCDEFGH"); + let (tok, len) = find_str(&lpm, "ABCDEFGHIJ"); + assert_eq!(tok, id); + assert_eq!(len, 8); + } + + #[test] + fn nine_byte_beats_eight_byte() { + let mut lpm = LongestPrefixMatcher::new(); + insert_str(&mut lpm, "ABCDEFGH"); + let id9 = insert_str(&mut lpm, "ABCDEFGHI"); + let (tok, len) = find_str(&lpm, "ABCDEFGHIJ"); + assert_eq!(tok, id9); + assert_eq!(len, 9); + } + + #[test] + fn short_long_token_is_prefix_of_longer_long_token() { + let mut lpm = LongestPrefixMatcher::new(); + let id_short = insert_str(&mut lpm, "ABCDEFGHI"); // 9 bytes + let id_long = insert_str(&mut lpm, "ABCDEFGHIJK"); // 11 bytes + + let (t_s, l_s) = find_str(&lpm, "ABCDEFGHIJx"); + assert_eq!(t_s, id_short); + assert_eq!(l_s, 9); + + let (t_l, l_l) = find_str(&lpm, "ABCDEFGHIJKx"); + assert_eq!(t_l, id_long); + assert_eq!(l_l, 11); + } + + #[test] + fn multiple_tokens_same_long_prefix() { + let mut lpm = LongestPrefixMatcher::new(); + let id1 = insert_str(&mut lpm, "ABCDEFGHX"); + let id2 = insert_str(&mut lpm, "ABCDEFGHYZ"); + + let (t1, l1) = find_str(&lpm, "ABCDEFGHX__"); + assert_eq!(t1, id1); + assert_eq!(l1, 9); + + let (t2, l2) = find_str(&lpm, "ABCDEFGHYZ_"); + assert_eq!(t2, id2); + assert_eq!(l2, 10); + } + + #[test] + fn max_token_size_pattern_found() { + let mut lpm = LongestPrefixMatcher::new(); + let pat = "0123456789abcdef"; + let id = insert_str(&mut lpm, pat); + let (tok, len) = find_str(&lpm, pat); + assert_eq!(tok, id); + assert_eq!(len, MAX_TOKEN_SIZE); + } + + #[test] + fn binary_all_zeros_long_sequence() { + let mut lpm = LongestPrefixMatcher::new(); + let data = [0u8; 10]; + let id = lpm.insert(&data); + let (tok, len) = lpm.find_longest_match(&data); + assert_eq!(tok, id); + assert_eq!(len, 10); + } + + #[test] + fn binary_all_ff_long_sequence() { + let mut lpm = LongestPrefixMatcher::new(); + let data = [0xFFu8; 10]; + let id = lpm.insert(&data); + let (tok, len) = lpm.find_longest_match(&data); + assert_eq!(tok, id); + assert_eq!(len, 10); + } + + // ── Behavioural equivalent of LinearBucket→TrieBucket promotion tests ── + + #[test] + fn all_tokens_findable_with_shared_long_prefix() { + let mut lpm = LongestPrefixMatcher::new(); + let prefix = vec![b'X'; 8]; + let mut inserted = Vec::with_capacity(130); + for i in 0..130u32 { + let mut buf = prefix.clone(); + buf.push(i as u8); + inserted.push(lpm.insert(&buf)); + } + for i in 0..130u32 { + let mut buf = prefix.clone(); + buf.push(i as u8); + buf.push(0xFF); + let (tok, len) = lpm.find_longest_match(&buf); + assert_eq!(tok, inserted[i as usize], "token index {i}"); + assert_eq!(len, 9, "token index {i}"); + } + } + + #[test] + fn size_correct_after_many_long_inserts() { + let mut lpm = LongestPrefixMatcher::new(); + let prefix = vec![b'Y'; 8]; + for i in 0..130u32 { + let mut buf = prefix.clone(); + buf.push(i as u8); + lpm.insert(&buf); + } + assert_eq!(lpm.size(), 256 + 130); + } + + #[test] + fn deep_trie_multi_level_suffix() { + let mut lpm = LongestPrefixMatcher::new(); + let prefix = vec![b'Z'; 8]; + let mut inserted = Vec::with_capacity(130); + for i in 0..130u32 { + let mut buf = prefix.clone(); + buf.push(0x00); + buf.push(i as u8); + inserted.push(lpm.insert(&buf)); + } + for i in 0..130u32 { + let mut buf = prefix.clone(); + buf.push(0x00); + buf.push(i as u8); + buf.push(0xFF); + let (tok, len) = lpm.find_longest_match(&buf); + assert_eq!(tok, inserted[i as usize], "token index {i}"); + assert_eq!(len, 10, "token index {i}"); + } + } + + // ── from_dictionary ────────────────────────────────────────────────────── + + #[test] + fn from_dict_size_matches_base_only() { + let d = make_test_dictionary(&[]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + assert_eq!(lpm.size(), 256); + } + + #[test] + fn from_dict_size_matches_extra_tokens() { + let d = make_test_dictionary(&["ab", "abcde"]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + assert_eq!(lpm.size(), 258); + } + + #[test] + fn from_dict_all_single_bytes_found() { + let d = make_test_dictionary(&[]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + for i in 0u16..=255 { + let (tok, len) = lpm.find_longest_match(&[i as u8]); + assert_eq!(tok, i, "byte {i}"); + assert_eq!(len, 1, "byte {i}"); + } + } + + #[test] + fn from_dict_single_byte_uses_positional_id() { + let d = make_test_dictionary(&[]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + let (tok, len) = lpm.find_longest_match(&[0x41]); + assert_eq!(tok, 0x41); + assert_eq!(len, 1); + } + + #[test] + fn from_dict_multi_byte_token_found_with_correct_id() { + let d = make_test_dictionary(&["ab", "abcde"]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + let (tok, len) = find_str(&lpm, "abcde"); + assert_eq!(tok, 257); + assert_eq!(len, 5); + } + + #[test] + fn from_dict_shorter_multi_byte_token_fallback() { + let d = make_test_dictionary(&["ab", "abcde"]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + let (tok, len) = find_str(&lpm, "abc"); + assert_eq!(tok, 256); + assert_eq!(len, 2); + } + + #[test] + fn from_dict_long_token_from_dictionary() { + let d = make_test_dictionary(&["ABCDEFGHI"]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + let (tok, len) = find_str(&lpm, "ABCDEFGHIX"); + assert_eq!(tok, 256); + assert_eq!(len, 9); + } + + #[test] + fn from_dict_max_size_token_from_dictionary() { + let pat = "0123456789abcdef"; + let d = make_test_dictionary(&[pat]); + let lpm = LongestPrefixMatcher::from_dictionary(&d); + let (tok, len) = find_str(&lpm, pat); + assert_eq!(tok, 256); + assert_eq!(len, MAX_TOKEN_SIZE); + } + + #[test] + fn from_dict_insert_continues_id() { + let d = make_test_dictionary(&["ab", "cd"]); + let mut lpm = LongestPrefixMatcher::from_dictionary(&d); + let new_id = insert_str(&mut lpm, "ef"); + assert_eq!(new_id, 258); + assert_eq!(lpm.size(), 259); + } + + #[test] + fn from_dict_inserted_token_is_searchable() { + let d = make_test_dictionary(&["ab"]); + let mut lpm = LongestPrefixMatcher::from_dictionary(&d); + let id = insert_str(&mut lpm, "xyz"); + let (tok, len) = find_str(&lpm, "xyzW"); + assert_eq!(tok, id); + assert_eq!(len, 3); + } +} diff --git a/encodings/experimental/onpair-rs/src/parser.rs b/encodings/experimental/onpair-rs/src/parser.rs new file mode 100644 index 00000000000..74c26c6b3c9 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/parser.rs @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/encoding/parsing/parser.h` and `parser.cpp`. +// +// Drives the LongestPrefixMatcher over every input string, writes the +// resulting token IDs into a `Store` via `BitWriter`, and records per-string +// token-count boundaries. + +use crate::bits::BitWriter; +use crate::lpm::LongestPrefixMatcher; +use crate::store::Store; +use crate::types::BitWidth; + +/// Encode all `n` strings into `store` using `lpm`. +/// +/// `offsets` has length `n + 1`; string `i` occupies +/// `data[offsets[i]..offsets[i + 1]]`. On entry `store` is reset. +pub fn parse( + data: &[u8], + offsets: &[u32], + n: usize, + lpm: &LongestPrefixMatcher, + bits: BitWidth, + store: &mut Store, +) { + store.bit_width = bits; + store.packed.clear(); + store.boundaries.clear(); + store.boundaries.reserve(n + 1); + store.boundaries.push(0); + + let mut writer = BitWriter::new(store); + let mut boundaries = Vec::with_capacity(n + 1); + boundaries.push(0u32); + + for i in 0..n { + let s = offsets[i] as usize; + let e = offsets[i + 1] as usize; + let mut pos = s; + while pos < e { + let (tok, mlen) = lpm.find_longest_match(&data[pos..e]); + writer.write(tok); + pos += mlen; + } + boundaries.push(writer.tokens_written() as u32); + } + drop(writer); // flush packed words + sentinel + store.boundaries = boundaries; +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bits::unpack_codes_to_u16; + use crate::config::FixedThreshold; + use crate::config::ThresholdSpec; + use crate::config::TrainingConfig; + use crate::dict::Dictionary; + use crate::test_corpus::alternating_strings as make_alternating_strings; + use crate::test_corpus::binary_strings as make_binary_strings; + use crate::test_corpus::homogeneous_strings as make_homogeneous_strings; + use crate::test_corpus::make_raw; + use crate::test_corpus::mixed_length_strings as make_mixed_length_strings; + use crate::test_corpus::random_ascii_strings as make_random_strings; + use crate::test_corpus::user_strings as make_user_strings; + use crate::trainer::TrainResult; + use crate::trainer::train; + use crate::types::Token; + + fn make_base_dict() -> Dictionary { + let mut d = Dictionary::default(); + d.offsets.push(0); + for i in 0u16..=255 { + d.bytes.push(i as u8); + d.offsets.push(d.bytes.len() as u32); + } + d + } + + /// Decode all tokens for row `idx` against `dict`. Equivalent of the + /// C++ `decode_tokens` helper in test_parser.cpp. + fn decode_tokens(store: &Store, dict: &Dictionary, idx: usize) -> Vec { + let begin = store.boundaries[idx] as usize; + let end = store.boundaries[idx + 1] as usize; + let codes = unpack_codes_to_u16(&store.packed, end, store.bit_width as u32); + let mut out = Vec::new(); + for &c in &codes[begin..end] { + out.extend_from_slice(dict.data(c as Token)); + } + out + } + + fn expected_packed_words(n: usize, bits: BitWidth) -> usize { + (n * bits as usize).div_ceil(64) + } + + fn roundtrip_all>(strings: &[S], bits: BitWidth, seed: u64) -> bool { + if strings.is_empty() { + return true; + } + let raw = make_raw(strings); + let cfg = TrainingConfig { + bits, + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(seed), + }; + let TrainResult { dict, lpm } = train(&raw.data, &raw.offsets, raw.n, &cfg); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, bits, &mut store); + for i in 0..strings.len() { + let decoded = decode_tokens(&store, &dict, i); + if decoded != strings[i].as_ref() { + return false; + } + } + true + } + + const WIDTHS: &[BitWidth] = &[9, 10, 11, 12, 13, 14, 15, 16]; + + // ── Degenerate inputs ────────────────────────────────────────────────── + + #[test] + fn zero_strings_produces_one_boundary() { + let lpm = LongestPrefixMatcher::new(); + let mut store = Store::default(); + parse(&[], &[0], 0, &lpm, 16, &mut store); + assert_eq!(store.boundaries, vec![0u32]); + assert!(store.packed.is_empty()); + assert_eq!(store.bit_width, 16); + } + + #[test] + fn single_empty_string_produces_two_zero_boundaries() { + let lpm = LongestPrefixMatcher::new(); + let mut store = Store::default(); + parse(&[], &[0, 0], 1, &lpm, 16, &mut store); + assert_eq!(store.boundaries, vec![0u32, 0]); + assert_eq!(store.num_tokens(), 0); + assert!(store.packed.is_empty()); + } + + #[test] + fn many_empty_strings_all_boundaries_are_zero() { + let lpm = LongestPrefixMatcher::new(); + let offsets = vec![0u32; 51]; + let mut store = Store::default(); + parse(&[], &offsets, 50, &lpm, 16, &mut store); + assert_eq!(store.boundaries.len(), 51); + for b in &store.boundaries { + assert_eq!(*b, 0); + } + assert_eq!(store.num_tokens(), 0); + assert!(store.packed.is_empty()); + } + + // ── Structural invariants over all bit widths ────────────────────────── + + #[test] + fn boundary_count_is_n_plus_one() { + for &bits in WIDTHS { + let lpm = LongestPrefixMatcher::new(); + let raw = make_raw(&make_user_strings(20)); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, bits, &mut store); + assert_eq!(store.boundaries.len(), raw.n + 1); + assert_eq!(store.bit_width, bits); + } + } + + #[test] + fn boundaries_are_monotonic() { + for &bits in WIDTHS { + let lpm = LongestPrefixMatcher::new(); + let raw = make_raw(&make_random_strings(25, 40, 7)); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, bits, &mut store); + for i in 1..store.boundaries.len() { + assert!( + store.boundaries[i] >= store.boundaries[i - 1], + "non-monotonic at index {i}" + ); + } + } + } + + #[test] + fn last_boundary_equals_total_token_count() { + for &bits in WIDTHS { + let lpm = LongestPrefixMatcher::new(); + let raw = make_raw(&make_random_strings(15, 30, 99)); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, bits, &mut store); + assert_eq!( + *store.boundaries.last().unwrap() as usize, + store.num_tokens() + ); + } + } + + #[test] + fn packed_size_consistent_with_token_count() { + for &bits in WIDTHS { + let lpm = LongestPrefixMatcher::new(); + let raw = make_raw(&make_user_strings(20)); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, bits, &mut store); + assert_eq!( + store.packed.len(), + expected_packed_words(store.num_tokens(), bits) + 1 + ); + } + } + + // ── Round-trip with base-tokens LPM ──────────────────────────────────── + + #[test] + fn base_tokens_single_known_string() { + let lpm = LongestPrefixMatcher::new(); + let d = make_base_dict(); + let expected = "Hello, World!"; + let raw = make_raw(&[expected]); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, 16, &mut store); + assert_eq!(decode_tokens(&store, &d, 0), expected.as_bytes()); + } + + #[test] + fn base_tokens_all_single_byte_values() { + let lpm = LongestPrefixMatcher::new(); + let d = make_base_dict(); + let strings: Vec> = (0u16..=255).map(|i| vec![i as u8]).collect(); + let raw = make_raw(&strings); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, 16, &mut store); + for (i, s) in strings.iter().enumerate() { + assert_eq!( + decode_tokens(&store, &d, i), + *s, + "mismatch for byte value {i}" + ); + } + } + + #[test] + fn base_tokens_multiple_strings() { + let lpm = LongestPrefixMatcher::new(); + let d = make_base_dict(); + let strings = make_random_strings(30, 20, 2024); + let raw = make_raw(&strings); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, 16, &mut store); + for (i, s) in strings.iter().enumerate() { + assert_eq!( + decode_tokens(&store, &d, i), + *s, + "decode mismatch at string {i}" + ); + } + } + + // ── Trained LPM produces multi-byte tokens ───────────────────────────── + + #[test] + fn trained_lpm_produces_multi_byte_tokens() { + let strings = make_homogeneous_strings(50, 40, b'a'); + let raw = make_raw(&strings); + let cfg = TrainingConfig { + bits: 16, + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + }; + let TrainResult { dict: _, lpm } = train(&raw.data, &raw.offsets, raw.n, &cfg); + let mut store = Store::default(); + parse(&raw.data, &raw.offsets, raw.n, &lpm, 16, &mut store); + let tokens_0 = store.boundaries[1] - store.boundaries[0]; + assert!(tokens_0 < 40, "parser did not use any multi-byte tokens"); + } + + // ── Round-trip with trained LPM across all bit widths ────────────────── + + #[test] + fn roundtrip_user_strings() { + for &bits in WIDTHS { + assert!(roundtrip_all(&make_user_strings(50), bits, 42)); + } + } + + #[test] + fn roundtrip_random_ascii_strings() { + for &bits in WIDTHS { + assert!(roundtrip_all(&make_random_strings(60, 50, 1337), bits, 42)); + } + } + + #[test] + fn roundtrip_binary_strings_with_nul_bytes() { + for &bits in WIDTHS { + assert!(roundtrip_all(&make_binary_strings(40, 30, 777), bits, 42)); + } + } + + #[test] + fn roundtrip_homogeneous_strings() { + for &bits in WIDTHS { + assert!(roundtrip_all( + &make_homogeneous_strings(30, 40, b'a'), + bits, + 42 + )); + } + } + + #[test] + fn roundtrip_alternating_strings() { + for &bits in WIDTHS { + assert!(roundtrip_all(&make_alternating_strings(30, 40), bits, 42)); + } + } + + #[test] + fn roundtrip_mixed_length_strings() { + for &bits in WIDTHS { + assert!(roundtrip_all( + &make_mixed_length_strings(80, 100, 31415), + bits, + 42 + )); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/store.rs b/encodings/experimental/onpair-rs/src/store.rs new file mode 100644 index 00000000000..e2b80469223 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/store.rs @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/core/store.h` and `store_view.h`. +// +// The packed token store holds the bit-packed token stream produced by the +// parser. All tokens in a column share the same fixed bit width (9..=16). +// +// Per-string boundaries are stored in token-stream indices (Arrow-style): +// `boundaries.len() == num_strings + 1`, with `boundaries[i]` the +// inclusive start and `boundaries[num_strings]` the total token count. + +use crate::types::BitWidth; +use crate::types::StreamSpan; + +#[derive(Default, Debug, Clone)] +pub struct Store { + /// Immutable after first write (9..=16). + pub bit_width: BitWidth, + /// LSB-first bit-packed token stream. + pub packed: Vec, + /// `boundaries[i]` is the token-index start of string `i`; + /// `boundaries.last()` is the total token count. + pub boundaries: Vec, +} + +impl Store { + #[inline] + pub fn num_strings(&self) -> usize { + if self.boundaries.is_empty() { + 0 + } else { + self.boundaries.len() - 1 + } + } + + #[inline] + pub fn num_tokens(&self) -> usize { + self.boundaries.last().copied().unwrap_or(0) as usize + } + + #[inline] + pub fn bytes_used(&self) -> usize { + if self.boundaries.is_empty() { + return 0; + } + let total_bits = self.num_tokens() * self.bit_width as usize; + let packed_bytes = total_bits.div_ceil(8); + packed_bytes + self.boundaries.len() * size_of::() + } + + /// Token-stream range `[begin, end)` for string at position `idx`. + /// Precondition: `idx < num_strings()`. + #[inline] + pub fn string_span(&self, idx: usize) -> StreamSpan { + StreamSpan { + begin: self.boundaries[idx], + end: self.boundaries[idx + 1], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── Store ─────────────────────────────────────────────────────────────── + + #[test] + fn num_strings_zero_when_boundaries_empty() { + let s = Store { + bit_width: 16, + ..Default::default() + }; + assert_eq!(s.num_strings(), 0); + } + + #[test] + fn num_strings_zero_when_only_sentinel() { + let s = Store { + bit_width: 16, + boundaries: vec![0], + ..Default::default() + }; + assert_eq!(s.num_strings(), 0); + } + + #[test] + fn num_strings_is_correct() { + let s = Store { + bit_width: 16, + boundaries: vec![0, 3, 5, 8], + ..Default::default() + }; + assert_eq!(s.num_strings(), 3); + } + + #[test] + fn num_tokens_zero_when_boundaries_empty() { + let s = Store { + bit_width: 16, + ..Default::default() + }; + assert_eq!(s.num_tokens(), 0); + } + + #[test] + fn num_tokens_is_last_boundary() { + let s = Store { + bit_width: 16, + boundaries: vec![0, 4, 7], + ..Default::default() + }; + assert_eq!(s.num_tokens(), 7); + } + + #[test] + fn bytes_used_counts_packed_bits_and_boundaries() { + let s = Store { + bit_width: 16, + packed: vec![0xDEAD, 0xBEEF], + boundaries: vec![0, 2, 4], + }; + // total_bits = 4*16 = 64; packed_bytes = 8; boundaries = 3 * 4 = 12. + assert_eq!(s.bytes_used(), 8 + 3 * size_of::()); + } + + #[test] + fn bytes_used_with_different_bit_width() { + let s = Store { + bit_width: 13, + packed: vec![0xDEAD, 0xBEEF], + boundaries: vec![0, 2, 4], + }; + // total_bits = 4*13 = 52; packed_bytes = 7. + assert_eq!(s.bytes_used(), 7 + 3 * size_of::()); + } + + // ── StoreView equivalents (string_span + raw access) ──────────────────── + + #[test] + fn inherits_metadata() { + let s = Store { + bit_width: 14, + packed: vec![1, 2], + boundaries: vec![0, 5, 10], + }; + assert_eq!(s.bit_width, 14); + assert_eq!(s.num_strings(), 2); + assert_eq!(s.num_tokens(), 10); + assert_eq!(s.bytes_used(), s.bytes_used()); + } + + #[test] + fn string_span_returns_correct_range() { + let s = Store { + bit_width: 16, + boundaries: vec![0, 3, 7, 10], + ..Default::default() + }; + assert_eq!(s.string_span(0), StreamSpan { begin: 0, end: 3 }); + assert_eq!(s.string_span(1), StreamSpan { begin: 3, end: 7 }); + assert_eq!(s.string_span(2), StreamSpan { begin: 7, end: 10 }); + } + + #[test] + fn empty_store_view() { + let s = Store { + bit_width: 12, + ..Default::default() + }; + assert_eq!(s.num_strings(), 0); + assert_eq!(s.num_tokens(), 0); + assert_eq!(s.bytes_used(), 0); + } +} diff --git a/encodings/experimental/onpair-rs/src/test_corpus.rs b/encodings/experimental/onpair-rs/src/test_corpus.rs new file mode 100644 index 00000000000..5846e8328fa --- /dev/null +++ b/encodings/experimental/onpair-rs/src/test_corpus.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Shared test corpus generators. These replicate the helpers in the C++ +// `tests/helpers/corpus.h` so the unit tests in `trainer`, `parser`, +// `decoder`, and `search` all draw from the same data and can be compared +// head-to-head against the upstream test suite. + +#![allow(dead_code)] + +use rand::RngExt; +use rand::SeedableRng; + +use crate::types::MAX_TOKEN_SIZE; + +/// Arrow-style flat representation of a list of byte strings. +pub(crate) struct Raw { + pub data: Vec, + pub offsets: Vec, + pub offsets_u64: Vec, + pub n: usize, +} + +pub(crate) fn make_raw>(strings: &[S]) -> Raw { + let mut data = Vec::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + let mut offsets_u64 = Vec::with_capacity(strings.len() + 1); + offsets.push(0u32); + offsets_u64.push(0u64); + for s in strings { + data.extend_from_slice(s.as_ref()); + offsets.push(data.len() as u32); + offsets_u64.push(data.len() as u64); + } + Raw { + data, + offsets, + offsets_u64, + n: strings.len(), + } +} + +/// URL-shaped repetitive corpus — easy BPE merge targets. +pub(crate) fn user_strings(n: usize) -> Vec { + const BASES: &[&str] = &[ + "https://www.example.com/page", + "https://www.example.com/data", + "https://www.test.org/page", + "ftp://files.example.com/x", + "https://docs.example.com/spec", + "https://api.example.net/v1", + ]; + (0..n).map(|i| BASES[i % BASES.len()].to_string()).collect() +} + +/// All copies of the same single-character string of length `len`. +pub(crate) fn homogeneous_strings(n: usize, len: usize, ch: u8) -> Vec> { + (0..n).map(|_| vec![ch; len]).collect() +} + +/// "abab..." period-2 strings of length `len`. +pub(crate) fn alternating_strings(n: usize, len: usize) -> Vec> { + (0..n) + .map(|_| { + (0..len) + .map(|i| if i.is_multiple_of(2) { b'a' } else { b'b' }) + .collect() + }) + .collect() +} + +/// Random ascii lowercase strings, length 1..=max_len. +pub(crate) fn random_ascii_strings(n: usize, max_len: usize, seed: u64) -> Vec> { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let l = rng.random_range(1..=max_len); + (0..l).map(|_| rng.random_range(b'a'..=b'z')).collect() + }) + .collect() +} + +/// Random bytes over the full 0..=255 range. +pub(crate) fn binary_strings(n: usize, max_len: usize, seed: u64) -> Vec> { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let l = rng.random_range(1..=max_len); + (0..l).map(|_| rng.random_range(0..=255u32) as u8).collect() + }) + .collect() +} + +/// `n` fixed-length strings of length `len`, content rotating through the +/// lowercase alphabet so adjacent strings differ. +pub(crate) fn fixed_length_strings(n: usize, len: usize) -> Vec> { + (0..n) + .map(|i| (0..len).map(|j| b'a' + ((i + j) as u8 % 26)).collect()) + .collect() +} + +/// Random strings with length 0..=max_len — exercises empty + max-len paths. +pub(crate) fn mixed_length_strings(n: usize, max_len: usize, seed: u64) -> Vec> { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let l = rng.random_range(0..=max_len); + (0..l).map(|_| rng.random_range(b'a'..=b'z')).collect() + }) + .collect() +} + +/// One row per byte value 0..=255 — exercises the base dictionary. +pub(crate) fn single_byte_strings() -> Vec> { + (0u16..=255).map(|i| vec![i as u8]).collect() +} + +/// `n` empty strings. +pub(crate) fn empty_strings(n: usize) -> Vec> { + vec![vec![]; n] +} + +/// A corpus designed to trigger longer-token discovery: `n` copies of a +/// long fixed string that is exactly `MAX_TOKEN_SIZE` bytes. +pub(crate) fn max_token_strings(n: usize) -> Vec> { + let pat: Vec = (0..MAX_TOKEN_SIZE as u8).collect(); + vec![pat; n] +} diff --git a/encodings/experimental/onpair-rs/src/tokenize.rs b/encodings/experimental/onpair-rs/src/tokenize.rs new file mode 100644 index 00000000000..74188dcdaaf --- /dev/null +++ b/encodings/experimental/onpair-rs/src/tokenize.rs @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/search/detail/tokenize.h`. +// +// Greedy longest-match tokenisation of a byte string against a sorted +// dictionary. The C++ original drives a binary search at each prefix length +// directly over the sorted dictionary. We re-use [`LongestPrefixMatcher`] +// built from the dictionary instead — it gives the same result (greedy +// longest match) and is O(MAX_TOKEN_SIZE) per token regardless of dict size. +// +// Precondition: the dictionary must contain every single-byte token +// (guaranteed by [`crate::train`]). This makes tokenisation total. + +use crate::dict::Dictionary; +use crate::lpm::LongestPrefixMatcher; +use crate::types::Token; + +/// Tokenise `text` against `dict` via greedy longest match. +pub fn tokenize(text: &[u8], dict: &Dictionary) -> Vec { + if text.is_empty() { + return Vec::new(); + } + let lpm = LongestPrefixMatcher::from_dictionary(dict); + tokenize_with(text, &lpm) +} + +/// Tokenise using a pre-built LPM (skip the build cost when tokenising +/// many strings against the same dictionary). +pub fn tokenize_with(text: &[u8], lpm: &LongestPrefixMatcher) -> Vec { + let mut out = Vec::with_capacity(text.len()); + let mut pos = 0; + while pos < text.len() { + let (t, len) = lpm.find_longest_match(&text[pos..]); + out.push(t); + pos += len; + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::FixedThreshold; + use crate::config::ThresholdSpec; + use crate::config::TrainingConfig; + use crate::test_corpus::binary_strings; + use crate::test_corpus::make_raw; + use crate::test_corpus::random_ascii_strings; + use crate::test_corpus::single_byte_strings; + use crate::test_corpus::user_strings; + use crate::trainer::TrainResult; + use crate::trainer::train; + + fn reconstruct(tokens: &[Token], dict: &Dictionary) -> Vec { + let mut out = Vec::new(); + for &t in tokens { + out.extend_from_slice(dict.data(t)); + } + out + } + + fn train_dict>(corpus: &[S]) -> Dictionary { + let raw = make_raw(corpus); + let cfg = TrainingConfig { + seed: Some(42), + ..Default::default() + }; + let TrainResult { dict, .. } = train(&raw.data, &raw.offsets, raw.n, &cfg); + dict + } + + // ── Empty input ─────────────────────────────────────────────────────── + + #[test] + fn empty_string_returns_no_tokens() { + let d = train_dict(&user_strings(10)); + assert!(tokenize(b"", &d).is_empty()); + } + + // ── Single byte ─────────────────────────────────────────────────────── + + #[test] + fn single_byte_produces_one_token() { + let d = train_dict(&user_strings(10)); + let t = tokenize(b"x", &d); + assert_eq!(t.len(), 1); + assert_eq!(reconstruct(&t, &d), b"x"); + } + + // ── Round-trip ───────────────────────────────────────────────────────── + + #[test] + fn reconstruction_matches_input() { + let corpus = user_strings(50); + let d = train_dict(&corpus); + for s in &corpus { + let t = tokenize(s.as_bytes(), &d); + assert_eq!(reconstruct(&t, &d), s.as_bytes(), "row {s}"); + } + } + + #[test] + fn reconstruction_with_random_strings() { + let corpus = random_ascii_strings(100, 50, 77); + let d = train_dict(&corpus); + let unseen = random_ascii_strings(20, 40, 999); + for s in &unseen { + let t = tokenize(s, &d); + assert_eq!(reconstruct(&t, &d), *s); + } + } + + #[test] + fn reconstruction_with_binary_strings() { + let corpus = binary_strings(50, 30, 13); + let d = train_dict(&corpus); + for s in &corpus { + let t = tokenize(s, &d); + assert_eq!(reconstruct(&t, &d), *s); + } + } + + // ── Greedy longest match ────────────────────────────────────────────── + + #[test] + fn greedy_longest_match_compresses() { + let corpus: Vec<&str> = (0..100).map(|_| "aabb").collect(); + let raw = make_raw(&corpus); + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let TrainResult { dict, .. } = train(&raw.data, &raw.offsets, raw.n, &cfg); + let tokens = tokenize(b"aabb", &dict); + assert!(tokens.len() < 4, "greedy tokenisation should compress"); + assert_eq!(reconstruct(&tokens, &dict), b"aabb"); + } + + // ── All 256 base tokens ─────────────────────────────────────────────── + + #[test] + fn all_256_bytes_tokenisable_via_base_tokens() { + let d = train_dict(&single_byte_strings()); + for b in 0u16..=255 { + let s = [b as u8]; + let t = tokenize(&s, &d); + assert_eq!(t.len(), 1, "byte {b} not tokenised"); + assert_eq!(reconstruct(&t, &d), &s[..], "byte {b} mismatch"); + } + } + + // ── Bound: token count <= byte count ────────────────────────────────── + + #[test] + fn token_count_never_exceeds_string_length() { + let corpus = user_strings(50); + let d = train_dict(&corpus); + for s in &corpus { + let t = tokenize(s.as_bytes(), &d); + assert!(t.len() <= s.len(), "more tokens than bytes for {s}"); + } + } + + // ── Consistency with parser ─────────────────────────────────────────── + + #[test] + fn tokenize_matches_parser_output() { + let corpus = user_strings(50); + let raw = make_raw(&corpus); + let cfg = TrainingConfig { + seed: Some(42), + ..Default::default() + }; + let TrainResult { dict, lpm } = train(&raw.data, &raw.offsets, raw.n, &cfg); + for s in &corpus { + let tokens_a = tokenize(s.as_bytes(), &dict); + // Tokenise directly via the trained LPM. + let tokens_b = tokenize_with(s.as_bytes(), &lpm); + assert_eq!(tokens_a, tokens_b, "disagreement on {s}"); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/trainer.rs b/encodings/experimental/onpair-rs/src/trainer.rs new file mode 100644 index 00000000000..f209d6c3768 --- /dev/null +++ b/encodings/experimental/onpair-rs/src/trainer.rs @@ -0,0 +1,663 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `src/onpair/encoding/training/trainer.cpp` plus the +// `DynamicThresholdController` from +// `include/onpair/encoding/training/dynamic_threshold.h`. + +use hashbrown::HashMap; +use rand::SeedableRng; +use rand::seq::SliceRandom; + +use crate::config::ThresholdSpec; +use crate::config::TrainingConfig; +use crate::dict::Dictionary; +use crate::lpm::LongestPrefixMatcher; +use crate::types::MAX_TOKEN_SIZE; +use crate::types::Token; +use crate::types::max_dict_size; + +/// Result of [`train`]: a sorted dictionary and a matching LPM whose token +/// IDs correspond to the dictionary's sorted order. +#[derive(Debug, Clone)] +pub struct TrainResult { + pub dict: Dictionary, + pub lpm: LongestPrefixMatcher, +} + +// ───────────────────────────────────────────────────────────────────────────── +// DynamicThresholdController — adaptive merge threshold. +// ───────────────────────────────────────────────────────────────────────────── + +struct DynamicThresholdController { + capacity: usize, + scan_budget: usize, + check_interval: usize, + threshold: u8, + entries_created: usize, + bytes_scanned: usize, + entries_at_check: usize, + bytes_at_check: usize, + next_checkpoint: usize, +} + +impl DynamicThresholdController { + fn new(capacity: usize, total_bytes: usize, scan_fraction: f64) -> Self { + let scan_budget = (total_bytes as f64 * scan_fraction) as usize; + let check_interval = (capacity / 128).max(64); + Self { + capacity, + scan_budget, + check_interval, + threshold: 2, + entries_created: 0, + bytes_scanned: 0, + entries_at_check: 0, + bytes_at_check: 0, + next_checkpoint: check_interval, + } + } + + #[inline] + fn get(&self) -> u8 { + self.threshold + } + + #[inline] + fn budget_exhausted(&self) -> bool { + self.bytes_scanned > self.scan_budget + } + + #[inline] + fn on_bytes_scanned(&mut self, n: usize) { + self.bytes_scanned += n; + } + + fn on_entry_created(&mut self) { + self.entries_created += 1; + if self.entries_created >= self.next_checkpoint { + self.rebalance(); + } + } + + fn rebalance(&mut self) { + let delta_e = self.entries_created - self.entries_at_check; + let delta_b = self.bytes_scanned - self.bytes_at_check; + + let recent_rate = if delta_b > 0 { + delta_e as f64 / delta_b as f64 + } else { + 1e9 + }; + + let e_rem = if self.capacity > self.entries_created { + self.capacity - self.entries_created + } else { + 1 + }; + let b_rem = if self.scan_budget > self.bytes_scanned { + self.scan_budget - self.bytes_scanned + } else { + 1 + }; + + let target_rate = e_rem as f64 / b_rem as f64; + let ratio = if target_rate > 0.0 { + recent_rate / target_rate + } else { + 1e9 + }; + + if ratio > 2.0 && self.threshold < 255 { + self.threshold += 1; + } else if ratio < 0.5 { + self.threshold = if self.threshold > 2 { + self.threshold - 1 + } else { + 2 + }; + } + + self.entries_at_check = self.entries_created; + self.bytes_at_check = self.bytes_scanned; + self.next_checkpoint = self.entries_created + self.check_interval; + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// train() +// ───────────────────────────────────────────────────────────────────────────── + +/// Discover merge tokens via frequency-threshold scanning, then sort the +/// dictionary lexicographically and pad it for decoder over-copy. +/// +/// `offsets` has length `n + 1`; string `i` occupies +/// `data[offsets[i]..offsets[i + 1]]`. +pub fn train(data: &[u8], offsets: &[u32], n: usize, cfg: &TrainingConfig) -> TrainResult { + let dict_capacity = max_dict_size(cfg.bits); + + // ── Initialise with the 256 single-byte base tokens ──────────────────── + let mut dict = Dictionary::default(); + dict.offsets.reserve(dict_capacity + 1); + dict.bytes.reserve(dict_capacity * MAX_TOKEN_SIZE); + dict.offsets.push(0); + for i in 0u16..=255 { + dict.bytes.push(i as u8); + dict.offsets.push(dict.bytes.len() as u32); + } + let mut lpm = LongestPrefixMatcher::new(); + + // ── Threshold setup ──────────────────────────────────────────────────── + let mut threshold: u8; + let mut dyn_ctrl: Option = None; + match cfg.threshold { + ThresholdSpec::Fixed(ft) => { + threshold = ft.value; + } + ThresholdSpec::Dynamic(dt) => { + let total_bytes = if n == 0 { 0 } else { offsets[n] as usize }; + let capacity = dict_capacity - 256; + let ctrl = DynamicThresholdController::new(capacity, total_bytes, dt.sample_fraction); + threshold = ctrl.get(); + dyn_ctrl = Some(ctrl); + } + } + + // ── Shuffle training order ───────────────────────────────────────────── + // The C++ trainer uses `std::mt19937_64` with `std::shuffle`. Pure-Rust + // bit-exact compatibility would require reimplementing both. We use the + // default Rng (with deterministic seed) and document this as a known + // divergence — cross-impl comparison tests assert structural equality + // (decompression equivalence, predicate equivalence), not bit-exact + // dictionary equality. + let mut order: Vec = (0..n as u32).collect(); + let seed = cfg.seed.unwrap_or_else(|| { + use rand::RngExt; + rand::rng().random() + }); + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + order.shuffle(&mut rng); + + // ── Pair frequency map. Key packs two Token values into a u32. ───────── + let mut freq: HashMap = HashMap::new(); + + let mut full_dictionary = false; + let mut budget_exhausted = false; + + for idx in order { + if full_dictionary || budget_exhausted { + break; + } + + let s_start = offsets[idx as usize] as usize; + let s_end = offsets[idx as usize + 1] as usize; + if s_end == s_start { + continue; + } + let str_bytes = &data[s_start..s_end]; + let len = str_bytes.len(); + + // First match. + let (mut prev_id, mut prev_len) = lpm.find_longest_match(str_bytes); + let mut pos = prev_len; + + if let Some(ref mut dyn_) = dyn_ctrl { + dyn_.on_bytes_scanned(prev_len); + budget_exhausted = dyn_.budget_exhausted(); + if budget_exhausted { + break; + } + } + + while pos < len { + let (curr_id, curr_len) = lpm.find_longest_match(&str_bytes[pos..]); + + if let Some(ref mut dyn_) = dyn_ctrl { + dyn_.on_bytes_scanned(curr_len); + budget_exhausted = dyn_.budget_exhausted(); + if budget_exhausted { + break; + } + } + + let pair_len = prev_len + curr_len; + + if pair_len <= MAX_TOKEN_SIZE { + let key = ((prev_id as u32) << 16) | (curr_id as u32); + // Saturating increment branchless of C++: f += (f < 255). + let f_slot = freq.entry(key).or_insert(0); + *f_slot = f_slot.saturating_add(1); + if *f_slot >= threshold { + // Merge: create new token for this pair. + let pair_start = pos - prev_len; + let pair_end = pos + curr_len; + let new_id = lpm.insert(&str_bytes[pair_start..pair_end]); + dict.bytes + .extend_from_slice(&str_bytes[pair_start..pair_end]); + dict.offsets.push(dict.bytes.len() as u32); + + if lpm.size() == dict_capacity { + full_dictionary = true; + break; + } + + if let Some(ref mut dyn_) = dyn_ctrl { + dyn_.on_entry_created(); + threshold = dyn_.get(); + } + + freq.remove(&key); + prev_id = new_id; + prev_len = pair_len; + pos += curr_len; + continue; + } + } + + prev_id = curr_id; + prev_len = curr_len; + pos += curr_len; + } + } + + let mut result = TrainResult { dict, lpm }; + sort_dictionary(&mut result); + result.dict.pad_for_decoder(); + result +} + +// ───────────────────────────────────────────────────────────────────────────── +// sort_dictionary — internal helper. +// +// Sorts the dictionary lexicographically and rebuilds the LPM so token IDs +// match the new positions. Mirrors the same-named function in trainer.cpp. +// ───────────────────────────────────────────────────────────────────────────── + +fn sort_dictionary(result: &mut TrainResult) { + let n = result.dict.num_tokens(); + + let mut perm: Vec = (0..n as Token).collect(); + perm.sort_by(|&a, &b| { + let pa = result.dict.data(a); + let pb = result.dict.data(b); + pa.cmp(pb) + }); + + let mut sorted = Dictionary::default(); + sorted.bytes.reserve(result.dict.bytes_used()); + sorted.offsets.reserve(n + 1); + sorted.offsets.push(0); + + for &old_id in &perm { + let s = result.dict.span(old_id); + sorted + .bytes + .extend_from_slice(&result.dict.bytes[s.begin as usize..s.end as usize]); + sorted.offsets.push(sorted.bytes.len() as u32); + } + + result.dict = sorted; + result.lpm = LongestPrefixMatcher::from_dictionary(&result.dict); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Tests — ported from `tests/encoding/test_trainer.cpp`. +// ───────────────────────────────────────────────────────────────────────────── + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::config::DynamicThreshold; + use crate::config::FixedThreshold; + use crate::test_corpus::alternating_strings as make_alternating_strings; + use crate::test_corpus::binary_strings as make_binary_strings; + use crate::test_corpus::fixed_length_strings as make_fixed_length_strings; + use crate::test_corpus::homogeneous_strings as make_homogeneous_strings; + use crate::test_corpus::make_raw; + use crate::test_corpus::mixed_length_strings as make_mixed_length_strings; + use crate::test_corpus::random_ascii_strings as make_random_strings; + use crate::test_corpus::user_strings as make_user_strings; + + fn train_strings>(strings: &[S], cfg: &TrainingConfig) -> TrainResult { + let raw = make_raw(strings); + train(&raw.data, &raw.offsets, raw.n, cfg) + } + + fn check_base_tokens(d: &Dictionary) { + assert!(d.num_tokens() >= 256); + let mut found = [false; 256]; + for i in 0..d.num_tokens() { + let s = d.span(i as Token); + if s.size() == 1 { + found[d.bytes[s.begin as usize] as usize] = true; + } + } + for (i, &f) in found.iter().enumerate() { + assert!(f, "base token for byte {i} not found in dictionary"); + } + } + + fn is_lex_sorted(d: &Dictionary) -> bool { + let n = d.num_tokens(); + for i in 1..n { + let a = d.data((i - 1) as Token); + let b = d.data(i as Token); + if a > b { + return false; + } + } + true + } + + // ── Baseline invariant ───────────────────────────────────────────────── + + #[test] + fn base_tokens_always_single_bytes() { + let result = train_strings(&make_user_strings(50), &TrainingConfig::default()); + check_base_tokens(&result.dict); + } + + #[test] + fn base_tokens_on_empty_input() { + let data: Vec = vec![]; + let offsets = vec![0u32]; + let result = train(&data, &offsets, 0, &TrainingConfig::default()); + check_base_tokens(&result.dict); + assert_eq!(result.dict.num_tokens(), 256); + } + + #[test] + fn base_tokens_on_single_empty_string() { + let data: Vec = vec![]; + let offsets = vec![0u32, 0]; + let result = train(&data, &offsets, 1, &TrainingConfig::default()); + check_base_tokens(&result.dict); + assert_eq!(result.dict.num_tokens(), 256); + } + + // ── Dictionary size bounds ───────────────────────────────────────────── + + #[test] + fn dictionary_size_does_not_exceed_capacity() { + let cfg = TrainingConfig { + bits: 12, + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + }; + let result = train_strings(&make_user_strings(500), &cfg); + assert!(result.dict.num_tokens() <= max_dict_size(cfg.bits)); + } + + // ── FixedThreshold ───────────────────────────────────────────────────── + + #[test] + fn threshold_gates_merges() { + // 100 copies of "ab": pair (a,b) appears exactly 100 times. + let corpus: Vec<&str> = (0..100).map(|_| "ab").collect(); + + let cfg_low = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + assert!(train_strings(&corpus, &cfg_low).dict.num_tokens() > 256); + + let cfg_high = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 101 }), + seed: Some(42), + ..Default::default() + }; + assert_eq!(train_strings(&corpus, &cfg_high).dict.num_tokens(), 256); + } + + #[test] + fn fixed_threshold_2_merges_frequent_pairs() { + let corpus: Vec<&str> = (0..50).map(|_| "aabb").collect(); + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + assert!(train_strings(&corpus, &cfg).dict.num_tokens() > 256); + } + + #[test] + fn merged_token_content_is_correct() { + let corpus: Vec<&str> = (0..50).map(|_| "ab").collect(); + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&corpus, &cfg); + let found = (0..result.dict.num_tokens()).any(|i| { + let s = result.dict.data(i as Token); + s == b"ab" + }); + assert!(found, "merged token \"ab\" not found in dictionary"); + } + + // ── Seed reproducibility ─────────────────────────────────────────────── + + #[test] + fn same_seed_produces_identical_dictionaries() { + let corpus = make_random_strings(100, 40, 12345); + let cfg = TrainingConfig { + seed: Some(42), + ..Default::default() + }; + let r1 = train_strings(&corpus, &cfg); + let r2 = train_strings(&corpus, &cfg); + assert_eq!(r1.dict.num_tokens(), r2.dict.num_tokens()); + assert_eq!(r1.dict.bytes, r2.dict.bytes); + assert_eq!(r1.dict.offsets, r2.dict.offsets); + } + + // ── sort_dictionary / LPM remap ──────────────────────────────────────── + + #[test] + fn dictionary_is_always_sorted() { + let result = train_strings(&make_user_strings(100), &TrainingConfig::default()); + assert!(is_lex_sorted(&result.dict)); + } + + #[test] + fn lpm_remaps_correctly() { + let strings = make_user_strings(30); + let result = train_strings(&strings, &TrainingConfig::default()); + let n = result.dict.num_tokens(); + for id in 0..n { + let bytes = result.dict.data(id as Token); + let (tok, len) = result.lpm.find_longest_match(bytes); + assert_eq!(tok, id as Token, "ID mismatch for token {id}"); + assert_eq!(len, bytes.len(), "length mismatch for token {id}"); + } + } + + // ── Token byte length ────────────────────────────────────────────────── + + #[test] + fn no_token_exceeds_max_token_size() { + let strings = make_random_strings(100, 50, 99); + let result = train_strings(&strings, &TrainingConfig::default()); + for i in 0..result.dict.num_tokens() { + let len = result.dict.token_size(i as Token); + assert!(len <= MAX_TOKEN_SIZE, "token {i} exceeds MAX_TOKEN_SIZE"); + } + } + + #[test] + fn no_token_has_zero_length() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let corpora: Vec<(&str, Vec>)> = vec![ + ("random", make_random_strings(100, 50, 77)), + ( + "user", + make_user_strings(50) + .into_iter() + .map(|s| s.into_bytes()) + .collect(), + ), + ("binary", make_binary_strings(50, 30, 13)), + ("fixed_len", make_fixed_length_strings(20, MAX_TOKEN_SIZE)), + ]; + for (name, c) in &corpora { + let result = train_strings(c, &cfg); + for i in 0..result.dict.num_tokens() { + let len = result.dict.token_size(i as Token); + assert!(len > 0, "corpus={name} token {i} has zero length"); + } + } + } + + // ── DynamicThreshold ─────────────────────────────────────────────────── + + #[test] + fn dynamic_threshold_produces_merged_tokens() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: 0.5, + }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&make_user_strings(200), &cfg); + assert!(result.dict.num_tokens() > 256); + } + + #[test] + fn dynamic_threshold_does_not_exceed_capacity() { + let cfg = TrainingConfig { + bits: 12, + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: 1.0, + }), + seed: Some(42), + }; + let result = train_strings(&make_user_strings(500), &cfg); + assert!(result.dict.num_tokens() <= max_dict_size(cfg.bits)); + } + + #[test] + fn dynamic_threshold_smaller_fraction_produces_fewer_tokens() { + let corpus = make_user_strings(500); + + let cfg_small = TrainingConfig { + bits: 14, + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: 0.05, + }), + seed: Some(42), + }; + let cfg_large = TrainingConfig { + bits: 14, + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: 1.0, + }), + seed: Some(42), + }; + + let r_small = train_strings(&corpus, &cfg_small); + let r_large = train_strings(&corpus, &cfg_large); + assert!(r_small.dict.num_tokens() <= r_large.dict.num_tokens()); + } + + #[test] + fn dynamic_threshold_dictionary_is_sorted() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Dynamic(DynamicThreshold { + sample_fraction: 0.3, + }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&make_user_strings(100), &cfg); + assert!(is_lex_sorted(&result.dict)); + } + + // ── Dictionary is padded for decoder ─────────────────────────────────── + + #[test] + fn dictionary_is_padded_for_decoder() { + let result = train_strings(&make_user_strings(50), &TrainingConfig::default()); + let last_start = result.dict.offsets[result.dict.offsets.len() - 2] as usize; + assert!(result.dict.bytes.len() >= last_start + MAX_TOKEN_SIZE); + } + + // ── No duplicate tokens ──────────────────────────────────────────────── + + #[test] + fn no_duplicate_tokens_in_dictionary() { + let result = train_strings(&make_user_strings(100), &TrainingConfig::default()); + let n = result.dict.num_tokens(); + for i in 1..n { + let a = result.dict.data((i - 1) as Token); + let b = result.dict.data(i as Token); + assert!(a != b, "duplicate token at positions {} and {}", i - 1, i); + } + } + + // ── Corpus type coverage ─────────────────────────────────────────────── + + #[test] + fn homogeneous_corpus_produces_merges() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&make_homogeneous_strings(50, 16, b'a'), &cfg); + assert!(result.dict.num_tokens() > 256); + check_base_tokens(&result.dict); + } + + #[test] + fn alternating_corpus_produces_merges() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&make_alternating_strings(50, 16), &cfg); + assert!(result.dict.num_tokens() > 256); + check_base_tokens(&result.dict); + } + + #[test] + fn mixed_length_corpus_produces_valid_dictionary() { + let cfg = TrainingConfig { + threshold: ThresholdSpec::Fixed(FixedThreshold { value: 2 }), + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&make_mixed_length_strings(200, 64, 7), &cfg); + check_base_tokens(&result.dict); + assert!(is_lex_sorted(&result.dict)); + assert!(result.dict.num_tokens() <= max_dict_size(cfg.bits)); + } + + // ── All bit widths produce valid dictionary ──────────────────────────── + + #[test] + fn all_bit_widths_produce_valid_dictionary() { + let corpus = make_user_strings(50); + for b in 9u8..=16 { + let cfg = TrainingConfig { + bits: b, + seed: Some(42), + ..Default::default() + }; + let result = train_strings(&corpus, &cfg); + check_base_tokens(&result.dict); + assert!(is_lex_sorted(&result.dict), "not sorted for bits={b}"); + assert!( + result.dict.num_tokens() <= max_dict_size(b), + "overflow for bits={b}" + ); + } + } +} diff --git a/encodings/experimental/onpair-rs/src/types.rs b/encodings/experimental/onpair-rs/src/types.rs new file mode 100644 index 00000000000..166c63e3adf --- /dev/null +++ b/encodings/experimental/onpair-rs/src/types.rs @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Port of `include/onpair/core/types.h`. + +/// Number of bits per packed token. Legal range: 9..=16. +pub type BitWidth = u8; + +/// Token identifier within a dictionary. Capped at `2^bits` per column. +pub type Token = u16; + +/// Maximum byte length of any dictionary token. +pub const MAX_TOKEN_SIZE: usize = 16; + +/// Byte range `[begin, end)` inside the dictionary buffer. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct ByteSpan { + pub begin: u32, + pub end: u32, +} + +impl ByteSpan { + #[inline] + pub const fn size(self) -> u32 { + self.end - self.begin + } +} + +/// Token-stream index range `[begin, end)` inside the packed store. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct StreamSpan { + pub begin: u32, + pub end: u32, +} + +impl StreamSpan { + #[inline] + pub const fn size(self) -> u32 { + self.end - self.begin + } +} + +/// Closed range `[begin, last]` of token IDs. `begin > last` denotes empty. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct TokenRange { + pub begin: Token, + pub last: Token, +} + +impl Default for TokenRange { + fn default() -> Self { + // Canonical empty range: begin > last. + Self { begin: 1, last: 0 } + } +} + +impl TokenRange { + #[inline] + pub const fn empty(self) -> bool { + self.begin > self.last + } + + #[inline] + pub const fn size(self) -> u32 { + if self.empty() { + 0 + } else { + (self.last as u32) - (self.begin as u32) + 1 + } + } + + #[inline] + pub const fn contains(self, t: Token) -> bool { + t >= self.begin && t <= self.last + } +} + +/// Maximum dictionary size given a bit width. +#[inline] +pub const fn max_dict_size(bits: BitWidth) -> usize { + 1usize << bits +} + +/// Whether `bits` is in the legal range 9..=16. +#[inline] +pub const fn is_valid_bits(bits: BitWidth) -> bool { + bits >= 9 && bits <= 16 +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── ByteSpan ──────────────────────────────────────────────────────────── + #[test] + fn byte_span_size_is_end_minus_begin() { + assert_eq!(ByteSpan { begin: 0, end: 0 }.size(), 0); + assert_eq!(ByteSpan { begin: 0, end: 1 }.size(), 1); + assert_eq!(ByteSpan { begin: 5, end: 10 }.size(), 5); + assert_eq!( + ByteSpan { + begin: 100, + end: 100 + } + .size(), + 0 + ); + } + + // ── StreamSpan ────────────────────────────────────────────────────────── + #[test] + fn stream_span_size_is_end_minus_begin() { + assert_eq!(StreamSpan { begin: 0, end: 0 }.size(), 0); + assert_eq!(StreamSpan { begin: 0, end: 1 }.size(), 1); + assert_eq!(StreamSpan { begin: 3, end: 7 }.size(), 4); + } + + // ── TokenRange ────────────────────────────────────────────────────────── + #[test] + fn token_range_default_is_empty() { + let r = TokenRange::default(); + assert!(r.empty()); + assert_eq!(r.size(), 0); + } + + #[test] + fn token_range_empty_when_begin_gt_last() { + assert!(TokenRange { begin: 5, last: 4 }.empty()); + assert!(TokenRange { begin: 1, last: 0 }.empty()); + } + + #[test] + fn token_range_not_empty_when_begin_leq_last() { + assert!(!TokenRange { begin: 0, last: 0 }.empty()); + assert!(!TokenRange { begin: 5, last: 5 }.empty()); + assert!( + !TokenRange { + begin: 0, + last: 100 + } + .empty() + ); + } + + #[test] + fn token_range_size_zero_for_empty() { + assert_eq!(TokenRange { begin: 10, last: 5 }.size(), 0); + } + + #[test] + fn token_range_size_one_for_single() { + assert_eq!(TokenRange { begin: 5, last: 5 }.size(), 1); + assert_eq!(TokenRange { begin: 0, last: 0 }.size(), 1); + } + + #[test] + fn token_range_size_last_minus_begin_plus_one() { + assert_eq!(TokenRange { begin: 3, last: 7 }.size(), 5); + assert_eq!( + TokenRange { + begin: 0, + last: 255 + } + .size(), + 256 + ); + } + + #[test] + fn token_range_contains_boundary_tokens() { + let r = TokenRange { + begin: 10, + last: 20, + }; + assert!(r.contains(10)); + assert!(r.contains(20)); + assert!(r.contains(15)); + } + + #[test] + fn token_range_contains_false_outside() { + let r = TokenRange { + begin: 10, + last: 20, + }; + assert!(!r.contains(9)); + assert!(!r.contains(21)); + } + + #[test] + fn token_range_contains_false_for_empty() { + let r = TokenRange::default(); + assert!(!r.contains(0)); + assert!(!r.contains(1)); + } + + // ── max_dict_size ─────────────────────────────────────────────────────── + #[test] + fn max_dict_size_12_is_4096() { + assert_eq!(max_dict_size(12), 4096); + } + + #[test] + fn max_dict_size_16_is_65536() { + assert_eq!(max_dict_size(16), 65536); + } + + #[test] + fn max_dict_size_is_pow2_for_all_valid_widths() { + for b in 9u8..=16 { + assert_eq!(max_dict_size(b), 1usize << b); + } + } + + // ── is_valid_bits ─────────────────────────────────────────────────────── + #[test] + fn is_valid_bits_accepts_9_to_16() { + for b in 9u8..=16 { + assert!(is_valid_bits(b), "expected true for bits={b}"); + } + } + + #[test] + fn is_valid_bits_rejects_8_and_17() { + assert!(!is_valid_bits(8)); + assert!(!is_valid_bits(17)); + } + + #[test] + fn is_valid_bits_rejects_0_and_255() { + assert!(!is_valid_bits(0)); + assert!(!is_valid_bits(255)); + } + + // ── MAX_TOKEN_SIZE ────────────────────────────────────────────────────── + #[test] + fn max_token_size_is_16() { + assert_eq!(MAX_TOKEN_SIZE, 16); + } +} diff --git a/encodings/experimental/onpair-rs/tests/cross_impl.rs b/encodings/experimental/onpair-rs/tests/cross_impl.rs new file mode 100644 index 00000000000..cdcfdb1f8c6 --- /dev/null +++ b/encodings/experimental/onpair-rs/tests/cross_impl.rs @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#![allow(clippy::tests_outside_test_module)] +// +// Cross-implementation comparison tests: train + encode the same input with +// both the pure-Rust `vortex_onpair_rs::Column` and the C++-FFI +// `vortex_onpair_sys::Column`, then assert that downstream operations +// (decompression by row id, equality search, prefix search, substring +// search) agree. +// +// Bit-exact dictionary equality is NOT asserted: the two implementations +// use different RNGs (`std::mt19937_64` vs `rand`'s `StdRng`), so the +// merge-order of the BPE trainer differs. Equivalence is asserted on +// observable outputs: decode equality, predicate equality on the same +// queries, and the structural invariants the FFI guarantees +// (`bits == cfg.bits`, `len == n`, `dict_size <= 2^bits`, +// `codes_boundaries.len() == n + 1`). +// +// The pure-Rust crate exposes `Column::compress` and `Column::parts` with +// the same shape as `vortex-onpair-sys`. We materialise both columns' +// parts and compare what every downstream Vortex consumer (decode loop, +// predicate kernels) would see. + +use vortex_onpair_rs::Column as RustColumn; +use vortex_onpair_rs::OnPairTrainingConfig as RustConfig; +use vortex_onpair_rs::Parts as RustParts; +use vortex_onpair_rs::unpack_codes_to_u16; +use vortex_onpair_sys::Column as CppColumn; +use vortex_onpair_sys::OnPairTrainingConfig as CppConfig; +use vortex_onpair_sys::Parts as CppParts; + +// ───────────────────────────────────────────────────────────────────────────── +// Common helpers. +// ───────────────────────────────────────────────────────────────────────────── + +fn pack>(strings: &[S]) -> (Vec, Vec) { + let mut bytes = Vec::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + offsets.push(0u64); + for s in strings { + bytes.extend_from_slice(s.as_ref()); + offsets.push(bytes.len() as u64); + } + (bytes, offsets) +} + +fn rust_cfg(bits: u32, threshold: f64, seed: u64) -> RustConfig { + RustConfig { + bits, + threshold, + seed, + } +} + +fn cpp_cfg(bits: u32, threshold: f64, seed: u64) -> CppConfig { + CppConfig { + bits, + threshold, + seed, + } +} + +/// Decompress row `row` using the pure-Rust decode loop applied to +/// arbitrary `(dict_bytes, dict_offsets, codes_packed, codes_boundaries, +/// bits)`. This is the same logic `vortex-onpair`'s `DecodeView` runs on +/// the materialised children, so when this decoder agrees on both +/// implementations' parts it proves that `vortex-onpair` downstream +/// (decode, LIKE, EQ) would also agree. +fn decode_row( + bits: u32, + dict_bytes: &[u8], + dict_offsets: F, + codes_packed: &[u64], + codes_boundaries: &[u32], + row: usize, +) -> Vec +where + F: Fn(usize) -> (usize, usize), +{ + let begin = codes_boundaries[row] as usize; + let end = codes_boundaries[row + 1] as usize; + let codes = unpack_codes_to_u16(codes_packed, end, bits); + let mut out = Vec::new(); + for &c in &codes[begin..end] { + let (s, e) = dict_offsets(c as usize); + out.extend_from_slice(&dict_bytes[s..e]); + } + out +} + +fn decode_rust(parts: &RustParts<'_>, row: usize) -> Vec { + decode_row( + parts.bits, + parts.dict_bytes, + |i| { + ( + parts.dict_offsets[i] as usize, + parts.dict_offsets[i + 1] as usize, + ) + }, + parts.codes_packed, + parts.codes_boundaries, + row, + ) +} + +fn decode_cpp(parts: &CppParts<'_>, row: usize) -> Vec { + decode_row( + parts.bits, + parts.dict_bytes, + |i| { + ( + parts.dict_offsets[i] as usize, + parts.dict_offsets[i + 1] as usize, + ) + }, + parts.codes_packed, + parts.codes_boundaries, + row, + ) +} + +/// Naive predicate over decoded strings, used as the source of truth for +/// equality / prefix / substring comparisons. +fn predicate_truth(strings: &[&[u8]], f: F) -> Vec +where + F: Fn(&[u8]) -> bool, +{ + strings.iter().map(|s| f(s)).collect() +} + +fn rust_predicate bool>(parts: &RustParts<'_>, f: F) -> Vec { + (0..parts.num_rows) + .map(|i| f(&decode_rust(parts, i))) + .collect() +} + +fn cpp_predicate bool>(parts: &CppParts<'_>, f: F) -> Vec { + (0..parts.num_rows) + .map(|i| f(&decode_cpp(parts, i))) + .collect() +} + +/// Corpus that produces lots of repetition so BPE merges fire. +fn corpus_urls() -> Vec<&'static [u8]> { + vec![ + b"https://www.example.com/page", + b"https://www.example.com/data", + b"https://www.example.com/page", + b"https://www.test.org/page", + b"ftp://files.example.com/x", + b"https://docs.example.com/spec", + b"https://api.example.net/v1", + b"https://www.example.com/data", + b"https://docs.example.com/spec", + b"https://www.example.com/page", + b"another_unique_row", + b"yet_another_row", + b"https://api.example.net/v1", + b"prefix_admin_001", + b"prefix_admin_002", + b"prefix_guest_001", + b"prefix_user_001", + b"prefix_user_002", + b"prefix_user_003", + ] +} + +fn corpus_binary() -> Vec> { + let mut out = Vec::with_capacity(40); + for i in 0u8..40 { + let mut row = Vec::with_capacity(24); + for j in 0u8..24 { + row.push(i.wrapping_add(j)); + } + out.push(row); + } + out +} + +// ───────────────────────────────────────────────────────────────────────────── +// Structural parity. +// ───────────────────────────────────────────────────────────────────────────── + +#[test] +fn structural_parity_url_corpus() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp compress"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs compress"); + + assert_eq!(cpp.len(), strings.len()); + assert_eq!(rs.len(), strings.len()); + assert_eq!(cpp.bits(), 12); + assert_eq!(rs.bits(), 12); + // Both stay under the dict-12 cap of 4096. + assert!(cpp.dict_size() <= 4096); + assert!(rs.dict_size() <= 4096); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + // Number of boundary entries is identical: n + 1 in both. + assert_eq!(cpp_parts.codes_boundaries.len(), strings.len() + 1); + assert_eq!(rs_parts.codes_boundaries.len(), strings.len() + 1); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Decompression equivalence. +// +// For every row in the corpus, both columns must decode back to the original +// bytes, regardless of dictionary divergence. +// ───────────────────────────────────────────────────────────────────────────── + +#[test] +fn decompress_equivalence_url_corpus() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + for &bits in &[9u32, 10, 11, 12, 13, 14, 15, 16] { + let cpp = + CppColumn::compress(&bytes, &offsets, cpp_cfg(bits, 0.5, 42)).expect("cpp compress"); + let rs = + RustColumn::compress(&bytes, &offsets, rust_cfg(bits, 0.5, 42)).expect("rs compress"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + + for (i, &s) in strings.iter().enumerate() { + assert_eq!( + decode_cpp(&cpp_parts, i), + s, + "C++ decode bits={bits} row={i}" + ); + assert_eq!( + decode_rust(&rs_parts, i), + s, + "Rust decode bits={bits} row={i}" + ); + } + } +} + +#[test] +fn decompress_equivalence_binary_corpus() { + let strings = corpus_binary(); + let strings_ref: Vec<&[u8]> = strings.iter().map(|s| s.as_slice()).collect(); + let (bytes, offsets) = pack(&strings_ref); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(14, 0.5, 7)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(14, 0.5, 7)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + for (i, s) in strings_ref.iter().enumerate() { + assert_eq!(decode_cpp(&cpp_parts, i), *s, "cpp binary row {i}"); + assert_eq!(decode_rust(&rs_parts, i), *s, "rust binary row {i}"); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Predicate equivalence (eq / starts_with / contains). +// +// Run the predicate against the decoded value of every row produced by each +// implementation and confirm both implementations agree with the +// naive-string ground truth. +// ───────────────────────────────────────────────────────────────────────────── + +#[test] +fn equals_equivalence() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + + let needles: Vec<&[u8]> = vec![ + b"https://www.example.com/page", + b"prefix_user_002", + b"definitely-not-in-corpus", + b"another_unique_row", + ]; + for needle in needles { + let truth = predicate_truth(&strings, |s| s == needle); + let cpp_sel = cpp_predicate(&cpp_parts, |s| s == needle); + let rs_sel = rust_predicate(&rs_parts, |s| s == needle); + assert_eq!(cpp_sel, truth, "cpp eq for {needle:?}"); + assert_eq!(rs_sel, truth, "rust eq for {needle:?}"); + assert_eq!(cpp_sel, rs_sel, "cpp vs rust eq for {needle:?}"); + } +} + +#[test] +fn starts_with_equivalence() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + + let needles: Vec<&[u8]> = vec![ + b"https://", + b"prefix_user_", + b"prefix_", + b"ftp://", + b"zzz_not_present", + ]; + for needle in needles { + let truth = predicate_truth(&strings, |s| s.starts_with(needle)); + let cpp_sel = cpp_predicate(&cpp_parts, |s| s.starts_with(needle)); + let rs_sel = rust_predicate(&rs_parts, |s| s.starts_with(needle)); + assert_eq!(cpp_sel, truth, "cpp starts_with for {needle:?}"); + assert_eq!(rs_sel, truth, "rust starts_with for {needle:?}"); + assert_eq!(cpp_sel, rs_sel); + } +} + +#[test] +fn contains_equivalence() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + + let needles: Vec<&[u8]> = vec![b"example", b"admin", b"docs", b"_user_", b"never_appears"]; + for needle in needles { + let truth = predicate_truth(&strings, |s| s.windows(needle.len()).any(|w| w == needle)); + let cpp_sel = cpp_predicate(&cpp_parts, |s| s.windows(needle.len()).any(|w| w == needle)); + let rs_sel = rust_predicate(&rs_parts, |s| s.windows(needle.len()).any(|w| w == needle)); + assert_eq!(cpp_sel, truth, "cpp contains for {needle:?}"); + assert_eq!(rs_sel, truth, "rust contains for {needle:?}"); + assert_eq!(cpp_sel, rs_sel); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Dictionary structural invariants. +// +// Both implementations must produce a dictionary that: +// * begins with all 256 single-byte tokens (in some order) — sufficient to +// parse every possible byte; +// * is lexicographically sorted by token byte sequence (required for +// binary-search prefix lookups in downstream predicates). +// ───────────────────────────────────────────────────────────────────────────── + +fn dict_contains_all_single_bytes(dict_bytes: &[u8], dict_offsets: &[u32]) -> bool { + let mut found = [false; 256]; + for i in 0..dict_offsets.len() - 1 { + let s = dict_offsets[i] as usize; + let e = dict_offsets[i + 1] as usize; + if e - s == 1 { + found[dict_bytes[s] as usize] = true; + } + } + found.iter().all(|&f| f) +} + +fn dict_is_sorted(dict_bytes: &[u8], dict_offsets: &[u32]) -> bool { + for i in 1..dict_offsets.len() - 1 { + let a_s = dict_offsets[i - 1] as usize; + let a_e = dict_offsets[i] as usize; + let b_s = dict_offsets[i] as usize; + let b_e = dict_offsets[i + 1] as usize; + if dict_bytes[a_s..a_e] > dict_bytes[b_s..b_e] { + return false; + } + } + true +} + +#[test] +fn both_dicts_cover_all_single_bytes() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + assert!(dict_contains_all_single_bytes( + cpp_parts.dict_bytes, + cpp_parts.dict_offsets + )); + assert!(dict_contains_all_single_bytes( + rs_parts.dict_bytes, + rs_parts.dict_offsets + )); +} + +#[test] +fn both_dicts_are_lex_sorted() { + let strings = corpus_urls(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let cpp_parts = cpp.parts().expect("cpp parts"); + let rs_parts = rs.parts().expect("rs parts"); + assert!(dict_is_sorted(cpp_parts.dict_bytes, cpp_parts.dict_offsets)); + assert!(dict_is_sorted(rs_parts.dict_bytes, rs_parts.dict_offsets)); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Bitmap-API equivalence. +// +// `onpair-lib`'s predicate API returns the same LSB-first packed bitmap +// layout as `vortex-onpair-sys`'s `*_bitmap` family. Compare the two +// directly against the C++-FFI implementation on identical inputs. +// ───────────────────────────────────────────────────────────────────────────── + +mod bitmap_parity { + use super::*; + + fn corpus_for_predicates() -> Vec<&'static [u8]> { + vec![ + b"https://www.example.com/page", + b"https://www.example.com/data", + b"https://www.test.org/page", + b"ftp://files.example.com/x", + b"https://www.example.com/page", + b"admin_001", + b"admin_002", + b"guest_001", + b"user_007", + b"no_match_row", + b"another_no_match", + ] + } + + #[test] + fn equals_bitmap_matches_cpp() { + let strings = corpus_for_predicates(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + for needle in [&b"admin_001"[..], b"missing", b"user_007", b""] { + let cpp_bits = cpp.equals_bitmap(needle).expect("cpp eq"); + let rs_bits = rs.equals_bitmap(needle); + assert_eq!(cpp_bits, rs_bits, "needle={needle:?}"); + } + } + + #[test] + fn starts_with_bitmap_matches_cpp() { + let strings = corpus_for_predicates(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + for needle in [&b"https://"[..], b"admin_", b"ftp://", b"zzz", b""] { + let cpp_bits = cpp.starts_with_bitmap(needle).expect("cpp sw"); + let rs_bits = rs.starts_with_bitmap(needle); + assert_eq!(cpp_bits, rs_bits, "needle={needle:?}"); + } + } + + #[test] + fn contains_bitmap_matches_cpp() { + let strings = corpus_for_predicates(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + for needle in [&b"example"[..], b"admin", b"never", b"_00", b""] { + let cpp_bits = cpp.contains_bitmap(needle).expect("cpp ct"); + let rs_bits = rs.contains_bitmap(needle); + assert_eq!(cpp_bits, rs_bits, "needle={needle:?}"); + } + } + + #[test] + fn decompress_row_matches_cpp() { + let strings = corpus_for_predicates(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let mut cpp_buf = Vec::new(); + let mut rs_buf = Vec::new(); + for (i, s) in strings.iter().enumerate() { + cpp.decompress_row(i, &mut cpp_buf).expect("cpp dec"); + rs.decompress_row(i, &mut rs_buf).expect("rs dec"); + assert_eq!(cpp_buf, *s, "cpp row {i}"); + assert_eq!(rs_buf, *s, "rs row {i}"); + } + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Multi-pattern (Aho-Corasick) equivalence against naive truth. +// `vortex-onpair-sys` exposes single-needle predicates only, so we compare +// against the byte-level union-of-substrings predicate. +// ───────────────────────────────────────────────────────────────────────────── + +mod multi_pattern { + use super::*; + + #[test] + fn multi_pattern_matches_naive_union() { + let strings: Vec<&[u8]> = vec![ + b"admin_001", + b"guest_999", + b"user_007", + b"no_pattern_here", + b"some_admin_text", + b"guest_in_middle", + ]; + let (bytes, offsets) = pack(&strings); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let needles: &[&[u8]] = &[b"admin", b"guest"]; + let rs_bits = rs.multi_pattern_bitmap(needles); + // Naive truth: row matches iff any needle is a substring of it. + let mut expected = vec![0u8; strings.len().div_ceil(8)]; + for (i, s) in strings.iter().enumerate() { + if needles.iter().any(|n| s.windows(n.len()).any(|w| &w == n)) { + expected[i / 8] |= 1u8 << (i % 8); + } + } + assert_eq!(rs_bits, expected); + } +} + +// Bitmap-level algebra (`bitmap_and`/`bitmap_or`/`bitmap_not`) was removed +// from the public API — the token-automaton combinators (`and`, `or`, `not`) +// cover the same use cases in a single compressed-domain pass and are +// covered by the `token_automata` test module below. + +// ───────────────────────────────────────────────────────────────────────────── +// Token-automaton equivalence. +// +// Verify the compressed-domain automata (EqAutomaton, PrefixAutomaton, +// KmpAutomaton, AhoCorasickAutomaton) produce results identical to the +// C++-FFI bitmap implementations and to the in-crate byte-level bitmaps. +// ───────────────────────────────────────────────────────────────────────────── + +mod token_automata { + use vortex_onpair_rs::AhoCorasickAutomaton; + use vortex_onpair_rs::EqAutomaton; + use vortex_onpair_rs::KmpAutomaton; + use vortex_onpair_rs::PrefixAutomaton; + use vortex_onpair_rs::and; + use vortex_onpair_rs::not; + + use super::*; + + fn row_ids_from_bitmap(bits: &[u8], n: usize) -> Vec { + (0..n) + .filter(|&i| (bits[i / 8] >> (i % 8)) & 1 == 1) + .collect() + } + + fn corpus() -> Vec<&'static [u8]> { + vec![ + b"user_admin_001", + b"user_001", + b"user_002", + b"admin_001", + b"admin_002", + b"guest_001", + b"https://www.example.com/page", + b"https://docs.example.com/spec", + b"ftp://files.example.com/x", + b"prefix_user_777", + ] + } + + #[test] + fn eq_automaton_matches_cpp_equals_bitmap() { + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + for needle in [&b"admin_001"[..], b"user_001", b"missing"] { + let cpp_bitmap = cpp.equals_bitmap(needle).expect("cpp eq"); + let cpp_ids = row_ids_from_bitmap(&cpp_bitmap, strings.len()); + + let eq = EqAutomaton::new(needle, &dict); + let rs_ids = rs.scan(eq); + assert_eq!(rs_ids, cpp_ids, "needle={needle:?}"); + } + } + + #[test] + fn prefix_automaton_matches_cpp_starts_with_bitmap() { + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + for needle in [&b"user_"[..], b"admin", b"https://", b"zzz"] { + let cpp_bitmap = cpp.starts_with_bitmap(needle).expect("cpp sw"); + let cpp_ids = row_ids_from_bitmap(&cpp_bitmap, strings.len()); + + let pa = PrefixAutomaton::new(needle, &dict); + let rs_ids = rs.scan(pa); + assert_eq!(rs_ids, cpp_ids, "needle={needle:?}"); + } + } + + #[test] + fn kmp_automaton_matches_cpp_contains_bitmap() { + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let cpp = CppColumn::compress(&bytes, &offsets, cpp_cfg(12, 0.5, 42)).expect("cpp"); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + for needle in [&b"admin"[..], b"example", b"_001", b"missing", b"://"] { + let cpp_bitmap = cpp.contains_bitmap(needle).expect("cpp ct"); + let cpp_ids = row_ids_from_bitmap(&cpp_bitmap, strings.len()); + + let kmp = KmpAutomaton::new(needle, &dict); + let rs_ids = rs.scan(kmp); + assert_eq!(rs_ids, cpp_ids, "needle={needle:?}"); + } + } + + #[test] + fn aho_corasick_automaton_matches_naive_union() { + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + let needles: &[&[u8]] = &[b"admin", b"guest"]; + let ac = AhoCorasickAutomaton::new(needles, &dict); + let rs_ids = rs.scan(ac); + + let expected: Vec = strings + .iter() + .enumerate() + .filter(|(_, s)| needles.iter().any(|n| s.windows(n.len()).any(|w| &w == n))) + .map(|(i, _)| i) + .collect(); + assert_eq!(rs_ids, expected); + } + + #[test] + fn combinator_and_not_compressed_domain_matches_bitmap_algebra() { + // Single-pass `KMP(user) && !KMP(admin)` over the compressed token + // stream — compare with the bitmap-level equivalent. + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + + let mut kmp_user = KmpAutomaton::new(b"user", &dict); + let mut kmp_admin = KmpAutomaton::new(b"admin", &dict); + let token_ids = rs.scan(and(&mut kmp_user, not(&mut kmp_admin))); + + // Bitmap equivalent. + let users = rs.contains_bitmap(b"user"); + let admins = rs.contains_bitmap(b"admin"); + let combined: Vec = users + .iter() + .zip(admins.iter()) + .map(|(u, a)| u & !a) + .collect(); + let bitmap_ids = row_ids_from_bitmap(&combined, strings.len()); + + assert_eq!(token_ids, bitmap_ids); + } + + #[test] + fn combinator_eq_or_kmp_matches_bitmap_algebra() { + let strings = corpus(); + let (bytes, offsets) = pack(&strings); + let rs = RustColumn::compress(&bytes, &offsets, rust_cfg(12, 0.5, 42)).expect("rs"); + let dict = rs.dictionary().clone(); + + let mut eq = EqAutomaton::new(b"guest_001", &dict); + let mut kmp = KmpAutomaton::new(b"admin", &dict); + let token_ids = rs.scan(vortex_onpair_rs::or(&mut eq, &mut kmp)); + + let eq_bits = rs.equals_bitmap(b"guest_001"); + let kmp_bits = rs.contains_bitmap(b"admin"); + let combined: Vec = eq_bits + .iter() + .zip(kmp_bits.iter()) + .map(|(a, b)| a | b) + .collect(); + let bitmap_ids = row_ids_from_bitmap(&combined, strings.len()); + + assert_eq!(token_ids, bitmap_ids); + } +} diff --git a/encodings/experimental/onpair-sys/Cargo.toml b/encodings/experimental/onpair-sys/Cargo.toml new file mode 100644 index 00000000000..ec58d82841a --- /dev/null +++ b/encodings/experimental/onpair-sys/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "vortex-onpair-sys" +authors = { workspace = true } +categories = { workspace = true } +description = "Native FFI bindings to the OnPair short-string compression library" +edition = { workspace = true } +homepage = { workspace = true } +include = [ + "build.rs", + "src/**/*.rs", + "cxx/**/*", + "cmake/**/*", + "Cargo.toml", + "README.md", +] +keywords = { workspace = true } +license = { workspace = true } +links = "onpair_shim" +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[build-dependencies] +cmake = "0.1" diff --git a/encodings/experimental/onpair-sys/README.md b/encodings/experimental/onpair-sys/README.md new file mode 100644 index 00000000000..d90be5475ef --- /dev/null +++ b/encodings/experimental/onpair-sys/README.md @@ -0,0 +1,31 @@ +# vortex-onpair-sys + +Low-level FFI bindings to the [OnPair][onpair] short-string compression library. + +OnPair is a dictionary-based compressor with **random access** and +**compressed-domain predicate evaluation** (substring, prefix, exact-match), +making it a natural fit for column scans with filter pushdown. + +This crate is the unsafe `*-sys` layer used by [`vortex-onpair`][onpair-rs]. +End users should depend on `vortex-onpair`, not this crate. + +## Build + +The build script uses CMake's `FetchContent` to pull +`gargiulofrancesco/onpair_cpp` at the pin recorded in `cmake/onpair_pin.cmake`, +applies a small patch that replaces `boost::unordered_flat_map` with +`std::unordered_map` to avoid the Boost dependency, and compiles both OnPair +and a thin C ABI shim (`cxx/onpair_shim.{h,cpp}`) into a single static archive +that is linked into the Rust crate. + +### Requirements + +- CMake >= 3.21 +- A C++20-capable compiler (GCC >= 11, Clang >= 13, MSVC >= 19.29) +- Network access on first build (for `FetchContent`) + +After the first build the source tree is cached under +`$OUT_DIR/onpair-build/_deps`, so subsequent builds are offline. + +[onpair]: https://arxiv.org/abs/2508.02280 +[onpair-rs]: ../onpair diff --git a/encodings/experimental/onpair-sys/build.rs b/encodings/experimental/onpair-sys/build.rs new file mode 100644 index 00000000000..5d0bc69a39e --- /dev/null +++ b/encodings/experimental/onpair-sys/build.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Builds the OnPair C++ library plus a thin C-ABI shim into a static archive +// that gets linked into this crate. The CMake configuration lives in +// `cmake/CMakeLists.txt` and fetches `gargiulofrancesco/onpair_cpp` via +// `FetchContent`. + +fn main() { + let cmake_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("cmake"); + + println!("cargo:rerun-if-changed={}", cmake_dir.display()); + println!( + "cargo:rerun-if-changed={}", + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("cxx") + .display() + ); + println!("cargo:rerun-if-env-changed=VORTEX_ONPAIR_FORCE_REBUILD"); + + let dst = cmake::Config::new(&cmake_dir) + .profile("Release") + .define("CMAKE_POLICY_DEFAULT_CMP0077", "NEW") + .define("CMAKE_POSITION_INDEPENDENT_CODE", "ON") + .define("ONPAIR_BUILD_TESTS", "OFF") + .define("ONPAIR_BUILD_EXAMPLES", "OFF") + .build(); + + println!("cargo:rustc-link-search=native={}/lib", dst.display()); + // The shim depends on onpair; both are static archives. + println!("cargo:rustc-link-lib=static=onpair_shim"); + println!("cargo:rustc-link-lib=static=onpair"); + + // C++ standard library, picked by host platform. + let target = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + match target.as_str() { + "macos" | "ios" => println!("cargo:rustc-link-lib=c++"), + "windows" => {} // MSVC links the runtime automatically. + _ => println!("cargo:rustc-link-lib=stdc++"), + } +} diff --git a/encodings/experimental/onpair-sys/cmake/CMakeLists.txt b/encodings/experimental/onpair-sys/cmake/CMakeLists.txt new file mode 100644 index 00000000000..c0ed6e29293 --- /dev/null +++ b/encodings/experimental/onpair-sys/cmake/CMakeLists.txt @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +cmake_minimum_required(VERSION 3.21) +project(onpair_shim CXX) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +include(FetchContent) +include("${CMAKE_CURRENT_LIST_DIR}/onpair_pin.cmake") + +# Skip onpair_cpp's own tests/examples and tell it not to fetch Boost. +set(ONPAIR_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(ONPAIR_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ONPAIR_ENABLE_LTO OFF CACHE BOOL "" FORCE) +set(ONPAIR_NATIVE_ARCH OFF CACHE BOOL "" FORCE) + +FetchContent_Declare( + onpair_cpp + GIT_REPOSITORY ${ONPAIR_CPP_REPO} + GIT_TAG ${ONPAIR_CPP_TAG} + PATCH_COMMAND ${CMAKE_COMMAND} + -DSRC_DIR= + -P "${CMAKE_CURRENT_LIST_DIR}/strip_boost.cmake" +) +FetchContent_MakeAvailable(onpair_cpp) + +add_library(onpair_shim STATIC + "${CMAKE_CURRENT_LIST_DIR}/../cxx/onpair_shim.cpp" +) +target_include_directories(onpair_shim + PUBLIC "${CMAKE_CURRENT_LIST_DIR}/../cxx" +) +target_link_libraries(onpair_shim PUBLIC OnPair::onpair) +set_target_properties(onpair_shim PROPERTIES POSITION_INDEPENDENT_CODE ON) + +install(TARGETS onpair_shim onpair + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib) diff --git a/encodings/experimental/onpair-sys/cmake/onpair_pin.cmake b/encodings/experimental/onpair-sys/cmake/onpair_pin.cmake new file mode 100644 index 00000000000..9c02447e3ba --- /dev/null +++ b/encodings/experimental/onpair-sys/cmake/onpair_pin.cmake @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent. +# Bump `ONPAIR_CPP_TAG` to a full commit SHA when updating — never use a +# branch name in CI, otherwise builds become non-reproducible. +set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git") +set(ONPAIR_CPP_TAG "ae590713515c7bb7893e14a757b484545e5339c3") diff --git a/encodings/experimental/onpair-sys/cmake/strip_boost.cmake b/encodings/experimental/onpair-sys/cmake/strip_boost.cmake new file mode 100644 index 00000000000..4bd1ad31253 --- /dev/null +++ b/encodings/experimental/onpair-sys/cmake/strip_boost.cmake @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Replaces boost::unordered_flat_{map,set} with std::unordered_{map,set} +# in the fetched onpair_cpp source tree. Idempotent. +# +# Invoked by FetchContent_Declare(PATCH_COMMAND ...). +# +# We rewrite `#include ` to `#include ` +# and substitute the qualified types. OnPair only uses the public, std-compatible +# subset of boost::unordered_flat_map (operator[], find, emplace, size, iterators), +# so this is a sound substitution. + +if(NOT DEFINED SRC_DIR) + message(FATAL_ERROR "strip_boost.cmake: SRC_DIR not set") +endif() + +file(GLOB_RECURSE ONPAIR_SOURCES + "${SRC_DIR}/include/onpair/*.h" + "${SRC_DIR}/include/onpair/*.hpp" + "${SRC_DIR}/src/onpair/*.cpp" + "${SRC_DIR}/src/onpair/*.h" + "${SRC_DIR}/src/onpair/*.hpp" +) + +set(_PAIR_HASH_BLOCK +"// strip_boost.cmake: std::hash> for unordered_map keys\n#include \n#include \n#include \nnamespace std {\ntemplate<> struct hash> {\n size_t operator()(const std::pair& p) const noexcept {\n return std::hash{}(p.first) ^ (std::hash{}(p.second) << 1);\n }\n};\n} // namespace std\n") + +foreach(F ${ONPAIR_SOURCES}) + file(READ "${F}" CONTENT) + string(REGEX REPLACE + "#include[ \t]+" + "#include " CONTENT "${CONTENT}") + string(REGEX REPLACE + "#include[ \t]+" + "#include " CONTENT "${CONTENT}") + string(REGEX REPLACE + "#include[ \t]+" + "#include \n#include " CONTENT "${CONTENT}") + string(REPLACE "boost::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}") + # Inject the pair-hash specialization once, at the top of any file that + # keys an unordered_map by std::pair. std::hash> does not + # exist by default; boost::unordered_flat_map shipped its own. + string(FIND "${CONTENT}" "unordered_map + +#include +#include +#include +#include +#include +#include +#include +#include + +using onpair::DECOMPRESS_BUFFER_PADDING; +using onpair::DictionaryView; +using onpair::OnPairColumn; +using onpair::OnPairColumnView; +using onpair::StoreView; +using onpair::encoding::DynamicThreshold; +using onpair::encoding::TrainingConfig; + +namespace { + +struct ColumnHandle { + OnPairColumn column; + std::optional view; + + const OnPairColumnView& get_view() { + if (!view) { + view.emplace(column.view()); + } + return *view; + } +}; + +void clear_bitmap(uint8_t* out, size_t n) noexcept { + std::memset(out, 0, (n + 7) / 8); +} + +inline void set_bit(uint8_t* out, size_t i) noexcept { + out[i / 8] |= static_cast(1u << (i % 8)); +} + +// Upper bound for the size of a single decompressed row. We don't have a +// per-row decoder capacity API, so we conservatively use total bytes_used() +// + padding, which is always at least as large as any single row. +size_t row_decompress_capacity(const OnPairColumnView& view) noexcept { + return view.bytes_used() + DECOMPRESS_BUFFER_PADDING + 1; +} + +// uint64 → uint32 offset copy. The C++ API takes uint32_t offsets; our FFI +// stays uint64 so Rust callers don't have to truncate. We bail out on +// overflow rather than silently wrapping. +bool offsets_fit_u32(const uint64_t* offsets, size_t n_plus_one) noexcept { + for (size_t i = 0; i < n_plus_one; ++i) { + if (offsets[i] > static_cast(UINT32_MAX)) { + return false; + } + } + return true; +} + +} // namespace + +extern "C" { + +OnPairStatus onpair_column_compress( + const uint8_t* bytes, + const uint64_t* offsets, + size_t n, + OnPairTrainingConfig config, + OnPairColumnHandle** out_handle) { + if (out_handle == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_handle = nullptr; + if ((bytes == nullptr && n > 0) || offsets == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + if (config.bits < 9 || config.bits > 16) { + return ONPAIR_ERR_INVALID_ARG; + } + if (!offsets_fit_u32(offsets, n + 1)) { + return ONPAIR_ERR_INVALID_ARG; + } + try { + TrainingConfig tc{}; + tc.bits = static_cast(config.bits); + tc.threshold = DynamicThreshold{config.threshold}; + if (config.seed != 0) { + tc.seed = config.seed; + } + + // Re-pack uint64 → uint32 in a temporary so we can call the + // (data, offsets, n, cfg) overload that takes uint32 offsets. + std::vector off32(n + 1); + for (size_t i = 0; i < n + 1; ++i) { + off32[i] = static_cast(offsets[i]); + } + + auto column = OnPairColumn::compress( + reinterpret_cast(bytes), + off32.data(), + n, + tc); + auto handle = std::make_unique(); + handle->column = std::move(column); + *out_handle = reinterpret_cast(handle.release()); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_deserialize( + const uint8_t* data, + size_t len, + OnPairColumnHandle** out_handle) { + if (out_handle == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_handle = nullptr; + if (data == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + try { + std::stringstream ss; + ss.write(reinterpret_cast(data), static_cast(len)); + auto column = OnPairColumn::read_from(ss); + auto handle = std::make_unique(); + handle->column = std::move(column); + *out_handle = reinterpret_cast(handle.release()); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_BAD_FORMAT; + } +} + +OnPairStatus onpair_column_serialize( + const OnPairColumnHandle* handle, + uint8_t** out_data, + size_t* out_len) { + if (handle == nullptr || out_data == nullptr || out_len == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_data = nullptr; + *out_len = 0; + try { + const auto* h = reinterpret_cast(handle); + std::stringstream ss; + h->column.write_to(ss); + const std::string s = ss.str(); + auto* buf = static_cast(std::malloc(s.size() == 0 ? 1 : s.size())); + if (buf == nullptr) { + return ONPAIR_ERR_OOM; + } + std::memcpy(buf, s.data(), s.size()); + *out_data = buf; + *out_len = s.size(); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +void onpair_column_free(OnPairColumnHandle* handle) { + delete reinterpret_cast(handle); +} + +void onpair_buffer_free(uint8_t* data, size_t /*len*/) { + std::free(data); +} + +size_t onpair_column_len(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return h->get_view().num_strings(); +} + +uint32_t onpair_column_bits(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return static_cast(h->get_view().bits()); +} + +size_t onpair_column_dict_size(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return h->get_view().dictionary().num_tokens(); +} + +OnPairStatus onpair_column_decompress( + const OnPairColumnHandle* handle, + size_t row_id, + uint8_t* out_buf, + size_t out_capacity, + size_t* out_len) { + if (handle == nullptr || out_buf == nullptr || out_len == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_len = 0; + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + if (row_id >= view.num_strings()) { + return ONPAIR_ERR_OUT_OF_RANGE; + } + // The decoder over-copies by DECOMPRESS_BUFFER_PADDING bytes per token, + // so the caller's buffer must include that headroom. + const size_t needed = row_decompress_capacity(view); + if (needed > out_capacity) { + return ONPAIR_ERR_OOM; + } + *out_len = view.decompress(row_id, reinterpret_cast(out_buf)); + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return DECOMPRESS_BUFFER_PADDING; + } + auto* h = const_cast(reinterpret_cast(handle)); + return row_decompress_capacity(h->get_view()); +} + +OnPairStatus onpair_column_equals_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.equals( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_starts_with_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.starts_with( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_contains_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.contains( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_dict_copy( + const OnPairColumnHandle* handle, + uint8_t* out_bytes, + size_t bytes_capacity, + uint64_t* out_offsets) { + if (handle == nullptr || out_offsets == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& dv = h->get_view().dictionary(); + const size_t n = dv.num_tokens(); + const auto* raw_off = dv.raw_offsets(); + const auto* raw_bytes_ptr = dv.raw_bytes(); + const size_t total = raw_off[n]; + if (total > bytes_capacity) { + return ONPAIR_ERR_OOM; + } + if (total > 0 && out_bytes != nullptr) { + std::memcpy(out_bytes, raw_bytes_ptr, total); + } + for (size_t i = 0; i <= n; ++i) { + out_offsets[i] = static_cast(raw_off[i]); + } + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& dv = h->get_view().dictionary(); + return dv.bytes_used(); + } catch (...) { + return 0; + } +} + +OnPairStatus onpair_column_parts( + const OnPairColumnHandle* handle, + OnPairColumnParts* out_parts) { + if (handle == nullptr || out_parts == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + const DictionaryView& dv = view.dictionary(); + const StoreView& sv = view.store(); + + const size_t dict_size = dv.num_tokens(); + const uint32_t* dict_off = dv.raw_offsets(); + const size_t dict_bytes = dict_size == 0 ? 0 : dict_off[dict_size]; + + const size_t num_rows = sv.num_strings(); + const uint32_t bw = static_cast(sv.bits()); + const size_t tokens = sv.num_tokens(); + // The packed stream is laid out by BitWriter as a vector; + // round-up-to-u64 of (tokens * bits) bits. + const size_t packed_u64 = (tokens * bw + 63) / 64; + + out_parts->dict_bytes = dv.raw_bytes(); + out_parts->dict_bytes_len = dict_bytes; + out_parts->dict_offsets = dict_off; + out_parts->dict_offsets_len = dict_size + 1; + out_parts->codes_packed = sv.packed_data(); + out_parts->codes_packed_u64_len = packed_u64; + out_parts->codes_boundaries = sv.boundaries(); + out_parts->codes_boundaries_len = num_rows + 1; + out_parts->bits = bw; + out_parts->num_rows = num_rows; + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +} // extern "C" diff --git a/encodings/experimental/onpair-sys/cxx/onpair_shim.h b/encodings/experimental/onpair-sys/cxx/onpair_shim.h new file mode 100644 index 00000000000..f3ef47d06c7 --- /dev/null +++ b/encodings/experimental/onpair-sys/cxx/onpair_shim.h @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// C ABI over the OnPair C++ library. All functions are nothrow; failures are +// signalled by a non-zero return code, with the caller responsible for any +// out-parameter allocations. + +#ifndef VORTEX_ONPAIR_SHIM_H +#define VORTEX_ONPAIR_SHIM_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct OnPairColumnHandle OnPairColumnHandle; + +typedef enum OnPairStatus { + ONPAIR_OK = 0, + ONPAIR_ERR_INVALID_ARG = 1, + ONPAIR_ERR_BAD_FORMAT = 2, + ONPAIR_ERR_OUT_OF_RANGE = 3, + ONPAIR_ERR_OOM = 4, + ONPAIR_ERR_INTERNAL = 99, +} OnPairStatus; + +// Training configuration. `bits` must be in [9, 16]; `dict_12` corresponds to +// bits = 12. `threshold` is the dynamic frequency threshold (smaller values +// produce larger dictionaries). +typedef struct OnPairTrainingConfig { + uint32_t bits; + double threshold; + uint64_t seed; +} OnPairTrainingConfig; + +// `bytes` is the concatenation of all input strings; `offsets` has length `n + 1` +// such that the i-th string spans `bytes[offsets[i] .. offsets[i + 1]]`. +// +// On success, *out_handle is set to an owning handle that must be released with +// onpair_column_free. +OnPairStatus onpair_column_compress( + const uint8_t* bytes, + const uint64_t* offsets, + size_t n, + OnPairTrainingConfig config, + OnPairColumnHandle** out_handle); + +// Deserialize a previously-serialized OnPair column. `data` must contain the +// magic header `ONPAIR01` produced by onpair_column_serialize. +OnPairStatus onpair_column_deserialize( + const uint8_t* data, + size_t len, + OnPairColumnHandle** out_handle); + +// Serialize an OnPair column to a byte vector. The caller must free the +// returned buffer with onpair_buffer_free. +OnPairStatus onpair_column_serialize( + const OnPairColumnHandle* handle, + uint8_t** out_data, + size_t* out_len); + +void onpair_column_free(OnPairColumnHandle* handle); +void onpair_buffer_free(uint8_t* data, size_t len); + +// Number of rows in the compressed column. +size_t onpair_column_len(const OnPairColumnHandle* handle); +// Bits-per-token the column was compressed with (9..=16). +uint32_t onpair_column_bits(const OnPairColumnHandle* handle); +// Dictionary size in entries. +size_t onpair_column_dict_size(const OnPairColumnHandle* handle); + +// Decompress the row at `row_id` into `out_buf`. `out_buf` must have at least +// `out_capacity` bytes. On success `*out_len` holds the number of bytes +// written. Returns ONPAIR_ERR_OUT_OF_RANGE if `row_id` is out of bounds or +// ONPAIR_ERR_OOM if `out_capacity` is too small. +OnPairStatus onpair_column_decompress( + const OnPairColumnHandle* handle, + size_t row_id, + uint8_t* out_buf, + size_t out_capacity, + size_t* out_len); + +// Upper bound on the size of any single decompressed row, including the +// over-copy padding the C++ decoder requires. +size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle); + +// --- Compressed-domain predicate pushdown --------------------------------- +// +// All `*_into` predicates write a bitmap of length `n` into `out_bits` +// (one bit per row, LSB-first, packed into bytes; the caller must provide +// at least `(n + 7) / 8` bytes). + +OnPairStatus onpair_column_equals_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +OnPairStatus onpair_column_starts_with_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +OnPairStatus onpair_column_contains_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +// --- Bulk dictionary access (for canonicalisation) ------------------------ +// +// Copies the column's dictionary into the caller-provided buffer. The +// dictionary is laid out as a packed byte vector with parallel offsets +// (length `dict_size + 1`). +OnPairStatus onpair_column_dict_copy( + const OnPairColumnHandle* handle, + uint8_t* out_bytes, + size_t bytes_capacity, + uint64_t* out_offsets); + +// Bytes occupied by the dictionary (sum of entry lengths). +size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle); + +// --- Decomposition into raw arrays (Vortex layout) ------------------------ +// +// Borrows pointers to the column's underlying Dictionary + Store vectors. +// The pointers remain valid until `handle` is freed; the caller is expected +// to copy them out into Vortex buffers/children and then drop the column. + +typedef struct OnPairColumnParts { + const uint8_t* dict_bytes; + size_t dict_bytes_len; // = dict_offsets[dict_size] (true, unpadded) + const uint32_t* dict_offsets; + size_t dict_offsets_len; // = dict_size + 1 + const uint64_t* codes_packed; // LSB-first bit-packed token stream + size_t codes_packed_u64_len; // u64 word count + const uint32_t* codes_boundaries; // per-row token index + size_t codes_boundaries_len; // = num_rows + 1 + uint32_t bits; // 9..=16 + size_t num_rows; +} OnPairColumnParts; + +OnPairStatus onpair_column_parts( + const OnPairColumnHandle* handle, + OnPairColumnParts* out_parts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VORTEX_ONPAIR_SHIM_H diff --git a/encodings/experimental/onpair-sys/public-api.lock b/encodings/experimental/onpair-sys/public-api.lock new file mode 100644 index 00000000000..0480e8b6f81 --- /dev/null +++ b/encodings/experimental/onpair-sys/public-api.lock @@ -0,0 +1,351 @@ +pub mod vortex_onpair_sys + +pub mod vortex_onpair_sys::ffi + +#[repr(u32)] pub enum vortex_onpair_sys::ffi::OnPairStatus + +pub vortex_onpair_sys::ffi::OnPairStatus::BadFormat = 2 + +pub vortex_onpair_sys::ffi::OnPairStatus::Internal = 99 + +pub vortex_onpair_sys::ffi::OnPairStatus::InvalidArg = 1 + +pub vortex_onpair_sys::ffi::OnPairStatus::Ok = 0 + +pub vortex_onpair_sys::ffi::OnPairStatus::Oom = 4 + +pub vortex_onpair_sys::ffi::OnPairStatus::OutOfRange = 3 + +impl vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self + +impl core::clone::Clone for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus + +impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus + +impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool + +impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairStatus + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnHandle + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnParts + +pub vortex_onpair_sys::ffi::OnPairColumnParts::bits: u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries: *const u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed: *const u64 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed_u64_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes: *const u8 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets: *const u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::num_rows: usize + +impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts + +impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig + +impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_buffer_free(*mut u8, usize) + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle) + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub enum vortex_onpair_sys::Error + +pub vortex_onpair_sys::Error::BadFormat + +pub vortex_onpair_sys::Error::Internal + +pub vortex_onpair_sys::Error::InvalidArg + +pub vortex_onpair_sys::Error::Oom + +pub vortex_onpair_sys::Error::OutOfRange + +impl core::clone::Clone for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::clone(&self) -> vortex_onpair_sys::Error + +impl core::cmp::Eq for vortex_onpair_sys::Error + +impl core::cmp::PartialEq for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::eq(&self, &vortex_onpair_sys::Error) -> bool + +impl core::error::Error for vortex_onpair_sys::Error + +impl core::fmt::Debug for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::Error + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::Error + +#[repr(u32)] pub enum vortex_onpair_sys::OnPairStatus + +pub vortex_onpair_sys::OnPairStatus::BadFormat = 2 + +pub vortex_onpair_sys::OnPairStatus::Internal = 99 + +pub vortex_onpair_sys::OnPairStatus::InvalidArg = 1 + +pub vortex_onpair_sys::OnPairStatus::Ok = 0 + +pub vortex_onpair_sys::OnPairStatus::Oom = 4 + +pub vortex_onpair_sys::OnPairStatus::OutOfRange = 3 + +impl vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self + +impl core::clone::Clone for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus + +impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus + +impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool + +impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairStatus + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus + +pub struct vortex_onpair_sys::Column + +impl vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::bits(&self) -> u32 + +pub fn vortex_onpair_sys::Column::compress(&[u8], &[u64], vortex_onpair_sys::OnPairTrainingConfig) -> core::result::Result + +pub fn vortex_onpair_sys::Column::contains_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::decompress_row(&self, usize, &mut alloc::vec::Vec) -> core::result::Result<(), vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::dict(&self) -> core::result::Result<(alloc::vec::Vec, alloc::vec::Vec), vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::dict_bytes(&self) -> usize + +pub fn vortex_onpair_sys::Column::dict_size(&self) -> usize + +pub fn vortex_onpair_sys::Column::equals_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::from_bytes(&[u8]) -> core::result::Result + +pub fn vortex_onpair_sys::Column::is_empty(&self) -> bool + +pub fn vortex_onpair_sys::Column::len(&self) -> usize + +pub fn vortex_onpair_sys::Column::max_decompress_capacity(&self) -> usize + +pub unsafe fn vortex_onpair_sys::Column::raw(&self) -> *const core::ffi::c_void + +pub fn vortex_onpair_sys::Column::starts_with_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::to_bytes(&self) -> core::result::Result, vortex_onpair_sys::Error> + +impl vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::parts(&self) -> core::result::Result, vortex_onpair_sys::Error> + +impl core::marker::Send for vortex_onpair_sys::Column + +impl core::marker::Sync for vortex_onpair_sys::Column + +impl core::ops::drop::Drop for vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::drop(&mut self) + +#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnHandle + +#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnParts + +pub vortex_onpair_sys::OnPairColumnParts::bits: u32 + +pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries: *const u32 + +pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::codes_packed: *const u64 + +pub vortex_onpair_sys::OnPairColumnParts::codes_packed_u64_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::dict_bytes: *const u8 + +pub vortex_onpair_sys::OnPairColumnParts::dict_bytes_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::dict_offsets: *const u32 + +pub vortex_onpair_sys::OnPairColumnParts::dict_offsets_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::num_rows: usize + +impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts + +impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts + +#[repr(C)] pub struct vortex_onpair_sys::OnPairTrainingConfig + +pub vortex_onpair_sys::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_sys::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_sys::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig + +impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig + +pub struct vortex_onpair_sys::Parts<'a> + +pub vortex_onpair_sys::Parts::bits: u32 + +pub vortex_onpair_sys::Parts::codes_boundaries: &'a [u32] + +pub vortex_onpair_sys::Parts::codes_packed: &'a [u64] + +pub vortex_onpair_sys::Parts::dict_bytes: &'a [u8] + +pub vortex_onpair_sys::Parts::dict_offsets: &'a [u32] + +pub vortex_onpair_sys::Parts::num_rows: usize + +impl<'a> core::clone::Clone for vortex_onpair_sys::Parts<'a> + +pub fn vortex_onpair_sys::Parts<'a>::clone(&self) -> vortex_onpair_sys::Parts<'a> + +impl<'a> core::marker::Copy for vortex_onpair_sys::Parts<'a> + +pub const vortex_onpair_sys::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::OnPairTrainingConfig + +pub unsafe c fn vortex_onpair_sys::onpair_buffer_free(*mut u8, usize) + +pub unsafe c fn vortex_onpair_sys::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle) + +pub unsafe c fn vortex_onpair_sys::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub fn vortex_onpair_sys::read_bits_lsb(&[u64], usize, u32) -> u16 + +pub fn vortex_onpair_sys::unpack_codes_to_u16(&[u64], usize, u32) -> alloc::vec::Vec diff --git a/encodings/experimental/onpair-sys/src/lib.rs b/encodings/experimental/onpair-sys/src/lib.rs new file mode 100644 index 00000000000..a6804eb4c21 --- /dev/null +++ b/encodings/experimental/onpair-sys/src/lib.rs @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Unsafe FFI bindings to the OnPair C++ compression library. +//! +//! The public surface is intentionally minimal: a [`Column`] owning handle +//! plus the C-ABI functions defined in `cxx/onpair_shim.h`. Safe wrappers and +//! the Vortex array implementation live in the `vortex-onpair` crate. + +#![allow(non_camel_case_types)] + +use std::ffi::c_void; +use std::ptr::NonNull; + +pub mod ffi { + #[repr(C)] + pub struct OnPairColumnHandle { + _opaque: [u8; 0], + } + + #[repr(u32)] + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum OnPairStatus { + Ok = 0, + InvalidArg = 1, + BadFormat = 2, + OutOfRange = 3, + Oom = 4, + Internal = 99, + } + + impl OnPairStatus { + pub fn from_raw(raw: u32) -> Self { + match raw { + 0 => OnPairStatus::Ok, + 1 => OnPairStatus::InvalidArg, + 2 => OnPairStatus::BadFormat, + 3 => OnPairStatus::OutOfRange, + 4 => OnPairStatus::Oom, + _ => OnPairStatus::Internal, + } + } + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct OnPairTrainingConfig { + pub bits: u32, + pub threshold: f64, + pub seed: u64, + } + + unsafe extern "C" { + pub fn onpair_column_compress( + bytes: *const u8, + offsets: *const u64, + n: usize, + config: OnPairTrainingConfig, + out_handle: *mut *mut OnPairColumnHandle, + ) -> u32; + + pub fn onpair_column_deserialize( + data: *const u8, + len: usize, + out_handle: *mut *mut OnPairColumnHandle, + ) -> u32; + + pub fn onpair_column_serialize( + handle: *const OnPairColumnHandle, + out_data: *mut *mut u8, + out_len: *mut usize, + ) -> u32; + + pub fn onpair_column_free(handle: *mut OnPairColumnHandle); + pub fn onpair_buffer_free(data: *mut u8, len: usize); + + pub fn onpair_column_len(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_bits(handle: *const OnPairColumnHandle) -> u32; + pub fn onpair_column_dict_size(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_decompress_capacity(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_dict_bytes(handle: *const OnPairColumnHandle) -> usize; + + pub fn onpair_column_decompress( + handle: *const OnPairColumnHandle, + row_id: usize, + out_buf: *mut u8, + out_capacity: usize, + out_len: *mut usize, + ) -> u32; + + pub fn onpair_column_equals_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_starts_with_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_contains_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_dict_copy( + handle: *const OnPairColumnHandle, + out_bytes: *mut u8, + bytes_capacity: usize, + out_offsets: *mut u64, + ) -> u32; + + pub fn onpair_column_parts( + handle: *const OnPairColumnHandle, + out_parts: *mut OnPairColumnParts, + ) -> u32; + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct OnPairColumnParts { + pub dict_bytes: *const u8, + pub dict_bytes_len: usize, + pub dict_offsets: *const u32, + pub dict_offsets_len: usize, + pub codes_packed: *const u64, + pub codes_packed_u64_len: usize, + pub codes_boundaries: *const u32, + pub codes_boundaries_len: usize, + pub bits: u32, + pub num_rows: usize, + } +} + +pub use ffi::*; + +/// The "dict-12" preset: 12-bit packed token codes. +pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = OnPairTrainingConfig { + bits: 12, + threshold: 0.5, + seed: 0, +}; + +/// Error type returned by the safe wrappers. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Error { + InvalidArg, + BadFormat, + OutOfRange, + Oom, + Internal, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let msg = match self { + Error::InvalidArg => "OnPair: invalid argument", + Error::BadFormat => "OnPair: bad serialized format", + Error::OutOfRange => "OnPair: row index out of range", + Error::Oom => "OnPair: out of memory or buffer too small", + Error::Internal => "OnPair: internal error", + }; + f.write_str(msg) + } +} + +impl std::error::Error for Error {} + +impl Error { + fn check(status: u32) -> Result<(), Self> { + match OnPairStatus::from_raw(status) { + OnPairStatus::Ok => Ok(()), + OnPairStatus::InvalidArg => Err(Error::InvalidArg), + OnPairStatus::BadFormat => Err(Error::BadFormat), + OnPairStatus::OutOfRange => Err(Error::OutOfRange), + OnPairStatus::Oom => Err(Error::Oom), + OnPairStatus::Internal => Err(Error::Internal), + } + } +} + +/// Owning handle around a `OnPairColumn`. Send + Sync because the C++ object +/// is immutable once constructed and the predicate methods are read-only. +pub struct Column { + handle: NonNull, +} + +unsafe impl Send for Column {} +unsafe impl Sync for Column {} + +impl Column { + /// Compress `n` byte strings described by a flat `bytes` blob and an + /// `offsets` array of length `n + 1`. + pub fn compress( + bytes: &[u8], + offsets: &[u64], + config: OnPairTrainingConfig, + ) -> Result { + if offsets.is_empty() || offsets.len() - 1 > offsets.len() { + return Err(Error::InvalidArg); + } + let n = offsets.len() - 1; + let mut out: *mut OnPairColumnHandle = std::ptr::null_mut(); + let status = unsafe { + onpair_column_compress(bytes.as_ptr(), offsets.as_ptr(), n, config, &raw mut out) + }; + Error::check(status)?; + let handle = NonNull::new(out).ok_or(Error::Internal)?; + Ok(Self { handle }) + } + + /// Reconstruct a column from a previously-serialised byte blob. + pub fn from_bytes(data: &[u8]) -> Result { + let mut out: *mut OnPairColumnHandle = std::ptr::null_mut(); + let status = unsafe { onpair_column_deserialize(data.as_ptr(), data.len(), &raw mut out) }; + Error::check(status)?; + let handle = NonNull::new(out).ok_or(Error::Internal)?; + Ok(Self { handle }) + } + + pub fn to_bytes(&self) -> Result, Error> { + let mut data: *mut u8 = std::ptr::null_mut(); + let mut len: usize = 0; + let status = + unsafe { onpair_column_serialize(self.handle.as_ptr(), &raw mut data, &raw mut len) }; + Error::check(status)?; + let out = unsafe { std::slice::from_raw_parts(data, len) }.to_vec(); + unsafe { onpair_buffer_free(data, len) }; + Ok(out) + } + + pub fn len(&self) -> usize { + unsafe { onpair_column_len(self.handle.as_ptr()) } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn bits(&self) -> u32 { + unsafe { onpair_column_bits(self.handle.as_ptr()) } + } + + pub fn dict_size(&self) -> usize { + unsafe { onpair_column_dict_size(self.handle.as_ptr()) } + } + + pub fn max_decompress_capacity(&self) -> usize { + unsafe { onpair_column_decompress_capacity(self.handle.as_ptr()) } + } + + /// Decompress a single row, growing `out` as needed. + pub fn decompress_row(&self, row_id: usize, out: &mut Vec) -> Result<(), Error> { + let capacity = self.max_decompress_capacity().max(64); + out.clear(); + out.reserve(capacity); + let mut written: usize = 0; + let status = unsafe { + onpair_column_decompress( + self.handle.as_ptr(), + row_id, + out.as_mut_ptr(), + out.capacity(), + &raw mut written, + ) + }; + Error::check(status)?; + unsafe { out.set_len(written) }; + Ok(()) + } + + pub fn dict_bytes(&self) -> usize { + unsafe { onpair_column_dict_bytes(self.handle.as_ptr()) } + } + + /// Materialise the dictionary as `(bytes, offsets)`. `offsets` has length + /// `dict_size + 1`. + pub fn dict(&self) -> Result<(Vec, Vec), Error> { + let dict_size = self.dict_size(); + let bytes_len = self.dict_bytes(); + let mut bytes = vec![0u8; bytes_len]; + let mut offsets = vec![0u64; dict_size + 1]; + let status = unsafe { + onpair_column_dict_copy( + self.handle.as_ptr(), + bytes.as_mut_ptr(), + bytes.len(), + offsets.as_mut_ptr(), + ) + }; + Error::check(status)?; + Ok((bytes, offsets)) + } + + fn run_predicate( + &self, + f: unsafe extern "C" fn(*const OnPairColumnHandle, *const u8, usize, *mut u8) -> u32, + needle: &[u8], + ) -> Result, Error> { + let n = self.len(); + let mut bits = vec![0u8; n.div_ceil(8)]; + let status = unsafe { + f( + self.handle.as_ptr(), + needle.as_ptr(), + needle.len(), + bits.as_mut_ptr(), + ) + }; + Error::check(status)?; + Ok(bits) + } + + pub fn equals_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_equals_into, needle) + } + + pub fn starts_with_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_starts_with_into, needle) + } + + pub fn contains_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_contains_into, needle) + } + + /// Raw handle exposed for higher-level wrappers that need to pass the + /// pointer to their own FFI calls. + /// + /// # Safety + /// + /// The returned pointer is owned by `self`; callers must not free it, + /// must not dereference it through any FFI other than the `onpair_*` + /// functions, and must not let it outlive this [`Column`]. + pub unsafe fn raw(&self) -> *const c_void { + self.handle.as_ptr() as *const c_void + } +} + +impl Column { + /// Borrow the column's raw decomposition: dictionary, bit-packed token + /// stream, and per-row boundaries. The returned pointers reference memory + /// owned by `self` and remain valid for as long as the column does. + pub fn parts(&self) -> Result, Error> { + let mut raw = OnPairColumnParts { + dict_bytes: std::ptr::null(), + dict_bytes_len: 0, + dict_offsets: std::ptr::null(), + dict_offsets_len: 0, + codes_packed: std::ptr::null(), + codes_packed_u64_len: 0, + codes_boundaries: std::ptr::null(), + codes_boundaries_len: 0, + bits: 0, + num_rows: 0, + }; + let status = unsafe { onpair_column_parts(self.handle.as_ptr(), &raw mut raw) }; + Error::check(status)?; + // SAFETY: the C side returns pointers into vectors owned by `self` + // (the underlying `OnPairColumn`); they remain valid for `&self`. + Ok(unsafe { Parts::from_raw(raw) }) + } +} + +impl Drop for Column { + fn drop(&mut self) { + unsafe { onpair_column_free(self.handle.as_ptr()) } + } +} + +/// Borrowed view over a column's raw arrays. See [`Column::parts`]. +#[derive(Copy, Clone)] +pub struct Parts<'a> { + /// Concatenated dictionary entry bytes (unpadded). + pub dict_bytes: &'a [u8], + /// Length `dict_size + 1`; entry `i` spans `dict_bytes[dict_offsets[i]..dict_offsets[i + 1]]`. + pub dict_offsets: &'a [u32], + /// LSB-first bit-packed token stream, packed `bits` bits per token. + pub codes_packed: &'a [u64], + /// Length `num_rows + 1`; row `r` spans tokens `codes_boundaries[r]..codes_boundaries[r + 1]`. + pub codes_boundaries: &'a [u32], + /// Bits per token (9..=16). + pub bits: u32, + pub num_rows: usize, +} + +impl<'a> Parts<'a> { + /// # Safety + /// Caller must guarantee the pointers in `raw` are valid for `'a`. + unsafe fn from_raw(raw: OnPairColumnParts) -> Self { + unsafe { + Self { + dict_bytes: slice_or_empty(raw.dict_bytes, raw.dict_bytes_len), + dict_offsets: slice_or_empty(raw.dict_offsets, raw.dict_offsets_len), + codes_packed: slice_or_empty(raw.codes_packed, raw.codes_packed_u64_len), + codes_boundaries: slice_or_empty(raw.codes_boundaries, raw.codes_boundaries_len), + bits: raw.bits, + num_rows: raw.num_rows, + } + } + } +} + +#[inline] +unsafe fn slice_or_empty<'a, T>(ptr: *const T, len: usize) -> &'a [T] { + if ptr.is_null() || len == 0 { + &[] + } else { + unsafe { std::slice::from_raw_parts(ptr, len) } + } +} + +/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position +/// `bit_pos`. Matches OnPair's `BitWriter` layout. +#[inline] +pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 { + debug_assert!((1..=16).contains(&bits)); + let word_idx = bit_pos / 64; + // SAFETY of cast: `bit_pos % 64` is always in `0..64`, which fits in u32. + #[allow(clippy::cast_possible_truncation)] + let bit_off = (bit_pos % 64) as u32; + let mask: u64 = (1u64 << bits) - 1; + let low = packed[word_idx] >> bit_off; + let combined = if bit_off + bits <= 64 { + low & mask + } else { + let high = packed[word_idx + 1] << (64 - bit_off); + (low | high) & mask + }; + // SAFETY of cast: `combined` has been masked to at most `bits` (<=16) bits. + #[allow(clippy::cast_possible_truncation)] + let value = combined as u16; + value +} + +/// Decompress an LSB-first bit-packed token stream into a flat `Vec`, +/// one element per token. Each `u16` only uses its low `bits` bits. +pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec { + assert!((9..=16).contains(&bits), "bits must be in [9, 16]"); + let mut out = Vec::with_capacity(total_tokens); + for t in 0..total_tokens { + out.push(read_bits_lsb(packed, t * bits as usize, bits)); + } + out +} diff --git a/encodings/experimental/onpair/Cargo.toml b/encodings/experimental/onpair/Cargo.toml new file mode 100644 index 00000000000..65eb4671291 --- /dev/null +++ b/encodings/experimental/onpair/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "vortex-onpair" +authors = { workspace = true } +categories = { workspace = true } +description = "Vortex OnPair string array encoding (dict-12, pushdown predicates)" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +memchr = { workspace = true } +prost = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-onpair-sys = { workspace = true } +vortex-session = { workspace = true } + +[features] +_test-harness = ["vortex-array/_test-harness"] + +[dev-dependencies] +divan = { workspace = true } +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[[bench]] +name = "decode" +harness = false diff --git a/encodings/experimental/onpair/README.md b/encodings/experimental/onpair/README.md new file mode 100644 index 00000000000..21b92a0ae5b --- /dev/null +++ b/encodings/experimental/onpair/README.md @@ -0,0 +1,47 @@ +# Vortex OnPair + +A Vortex Encoding for Binary and Utf8 data that uses the +[OnPair][onpair] short-string compression algorithm. OnPair is a +dictionary-based encoder with fast per-row random access and +compressed-domain predicate evaluation. + +The C++ trainer / encoder lives in `vortex-onpair-sys`; this crate wraps +the resulting column as a Vortex array with cascading-compressor support +on every integer child. + +## LIKE Pushdown + +The OnPair encoding has a specialized LIKE fast path for a narrow subset +of patterns: + +- `prefix%` — matched in the compressed domain via a token-level + `PrefixAutomaton`. +- `%needle%` — pre-filtered by a per-token bloom (`dict_contains` / + `dict_could_extend`) before falling back to `memmem` on decoded rows. + +Unsupported shapes (`_`, `%suffix`, interior wildcards) fall back to +ordinary decompression-based LIKE evaluation. + +## Default Configuration + +The default training preset is **dict-12**: 12 bits per token, +dictionary capped at 4 096 entries. Token codes are stored as a +`PrimitiveArray`; downstream `FastLanes::BitPacking` losslessly +narrows the child to exactly `bits`-bit codes on disk. + +## Layout + +- Buffer 0 — `dict_bytes`: dictionary blob built by the C++ trainer, + padded with `MAX_TOKEN_SIZE` trailing zero bytes so the over-copy + decoder can read 16 bytes past the last token. +- Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. +- Slot 1 — `codes`: `PrimitiveArray`, length `total_tokens`. +- Slot 2 — `codes_offsets`: `PrimitiveArray`, length `num_rows + 1`. +- Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, length + `num_rows`. +- Slot 4 — optional validity child. + +All four integer slot children flow through the standard cascading +compressor pipeline (FoR / BitPacking / RunEnd / etc.). + +[onpair]: https://arxiv.org/abs/2508.02280 diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs new file mode 100644 index 00000000000..2b2d766b276 --- /dev/null +++ b/encodings/experimental/onpair/benches/decode.rs @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Decode-path microbenchmarks for the OnPair Vortex array. +//! +//! * `decode_rows_unchecked` — the production decoder hot loop (combined +//! `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled). +//! Measured by hand-driving `DecodeView::decode_rows_unchecked` straight +//! into a `Vec` so the time reflects the inner loop only. +//! * `canonicalize_to_varbinview` — the full Vortex +//! `OnPair → VarBinViewArray` path callers actually hit. Includes +//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc. +//! +//! Each bench sweeps four corpus shapes against two row counts to surface +//! cache-pressure cliffs and per-row decode cost. + +#![allow( + clippy::cast_possible_truncation, + clippy::cast_lossless, + clippy::panic, + clippy::tests_outside_test_module, + clippy::redundant_clone, + clippy::missing_safety_doc, + clippy::unwrap_used, + clippy::expect_used +)] + +use std::sync::LazyLock; + +use divan::Bencher; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_array::session::ArraySession; +use vortex_mask::Mask; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::MAX_TOKEN_SIZE; +use vortex_onpair::OnPair; +use vortex_onpair::OnPairArray; +use vortex_onpair::decode::OwnedDecodeInputs; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +#[derive(Copy, Clone, Debug)] +enum Shape { + /// URL / HTTP-log shaped — high lexical overlap, ~35–45 bytes per row. + UrlLog, + /// Short uniform strings — 4–8 bytes per row, very low cardinality. + Short, + /// Long log-line shaped — ~120 bytes per row, more tokens per row. + Long, + /// High cardinality — every row unique. + HighCard, +} + +fn corpus(n: usize, shape: Shape) -> Vec { + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + let mut next = || { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + state + }; + let mut out = Vec::with_capacity(n); + match shape { + Shape::UrlLog => { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::Short => { + let templates: &[&str] = &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"]; + for _ in 0..n { + let s = next(); + out.push(templates[(s as usize) % templates.len()].to_string()); + } + } + Shape::Long => { + let templates: &[&str] = &[ + "2026-05-14T12:34:56.789012Z INFO request_id={id} method=GET path=/api/v1/users/{id}/profile status=200", + "2026-05-14T12:34:56.789012Z WARN request_id={id} method=POST path=/api/v1/users/{id}/sessions status=429", + "2026-05-14T12:34:56.789012Z ERROR request_id={id} method=PUT path=/api/v1/users/{id}/settings status=500", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::HighCard => { + for i in 0..n { + out.push(format!("row-{i:010x}-{rand:016x}", rand = next())); + } + } + } + out +} + +fn compress(n: usize, shape: Shape) -> OnPairArray { + let strings = corpus(n, shape); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .unwrap_or_else(|e| panic!("onpair_compress failed: {e}")) +} + +fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) { + let mut ctx = SESSION.create_execution_ctx(); + let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) + .unwrap_or_else(|e| panic!("collect: {e}")); + let n = arr.len(); + let total: usize = inputs + .codes + .as_slice() + .iter() + .map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize) + .sum(); + (inputs, n, total) +} + +const CASES: &[(Shape, usize)] = &[ + (Shape::UrlLog, 100_000), + (Shape::UrlLog, 1_000_000), + (Shape::Short, 100_000), + (Shape::Long, 100_000), + (Shape::HighCard, 100_000), +]; + +/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the +/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly. +#[divan::bench(args = CASES)] +fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let (inputs, n_rows, total) = materialise(&arr); + bencher.bench_local(|| { + let mut out: Vec = Vec::with_capacity(total + MAX_TOKEN_SIZE); + let dv = inputs.view(); + unsafe { + let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr()); + out.set_len(written); + } + divan::black_box(out); + }); +} + +/// Full Vortex canonicalisation, including `execute<>` on every child, +/// building the view buffer + `BinaryView` list, etc. +#[divan::bench(args = CASES)] +fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher + .with_inputs(|| arr.clone().into_array()) + .bench_local_values(|arr| { + let mut ctx = SESSION.create_execution_ctx(); + divan::black_box( + arr.execute::(&mut ctx) + .unwrap_or_else(|e| panic!("canonicalize failed: {e}")), + ) + }); +} + +// ─── Compute kernels ───────────────────────────────────────────────────── + +const COMPUTE_CASES: &[(Shape, usize)] = &[(Shape::UrlLog, 100_000), (Shape::UrlLog, 1_000_000)]; + +/// `Eq` against a literal (token-aware fast path: no row decode, just +/// `&[u16]` comparison). +#[divan::bench(args = COMPUTE_CASES)] +fn eq_constant(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let strings = corpus(n, shape); + // Pick the very first row's value as the needle so we always hit at + // least one match. + let needle = strings[0].clone(); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let result = ::compare( + arr.as_view(), + &ConstantArray::new(needle.as_str(), n).into_array(), + CompareOperator::Eq, + &mut ctx, + ) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +/// `LIKE 'prefix%'` — byte-streaming row prefix check. +#[divan::bench(args = COMPUTE_CASES)] +fn like_prefix(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let pattern = ConstantArray::new("https://www.%", n).into_array(); + let result = + ::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +/// `LIKE '%substring%'` — `memchr::memmem::Finder` over decoded rows. +#[divan::bench(args = COMPUTE_CASES)] +fn like_contains(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let pattern = ConstantArray::new("%example.com%", n).into_array(); + let result = + ::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +/// Filter — share-dict path. Builds a 1-in-7 mask so we keep ~14 % of +/// rows; the cost is dominated by the `codes` segment copy + offsets. +#[divan::bench(args = COMPUTE_CASES)] +fn filter_share_dict(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let mask = Mask::from_iter((0..n).map(|i| i % 7 == 0)); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let result = ::filter(arr.as_view(), &mask, &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +fn main() { + divan::main(); +} diff --git a/encodings/experimental/onpair/goldenfiles/onpair.metadata b/encodings/experimental/onpair/goldenfiles/onpair.metadata new file mode 100644 index 00000000000..e96baf1a0ab --- /dev/null +++ b/encodings/experimental/onpair/goldenfiles/onpair.metadata @@ -0,0 +1 @@ + € €è(08 \ No newline at end of file diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock new file mode 100644 index 00000000000..a97a759cba9 --- /dev/null +++ b/encodings/experimental/onpair/public-api.lock @@ -0,0 +1,263 @@ +pub mod vortex_onpair + +pub mod vortex_onpair::decode + +pub struct vortex_onpair::decode::DecodeView<'a> + +pub vortex_onpair::decode::DecodeView::codes: &'a [u16] + +pub vortex_onpair::decode::DecodeView::codes_offsets: &'a [u32] + +pub vortex_onpair::decode::DecodeView::dict_bytes: &'a [u8] + +pub vortex_onpair::decode::DecodeView::dict_table: &'a [u64] + +impl<'a> vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_row_into(&self, usize, &mut alloc::vec::Vec) + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into(&self, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into_with_size(&self, usize, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_unchecked(&self, usize, usize, *mut u8) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len(&self, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len_rows(&self, usize, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::for_each_dict_slice bool>(&self, usize, F) -> bool + +impl<'a> core::clone::Clone for vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::clone(&self) -> vortex_onpair::decode::DecodeView<'a> + +impl<'a> core::marker::Copy for vortex_onpair::decode::DecodeView<'a> + +pub struct vortex_onpair::decode::OwnedDecodeInputs + +pub vortex_onpair::decode::OwnedDecodeInputs::codes: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::codes_offsets: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_bytes: vortex_buffer::ByteBuffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_table: vortex_buffer::buffer::Buffer + +impl vortex_onpair::decode::OwnedDecodeInputs + +pub fn vortex_onpair::decode::OwnedDecodeInputs::collect(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::decode::OwnedDecodeInputs::view(&self) -> vortex_onpair::decode::DecodeView<'_> + +pub struct vortex_onpair::OnPair + +impl vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::try_new(vortex_array::dtype::DType, vortex_array::buffer::BufferHandle, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::validity::Validity, u32) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::clone(&self) -> vortex_onpair::OnPair + +impl core::fmt::Debug for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::array::vtable::VTable for vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::OperationsVTable = vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::TypedArrayData = vortex_onpair::OnPairData + +pub type vortex_onpair::OnPair::ValidityVTable = vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::append_to_builder(vortex_array::array::view::ArrayView<'_, Self>, &mut dyn vortex_array::builders::ArrayBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + +pub fn vortex_onpair::OnPair::buffer(vortex_array::array::view::ArrayView<'_, Self>, usize) -> vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPair::buffer_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> core::option::Option + +pub fn vortex_onpair::OnPair::deserialize(&self, &vortex_array::dtype::DType, usize, &[u8], &[vortex_array::buffer::BufferHandle], &dyn vortex_array::serde::ArrayChildren, &vortex_session::VortexSession) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::execute(vortex_array::array::typed::Array, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::OnPair::execute_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::id(&self) -> vortex_array::array::ArrayId + +pub fn vortex_onpair::OnPair::nbuffers(vortex_array::array::view::ArrayView<'_, Self>) -> usize + +pub fn vortex_onpair::OnPair::reduce_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::serialize(vortex_array::array::view::ArrayView<'_, Self>, &vortex_session::VortexSession) -> vortex_error::VortexResult>> + +pub fn vortex_onpair::OnPair::slot_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> alloc::string::String + +pub fn vortex_onpair::OnPair::validate(&self, &Self::TypedArrayData, &vortex_array::dtype::DType, usize, &[core::option::Option]) -> vortex_error::VortexResult<()> + +impl vortex_array::array::vtable::operations::OperationsVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::scalar_at(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +impl vortex_array::array::vtable::validity::ValidityVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::validity(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult + +impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::filter(vortex_array::array::view::ArrayView<'_, Self>, &vortex_mask::Mask, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::arrays::slice::SliceReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::like(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::like::LikeOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub struct vortex_onpair::OnPairData + +impl vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::bits(&self) -> u32 + +pub fn vortex_onpair::OnPairData::dict_bytes(&self) -> &vortex_buffer::ByteBuffer + +pub fn vortex_onpair::OnPairData::dict_bytes_handle(&self) -> &vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPairData::is_empty(&self) -> bool + +pub fn vortex_onpair::OnPairData::len(&self) -> usize + +pub fn vortex_onpair::OnPairData::new(vortex_array::buffer::BufferHandle, u32, usize) -> Self + +impl core::clone::Clone for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::clone(&self) -> vortex_onpair::OnPairData + +impl core::fmt::Debug for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::hash::ArrayEq for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_eq(&self, &Self, vortex_array::hash::Precision) -> bool + +impl vortex_array::hash::ArrayHash for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_hash(&self, &mut H, vortex_array::hash::Precision) + +pub struct vortex_onpair::OnPairMetadata + +pub vortex_onpair::OnPairMetadata::bits: u32 + +pub vortex_onpair::OnPairMetadata::codes_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::codes_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_size: u64 + +pub vortex_onpair::OnPairMetadata::total_tokens: u64 + +pub vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype: i32 + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::codes_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::codes_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::dict_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype(&self) -> vortex_array::dtype::ptype::PType + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::get_uncompressed_lengths_ptype(&self) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clone(&self) -> vortex_onpair::OnPairMetadata + +impl core::default::Default for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::default() -> Self + +impl core::fmt::Debug for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl prost::message::Message for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clear(&mut self) + +pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize + +pub const vortex_onpair::DEFAULT_BITS: u32 + +pub const vortex_onpair::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub const vortex_onpair::MAX_TOKEN_SIZE: usize + +pub trait vortex_onpair::OnPairArrayExt: vortex_array::array::typed::TypedArrayRef + +pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity + +pub fn vortex_onpair::OnPairArrayExt::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +impl> vortex_onpair::OnPairArrayExt for T + +pub fn T::array_validity(&self) -> vortex_array::validity::Validity + +pub fn T::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::config_with_bits(u32) -> vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub fn vortex_onpair::onpair_compress>(A, usize, &vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array_default(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_iter<'a, I>(I, usize, vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult where I: core::iter::traits::iterator::Iterator> + +pub type vortex_onpair::OnPairArray = vortex_array::array::typed::Array diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs new file mode 100644 index 00000000000..e95c0e15b3c --- /dev/null +++ b/encodings/experimental/onpair/src/array.rs @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; +use std::hash::Hasher; + +use prost::Message as _; +use vortex_array::Array; +use vortex_array::ArrayEq; +use vortex_array::ArrayHash; +use vortex_array::ArrayId; +use vortex_array::ArrayParts; +use vortex_array::ArrayRef; +use vortex_array::ArraySlots; +use vortex_array::ArrayView; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::ExecutionResult; +use vortex_array::IntoArray; +use vortex_array::Precision; +use vortex_array::TypedArrayRef; +use vortex_array::buffer::BufferHandle; +use vortex_array::builders::ArrayBuilder; +use vortex_array::builders::VarBinViewBuilder; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::serde::ArrayChildren; +use vortex_array::smallvec::smallvec; +use vortex_array::validity::Validity; +use vortex_array::vtable::VTable; +use vortex_array::vtable::ValidityVTable; +use vortex_array::vtable::child_to_validity; +use vortex_array::vtable::validity_to_child; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_session::VortexSession; +use vortex_session::registry::CachedId; + +use crate::canonical::canonicalize_onpair; +use crate::canonical::onpair_decode_views; +use crate::kernel::PARENT_KERNELS; +use crate::rules::RULES; + +/// An [`OnPair`]-encoded Vortex array. +pub type OnPairArray = Array; + +/// Default bits-per-token preset used by [`crate::onpair_compress`]: 12-bit +/// codes, dictionary capped at 4 096 entries. +pub const DEFAULT_BITS: u32 = 12; + +/// Wire-format metadata persisted alongside the OnPair buffer + slot children. +/// +/// On disk the layout is FSST-shape: +/// +/// * Buffer 0 — `dict_bytes`: the dictionary blob built by the C++ trainer, +/// padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero +/// bytes so the over-copy decoder can read 16 bytes past the last token. +/// * Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. +/// * Slot 1 — `codes`: `PrimitiveArray`. Each value only uses its low +/// `bits` bits; downstream `FastLanes::BitPacking` losslessly shrinks +/// the child to exactly `bits`-bit codes on disk. +/// * Slot 2 — `codes_offsets`: `PrimitiveArray`, len `num_rows + 1`. +/// FoR / RunEnd / etc. apply naturally via the cascading compressor. +/// * Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, len +/// `num_rows`. Used to size the canonical output buffer. +/// * Slot 4 — optional validity child. +/// +/// All three integer slot children flow through the standard +/// `compress_child` pipeline (see `vortex-btrblocks::schemes::string:: +/// OnPairScheme`), so any encoding registered with the compressor can +/// re-encode them — exactly the same shape as FSST's `codes` `VarBinArray`. +#[derive(Clone, prost::Message)] +pub struct OnPairMetadata { + /// Width of the per-row primitive `uncompressed_lengths` child. + #[prost(enumeration = "PType", tag = "1")] + pub uncompressed_lengths_ptype: i32, + /// Bits-per-token the column was compressed with (9..=16). Every value + /// in the `codes` child only uses its low `bits` bits. + #[prost(uint32, tag = "2")] + pub bits: u32, + /// Number of dictionary tokens. `dict_offsets` has length `dict_size + 1`. + #[prost(uint64, tag = "3")] + pub dict_size: u64, + /// Total number of tokens across all rows. `codes` has this length; + /// `codes_offsets.last() == total_tokens`. + #[prost(uint64, tag = "4")] + pub total_tokens: u64, + /// PType of the `dict_offsets` slot child (defaults to U32, may be + /// narrowed to U16/U8 by the cascading compressor when values fit). + #[prost(enumeration = "PType", tag = "5")] + pub dict_offsets_ptype: i32, + /// PType of the `codes` slot child (typically U16, may be narrowed to U8 + /// when `bits <= 8`). + #[prost(enumeration = "PType", tag = "6")] + pub codes_ptype: i32, + /// PType of the `codes_offsets` slot child. + #[prost(enumeration = "PType", tag = "7")] + pub codes_offsets_ptype: i32, +} + +impl OnPairMetadata { + pub fn get_uncompressed_lengths_ptype(&self) -> VortexResult { + PType::try_from(self.uncompressed_lengths_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype)) + } +} + +/// Slot indices on the outer [`Array`]. +pub(crate) const DICT_OFFSETS_SLOT: usize = 0; +pub(crate) const CODES_SLOT: usize = 1; +pub(crate) const CODES_OFFSETS_SLOT: usize = 2; +pub(crate) const UNCOMPRESSED_LENGTHS_SLOT: usize = 3; +pub(crate) const VALIDITY_SLOT: usize = 4; +pub(crate) const NUM_SLOTS: usize = 5; +pub(crate) const SLOT_NAMES: [&str; NUM_SLOTS] = [ + "dict_offsets", + "codes", + "codes_offsets", + "uncompressed_lengths", + "validity", +]; + +/// Inner data for an OnPair-encoded array. +/// +/// Holds only the dictionary blob (buffer 0). Every other piece — +/// `dict_offsets`, the per-token `codes`, the per-row `codes_offsets`, the +/// per-row `uncompressed_lengths`, and the optional validity child — is a +/// Vortex slot child so it can be re-encoded by the cascading compressor. +#[derive(Clone)] +pub struct OnPairData { + dict_bytes: BufferHandle, + bits: u32, + len: usize, +} + +impl OnPairData { + pub fn new(dict_bytes: BufferHandle, bits: u32, len: usize) -> Self { + Self { + dict_bytes, + bits, + len, + } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn bits(&self) -> u32 { + self.bits + } + + pub fn dict_bytes(&self) -> &ByteBuffer { + self.dict_bytes.as_host() + } + + pub fn dict_bytes_handle(&self) -> &BufferHandle { + &self.dict_bytes + } +} + +impl Display for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "len: {}, bits: {}, dict_bytes_len: {}", + self.len, + self.bits, + self.dict_bytes.len() + ) + } +} + +impl Debug for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OnPairData") + .field("len", &self.len) + .field("bits", &self.bits) + .field("dict_bytes_len", &self.dict_bytes.len()) + .finish() + } +} + +impl ArrayHash for OnPairData { + fn array_hash(&self, state: &mut H, precision: Precision) { + self.dict_bytes.as_host().array_hash(state, precision); + state.write_u32(self.bits); + } +} + +impl ArrayEq for OnPairData { + fn array_eq(&self, other: &Self, precision: Precision) -> bool { + self.bits == other.bits + && self + .dict_bytes + .as_host() + .array_eq(other.dict_bytes.as_host(), precision) + } +} + +/// Zero-sized VTable marker for the OnPair encoding. +#[derive(Clone, Debug)] +pub struct OnPair; + +impl OnPair { + /// Build an [`OnPairArray`] from already-materialised parts. + #[expect(clippy::too_many_arguments, reason = "every child is a real input")] + pub fn try_new( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> VortexResult { + validate_parts( + &dtype, + &dict_offsets, + &codes, + &codes_offsets, + &uncompressed_lengths, + bits, + )?; + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + }) + } + + #[expect(clippy::too_many_arguments, reason = "every child is a real input")] + pub(crate) unsafe fn new_unchecked( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> OnPairArray { + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + } + } +} + +fn validate_parts( + dtype: &DType, + dict_offsets: &ArrayRef, + codes: &ArrayRef, + codes_offsets: &ArrayRef, + uncompressed_lengths: &ArrayRef, + bits: u32, +) -> VortexResult<()> { + vortex_ensure!( + matches!(dtype, DType::Binary(_) | DType::Utf8(_)), + "OnPair arrays must be Binary or Utf8, found {dtype}" + ); + vortex_ensure!((9..=16).contains(&bits), "bits {bits} out of range [9, 16]"); + + if !dict_offsets.dtype().is_int() || dict_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "dict_offsets must be non-nullable integer"); + } + if !codes.dtype().is_int() || codes.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes must be non-nullable integer"); + } + if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer"); + } + if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer"); + } + if codes_offsets.len() != uncompressed_lengths.len() + 1 { + vortex_bail!(InvalidArgument: + "codes_offsets.len ({}) != uncompressed_lengths.len + 1 ({})", + codes_offsets.len(), + uncompressed_lengths.len() + 1 + ); + } + Ok(()) +} + +impl VTable for OnPair { + type TypedArrayData = OnPairData; + type OperationsVTable = Self; + type ValidityVTable = Self; + + fn id(&self) -> ArrayId { + static ID: CachedId = CachedId::new("vortex.onpair"); + *ID + } + + fn validate( + &self, + data: &Self::TypedArrayData, + dtype: &DType, + len: usize, + slots: &[Option], + ) -> VortexResult<()> { + let dict_offsets = slots[DICT_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray dict_offsets slot missing"))?; + let codes = slots[CODES_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes slot missing"))?; + let codes_offsets = slots[CODES_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes_offsets slot missing"))?; + let uncompressed_lengths = slots[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray uncompressed_lengths slot missing"))?; + validate_parts( + dtype, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + data.bits, + )?; + if uncompressed_lengths.len() != len { + vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); + } + if data.len != len { + vortex_bail!(InvalidArgument: "OnPairData len {} != outer len {}", data.len, len); + } + Ok(()) + } + + fn nbuffers(_array: ArrayView<'_, Self>) -> usize { + 1 + } + + fn buffer(array: ArrayView<'_, Self>, idx: usize) -> BufferHandle { + match idx { + 0 => array.dict_bytes_handle().clone(), + _ => vortex_panic!("OnPairArray buffer index {idx} out of bounds"), + } + } + + fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option { + match idx { + 0 => Some("dict_bytes".to_string()), + _ => vortex_panic!("OnPairArray buffer_name index {idx} out of bounds"), + } + } + + fn serialize( + array: ArrayView<'_, Self>, + _session: &VortexSession, + ) -> VortexResult>> { + let dict_size = array.dict_offsets().len().saturating_sub(1) as u64; + let total_tokens = array.codes().len() as u64; + Ok(Some( + OnPairMetadata { + uncompressed_lengths_ptype: array.uncompressed_lengths().dtype().as_ptype().into(), + bits: array.bits(), + dict_size, + total_tokens, + dict_offsets_ptype: array.dict_offsets().dtype().as_ptype().into(), + codes_ptype: array.codes().dtype().as_ptype().into(), + codes_offsets_ptype: array.codes_offsets().dtype().as_ptype().into(), + } + .encode_to_vec(), + )) + } + + fn deserialize( + &self, + dtype: &DType, + len: usize, + metadata: &[u8], + buffers: &[BufferHandle], + children: &dyn ArrayChildren, + _session: &VortexSession, + ) -> VortexResult> { + if buffers.len() != 1 { + vortex_bail!(InvalidArgument: "Expected 1 buffer, got {}", buffers.len()); + } + let metadata = OnPairMetadata::decode(metadata)?; + let uncompressed_ptype = metadata.get_uncompressed_lengths_ptype()?; + + // Slot children. We pass `usize::MAX` for slots whose length we + // don't know up front (`dict_offsets` and `codes`). `codes_offsets` + // has known length `len + 1`. + let dict_offsets_len = usize::try_from(metadata.dict_size + 1) + .map_err(|_| vortex_err!("dict_size {} overflows usize", metadata.dict_size))?; + let total_tokens = usize::try_from(metadata.total_tokens) + .map_err(|_| vortex_err!("total_tokens {} overflows usize", metadata.total_tokens))?; + // The cascading compressor may have narrowed any of these integer + // children to a tighter ptype; the recorded ptype tells the framework + // exactly which dtype to materialise as. + let dict_offsets_ptype = PType::try_from(metadata.dict_offsets_ptype).map_err(|_| { + vortex_err!("invalid dict_offsets_ptype {}", metadata.dict_offsets_ptype) + })?; + let codes_ptype = PType::try_from(metadata.codes_ptype) + .map_err(|_| vortex_err!("invalid codes_ptype {}", metadata.codes_ptype))?; + let codes_offsets_ptype = PType::try_from(metadata.codes_offsets_ptype).map_err(|_| { + vortex_err!( + "invalid codes_offsets_ptype {}", + metadata.codes_offsets_ptype + ) + })?; + let dict_offsets = children.get( + 0, + &DType::Primitive(dict_offsets_ptype, Nullability::NonNullable), + dict_offsets_len, + )?; + let codes = children.get( + 1, + &DType::Primitive(codes_ptype, Nullability::NonNullable), + total_tokens, + )?; + let codes_offsets = children.get( + 2, + &DType::Primitive(codes_offsets_ptype, Nullability::NonNullable), + len + 1, + )?; + let uncompressed_lengths = children.get( + 3, + &DType::Primitive(uncompressed_ptype, Nullability::NonNullable), + len, + )?; + let validity = match children.len() { + 4 => Validity::from(dtype.nullability()), + 5 => Validity::Array(children.get(4, &Validity::DTYPE, len)?), + other => vortex_bail!(InvalidArgument: "Expected 4 or 5 children, got {other}"), + }; + + let data = OnPairData::new(buffers[0].clone(), metadata.bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) + } + + fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { + SLOT_NAMES[idx].to_string() + } + + fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { + canonicalize_onpair(array.as_view(), ctx).map(ExecutionResult::done) + } + + fn append_to_builder( + array: ArrayView<'_, Self>, + builder: &mut dyn ArrayBuilder, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()> { + let Some(builder) = builder.as_any_mut().downcast_mut::() else { + builder.extend_from_array( + &array + .array() + .clone() + .execute::(ctx)? + .into_array(), + ); + return Ok(()); + }; + + let next_buffer_index = builder.completed_block_count() + u32::from(builder.in_progress()); + let (buffers, views) = onpair_decode_views(array, next_buffer_index, ctx)?; + builder.push_buffer_and_adjusted_views( + &buffers, + &views, + array + .array() + .validity()? + .execute_mask(array.array().len(), ctx)?, + ); + Ok(()) + } + + fn execute_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + PARENT_KERNELS.execute(array, parent, child_idx, ctx) + } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + RULES.evaluate(array, parent, child_idx) + } +} + +impl ValidityVTable for OnPair { + fn validity(array: ArrayView<'_, OnPair>) -> VortexResult { + Ok(child_to_validity( + array.slots()[VALIDITY_SLOT].as_ref(), + array.dtype().nullability(), + )) + } +} + +/// Convenience extension trait. Slot accessors live here; methods reachable +/// through `OnPairData` flow via the `ArrayView -> Deref` chain. +pub trait OnPairArrayExt: TypedArrayRef { + fn dict_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[DICT_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray dict_offsets slot missing")) + } + fn codes(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes slot missing")) + } + fn codes_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes_offsets slot missing")) + } + fn uncompressed_lengths(&self) -> &ArrayRef { + self.as_ref().slots()[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray uncompressed_lengths slot missing")) + } + fn array_validity(&self) -> Validity { + child_to_validity( + self.as_ref().slots()[VALIDITY_SLOT].as_ref(), + self.as_ref().dtype().nullability(), + ) + } +} + +impl> OnPairArrayExt for T {} diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs new file mode 100644 index 00000000000..368c5ab0b7a --- /dev/null +++ b/encodings/experimental/onpair/src/canonical.rs @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by running +//! the pure-Rust dictionary-lookup decoder over every row. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::build_views::BinaryView; +use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; +use vortex_array::arrays::varbinview::build_views::build_views; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; +use vortex_buffer::ByteBufferMut; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::decode::OwnedDecodeInputs; + +pub(super) fn canonicalize_onpair( + array: ArrayView<'_, OnPair>, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let (buffers, views) = onpair_decode_views(array, 0, ctx)?; + let validity = array.array().validity()?; + Ok(unsafe { + VarBinViewArray::new_unchecked(views, Arc::from(buffers), array.dtype().clone(), validity) + .into_array() + }) +} + +pub(crate) fn onpair_decode_views( + array: ArrayView<'_, OnPair>, + start_buf_index: u32, + ctx: &mut ExecutionCtx, +) -> VortexResult<(Vec, Buffer)> { + let n = array.array().len(); + let lengths = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + + #[expect(clippy::cast_possible_truncation)] + let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { + lengths.as_slice::

().iter().map(|x| *x as usize).sum() + }); + + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + // Decode directly into the canonical output buffer's spare capacity — + // no temporary `Vec` + `extend_from_slice` round-trip. Total size + // is already known from `uncompressed_lengths`, so we can size the + // buffer once with the over-copy slack and call into the unchecked + // single-pass decoder. + let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: + // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes + // above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE + // bytes past the true end, all within reserved capacity. + // * Caller has verified the array's invariants in `OnPair::try_new`, + // so every code is a valid index and `dict_bytes` is padded. + unsafe { + let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::(); + let written = dv.decode_rows_unchecked(0, n, dst); + debug_assert_eq!(written, total_size); + out_bytes.set_len(written); + } + + match_each_integer_ptype!(lengths.ptype(), |P| { + Ok(build_views( + start_buf_index, + MAX_BUFFER_LEN, + out_bytes, + lengths.as_slice::

(), + )) + }) +} diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs new file mode 100644 index 00000000000..1f9c876265a --- /dev/null +++ b/encodings/experimental/onpair/src/compress.rs @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Train + compress entry points for the OnPair encoding. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::buffer::BufferHandle; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_onpair_sys::Column; +use vortex_onpair_sys::OnPairTrainingConfig; +use vortex_onpair_sys::unpack_codes_to_u16; + +use crate::OnPair; +use crate::OnPairArray; + +/// Default OnPair training configuration: 12-bit codes ("dict-12"). +pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = vortex_onpair_sys::DEFAULT_DICT12_CONFIG; + +/// Build a training config with a custom bit width. +pub fn config_with_bits(bits: u32) -> OnPairTrainingConfig { + OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 0, + } +} + +/// Compress an iterable of optional byte strings via the OnPair C++ library. +pub fn onpair_compress_iter<'a, I>( + iter: I, + len: usize, + dtype: DType, + config: OnPairTrainingConfig, +) -> VortexResult +where + I: Iterator>, +{ + let mut flat: Vec = Vec::with_capacity(len * 16); + let mut offsets: Vec = Vec::with_capacity(len + 1); + let mut uncompressed_lengths: BufferMut = BufferMut::with_capacity(len); + let mut validity_bits: Vec = Vec::with_capacity(len); + offsets.push(0); + + for item in iter { + match item { + Some(bytes) => { + flat.extend_from_slice(bytes); + offsets.push(flat.len() as u64); + uncompressed_lengths.push( + i32::try_from(bytes.len()).vortex_expect("string length must fit in i32"), + ); + validity_bits.push(true); + } + None => { + offsets.push(flat.len() as u64); + uncompressed_lengths.push(0); + validity_bits.push(false); + } + } + } + + let column = Column::compress(&flat, &offsets, config) + .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; + let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = parts_to_children(&column)?; + drop(column); + + let uncompressed_lengths = uncompressed_lengths.into_array(); + let validity = match dtype.nullability() { + Nullability::NonNullable => Validity::NonNullable, + Nullability::Nullable => Validity::from_iter(validity_bits), + }; + + OnPair::try_new( + dtype, + dict_bytes, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity, + bits, + ) +} + +/// Borrow the raw C++ parts and lift them into Vortex children + the dict buffer. +/// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. +fn parts_to_children( + column: &Column, +) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { + let parts = column + .parts() + .map_err(|e| vortex_err!("OnPair parts failed: {e}"))?; + let bits = parts.bits; + // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the + // over-copy decoder can issue a fixed 16-byte load for every token + // without risking an OOB read on the last entry. + let mut padded = Vec::with_capacity(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE); + padded.extend_from_slice(parts.dict_bytes); + padded.resize(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE, 0); + // Align dict_bytes to 8 bytes so the segment that ultimately holds the + // OnPair tree starts at an 8-aligned in-memory address. Without this + // anchor, the per-buffer padding the serializer inserts is only + // *relative* to the segment start; if the segment lands at a u8-aligned + // heap address, downstream `PrimitiveArray::deserialize` panics + // with `Misaligned buffer cannot be used to build PrimitiveArray of u32`. + let dict_bytes = + BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); + + let dict_offsets = Buffer::::copy_from(parts.dict_offsets).into_array(); + let total_tokens = usize::try_from( + *parts + .codes_boundaries + .last() + .ok_or_else(|| vortex_err!("OnPair: missing codes_boundaries"))?, + ) + .map_err(|_| vortex_err!("OnPair: total_tokens does not fit in usize"))?; + let codes_vec = unpack_codes_to_u16(parts.codes_packed, total_tokens, bits); + let codes = Buffer::::copy_from(codes_vec).into_array(); + let codes_offsets = Buffer::::copy_from(parts.codes_boundaries).into_array(); + Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) +} + +/// Compress a byte-string accessor (typically a `VarBinArray` or +/// `VarBinViewArray`). +pub fn onpair_compress>( + array: A, + len: usize, + dtype: &DType, + config: OnPairTrainingConfig, +) -> VortexResult { + array.with_iterator(|iter| onpair_compress_iter(iter, len, dtype.clone(), config)) +} + +/// Compress any [`ArrayRef`] whose canonical form is a string array, by first +/// canonicalising to `VarBinViewArray`. +pub fn onpair_compress_array( + array: &ArrayRef, + config: OnPairTrainingConfig, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let view = array.clone().execute::(ctx)?; + let len = view.len(); + let dtype = view.dtype().clone(); + onpair_compress(&view, len, &dtype, config) +} + +/// Convenience: build a default `ExecutionCtx` from `LEGACY_SESSION`. +pub fn onpair_compress_array_default( + array: &ArrayRef, + config: OnPairTrainingConfig, +) -> VortexResult { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + onpair_compress_array(array, config, &mut ctx) +} diff --git a/encodings/experimental/onpair/src/compute/cast.rs b/encodings/experimental/onpair/src/compute/cast.rs new file mode 100644 index 00000000000..27b4ad378c7 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/cast.rs @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::dtype::DType; +use vortex_array::scalar_fn::fns::cast::CastKernel; +use vortex_array::scalar_fn::fns::cast::CastReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Cast between `Utf8` and `Binary` (or adjust nullability) without touching +/// any of the encoded payload — we only rewrap into a new outer DType. +impl CastReduce for OnPair { + fn cast(array: ArrayView<'_, Self>, dtype: &DType) -> VortexResult> { + if !array.dtype().eq_ignore_nullability(dtype) { + return Ok(None); + } + let validity = array.array().validity()?; + let Some(new_validity) = + validity.trivially_cast_nullability(dtype.nullability(), array.array().len())? + else { + return Ok(None); + }; + Ok(Some( + unsafe { + OnPair::new_unchecked( + dtype.clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + array.codes_offsets().clone(), + array.uncompressed_lengths().clone(), + new_validity, + array.bits(), + ) + } + .into_array(), + )) + } +} + +impl CastKernel for OnPair { + fn cast( + array: ArrayView<'_, Self>, + dtype: &DType, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + ::cast(array, dtype) + } +} diff --git a/encodings/experimental/onpair/src/compute/compare.rs b/encodings/experimental/onpair/src/compute/compare.rs new file mode 100644 index 00000000000..3cce3384256 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/compare.rs @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! `Eq` / `NotEq` against a constant via **token-aware** comparison. +//! +//! OnPair's compressor encodes every byte string deterministically via +//! greedy LPM against the same dictionary, so two byte strings are +//! equal **iff** their LPM token sequences are equal. We tokenise the +//! needle once and then compare the row's `codes[lo..hi]` slice +//! directly against the tokenised needle as `&[u16]` — no row decode. +//! +//! Edge case: if the needle contains a byte that has no dict entry at +//! all (degenerate dict; OnPair training normally guarantees every +//! single-byte token), no row can possibly equal the needle, since +//! every row was compressed against the same dict. We return an +//! all-zeros bitmap (or all-ones for `NotEq`). + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::dtype::DType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_buffer::BitBuffer; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; +use crate::lpm::DictIndex; +use crate::lpm::tokenize_needle; + +impl CompareKernel for OnPair { + fn compare( + lhs: ArrayView<'_, Self>, + rhs: &ArrayRef, + operator: CompareOperator, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) { + return Ok(None); + } + let Some(constant) = rhs.as_constant() else { + return Ok(None); + }; + let Some(needle) = needle_bytes(&constant) else { + return Ok(None); + }; + + let inputs = OwnedDecodeInputs::collect(lhs, ctx)?; + let dv = inputs.view(); + let n = lhs.array().len(); + let mut bytes = vec![0u8; n.div_ceil(8)]; + + let index = DictIndex::build(&dv); + if let Some(needle_toks) = tokenize_needle(&dv, &index, &needle) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction time. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if row_toks == needle_toks.as_slice() { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // If `tokenize_needle` returned None, no row can equal the + // needle (every row was compressed against the same dict, so + // any byte not in the dict can't appear in any row either). + // Leave the bitmap zeroed. + + let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n); + if operator == CompareOperator::NotEq { + bool_buf = !bool_buf; + } + let validity = lhs + .array() + .validity()? + .union_nullability(constant.dtype().nullability()); + Ok(Some(BoolArray::new(bool_buf, validity).into_array())) + } +} + +fn needle_bytes(scalar: &Scalar) -> Option> { + match scalar.dtype() { + DType::Utf8(_) => scalar.as_utf8().value().map(|s| s.as_bytes().to_vec()), + DType::Binary(_) => scalar.as_binary().value().map(|b| b.to_vec()), + _ => None, + } +} diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs new file mode 100644 index 00000000000..fbece54c4bb --- /dev/null +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Filter that **shares the dictionary**. The previous implementation +//! decoded the whole array, filtered the canonical bytes, and re-trained +//! a brand-new OnPair dictionary on the surviving rows — order-of- +//! magnitude regressions on TPC-H Q22 at SF=10 traced back to that cost +//! (the customer table's `c_phone` column gets two consecutive filters, +//! each of which was paying full `Column::compress` training overhead). +//! +//! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical** +//! to the input; rebuild only `codes`, `codes_offsets`, +//! `uncompressed_lengths`, and validity by walking the mask. No decode, +//! no retrain, no C++ call on the read path. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_mask::Mask; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl FilterKernel for OnPair { + #[expect(clippy::cognitive_complexity, clippy::cast_possible_truncation)] + fn filter( + array: ArrayView<'_, Self>, + mask: &Mask, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let n_in = array.array().len(); + let n_out = mask.true_count(); + + // Materialise the per-row offset arrays we walk during filtering. + // The codes themselves we read through whatever ptype the + // cascading compressor narrowed to — match_each_integer_ptype + // dispatches on it below. + let codes_offsets_arr = array + .codes_offsets() + .clone() + .execute::(ctx)?; + let codes_arr = array.codes().clone().execute::(ctx)?; + + let mut new_codes_offsets = BufferMut::::with_capacity(n_out + 1); + + // The cascading compressor may have narrowed `codes_offsets` + // (e.g. u32 → u16 if every row's token count is small). Read + // through whatever ptype it lives at — the values still fit in + // `usize` when widened. Likewise for `codes`. + let new_codes: ArrayRef = match_each_integer_ptype!(codes_offsets_arr.ptype(), |OP| { + let codes_offsets = codes_offsets_arr.as_slice::(); + + // First pass: sum the surviving token count so we reserve once. + let mut new_codes_len: usize = 0; + for r in 0..n_in { + if mask.value(r) { + new_codes_len += (codes_offsets[r + 1] as usize) - (codes_offsets[r] as usize); + } + } + + // SAFETY: capacity reserved. + unsafe { new_codes_offsets.push_unchecked(0u32) }; + + match_each_integer_ptype!(codes_arr.ptype(), |P| { + let codes = codes_arr.as_slice::

(); + let mut out = BufferMut::

::with_capacity(new_codes_len); + let mut cursor: u32 = 0; + for r in 0..n_in { + if mask.value(r) { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let segment = unsafe { codes.get_unchecked(lo..hi) }; + out.extend_from_slice(segment); + let segment_len = u32::try_from(hi - lo) + .map_err(|_| vortex_err!("token segment overflows u32"))?; + cursor = cursor + .checked_add(segment_len) + .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?; + // SAFETY: capacity reserved (n_out + 1 entries). + unsafe { new_codes_offsets.push_unchecked(cursor) }; + } + } + out.freeze().into_array() + }) + }); + + // uncompressed_lengths + validity flow through the standard + // primitive filter — these are short integer arrays so the cost + // is negligible compared to the (avoided) recompress. + let uncompressed_lengths = array.uncompressed_lengths().clone().filter(mask.clone())?; + let validity = array.array_validity().filter(mask)?; + + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + new_codes, + new_codes_offsets.freeze().into_array(), + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs new file mode 100644 index 00000000000..6d9dcd79513 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! `LIKE` pushdown for OnPair. Three pattern shapes are accelerated; +//! everything else returns `None` so the caller decompresses + runs the +//! scalar `LIKE` on the canonical bytes. +//! +//! * `'literal'` — token-aware equality (LPM-tokenise the literal once +//! and compare the row's `codes[lo..hi]` against the tokenised needle +//! as `&[u16]`). No row decode. +//! * `'prefix%'` — OnPair-style [`PrefixAutomaton`][crate::dfa::PrefixAutomaton]: +//! tokenise the prefix and precompute valid-divergence intervals for +//! each query position. Per-row scan is `≤ q + 1` `u16` comparisons +//! plus one interval check; no decode at all in the hot path. +//! * `'%substring%'` — dict-bloom skip + `memchr::memmem` over the +//! decoded row only when needed. +//! [`ContainsBloom`][crate::dfa::ContainsBloom] precomputes "this +//! dict entry contains the substring" and "some suffix of this entry +//! could start a cross-token match". Most rows resolve via the bloom +//! without touching `dict_bytes`; the rest fall through to a +//! scratch-buffer decode + memmem. +//! +//! Escapes (`\\`), single-character wildcards (`_`), mid-pattern +//! wildcards, and `case_insensitive: true` all bail out with `None`. + +use memchr::memmem; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_buffer::BitBuffer; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::DecodeView; +use crate::decode::OwnedDecodeInputs; +use crate::dfa::ContainsBloom; +use crate::dfa::PrefixAutomaton; +use crate::lpm::DictIndex; +use crate::lpm::tokenize_needle; + +#[derive(Debug)] +enum PatternShape<'a> { + Equals(&'a [u8]), + StartsWith(&'a [u8]), + Contains(&'a [u8]), +} + +fn classify(pattern: &[u8]) -> Option> { + if pattern.contains(&b'_') || pattern.contains(&b'\\') { + return None; + } + let first_pct = pattern.iter().position(|&b| b == b'%'); + let last_pct = pattern.iter().rposition(|&b| b == b'%'); + match (first_pct, last_pct) { + (None, None) => Some(PatternShape::Equals(pattern)), + (Some(0), Some(end)) if end == pattern.len() - 1 && pattern.len() >= 2 => { + let inner = &pattern[1..pattern.len() - 1]; + if inner.contains(&b'%') { + None + } else { + Some(PatternShape::Contains(inner)) + } + } + (Some(p), Some(q)) if p == q && q == pattern.len() - 1 => { + Some(PatternShape::StartsWith(&pattern[..pattern.len() - 1])) + } + _ => None, + } +} + +impl LikeKernel for OnPair { + fn like( + array: ArrayView<'_, Self>, + pattern: &ArrayRef, + options: LikeOptions, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + if options.case_insensitive { + return Ok(None); + } + let Some(scalar) = pattern.as_constant() else { + return Ok(None); + }; + let pattern_bytes: Vec = if let Some(s) = scalar.as_utf8_opt() { + let Some(v) = s.value() else { return Ok(None) }; + v.as_bytes().to_vec() + } else if let Some(b) = scalar.as_binary_opt() { + let Some(v) = b.value() else { return Ok(None) }; + v.to_vec() + } else { + return Ok(None); + }; + let Some(shape) = classify(&pattern_bytes) else { + return Ok(None); + }; + + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + let n = array.array().len(); + + let mut bytes = vec![0u8; n.div_ceil(8)]; + match shape { + PatternShape::Equals(needle) => { + let index = DictIndex::build(&dv); + if let Some(needle_toks) = tokenize_needle(&dv, &index, needle) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + let needle_slice = needle_toks.as_slice(); + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if row_toks == needle_slice { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // Else: needle has a byte not in the dict ⇒ no row matches. + } + PatternShape::StartsWith(prefix) => { + if prefix.is_empty() { + fill_all(&mut bytes, n); + } else if let Some(automaton) = PrefixAutomaton::build(&dv, prefix) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if automaton.matches(row_toks) { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // Else: prefix has a byte not in the dict ⇒ no row matches. + } + PatternShape::Contains(sub) => { + if sub.is_empty() { + fill_all(&mut bytes, n); + } else { + contains_into_bitmap(&dv, sub, n, &mut bytes); + } + } + } + + let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n); + if options.negated { + bool_buf = !bool_buf; + } + let validity = array + .array() + .validity()? + .union_nullability(scalar.dtype().nullability()); + Ok(Some(BoolArray::new(bool_buf, validity).into_array())) + } +} + +/// `%substring%` pushdown: dict-bloom skip + per-row decode + memmem. +fn contains_into_bitmap(dv: &DecodeView<'_>, sub: &[u8], n: usize, out: &mut [u8]) { + let bloom = ContainsBloom::build(dv, sub); + let finder = memmem::Finder::new(sub); + let mut scratch: Vec = Vec::with_capacity(64); + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + let hit = match bloom.classify(row_toks) { + Some(b) => b, + None => { + scratch.clear(); + dv.decode_row_into(r, &mut scratch); + finder.find(&scratch).is_some() + } + }; + if hit { + out[r / 8] |= 1u8 << (r % 8); + } + } +} + +fn fill_all(bytes: &mut [u8], n: usize) { + bytes.fill(0xff); + if !n.is_multiple_of(8) { + let last = n / 8; + bytes[last] = (1u8 << (n % 8)) - 1; + } +} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs new file mode 100644 index 00000000000..54779d5e3fb --- /dev/null +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod cast; +mod compare; +mod filter; +mod like; diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs new file mode 100644 index 00000000000..dd434811d06 --- /dev/null +++ b/encodings/experimental/onpair/src/decode.rs @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Pure-Rust decoder for an [`OnPair`][crate::OnPair] array. +//! +//! The decode loop is intentionally simple — one `u16` code load, one +//! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the +//! autovectoriser keeps the hot path SIMD-friendly. We materialise the +//! children once into native-aligned `Buffer`s (and pack the dict +//! offsets + lengths into a single `Buffer` lookup table) so the +//! inner loop indexes straight into raw slices with no branches. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Materialised, host-resident copies of every read path's input. +/// +/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot +/// on the outer `OnPair` array, possibly wrapped in a non-canonical +/// encoding the cascading compressor chose (e.g. FastLanes-bit-packed +/// `codes`, `narrow`-ed dict offsets). `execute::` may +/// hand us back a narrower ptype than the decode loop wants. `collect` +/// widens each child to the decoder's native width (`u32` for both offset +/// arrays, `u16` for codes) once so the inner loop is branch-free pointer +/// arithmetic. +/// +/// Construction also packs `dict_offsets` into the combined +/// `(offset << 16) | length` `dict_table` so the decode hot loop loads a +/// single `u64` per token instead of two adjacent `u32`s. +pub struct OwnedDecodeInputs { + pub dict_bytes: ByteBuffer, + /// `(dict_offset << 16) | dict_len` per token. `dict_len` ≤ + /// `MAX_TOKEN_SIZE = 16` so 16 bits suffice. + pub dict_table: Buffer, + pub codes: Buffer, + pub codes_offsets: Buffer, +} + +impl OwnedDecodeInputs { + pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { + let dict_offsets_arr = to_primitive(array.dict_offsets(), ctx)?; + let dict_table = build_dict_table(&dict_offsets_arr); + Ok(Self { + dict_bytes: array.dict_bytes().clone(), + dict_table, + codes: widen_to_u16(&to_primitive(array.codes(), ctx)?), + codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?), + }) + } + + pub fn view(&self) -> DecodeView<'_> { + DecodeView { + dict_bytes: self.dict_bytes.as_slice(), + dict_table: self.dict_table.as_slice(), + codes: self.codes.as_slice(), + codes_offsets: self.codes_offsets.as_slice(), + } + } +} + +/// Pack `dict_offsets` directly into `(offset << 16) | length` per token. +/// Reads through the integer-ptype macro once so we don't have to widen +/// the offsets buffer first — saves one `Vec` allocation in the common +/// (non-narrowed) case. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn build_dict_table(arr: &PrimitiveArray) -> Buffer { + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + if slice.is_empty() { + return Buffer::::copy_from(Vec::::new()); + } + let dict_size = slice.len() - 1; + let mut table = BufferMut::::with_capacity(dict_size); + for i in 0..dict_size { + let off = slice[i] as u64; + let len = (slice[i + 1] - slice[i]) as u64; + // SAFETY: capacity reserved above; we push exactly dict_size times. + unsafe { table.push_unchecked((off << 16) | len) }; + } + table.freeze() + }) +} + +fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + arr.clone().execute::(ctx) +} + +/// Widen any integer-typed `PrimitiveArray` to `Buffer`. When the +/// underlying ptype already matches we transmute the buffer instead of +/// allocating a new one. Used when the cascading compressor narrowed an +/// offset array (e.g. `u32` → `u16`). +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u32(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U32 { + // Cheap: PrimitiveArray's underlying buffer is Arc-shared, so + // `into_buffer` on a clone is effectively a refcount bump. + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u32) }; + } + out.freeze() + }) +} + +/// As `widen_to_u32` but for `Buffer`. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u16(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U16 { + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u16) }; + } + out.freeze() + }) +} + +/// Borrowed slices for the decode loop. +#[derive(Copy, Clone)] +pub struct DecodeView<'a> { + pub dict_bytes: &'a [u8], + pub dict_table: &'a [u64], + pub codes: &'a [u16], + pub codes_offsets: &'a [u32], +} + +impl<'a> DecodeView<'a> { + /// Decode row `row` into `out` (appended). Thin wrapper around + /// [`Self::decode_rows_into`]. + #[inline] + pub fn decode_row_into(&self, row: usize, out: &mut Vec) { + self.decode_rows_into(row, 1, out); + } + + /// Bulk decode rows `[start, start + count)` contiguously into `out`. + /// Pre-computes the decoded length, reserves once, then delegates to + /// the unrolled fast path. Callers that already know the size (e.g. + /// canonicalize from `uncompressed_lengths`) should call + /// [`Self::decode_rows_into_with_size`] to skip the size pre-pass. + pub fn decode_rows_into(&self, start: usize, count: usize, out: &mut Vec) { + if count == 0 { + return; + } + let decoded_len = self.decoded_len_rows(start, count); + let written_start = out.len(); + out.reserve(decoded_len + crate::MAX_TOKEN_SIZE); + // SAFETY: capacity reserved above; `decode_rows_unchecked`'s + // invariants are upheld by the [`OnPair::try_new`] validation. + unsafe { + let written = + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)); + debug_assert_eq!(written, decoded_len); + out.set_len(written_start + written); + } + } + + /// Single-pass over-copy decode of a token window into raw `dst`. + /// + /// Mirrors OnPair C++ `decode_all` (and `decompress`) + /// exactly: each iteration loads one `u16` code, one `u64` dict-table + /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] + /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned + /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by + /// the *true* token length. The body is hand-unrolled four times so + /// the CPU can keep four independent stores in flight, matching the + /// `ONPAIR_EMIT4` block of the upstream `decode_all.h`. + /// + /// Returns the number of *true* bytes written. + /// + /// # Safety + /// * `dst` must point into a region with at least + /// `decoded_byte_length + MAX_TOKEN_SIZE` bytes of writable + /// uninitialised capacity. + /// * `self.dict_bytes` must have at least `MAX_TOKEN_SIZE` trailing + /// pad bytes past the last real token byte (`compress.rs` enforces + /// this). + /// * Every `code` in the window must be `< self.dict_table.len()`. + #[inline] + pub unsafe fn decode_rows_unchecked(&self, start: usize, count: usize, dst: *mut u8) -> usize { + if count == 0 { + return 0; + } + // SAFETY: caller invariants. + let lo = unsafe { *self.codes_offsets.get_unchecked(start) } as usize; + let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize; + + let codes_ptr = self.codes.as_ptr(); + let table_ptr = self.dict_table.as_ptr(); + let dict_ptr = self.dict_bytes.as_ptr(); + + let mut cursor = dst; + let unroll_end = lo + ((hi - lo) & !3); + let mut i = lo; + // SAFETY: indices derived from validated offsets; the 16-byte + // over-copy reads stay within `dict_bytes`'s trailing pad; writes + // stay within the caller-promised capacity. + unsafe { + while i < unroll_end { + macro_rules! emit { + ($k:expr) => {{ + let c = *codes_ptr.add(i + $k) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping( + dict_ptr.add(off), + cursor, + crate::MAX_TOKEN_SIZE, + ); + cursor = cursor.add(len); + }}; + } + emit!(0); + emit!(1); + emit!(2); + emit!(3); + i += 4; + } + while i < hi { + let c = *codes_ptr.add(i) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE); + cursor = cursor.add(len); + i += 1; + } + cursor.offset_from(dst) as usize + } + } + + /// Single-pass decode when the caller already knows the total decoded + /// byte length (e.g. from summing `uncompressed_lengths`). Skips the + /// size-precomputation pass. + /// + /// # Safety + /// `out.capacity() - out.len() >= total_size + MAX_TOKEN_SIZE` and + /// `total_size` equals the true decoded length. + #[inline] + pub unsafe fn decode_rows_into_with_size( + &self, + start: usize, + count: usize, + total_size: usize, + out: &mut Vec, + ) { + let written_start = out.len(); + debug_assert!(out.capacity() - written_start >= total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: caller's invariants. + let written = unsafe { + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)) + }; + debug_assert_eq!(written, total_size); + // SAFETY: `written` ≤ reserved capacity (caller invariants). + unsafe { out.set_len(written_start + written) }; + } + + /// Decoded byte length of row `row` without copying any bytes. + #[inline] + pub fn decoded_len(&self, row: usize) -> usize { + self.decoded_len_rows(row, 1) + } + + /// Decoded byte length of rows `[start, start + count)`. Uses the + /// combined `dict_table` — one `u64` load per token. + #[inline] + pub fn decoded_len_rows(&self, start: usize, count: usize) -> usize { + if count == 0 { + return 0; + } + let lo = self.codes_offsets[start] as usize; + let hi = self.codes_offsets[start + count] as usize; + let mut total = 0usize; + // SAFETY: bounds checked by indexing above. + unsafe { + for i in lo..hi { + let c = *self.codes.get_unchecked(i) as usize; + total += (*self.dict_table.get_unchecked(c) & 0xffff) as usize; + } + } + total + } + + /// Iterate the decoded bytes of `row` without materialising the full + /// row, calling `f` on each contiguous dict slice. Returns + /// + /// * `true` if every slice was visited (i.e. `f` always returned + /// `true`), + /// * `false` if `f` short-circuited with `false`. + /// + /// Useful for predicates that can short-circuit, e.g. `equals` and + /// `starts_with`. + #[inline] + pub fn for_each_dict_slice bool>(&self, row: usize, mut f: F) -> bool { + let lo = self.codes_offsets[row] as usize; + let hi = self.codes_offsets[row + 1] as usize; + let codes = &self.codes[lo..hi]; + // SAFETY: codes were validated at construction time. + unsafe { + for &c in codes { + let entry = *self.dict_table.get_unchecked(c as usize); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + let slice = self.dict_bytes.get_unchecked(off..off + len); + if !f(slice) { + return false; + } + } + } + true + } +} diff --git a/encodings/experimental/onpair/src/dfa.rs b/encodings/experimental/onpair/src/dfa.rs new file mode 100644 index 00000000000..9bb7245a402 --- /dev/null +++ b/encodings/experimental/onpair/src/dfa.rs @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Token-level matchers for `LIKE 'prefix%'` and `LIKE '%needle%'` over +//! OnPair-compressed `codes: &[u16]` — no row decode at all in the hot +//! path (prefix), and a dict-bloom skip + bounded per-row decode for +//! contains. +//! +//! Mirrors `onpair_cpp/include/onpair/search/automata/prefix_automaton.h` +//! and `…/aho_corasick_automaton.h`. The trick that makes both work is +//! the dictionary's lexicographic ordering: the set of dict ids whose +//! tokens start with byte sequence `S` is always a contiguous +//! `[lo, hi)` range — found in O(|S| · log dict) by binary search. +//! +//! ## PrefixAutomaton +//! +//! 1. LPM-tokenise the prefix into `query[0..q]`. +//! 2. For each `i ∈ 0..q`, precompute `intervals[i] = prefix_range( +//! remaining_prefix_suffix_at_i)` — the dict token range whose bytes +//! start with the prefix's remaining bytes from position `i` onward. +//! 3. Walk the row's tokens. If token `j` equals `query[j]` advance. +//! If it differs but is within `intervals[j]` the token must cover +//! the whole remaining prefix → accept. Otherwise reject. If we run +//! out of query tokens → accept (rest of row is irrelevant). +//! +//! Per-row cost: at most `q + 1` `u16` comparisons + 1 interval check. +//! For URL-shape data with `q ≈ 5–10` this is ~10 ns / row. +//! +//! ## Contains (dict-bloom + bounded decode) +//! +//! `LIKE '%needle%'` doesn't have a token-level shortcut as clean as +//! prefix because the LPM of "…[bytes]…needle…[bytes]…" tokenises +//! differently depending on the surrounding context. We do: +//! +//! 1. Per-token bloom: precompute `dict_contains[c] = true` iff dict +//! entry `c` contains `needle` as a byte substring. If any code in +//! the row has the bit set, the row matches with no decode. +//! 2. Per-token "could be left of a cross-boundary match" bloom: +//! `dict_could_extend[c] = true` iff some non-empty suffix of dict +//! entry `c` is a non-empty prefix of `needle`. Rows where no code +//! has this bit can't match across boundaries either, so we skip +//! them entirely. +//! 3. Otherwise, decode the row and run `memchr::memmem`. +//! +//! For URL/log shapes the bloom resolves the vast majority of rows +//! without touching `dict_bytes` at all. + +use crate::decode::DecodeView; + +// ─── prefix_range helper ──────────────────────────────────────────── + +/// Returns the half-open `[lo, hi)` range of dict ids whose bytes start +/// with `prefix`. The dict is sorted lexicographically (per OnPair +/// `core/dictionary.h`) so the answer is contiguous. +/// +/// Empty range if no dict entry starts with `prefix`. +fn prefix_range(dv: &DecodeView<'_>, prefix: &[u8]) -> std::ops::Range { + let n = dv.dict_table.len(); + if prefix.is_empty() { + return 0..n; + } + let lo = lower_bound(dv, prefix); + if lo == n { + return n..n; + } + // Check the actual entry at lo starts with `prefix`; if not, range + // is empty (lower_bound only guarantees ≥). + if !dict_starts_with(dv, lo, prefix) { + return n..n; + } + let hi = upper_bound_with_prefix(dv, prefix, lo); + lo..hi +} + +#[inline] +fn dict_token_bytes<'a>(dv: &DecodeView<'a>, id: usize) -> &'a [u8] { + let entry = dv.dict_table[id]; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + &dv.dict_bytes[off..off + len] +} + +#[inline] +fn dict_starts_with(dv: &DecodeView<'_>, id: usize, prefix: &[u8]) -> bool { + let bytes = dict_token_bytes(dv, id); + bytes.starts_with(prefix) +} + +/// First dict id whose bytes are `>= prefix` lexicographically. +fn lower_bound(dv: &DecodeView<'_>, prefix: &[u8]) -> usize { + let mut lo = 0usize; + let mut hi = dv.dict_table.len(); + while lo < hi { + let mid = lo + (hi - lo) / 2; + if dict_token_bytes(dv, mid) < prefix { + lo = mid + 1; + } else { + hi = mid; + } + } + lo +} + +/// First dict id `>= start` whose bytes do **not** start with `prefix`. +fn upper_bound_with_prefix(dv: &DecodeView<'_>, prefix: &[u8], start: usize) -> usize { + let mut lo = start; + let mut hi = dv.dict_table.len(); + while lo < hi { + let mid = lo + (hi - lo) / 2; + if dict_starts_with(dv, mid, prefix) { + lo = mid + 1; + } else { + hi = mid; + } + } + lo +} + +// ─── PrefixAutomaton ──────────────────────────────────────────────── + +pub(crate) struct PrefixAutomaton { + query: Vec, + /// `intervals[i]` is the dict range whose bytes start with the + /// prefix's remaining suffix at position `i`. The row's `i`-th token + /// "covers" the rest of the prefix iff it falls in this range. + intervals: Vec>, +} + +impl PrefixAutomaton { + /// Build the automaton. Returns `None` if the prefix has a byte + /// missing from the dict (no row can match) — caller emits an + /// all-false result. + #[expect(clippy::cast_possible_truncation)] + pub(crate) fn build(dv: &DecodeView<'_>, prefix: &[u8]) -> Option { + if prefix.is_empty() { + // Empty prefix matches everything — caller short-circuits + // before calling us. + return Some(Self { + query: Vec::new(), + intervals: Vec::new(), + }); + } + + let query = crate::lpm::tokenize_needle(dv, &crate::lpm::DictIndex::build(dv), prefix)?; + + // For each query token at position i, the remaining prefix at + // that position is `prefix[byte_pos..]`. The valid-divergence + // range is `prefix_range(prefix[byte_pos..])`. + let mut intervals = Vec::with_capacity(query.len()); + let mut byte_pos = 0usize; + for &tok in &query { + let remaining = &prefix[byte_pos..]; + let range = prefix_range(dv, remaining); + intervals.push(range.start as u32..range.end as u32); + // Advance by the token's true length. + let entry = dv.dict_table[tok as usize]; + byte_pos += (entry & 0xffff) as usize; + } + debug_assert_eq!(byte_pos, prefix.len()); + Some(Self { query, intervals }) + } + + /// Returns `true` iff some prefix of the decoded row equals the + /// literal prefix. + #[inline] + pub(crate) fn matches(&self, codes: &[u16]) -> bool { + let qlen = self.query.len(); + if qlen == 0 { + return true; + } + let mut i = 0usize; + // SAFETY: indexing bounded by `i < qlen`. + unsafe { + for &c in codes { + let want = *self.query.get_unchecked(i); + if c == want { + i += 1; + if i == qlen { + return true; + } + } else { + let r = self.intervals.get_unchecked(i); + let cu = c as u32; + return cu >= r.start && cu < r.end; + } + } + } + // Ran out of row tokens before finishing the query → mismatch + // unless we'd already returned `true` above. + false + } +} + +// ─── Contains: dict-bloom + memmem ────────────────────────────────── + +pub(crate) struct ContainsBloom { + /// `dict_contains[c]` — dict entry `c` contains `needle` as a + /// substring. + dict_contains: Vec, + /// `dict_could_extend[c]` — some non-empty suffix of `c`'s bytes + /// is a non-empty prefix of `needle`. + dict_could_extend: Vec, +} + +impl ContainsBloom { + pub(crate) fn build(dv: &DecodeView<'_>, needle: &[u8]) -> Self { + let n = dv.dict_table.len(); + let mut dict_contains = vec![false; n]; + let mut dict_could_extend = vec![false; n]; + for id in 0..n { + let bytes = dict_token_bytes(dv, id); + if bytes.len() >= needle.len() && memchr::memmem::find(bytes, needle).is_some() { + dict_contains[id] = true; + continue; + } + // Suffix-of-token is a prefix-of-needle: walk possible + // suffix lengths up to min(len, needle.len()-1). + let max_overlap = bytes.len().min(needle.len() - 1); + for k in 1..=max_overlap { + if bytes[bytes.len() - k..] == needle[..k] { + dict_could_extend[id] = true; + break; + } + } + } + Self { + dict_contains, + dict_could_extend, + } + } + + /// Quick row-level pre-filter: + /// * `Some(true)` — at least one code is in `dict_contains`, + /// row matches without decoding. + /// * `Some(false)` — no codes are in `dict_could_extend` either, + /// row cannot match, no decode needed. + /// * `None` — uncertain; caller must decode + memmem. + #[inline] + pub(crate) fn classify(&self, codes: &[u16]) -> Option { + let mut any_extend = false; + // SAFETY: codes are validated `< dict_table.len()` at array + // construction, and the bloom vectors have that length. + unsafe { + for &c in codes { + if *self.dict_contains.get_unchecked(c as usize) { + return Some(true); + } + any_extend |= *self.dict_could_extend.get_unchecked(c as usize); + } + } + if any_extend { None } else { Some(false) } + } +} + +#[cfg(test)] +mod tests { + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::VarBinArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + + use super::*; + use crate::DEFAULT_DICT12_CONFIG; + use crate::decode::OwnedDecodeInputs; + use crate::onpair_compress; + + fn build_inputs(strings: &[&str]) -> OwnedDecodeInputs { + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap() + } + + fn row_codes(inputs: &OwnedDecodeInputs, r: usize) -> &[u16] { + let lo = inputs.codes_offsets[r] as usize; + let hi = inputs.codes_offsets[r + 1] as usize; + &inputs.codes[lo..hi] + } + + #[test] + fn prefix_matches_decoded_truth() { + let strings: &[&str] = &[ + "https://example.com/items/0001", + "https://example.com/items/0002", + "https://example.com/users/abc", + "ftp://other.example.com/x", + "http", + "https", + "h", + "", + ]; + let inputs = build_inputs(strings); + let dv = inputs.view(); + + for &prefix in &[ + &b"https://"[..], + b"https://example.com/items/", + b"ftp://", + b"https", + b"https:", + b"missing", + b"h", + b"http", + b"e", + ] { + let dfa = PrefixAutomaton::build(&dv, prefix); + for (r, s) in strings.iter().enumerate() { + let want = s.as_bytes().starts_with(prefix); + let got = match dfa.as_ref() { + Some(d) => d.matches(row_codes(&inputs, r)), + None => false, + }; + assert_eq!( + got, + want, + "prefix={:?} row={s:?}", + std::str::from_utf8(prefix) + ); + } + } + } + + #[test] + fn contains_bloom_classifies_correctly() { + let strings: &[&str] = &[ + "https://example.com/items/0001", + "https://example.com/users/abc", + "ftp://other.example.com/x", + "no overlap", + "googlegoogle", + "preg", + ]; + let inputs = build_inputs(strings); + let dv = inputs.view(); + + for &needle in &[&b"example"[..], b"google", b"reg", b"://", b"missing", b"e"] { + let bloom = ContainsBloom::build(&dv, needle); + for (r, s) in strings.iter().enumerate() { + let want = memchr::memmem::find(s.as_bytes(), needle).is_some(); + let codes = row_codes(&inputs, r); + let mut row_bytes = Vec::new(); + dv.decode_row_into(r, &mut row_bytes); + match bloom.classify(codes) { + Some(true) => { + assert!( + want, + "false +ve: needle={:?} row={s:?}", + std::str::from_utf8(needle) + ); + } + Some(false) => { + assert!( + !want, + "false -ve: needle={:?} row={s:?}", + std::str::from_utf8(needle) + ); + } + None => { + // Unknown — that's fine; just check the decoded + // memmem agrees with `want`. + assert_eq!(memchr::memmem::find(&row_bytes, needle).is_some(), want); + } + } + } + } + } +} diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs new file mode 100644 index 00000000000..947383d58ba --- /dev/null +++ b/encodings/experimental/onpair/src/kernel.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::filter::FilterExecuteAdaptor; +use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; +use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; +use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; + +use crate::OnPair; + +// TODO: implement TakeExecute for OnPair to add a TakeExecuteAdaptor here +// (matches the FSST pattern; would dispatch take on the codes child + reuse +// the dictionary, mirroring the slice path). +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&CastExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&CompareExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&LikeExecuteAdaptor(OnPair)), +]); diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs new file mode 100644 index 00000000000..44761bfcd6d --- /dev/null +++ b/encodings/experimental/onpair/src/lib.rs @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Vortex string array backed by the [OnPair][onpair] short-string +//! compression library, with compressed-domain predicate pushdown. +//! +//! The default training preset is `dict-12` (12 bits per token, dictionary +//! capped at 4 096 entries). See [`onpair_compress`] for the entry point and +//! [`OnPairArray`] for the resulting array type. +//! +//! [onpair]: https://arxiv.org/abs/2508.02280 + +mod array; +mod canonical; +mod compress; +mod compute; +pub mod decode; +mod dfa; +mod kernel; +mod lpm; +mod ops; +mod rules; +mod slice; +#[cfg(test)] +mod tests; + +pub use array::*; +pub use compress::*; + +/// Fixed token-byte over-copy width. Matches OnPair C++'s `MAX_TOKEN_SIZE`: +/// the decoder copies exactly this many bytes per token and advances the +/// output cursor by the *true* token length. Lets the compiler emit a single +/// 128-bit SIMD store per token on x86_64 / aarch64 instead of a +/// variable-length memcpy. +pub const MAX_TOKEN_SIZE: usize = 16; diff --git a/encodings/experimental/onpair/src/lpm.rs b/encodings/experimental/onpair/src/lpm.rs new file mode 100644 index 00000000000..9f19e7793e1 --- /dev/null +++ b/encodings/experimental/onpair/src/lpm.rs @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Greedy longest-prefix-match tokeniser for OnPair predicate kernels. +//! +//! OnPair's dictionary is stored in **lexicographic order** (per +//! `onpair_cpp/include/onpair/core/dictionary.h`). For any byte `b` the +//! dict ids whose first byte equals `b` form a contiguous range we can +//! find in O(1) via a 257-entry first-byte index. The tokeniser walks +//! `needle` left-to-right and at each position picks the *longest* dict +//! entry that's a prefix of `needle[pos..]` — exactly the same strategy +//! `EQSearch` / `PrefixAutomaton` use on the C++ side. +//! +//! Returns: +//! * `Some(Vec)` — the unique LPM token sequence for `needle`. Two +//! strings with the same byte content compress to the same token +//! sequence under the same dict, so token-sequence equality on the +//! `codes` child is exactly equivalent to byte equality on the +//! decoded rows. **No decoding required** in the predicate hot loop. +//! * `None` — `needle` contains a byte that's not the start of any dict +//! entry (degenerate dict; OnPair training normally guarantees the +//! 256 single-byte entries exist). Callers should fall back to byte +//! matching. + +use vortex_error::vortex_panic; + +use crate::decode::DecodeView; + +/// Per-byte index into the dictionary: `range_for(b) = lo..hi` is the +/// half-open range of dict ids whose first byte equals `b`. Empty if +/// no such dict entry exists. +/// +/// Stored as 257 `u32` so `range_for(b) = lo..hi` reads two adjacent +/// entries with no branch. +pub(crate) struct DictIndex { + by_first_byte: [u32; 257], +} + +impl DictIndex { + pub fn build(dv: &DecodeView<'_>) -> Self { + let mut by_first_byte = [0u32; 257]; + // OnPair training caps dict_size at 2^bits ≤ 65 536, well within u32. + let dict_size: u32 = u32::try_from(dv.dict_table.len()) + .unwrap_or_else(|_| vortex_panic!("OnPair dict_size > u32::MAX")); + // The dict is sorted lexicographically, so the first dict id + // whose first byte is `b` is the lowest `i` with that property. + // Fill `by_first_byte[0..=first]` with `i` lazily and tail-fill + // with `dict_size`. + let mut last_first: usize = 0; + for (i, &entry) in dv.dict_table.iter().enumerate() { + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + if len == 0 { + continue; // defensive: OnPair dicts have len >= 1 + } + let first = dv.dict_bytes[off] as usize; + let i_u32 = + u32::try_from(i).unwrap_or_else(|_| vortex_panic!("OnPair dict id > u32::MAX")); + while last_first <= first { + by_first_byte[last_first] = i_u32; + last_first += 1; + } + } + while last_first <= 256 { + by_first_byte[last_first] = dict_size; + last_first += 1; + } + Self { by_first_byte } + } + + /// Range of dict ids whose first byte is `b`. Empty if none. + #[inline] + pub fn range_for(&self, b: u8) -> std::ops::Range { + let lo = self.by_first_byte[b as usize] as usize; + let hi = self.by_first_byte[b as usize + 1] as usize; + lo..hi + } +} + +/// Tokenise `needle` via greedy longest-prefix-match against the +/// OnPair dict. Returns `None` if any byte of the needle has no +/// matching dict entry. +pub(crate) fn tokenize_needle( + dv: &DecodeView<'_>, + index: &DictIndex, + needle: &[u8], +) -> Option> { + let mut tokens = Vec::with_capacity(needle.len()); + let mut pos = 0usize; + while pos < needle.len() { + let candidates = index.range_for(needle[pos]); + if candidates.is_empty() { + return None; + } + let remaining = &needle[pos..]; + let mut best_len: usize = 0; + let mut best_id: u16 = 0; + for id in candidates { + // SAFETY: `id < dict_table.len()` (range from index). + let entry = unsafe { *dv.dict_table.get_unchecked(id) }; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + if len <= best_len || len > remaining.len() { + continue; + } + // SAFETY: dict_bytes was validated; off + len ≤ dict_bytes.len(). + let entry_bytes = unsafe { dv.dict_bytes.get_unchecked(off..off + len) }; + if remaining.starts_with(entry_bytes) { + best_len = len; + // OnPair caps `bits ≤ 16`, so dict ids fit in u16. + best_id = u16::try_from(id) + .unwrap_or_else(|_| vortex_panic!("OnPair dict id > u16::MAX")); + } + } + if best_len == 0 { + return None; + } + tokens.push(best_id); + pos += best_len; + } + Some(tokens) +} + +// `LIKE 'prefix%'` could *not* use a token-prefix shortcut: the LPM of +// the row's leading bytes may merge what would otherwise be two prefix +// tokens into a single longer token whose end extends past the literal +// prefix. The byte-streaming check in `compute/like.rs::row_starts_with` +// is the correct minimum-work option. + +#[cfg(test)] +mod tests { + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::VarBinArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + + use super::*; + use crate::DEFAULT_DICT12_CONFIG; + use crate::decode::OwnedDecodeInputs; + use crate::onpair_compress; + + fn build_array(strings: &[&str]) -> OwnedDecodeInputs { + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap() + } + + #[test] + fn tokenise_round_trip() { + let strings: Vec = (0..200).map(|i| format!("row-{i:04}-tail")).collect(); + let str_refs: Vec<&str> = strings.iter().map(String::as_str).collect(); + let inputs = build_array(&str_refs); + let dv = inputs.view(); + let index = DictIndex::build(&dv); + + for s in &strings { + let needle = s.as_bytes(); + let toks = tokenize_needle(&dv, &index, needle).expect("LPM must tokenise"); + // Round-trip: decode the token sequence back to bytes. + let mut decoded = Vec::with_capacity(needle.len()); + for &t in &toks { + let entry = dv.dict_table[t as usize]; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + decoded.extend_from_slice(&dv.dict_bytes[off..off + len]); + } + assert_eq!(decoded, needle, "LPM didn't reconstruct {s:?}"); + } + } + + #[test] + fn tokenise_prefix_matches_row_prefix() { + let strings: &[&str] = &[ + "https://example.com/items/0001", + "https://example.com/items/0002", + "https://example.com/users/abc", + "ftp://other.example.com/x", + ]; + let inputs = build_array(strings); + let dv = inputs.view(); + let index = DictIndex::build(&dv); + + // Prefixes that should tokenise and match the right rows. + let pfx = b"https://example.com/items/"; + let pfx_toks = tokenize_needle(&dv, &index, pfx).expect("prefix must tokenise"); + // For each row, check whether its codes start with pfx_toks. + let codes_offsets = dv.codes_offsets; + let codes = dv.codes; + for (r, s) in strings.iter().enumerate() { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + let row_toks = &codes[lo..hi]; + let token_match = + row_toks.len() >= pfx_toks.len() && row_toks[..pfx_toks.len()] == pfx_toks[..]; + assert_eq!( + token_match, + s.as_bytes().starts_with(pfx), + "row {r} ({s:?}) prefix mismatch" + ); + } + } +} diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs new file mode 100644 index 00000000000..55e6c77b1e0 --- /dev/null +++ b/encodings/experimental/onpair/src/ops.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::varbin::varbin_scalar; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::OperationsVTable; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; + +impl OperationsVTable for OnPair { + fn scalar_at( + array: ArrayView<'_, OnPair>, + index: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + let mut buf: Vec = Vec::with_capacity(dv.decoded_len(index)); + dv.decode_row_into(index, &mut buf); + Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype())) + } +} diff --git a/encodings/experimental/onpair/src/rules.rs b/encodings/experimental/onpair/src/rules.rs new file mode 100644 index 00000000000..279c160c1eb --- /dev/null +++ b/encodings/experimental/onpair/src/rules.rs @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::slice::SliceReduceAdaptor; +use vortex_array::optimizer::rules::ParentRuleSet; +use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; + +use crate::OnPair; + +pub(crate) static RULES: ParentRuleSet = ParentRuleSet::new(&[ + ParentRuleSet::lift(&SliceReduceAdaptor(OnPair)), + ParentRuleSet::lift(&CastReduceAdaptor(OnPair)), +]); diff --git a/encodings/experimental/onpair/src/slice.rs b/encodings/experimental/onpair/src/slice.rs new file mode 100644 index 00000000000..48f3d6b8d16 --- /dev/null +++ b/encodings/experimental/onpair/src/slice.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Slicing an `OnPairArray` reuses the same dictionary blob, the full +//! `codes` child, and the full `dict_offsets` child. Only the +//! `codes_offsets` child (narrowed to `[start, end + 1)`), the +//! `uncompressed_lengths` child (narrowed to `[start, end)`) and the +//! optional validity child change. No decode, no re-training. + +use std::ops::Range; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::IntoArray; +use vortex_array::arrays::slice::SliceReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl SliceReduce for OnPair { + fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { + let codes_offsets = array.codes_offsets().slice(range.start..range.end + 1)?; + let uncompressed_lengths = array.uncompressed_lengths().slice(range.clone())?; + let validity = array.array_validity().slice(range)?; + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + codes_offsets, + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/experimental/onpair/src/tests.rs b/encodings/experimental/onpair/src/tests.rs new file mode 100644 index 00000000000..02370b9d810 --- /dev/null +++ b/encodings/experimental/onpair/src/tests.rs @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::LazyLock; + +use prost::Message; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::session::ArraySession; +use vortex_array::test_harness::check_metadata; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_session::VortexSession; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::OnPairMetadata; +use crate::compress::DEFAULT_DICT12_CONFIG; +use crate::compress::onpair_compress; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn sample_input() -> VarBinArray { + VarBinArray::from_iter( + [ + Some("https://www.example.com/page"), + Some("https://www.example.com/data"), + Some("https://www.test.org/page"), + Some("ftp://files.example.com/x"), + Some("https://www.example.com/page"), + ], + DType::Utf8(Nullability::NonNullable), + ) +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_metadata_golden() { + check_metadata( + "onpair.metadata", + &OnPairMetadata { + uncompressed_lengths_ptype: PType::I32 as i32, + bits: 12, + dict_size: 4096, + total_tokens: 128_000, + dict_offsets_ptype: PType::U32 as i32, + codes_ptype: PType::U16 as i32, + codes_offsets_ptype: PType::U32 as i32, + } + .encode_to_vec(), + ); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_roundtrip() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + + let compressed = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).expect("compress"); + assert!(compressed.clone().into_array().is::()); + + let mut ctx = SESSION.create_execution_ctx(); + let decoded = compressed + .into_array() + .execute::(&mut ctx) + .expect("canonicalize"); + + decoded + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), 5); + assert_eq!( + got[0].as_deref(), + Some(b"https://www.example.com/page".as_ref()) + ); + assert_eq!( + got[3].as_deref(), + Some(b"ftp://files.example.com/x".as_ref()) + ); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_nullable_canonicalize() { + let input = VarBinArray::from_iter( + [Some("a"), None, Some("bbb"), None, Some("ccccc")], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got[1], None); + assert_eq!(got[3], None); + assert_eq!(got[4].as_deref(), Some(b"ccccc".as_ref())); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_scalar_at() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let s = arr.into_array().execute_scalar(2, &mut ctx).unwrap(); + let v = s.as_utf8().value().unwrap(); + assert_eq!(v.as_bytes(), b"https://www.test.org/page"); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_eq_pushdown() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let mut ctx = SESSION.create_execution_ctx(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + + let rhs = ConstantArray::new("https://www.example.com/page", arr.len()).into_array(); + let eq = arr + .binary(rhs, Operator::Eq) + .unwrap() + .execute::(&mut ctx) + .unwrap() + .into_array(); + assert_eq!(eq.as_bool_typed().true_count().unwrap(), 2); +} + +fn run_like(arr: &vortex_array::ArrayRef, pattern: &str) -> vortex_array::ArrayRef { + let n = arr.len(); + let pat = ConstantArray::new(pattern, n).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + Like.try_new_array(n, LikeOptions::default(), [arr.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_like_prefix() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + let result = run_like(&arr, "https://www.%"); + assert_eq!(result.as_bool_typed().true_count().unwrap(), 4); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_like_contains() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + let result = run_like(&arr, "%example.com%"); + assert_eq!(result.as_bool_typed().true_count().unwrap(), 4); +} + +/// The hot decode loop is 4×-unrolled with a scalar tail. Anything that +/// lands in the tail (1-3 leftover tokens, or zero total tokens) must +/// produce the same bytes as the unrolled body. Hit every row-count +/// near the boundary. +#[cfg_attr(miri, ignore)] +#[rstest::rstest] +#[case::n_1(1)] +#[case::n_2(2)] +#[case::n_3(3)] +#[case::n_4(4)] +#[case::n_5(5)] +#[case::n_7(7)] +#[case::n_8(8)] +#[case::n_9(9)] +fn test_onpair_unroll_tail_boundaries(#[case] n: usize) { + let words: &[&str] = &["a", "bb", "ccc", "https://www.example.com/x"]; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let input = VarBinArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), n); + for (i, expected) in strings.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(expected.as_bytes()), "n={n}, i={i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Empty array — the unroll path must short-circuit cleanly. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_empty() { + let input = VarBinArray::from_iter( + std::iter::empty::>(), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + assert_eq!(arr.len(), 0); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + assert_eq!(canonical.len(), 0); +} + +/// Filter must share the dictionary — never recompress (this is the +/// regression cause on TPC-H Q22 SF=10). Exercise both selectivities +/// and check that the result is bit-exact and still an OnPairArray. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_shares_dict() { + let n = 5_000usize; + let strings: Vec = (0..n) + .map(|i| format!("https://www.example.com/items/{i:08}")) + .collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let dict_bytes_before = arr.dict_bytes().clone(); + let dict_offsets_len_before = arr.dict_offsets().len(); + + // Keep every 7th row. + let keep: Vec = (0..n).map(|i| i % 7 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert!( + filtered.is::(), + "filter dropped OnPair encoding: got {}", + filtered.encoding_id() + ); + let typed = filtered.try_downcast::().expect("OnPair"); + // Dict must be byte-identical with the input — no retrain, no copy. + assert_eq!(typed.dict_bytes().as_slice(), dict_bytes_before.as_slice()); + assert_eq!(typed.dict_offsets().len(), dict_offsets_len_before); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed +/// (smaller-ptype) primitive copy. Used by the narrowed-child +/// regression tests below. +#[expect(clippy::cognitive_complexity)] +fn narrow_codes_offsets(arr: &crate::OnPairArray, target: PType) -> crate::OnPairArray { + let view = arr.as_view(); + let mut ctx = SESSION.create_execution_ctx(); + let original = view + .codes_offsets() + .clone() + .execute::(&mut ctx) + .unwrap(); + + let narrowed_array = match_each_integer_ptype!(original.ptype(), |SRC| { + let src = original.as_slice::(); + match_each_integer_ptype!(target, |DST| { + let mut buf = BufferMut::::with_capacity(src.len()); + for &v in src { + #[allow( + clippy::unnecessary_cast, + reason = "macro-generated SRC may already be u64" + )] + buf.push(DST::try_from(v as u64).expect("value must fit in target ptype")); + } + PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array() + }) + }); + + unsafe { + OnPair::new_unchecked( + view.dtype().clone(), + view.dict_bytes_handle().clone(), + view.dict_offsets().clone(), + view.codes().clone(), + narrowed_array, + view.uncompressed_lengths().clone(), + view.array_validity(), + view.bits(), + ) + } +} + +/// Regression: the cascading compressor can narrow `codes_offsets` +/// from u32 → u16 when every row's token count is small. The previous +/// `filter` impl read the child as `as_slice::()` and panicked +/// with `Other error: Attempted to get slice of type u32 from array +/// of type u16`. The fix dispatches via `match_each_integer_ptype!`. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u16() { + let n = 200usize; + // Short rows so per-row token counts stay small and codes_offsets + // values fit in u16. (We narrow manually below regardless — this + // matches the shape the cascading compressor produces in the + // wild.) + let strings: Vec = (0..n).map(|i| format!("r{:03}", i)).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + + // Force `codes_offsets` to u16 so the panicking pre-fix + // `as_slice::()` would fire. + let arr = narrow_codes_offsets(&arr, PType::U16); + assert_eq!( + arr.as_view().codes_offsets().dtype().as_ptype(), + PType::U16, + "codes_offsets must be u16 to exercise the regression path" + ); + + let keep: Vec = (0..n).map(|i| i % 3 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + // Pre-fix: this call panics with "Attempted to get slice of type + // u32 from array of type u16". Post-fix: succeeds. + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + let typed = filtered.try_downcast::().expect("OnPair"); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Same regression, narrowed to u8 (smallest possible ptype) — extra +/// coverage that the macro dispatch handles every integer ptype the +/// cascading compressor might pick. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u8() { + let n = 100usize; + let strings: Vec = (0..n).map(|i| format!("{i}")).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let arr = narrow_codes_offsets(&arr, PType::U8); + assert_eq!(arr.as_view().codes_offsets().dtype().as_ptype(), PType::U8); + + let mask = vortex_mask::Mask::from_iter((0..n).map(|i| i % 2 == 0)); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert_eq!(filtered.len(), n / 2); +} diff --git a/encodings/experimental/onpair/tests/big_data.rs b/encodings/experimental/onpair/tests/big_data.rs new file mode 100644 index 00000000000..0be025dcfc5 --- /dev/null +++ b/encodings/experimental/onpair/tests/big_data.rs @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end smoke test on a realistically-sized input. Validates the +//! pure-Rust decode path and pushdown predicates end-to-end through the new +//! u16-codes layout. + +#![allow( + clippy::cast_possible_truncation, + clippy::redundant_clone, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; +use std::time::Instant; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::session::ArraySession; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "GET /api/v1/users/{id}/profile HTTP/1.1", + "POST /api/v1/users/{id}/sessions HTTP/1.1", + "GET /static/js/app.{id}.js HTTP/1.1", + "GET /static/css/app.{id}.css HTTP/1.1", + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "ftp://files.example.com/dump/{id}.tar.gz", + "ssh://deploy@build-{id}.internal:22", + "redis://cache-{id}.svc.cluster.local:6379", + "INFO request_id={id} method=GET status=200", + "WARN request_id={id} method=POST status=429", + "ERROR request_id={id} method=PUT status=500", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +#[cfg_attr(miri, ignore)] +fn smoke_100k_rows() { + let n = 100_000; + let strings = corpus(n); + let raw_bytes: usize = strings.iter().map(|s| s.len()).sum(); + + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + + let t0 = Instant::now(); + let arr = onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .expect("compress"); + let compress_elapsed = t0.elapsed(); + let bits = arr.bits(); + eprintln!( + "compressed {} rows ({} raw bytes) in {:?}, bits={}", + n, raw_bytes, compress_elapsed, bits + ); + + let arr_ref = arr.into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + // Full canonical round-trip via the pure-Rust decoder. + let t0 = Instant::now(); + let decoded = arr_ref + .clone() + .execute::(&mut ctx) + .expect("canonicalize"); + eprintln!("canonicalized in {:?}", t0.elapsed()); + + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_bytes(); + assert_eq!(got, Some(want), "row {} mismatch", i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + eprintln!("roundtrip OK on all {} rows", n); + + // Equality pushdown: pick a specific row's value and ensure the kernel + // finds all occurrences. + let needle_row = 42; + let needle = strings[needle_row].clone(); + let want_eq = strings.iter().filter(|s| **s == needle).count(); + let eq = arr_ref + .binary( + ConstantArray::new(needle.as_str(), n).into_array(), + Operator::Eq, + ) + .unwrap() + .execute::(&mut ctx) + .unwrap() + .into_array(); + assert_eq!(eq.as_bool_typed().true_count().unwrap(), want_eq); + eprintln!("eq pushdown matches reference count ({})", want_eq); + + // Prefix pushdown. + let prefix = "https://www."; + let want_prefix = strings.iter().filter(|s| s.starts_with(prefix)).count(); + let pat = ConstantArray::new(format!("{prefix}%").as_str(), n).into_array(); + let got_prefix = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_prefix, want_prefix); + eprintln!("starts_with pushdown matches reference ({})", want_prefix); + + // Contains pushdown. + let sub = "status=500"; + let want_sub = strings.iter().filter(|s| s.contains(sub)).count(); + let pat = ConstantArray::new(format!("%{sub}%").as_str(), n).into_array(); + let got_sub = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_sub, want_sub); + eprintln!("contains pushdown matches reference ({})", want_sub); +} diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 40b0ae52aae..493c1684318 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -30,6 +30,7 @@ vortex-error = { workspace = true } vortex-fastlanes = { workspace = true } vortex-fsst = { workspace = true } vortex-mask = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true, optional = true } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } @@ -48,7 +49,7 @@ vortex-session = { workspace = true } [features] # This feature enabled unstable encodings for which we don't guarantee stability. -unstable_encodings = ["dep:vortex-tensor", "vortex-zstd?/unstable_encodings"] +unstable_encodings = ["dep:vortex-tensor", "dep:vortex-onpair", "vortex-zstd?/unstable_encodings"] pco = ["dep:pco", "dep:vortex-pco"] zstd = ["dep:vortex-zstd"] diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index ab77f625764..5344b9e94eb 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -53,7 +53,8 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // String schemes. //////////////////////////////////////////////////////////////////////////////////////////////// &string::StringDictScheme, - &string::FSSTScheme, + #[cfg(feature = "unstable_encodings")] + &string::OnPairScheme, &string::StringConstantScheme, &string::NullDominatedSparseScheme, // Decimal schemes. @@ -168,14 +169,21 @@ impl BtrBlocksCompressorBuilder { /// preserves the array buffer layout for zero-conversion GPU decompression. Without it, /// interleaved Zstd compression is used. pub fn only_cuda_compatible(self) -> Self { - let builder = self.exclude_schemes([ + // String fragmentation schemes (OnPair, FSST) require host-side + // dictionary expansion at decode time, which is incompatible with + // pure-GPU decompression paths. Strip whichever string-fragment + // scheme is enabled by feature. + #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))] + let mut excluded: Vec = vec![ integer::SparseScheme.id(), integer::IntRLEScheme.id(), float::FloatRLEScheme.id(), float::NullDominatedSparseScheme.id(), string::StringDictScheme.id(), - string::FSSTScheme.id(), - ]); + ]; + #[cfg(feature = "unstable_encodings")] + excluded.push(string::OnPairScheme.id()); + let builder = self.exclude_schemes(excluded); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] let builder = builder.with_new_scheme(&string::ZstdBuffersScheme); diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index ade42f88668..071c4c398e3 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -21,6 +21,14 @@ use vortex_fsst::FSST; use vortex_fsst::FSSTArrayExt; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::DEFAULT_DICT12_CONFIG; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPairArrayExt; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::onpair_compress; use vortex_sparse::Sparse; use vortex_sparse::SparseExt as _; @@ -33,9 +41,26 @@ use crate::Scheme; use crate::SchemeExt; /// FSST (Fast Static Symbol Table) compression. +/// +/// Retained for callers that want to opt back in via +/// [`BtrBlocksCompressorBuilder::with_new_scheme`]; it is **not** part of the +/// default [`ALL_SCHEMES`] anymore — the default string-fragmentation slot is +/// filled by [`OnPairScheme`]. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; +/// OnPair short-string compression (dict-12). +/// +/// The default string-fragmentation scheme — targets large columns of +/// short-to-medium strings with high lexical overlap, like URLs or log lines. +/// Uses a learned dictionary of frequent adjacent substrings (built by the +/// OnPair C++ trainer at compress time) and 12-bit token codes stored as a +/// u16 child, with offsets / uncompressed-lengths flowing through the +/// cascading compressor like any other primitive children. +#[cfg(feature = "unstable_encodings")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct OnPairScheme; + /// Sparse encoding for null-dominated arrays. /// /// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. @@ -138,6 +163,114 @@ impl Scheme for FSSTScheme { } } +#[cfg(feature = "unstable_encodings")] +impl Scheme for OnPairScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.onpair" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// One slot child: `uncompressed_lengths`. The dictionary blob, dictionary + /// offsets, codes (u16), and codes offsets all live as raw byte buffers + /// on the OnPair array — they're not primitive slot children, so the + /// cascading compressor doesn't recompress them. Codes intentionally + /// 4 primitive slot children flow through the cascading compressor: + /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 → + /// `FastLanes::BitPacked` to exactly `bits` = 12 by default), + /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow + /// + `FoR`). Validity stays untouched. + fn num_children(&self) -> usize { + 4 + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Sample) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let utf8 = data.array_as_utf8().into_owned(); + let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; + + let dict_offsets = compress_primitive_child( + compressor, + onpair_array.dict_offsets(), + &compress_ctx, + self.id(), + 0, + exec_ctx, + )?; + let codes = compress_primitive_child( + compressor, + onpair_array.codes(), + &compress_ctx, + self.id(), + 1, + exec_ctx, + )?; + let codes_offsets = compress_primitive_child( + compressor, + onpair_array.codes_offsets(), + &compress_ctx, + self.id(), + 2, + exec_ctx, + )?; + let uncompressed_lengths = compress_primitive_child( + compressor, + onpair_array.uncompressed_lengths(), + &compress_ctx, + self.id(), + 3, + exec_ctx, + )?; + + Ok(OnPair::try_new( + onpair_array.dtype().clone(), + onpair_array.dict_bytes_handle().clone(), + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + onpair_array.array_validity(), + onpair_array.bits(), + )? + .into_array()) + } +} + +/// Narrow a primitive child to its tightest int type, then forward it to +/// the cascading compressor. +#[cfg(feature = "unstable_encodings")] +fn compress_primitive_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: vortex_compressor::scheme::SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, +) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow()? + .into_array(); + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx) +} + impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" @@ -411,8 +544,25 @@ mod scheme_selection_tests { Ok(()) } + #[cfg(feature = "unstable_encodings")] + #[test] + fn test_onpair_in_default_scheme_list() { + use crate::SchemeExt; + use crate::schemes::string::OnPairScheme; + + let ids: Vec<_> = crate::ALL_SCHEMES.iter().map(|s| s.id()).collect(); + assert!( + ids.contains(&OnPairScheme.id()), + "OnPairScheme not registered in ALL_SCHEMES" + ); + } + + #[cfg(feature = "unstable_encodings")] #[test] - fn test_fsst_compressed() -> VortexResult<()> { + fn test_onpair_compressed() -> VortexResult<()> { + // Dictionary-style string corpus: high lexical overlap, short rows. + // OnPair is the only string-fragmentation scheme in the default + // builder, so it should win the sample-based comparison. let mut strings = Vec::with_capacity(1000); for i in 0..1000 { strings.push(Some(format!( @@ -423,7 +573,49 @@ mod scheme_selection_tests { let array_ref = array.into_array(); let compressed = BtrBlocksCompressor::default() .compress(&array_ref, &mut SESSION.create_execution_ctx())?; - assert!(compressed.is::()); + assert!( + compressed.is::(), + "expected OnPair, got {}", + compressed.encoding_id() + ); + Ok(()) + } + + /// FSST is no longer in the default scheme list, but `with_new_scheme` + /// still lets callers opt it back in. + #[test] + fn test_fsst_opt_in_still_works() -> VortexResult<()> { + use crate::BtrBlocksCompressorBuilder; + use crate::SchemeExt; + use crate::schemes::string::FSSTScheme; + + // FSST must not be registered by default. + assert!( + !crate::ALL_SCHEMES.iter().any(|s| s.id() == FSSTScheme.id()), + "FSSTScheme should not be in ALL_SCHEMES anymore", + ); + + // ...but explicitly adding it back should still produce a compressor + // that returns an FSST array for FSST-favourable input. Start from an + // empty builder so the sample-based comparison can't pick OnPair. + let mut strings = Vec::with_capacity(1000); + for i in 0..1000 { + strings.push(Some(format!( + "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern" + ))); + } + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + + let compressor = BtrBlocksCompressorBuilder::empty() + .with_new_scheme(&FSSTScheme) + .build(); + let compressed = compressor.compress(&array_ref, &mut SESSION.create_execution_ctx())?; + assert!( + compressed.is::(), + "expected FSST when only FSSTScheme is registered, got {}", + compressed.encoding_id() + ); Ok(()) } } diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs new file mode 100644 index 00000000000..63b47355b4c --- /dev/null +++ b/vortex-btrblocks/tests/onpair_roundtrip.rs @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end round-trip through the full Vortex compressor + decompressor +//! on string arrays. Lives in `vortex-btrblocks` (gated on `unstable_encodings`) +//! so it exercises the same code path the file writer uses, not just the +//! OnPair crate in isolation. + +#![cfg(feature = "unstable_encodings")] +#![allow( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::session::ArraySession; +use vortex_btrblocks::BtrBlocksCompressor; +use vortex_onpair::OnPair; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +/// Helper: synthetic short-string corpus that the cascading compressor should +/// route through OnPair. +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +fn nonnullable_roundtrip_via_default_compressor() { + let n = 4096; + let strings = corpus(n); + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + assert!( + compressed.is::(), + "expected OnPair, got {}", + compressed.encoding_id() + ); + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + assert_eq!( + got, + Some(strings[i].as_bytes()), + "mismatch at row {i}: got {:?}", + got.map(|b| String::from_utf8_lossy(b).into_owned()), + ); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn nullable_roundtrip_via_default_compressor() { + let n = 2048; + let strings: Vec> = corpus(n) + .into_iter() + .enumerate() + .map(|(i, s)| (i % 7 != 0).then_some(s)) + .collect(); + + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + // Don't assert OnPair specifically here — the sample-based selector may + // pick a different scheme on tiny inputs. What matters is the round-trip. + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_deref().map(str::as_bytes); + assert_eq!(got, want, "mismatch at row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn empty_and_short_string_roundtrip() { + // Edge cases: empty strings interleaved with short ones. + let strings = vec!["", "a", "", "bb", "ccc", "", "dddd", "eeeee", ""]; + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + decoded + .with_iterator(|iter| { + let got: Vec<_> = iter.collect(); + for (i, want) in strings.iter().enumerate() { + assert_eq!(got[i], Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index 77d664a12cb..c4bf980d683 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -46,6 +46,7 @@ vortex-io = { workspace = true } vortex-layout = { workspace = true } vortex-mask = { workspace = true } vortex-metrics = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true } vortex-runend = { workspace = true } vortex-scan = { workspace = true } @@ -78,6 +79,7 @@ tokio = [ zstd = ["dep:vortex-zstd", "vortex-btrblocks/zstd", "vortex-btrblocks/pco"] # This feature enables unstable encodings for which we don't guarantee stability. unstable_encodings = [ + "dep:vortex-onpair", "dep:vortex-tensor", "vortex-zstd?/unstable_encodings", "vortex-btrblocks/unstable_encodings", diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs index ce6598173a6..da38b7944f4 100644 --- a/vortex-file/src/lib.rs +++ b/vortex-file/src/lib.rs @@ -115,6 +115,8 @@ use vortex_array::arrays::patched::use_experimental_patches; use vortex_array::session::ArraySessionExt; use vortex_bytebool::ByteBool; use vortex_fsst::FSST; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_session::VortexSession; use vortex_sparse::Sparse; @@ -163,6 +165,8 @@ pub fn register_default_encodings(session: &VortexSession) { arrays.register(ByteBool); arrays.register(Dict); arrays.register(FSST); + #[cfg(feature = "unstable_encodings")] + arrays.register(OnPair); arrays.register(Pco); arrays.register(Sparse); arrays.register(ZigZag); diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 71c72ffc904..6f120629e8c 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -52,6 +52,8 @@ use vortex_layout::layouts::repartition::RepartitionWriterOptions; use vortex_layout::layouts::table::TableStrategy; use vortex_layout::layouts::zoned::writer::ZonedLayoutOptions; use vortex_layout::layouts::zoned::writer::ZonedStrategy; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_runend::RunEnd; use vortex_sequence::Sequence; @@ -100,6 +102,8 @@ pub static ALLOWED_ENCODINGS: LazyLock> = LazyLock::new(|| { allowed.insert(Delta.id()); allowed.insert(FoR.id()); allowed.insert(FSST.id()); + #[cfg(feature = "unstable_encodings")] + allowed.insert(OnPair.id()); allowed.insert(Pco.id()); allowed.insert(RLE.id()); allowed.insert(RunEnd.id()); diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs new file mode 100644 index 00000000000..803c6869e46 --- /dev/null +++ b/vortex-file/tests/test_onpair_string_roundtrip.rs @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Round-trip stress tests for OnPair through the full Vortex file writer + +//! reader. Mirrors the call shape `vortex-bench/src/conversions.rs` uses and +//! the multi-column, many-chunk pattern of TPC-H tables (`supplier_0.vortex` +//! is the file from which CI surfaced +//! `Misaligned buffer cannot be used to build PrimitiveArray of u32`). + +#![cfg(feature = "unstable_encodings")] +#![expect( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::redundant_clone +)] + +use std::sync::LazyLock; + +use futures::StreamExt; +use futures::pin_mut; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::aggregate_fn::session::AggregateFnSession; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::session::DTypeSession; +use vortex_array::optimizer::kernels::ArrayKernels; +use vortex_array::scalar_fn::session::ScalarFnSession; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::ByteBuffer; +use vortex_file::OpenOptionsSessionExt; +use vortex_file::WriteOptionsSessionExt; +use vortex_io::session::RuntimeSession; +use vortex_layout::session::LayoutSession; +use vortex_session::VortexSession; + +/// Full default Vortex session — the same set of sub-sessions +/// `vortex::VortexSession::default()` would install, plus +/// `register_default_encodings`. Built inline here because `vortex-file` +/// can't depend on the umbrella `vortex` crate (it's the other way round). +static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::empty() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::(); + vortex_file::register_default_encodings(&session); + session +}); + +fn corpus(n: usize, offset: u64) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64.wrapping_add(offset); + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + out +} + +/// Write `data` to an in-memory `Vec` using the **full default Vortex +/// compressor** (`WriteStrategyBuilder::default()` = +/// `BtrBlocksCompressor::default()` cascading through every registered +/// scheme, including OnPair), then open the resulting bytes via +/// `OpenOptions::open_buffer` and stream every chunk back. +async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec { + // `write_options()` builds a `VortexWriteOptions` whose `strategy` is + // `WriteStrategyBuilder::default().build()` — the same path `vortex-bench` + // uses for Parquet → Vortex conversion. No custom strategy injected. + let mut bytes = Vec::new(); + SESSION + .write_options() + .write(&mut bytes, data.to_array_stream()) + .await + .expect("write Vortex file"); + + // Read back from the in-memory byte buffer; no disk, no FS. + let bytes = ByteBuffer::from(bytes); + let vxf = SESSION.open_options().open_buffer(bytes).expect("open"); + + let stream = vxf + .scan() + .expect("scan") + .into_stream() + .expect("into_stream"); + pin_mut!(stream); + + let mut chunks = Vec::new(); + while let Some(chunk) = stream.next().await { + chunks.push(chunk.expect("chunk")); + } + chunks +} + +/// Single string column, single chunk. The simplest case. +#[tokio::test] +async fn single_column_single_chunk() { + let n = 4096usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Many rows → many chunks via the writer's default row_block_size. +#[tokio::test] +async fn single_column_many_chunks() { + let n = 50_000usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// TPC-H supplier-shaped table: 5 string columns + a primary key + a +/// foreign key + a decimal/integer, with the row count large enough to +/// exercise multiple chunks. This is the configuration that surfaced the +/// `Misaligned buffer` error in CI. +#[tokio::test] +async fn tpch_supplier_shape() { + let n = 32_000usize; + let names = corpus(n, 1); + let addresses = corpus(n, 2); + let phones = corpus(n, 3); + let comments = corpus(n, 4); + let cities = corpus(n, 5); + + let suppkey: Vec = (0..n as i64).collect(); + let nationkey: Vec = (0..n as i32).map(|i| i % 25).collect(); + let acctbal: Vec = (0..n as i64).map(|i| (i * 13) % 1_000_000).collect(); + + let mk_str = |v: &[String]| -> vortex_array::ArrayRef { + VarBinViewArray::from_iter( + v.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + }; + + let data = StructArray::new( + FieldNames::from([ + "s_suppkey", + "s_name", + "s_address", + "s_nationkey", + "s_phone", + "s_acctbal", + "s_comment", + "s_city", + ]), + vec![ + PrimitiveArray::from_iter(suppkey.iter().copied()).into_array(), + mk_str(&names), + mk_str(&addresses), + PrimitiveArray::from_iter(nationkey.iter().copied()).into_array(), + mk_str(&phones), + PrimitiveArray::from_iter(acctbal.iter().copied()).into_array(), + mk_str(&comments), + mk_str(&cities), + ], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let chunk_len = strct.as_ref().len(); + let mut ctx = SESSION.create_execution_ctx(); + + let name = strct + .unmasked_field(1) + .clone() + .execute::(&mut ctx) + .unwrap(); + let address = strct + .unmasked_field(2) + .clone() + .execute::(&mut ctx) + .unwrap(); + let phone = strct + .unmasked_field(4) + .clone() + .execute::(&mut ctx) + .unwrap(); + let comment = strct + .unmasked_field(6) + .clone() + .execute::(&mut ctx) + .unwrap(); + let city = strct + .unmasked_field(7) + .clone() + .execute::(&mut ctx) + .unwrap(); + + for (s, want) in [ + (&name, &names), + (&address, &addresses), + (&phone, &phones), + (&comment, &comments), + (&city, &cities), + ] { + let base = row; + s.with_iterator(|iter| { + for (i, b) in iter.enumerate() { + assert_eq!(b, Some(want[base + i].as_bytes()), "row {}", base + i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + row += chunk_len; + } + assert_eq!(row, n); +} + +/// 30 short fixed strings where the dictionary blob length is unlikely to +/// be a multiple of 4. Earlier buffer orderings (dict_bytes first) tripped +/// the segment writer's first-buffer-only alignment, surfacing +/// `Misaligned buffer cannot be used to build PrimitiveArray of u32` on +/// read. +#[tokio::test] +async fn odd_dict_length_alignment() { + let words: &[&str] = &[ + "a", "bb", "ccc", "dddd", "eeeee", "fffff", "ggggggg", "h", "ii", "jjj", + ]; + let n = 20_000usize; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["w"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Mixed-shape strings: empty, short, very long, with a fair chunk of nulls +/// — exercising the validity child + edge offsets. +#[tokio::test] +async fn nullable_and_extreme_shapes() { + let n = 16_000usize; + let mut strings: Vec> = Vec::with_capacity(n); + for i in 0..n { + match i % 11 { + 0 => strings.push(None), + 1 => strings.push(Some(String::new())), + 2 => strings.push(Some("a".repeat(1024))), + 3 => strings.push(Some(format!("row-{i}"))), + _ => strings.push(Some(corpus(1, i as u64).pop().unwrap())), + } + } + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["s"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + let want = strings[row].as_deref().map(str::as_bytes); + assert_eq!(b, want, "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +}