From abae54d7085ce34e4e3e745c2f15196ce08f1eb9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 26 Apr 2026 19:58:36 +0000 Subject: [PATCH 1/5] migrate(benchmarks-v3): add one-shot v2-to-v3 historical migrator Reads v2's data.json.gz/commits.json/file-sizes from S3, ports v2's getGroup classifier bug-for-bug, and writes a fully populated v3 DuckDB. Includes a verify subcommand that diffs group/chart structure against the live v2 /api/metadata endpoint. Binary and classifier are throwaway: deleted post-cutover. Signed-off-by: Claude --- Cargo.lock | 19 + Cargo.toml | 1 + benchmarks-website/migrate/Cargo.toml | 36 ++ benchmarks-website/migrate/src/classifier.rs | 605 ++++++++++++++++++ benchmarks-website/migrate/src/commits.rs | 100 +++ benchmarks-website/migrate/src/lib.rs | 21 + benchmarks-website/migrate/src/main.rs | 114 ++++ benchmarks-website/migrate/src/migrate.rs | 562 ++++++++++++++++ benchmarks-website/migrate/src/source.rs | 116 ++++ benchmarks-website/migrate/src/v2.rs | 142 ++++ benchmarks-website/migrate/src/verify.rs | 350 ++++++++++ .../migrate/tests/classifier.rs | 291 +++++++++ .../migrate/tests/end_to_end.rs | 111 ++++ 13 files changed, 2468 insertions(+) create mode 100644 benchmarks-website/migrate/Cargo.toml create mode 100644 benchmarks-website/migrate/src/classifier.rs create mode 100644 benchmarks-website/migrate/src/commits.rs create mode 100644 benchmarks-website/migrate/src/lib.rs create mode 100644 benchmarks-website/migrate/src/main.rs create mode 100644 benchmarks-website/migrate/src/migrate.rs create mode 100644 benchmarks-website/migrate/src/source.rs create mode 100644 benchmarks-website/migrate/src/v2.rs create mode 100644 benchmarks-website/migrate/src/verify.rs create mode 100644 benchmarks-website/migrate/tests/classifier.rs create mode 100644 benchmarks-website/migrate/tests/end_to_end.rs diff --git a/Cargo.lock b/Cargo.lock index df22be4de9b..079289cdfa8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10352,6 +10352,25 @@ dependencies = [ "vortex-tensor", ] +[[package]] +name = "vortex-bench-migrate" +version = "0.1.0-alpha.0" +dependencies = [ + "anyhow", + "clap", + "duckdb", + "flate2", + "reqwest 0.13.2", + "rstest", + "serde", + "serde_json", + "tempfile", + "tokio", + "tracing", + "tracing-subscriber", + "vortex-bench-server", +] + [[package]] name = "vortex-bench-server" version = "0.1.0-alpha.0" diff --git a/Cargo.toml b/Cargo.toml index 70a02d78312..8d7a1ee6adb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ members = [ "benchmarks/vector-search-bench", # Benchmarks website v3 (alpha) - leaf binary, not part of vortex-* API "benchmarks-website/server", + "benchmarks-website/migrate", ] exclude = ["java/testfiles", "wasm-test"] resolver = "2" diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml new file mode 100644 index 00000000000..464e55d9485 --- /dev/null +++ b/benchmarks-website/migrate/Cargo.toml @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +[package] +name = "vortex-bench-migrate" +version = "0.1.0-alpha.0" +edition = "2024" +rust-version = "1.91.0" +license = "Apache-2.0" +description = "One-shot historical migrator from the v2 benchmarks S3 dataset to a v3 DuckDB file" +publish = false + +[[bin]] +name = "vortex-bench-migrate" +path = "src/main.rs" + +# Throwaway binary, not part of the vortex-* public API surface. +# Errors use anyhow, and the crate is intentionally outside the +# workspace public-api lockfile set. + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive"] } +duckdb = { version = "1.4", features = ["bundled"] } +flate2 = "1.1" +reqwest = { workspace = true, features = ["json"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } +tracing = { workspace = true, features = ["std"] } +tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] } +vortex-bench-server = { path = "../server" } + +[dev-dependencies] +rstest = { workspace = true } +tempfile = { workspace = true } diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs new file mode 100644 index 00000000000..f7a1e56c0ae --- /dev/null +++ b/benchmarks-website/migrate/src/classifier.rs @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Bug-for-bug port of v2's `getGroup`, `formatQuery`, and +//! `normalizeChartName` from `benchmarks-website/server.js`, plus the +//! mapping from v2 group + name pattern to a v3 fact-table bin. +//! +//! The v2 classifier was the source of truth for what historical +//! records mean. It groups records by name prefix into one of: +//! "Random Access", "Compression", "Compression Size", or one of the +//! SQL query suites (with optional fan-out by storage and scale +//! factor for TPC-H/TPC-DS). This module reproduces that logic and +//! then hops to a v3 fact-table bin, since v3 stores dim values as +//! columns instead of name fragments. + +use crate::v2::V2Record; +use crate::v2::dataset_scale_factor; + +/// Static port of v2's `QUERY_SUITES`. +pub const QUERY_SUITES: &[QuerySuite] = &[ + QuerySuite { + prefix: "clickbench", + display_name: "Clickbench", + query_prefix: "CLICKBENCH", + dataset_key: None, + fan_out: false, + skip: false, + }, + QuerySuite { + prefix: "statpopgen", + display_name: "Statistical and Population Genetics", + query_prefix: "STATPOPGEN", + dataset_key: None, + fan_out: false, + skip: false, + }, + QuerySuite { + prefix: "polarsignals", + display_name: "PolarSignals Profiling", + query_prefix: "POLARSIGNALS", + dataset_key: None, + fan_out: false, + skip: false, + }, + QuerySuite { + prefix: "tpch", + display_name: "TPC-H", + query_prefix: "TPC-H", + dataset_key: Some("tpch"), + fan_out: true, + skip: false, + }, + QuerySuite { + prefix: "tpcds", + display_name: "TPC-DS", + query_prefix: "TPC-DS", + dataset_key: Some("tpcds"), + fan_out: true, + skip: false, + }, + QuerySuite { + prefix: "fineweb", + display_name: "Fineweb", + query_prefix: "FINEWEB", + dataset_key: None, + fan_out: false, + skip: true, + }, +]; + +/// Static port of v2's `ENGINE_RENAMES`. Applied to the "series" half +/// of a benchmark name (the part after the first `/`) before splitting +/// on `:` into engine/format. Order doesn't matter — keys are unique. +const ENGINE_RENAMES: &[(&str, &str)] = &[ + ("datafusion:vortex-file-compressed", "datafusion:vortex"), + ("datafusion:parquet", "datafusion:parquet"), + ("datafusion:arrow", "datafusion:in-memory-arrow"), + ("datafusion:lance", "datafusion:lance"), + ("datafusion:vortex-compact", "datafusion:vortex-compact"), + ("duckdb:vortex-file-compressed", "duckdb:vortex"), + ("duckdb:parquet", "duckdb:parquet"), + ("duckdb:duckdb", "duckdb:duckdb"), + ("duckdb:vortex-compact", "duckdb:vortex-compact"), + ("vortex-tokio-local-disk", "vortex-nvme"), + ("vortex-compact-tokio-local-disk", "vortex-compact-nvme"), + ("lance-tokio-local-disk", "lance-nvme"), + ("parquet-tokio-local-disk", "parquet-nvme"), + ("lance", "lance"), +]; + +/// One entry of `QUERY_SUITES`. +#[derive(Debug, Clone, Copy)] +pub struct QuerySuite { + pub prefix: &'static str, + pub display_name: &'static str, + pub query_prefix: &'static str, + pub dataset_key: Option<&'static str>, + pub fan_out: bool, + pub skip: bool, +} + +/// Group a v2 record falls into. Mirrors `getGroup` in `server.js`, +/// including the fan-out group naming for TPC-H/TPC-DS. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum V2Group { + RandomAccess, + Compression, + CompressionSize, + Query { + suite_index: usize, + /// `Some` for fan-out suites only. + storage: Option, + /// `Some` for fan-out suites only. + scale_factor: Option, + }, +} + +impl V2Group { + /// Display name as v2 served it from `/api/metadata`. + pub fn display_name(&self) -> String { + match self { + V2Group::RandomAccess => "Random Access".into(), + V2Group::Compression => "Compression".into(), + V2Group::CompressionSize => "Compression Size".into(), + V2Group::Query { + suite_index, + storage, + scale_factor, + } => { + let suite = &QUERY_SUITES[*suite_index]; + if let (Some(storage), Some(sf)) = (storage, scale_factor) { + format!("{} ({}) (SF={})", suite.display_name, storage, sf) + } else { + suite.display_name.to_string() + } + } + } + } +} + +/// Apply v2's `ENGINE_RENAMES`. Reproduces the JS `rename`: +/// `RENAMES[s.toLowerCase()] || RENAMES[s] || s`. +pub fn rename_engine(s: &str) -> String { + let lower = s.to_lowercase(); + for (k, v) in ENGINE_RENAMES { + if *k == lower { + return (*v).to_string(); + } + } + for (k, v) in ENGINE_RENAMES { + if *k == s { + return (*v).to_string(); + } + } + s.to_string() +} + +/// Faithful port of v2's `formatQuery`: maps `clickbench_q07` → +/// `"CLICKBENCH Q7"`. Returns the original (uppercased, +/// `-` and `_` replaced with spaces) when no suite matches. +pub fn format_query(q: &str) -> String { + let lower = q.to_lowercase(); + for suite in QUERY_SUITES { + if suite.skip { + continue; + } + let prefix = suite.prefix; + if let Some(rest) = lower.strip_prefix(prefix) + && let Some(idx) = parse_query_index(rest) + { + return format!("{} Q{}", suite.query_prefix, idx); + } + } + let mut out = q.to_uppercase(); + out = out.replace(['_', '-'], " "); + out +} + +/// Parse the `_q07` / ` q7` / `q42` tail used by `format_query`. +/// Returns the integer query index if the tail matches the v2 regex +/// `^[_ ]?q(\d+)`. +fn parse_query_index(rest: &str) -> Option { + let after_sep = rest + .strip_prefix('_') + .or_else(|| rest.strip_prefix(' ')) + .unwrap_or(rest); + let after_q = after_sep + .strip_prefix('q') + .or_else(|| after_sep.strip_prefix('Q'))?; + let digits: String = after_q.chars().take_while(|c| c.is_ascii_digit()).collect(); + if digits.is_empty() { + return None; + } + digits.parse().ok() +} + +/// Faithful port of v2's `normalizeChartName`. +pub fn normalize_chart_name(group: &V2Group, chart_name: &str) -> String { + if matches!(group, V2Group::CompressionSize) && chart_name == "VORTEX FILE COMPRESSED SIZE" { + return "VORTEX SIZE".into(); + } + chart_name.to_string() +} + +/// Port of v2's `getGroup`. Returns `None` for skipped suites +/// (e.g. `fineweb`) or names that match nothing. +pub fn get_group(record: &V2Record) -> Option { + let lower = record.name.to_lowercase(); + + if lower.starts_with("random-access/") || lower.starts_with("random access/") { + return Some(V2Group::RandomAccess); + } + + if lower.starts_with("vortex size/") + || lower.starts_with("vortex-file-compressed size/") + || lower.starts_with("parquet size/") + || lower.starts_with("lance size/") + || lower.contains(":raw size/") + || lower.contains(":parquet-zstd size/") + || lower.contains(":lance size/") + { + return Some(V2Group::CompressionSize); + } + + if lower.starts_with("compress time/") + || lower.starts_with("decompress time/") + || lower.starts_with("parquet_rs-zstd compress") + || lower.starts_with("parquet_rs-zstd decompress") + || lower.starts_with("lance compress") + || lower.starts_with("lance decompress") + || lower.starts_with("vortex:lance ratio") + || lower.starts_with("vortex:parquet-zstd ratio") + || lower.starts_with("vortex:raw ratio") + { + return Some(V2Group::Compression); + } + + for (i, suite) in QUERY_SUITES.iter().enumerate() { + let prefix_q = format!("{}_q", suite.prefix); + let prefix_slash = format!("{}/", suite.prefix); + if !lower.starts_with(&prefix_q) && !lower.starts_with(&prefix_slash) { + continue; + } + if suite.skip { + return None; + } + if !suite.fan_out { + return Some(V2Group::Query { + suite_index: i, + storage: None, + scale_factor: None, + }); + } + let storage = match record.storage.as_deref().map(str::to_uppercase).as_deref() { + Some("S3") => "S3", + _ => "NVMe", + }; + let dataset_key = suite.dataset_key.unwrap_or(suite.prefix); + let raw_sf = record + .dataset + .as_ref() + .and_then(|d| dataset_scale_factor(d, dataset_key)); + let sf = raw_sf + .as_deref() + .and_then(|s| s.parse::().ok()) + .map(|f| f.round() as i64) + .unwrap_or(1); + return Some(V2Group::Query { + suite_index: i, + storage: Some(storage.into()), + scale_factor: Some(sf.to_string()), + }); + } + + None +} + +/// Group + chart + series breakdown for a v2 record, using the same +/// rules `server.js` applies in `refresh()`. Equivalent to v2's +/// `(group, chartName, seriesName)` triple after rename / skip rules. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct V2Classification { + pub group: V2Group, + pub chart: String, + pub series: String, +} + +/// Apply the same chart / series naming v2's `refresh()` does, plus +/// the throughput / `PARQUET-UNC` skip rules. +pub fn classify_v2(record: &V2Record) -> Option { + if record.name.contains(" throughput") { + return None; + } + let group = get_group(record)?; + let parts: Vec<&str> = record.name.split('/').collect(); + let (chart, series) = match (&group, parts.len()) { + (V2Group::RandomAccess, 4) => { + let chart = format!("{}/{}", parts[1], parts[2]) + .to_uppercase() + .replace(['_', '-'], " "); + let series = rename_engine(if parts[3].is_empty() { + "default" + } else { + parts[3] + }); + (chart, series) + } + (V2Group::RandomAccess, 2) => ( + "RANDOM ACCESS".to_string(), + rename_engine(if parts[1].is_empty() { + "default" + } else { + parts[1] + }), + ), + (V2Group::RandomAccess, _) => return None, + _ => { + let series_raw = if parts.len() >= 2 && !parts[1].is_empty() { + parts[1] + } else { + "default" + }; + let series = rename_engine(series_raw); + let chart = format_query(parts[0]); + (chart, series) + } + }; + let chart = normalize_chart_name(&group, &chart); + if chart.contains("PARQUET-UNC") { + return None; + } + Some(V2Classification { + group, + chart, + series, + }) +} + +/// Mapping target: which v3 fact table a v2 record lands in, plus the +/// dim values that table needs. +#[derive(Debug, Clone, PartialEq)] +pub enum V3Bin { + Query { + dataset: String, + dataset_variant: Option, + scale_factor: Option, + query_idx: i32, + storage: String, + engine: String, + format: String, + }, + CompressionTime { + dataset: String, + dataset_variant: Option, + format: String, + op: String, + }, + CompressionSize { + dataset: String, + dataset_variant: Option, + format: String, + }, + RandomAccess { + dataset: String, + format: String, + }, +} + +/// Top-level entry point. Combines `classify_v2` with the v3 fact-table +/// mapping. Returns `None` for records that: +/// +/// - Don't match any v2 group (uncategorized prefix). +/// - Are explicitly skipped by v2 (throughput, PARQUET-UNC, fineweb). +/// - Are computed-at-read-time ratios that v3 derives from +/// `compression_sizes` (`vortex:parquet-zstd ratio …`, +/// `vortex:lance ratio …`, `vortex:raw ratio …`, +/// `vortex:* size/…`). +pub fn classify(record: &V2Record) -> Option { + let cls = classify_v2(record)?; + match &cls.group { + V2Group::RandomAccess => bin_random_access(&cls, record), + V2Group::Compression => bin_compression_time(&cls, record), + V2Group::CompressionSize => bin_compression_size(&cls, record), + V2Group::Query { .. } => bin_query(&cls, record), + } +} + +fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option { + // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase). + // We store it as the v3 dataset value verbatim, lowercased so + // `/api/groups` returns canonical lowercase names. + let dataset = cls.chart.to_lowercase(); + if dataset.is_empty() { + return None; + } + let mut format = cls.series.clone(); + if format.is_empty() { + return None; + } + // v2 emits a "default" placeholder when parts[1] is empty; treat + // that as missing and skip the row instead of inserting "default" + // as a format. + if format == "default" { + return None; + } + // The v2 random-access bench used to emit `parquet`-suffixed names; + // strip an "ns" unit guard later. + let _ = record; // record is unused here; kept for parity with siblings. + // Lower-case the format too so v3 series names are canonical. + format = format.to_lowercase(); + Some(V3Bin::RandomAccess { dataset, format }) +} + +fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option { + // v2 compression chart names look like (after format_query): + // "COMPRESS TIME" [vortex/encode] + // "DECOMPRESS TIME" [vortex/decode] + // "PARQUET RS ZSTD COMPRESS TIME" [parquet/encode] + // "PARQUET RS ZSTD DECOMPRESS TIME" [parquet/decode] + // "LANCE COMPRESS TIME" [lance/encode] + // "LANCE DECOMPRESS TIME" [lance/decode] + // "VORTEX:LANCE RATIO COMPRESS TIME" [drop] + // "VORTEX:PARQUET-ZSTD RATIO COMPRESS TIME" [drop] + // "VORTEX:RAW RATIO COMPRESS TIME" [drop] + let lc = cls.chart.to_lowercase(); + if lc.contains("ratio") || lc.contains(':') { + // Ratios are computed at read time from compression_sizes. + return None; + } + let (format, op) = if lc.starts_with("compress time") { + ("vortex-file-compressed", "encode") + } else if lc.starts_with("decompress time") { + ("vortex-file-compressed", "decode") + } else if lc.starts_with("parquet rs zstd compress time") { + ("parquet", "encode") + } else if lc.starts_with("parquet rs zstd decompress time") { + ("parquet", "decode") + } else if lc.starts_with("lance compress time") { + ("lance", "encode") + } else if lc.starts_with("lance decompress time") { + ("lance", "decode") + } else { + return None; + }; + let dataset = cls.series.to_lowercase(); + if dataset.is_empty() || dataset == "default" { + return None; + } + Some(V3Bin::CompressionTime { + dataset, + dataset_variant: None, + format: format.to_string(), + op: op.to_string(), + }) +} + +fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option { + let lc = cls.chart.to_lowercase(); + // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" / + // "VORTEX:RAW SIZE" are derived from compression_sizes at read + // time, not stored. + if lc.contains(':') { + return None; + } + let format = if lc.starts_with("vortex size") { + "vortex-file-compressed" + } else if lc.starts_with("parquet size") { + "parquet" + } else if lc.starts_with("lance size") { + "lance" + } else { + return None; + }; + let dataset = cls.series.to_lowercase(); + if dataset.is_empty() || dataset == "default" { + return None; + } + Some(V3Bin::CompressionSize { + dataset, + dataset_variant: None, + format: format.to_string(), + }) +} + +fn bin_query(cls: &V2Classification, record: &V2Record) -> Option { + let V2Group::Query { + suite_index, + storage, + scale_factor, + } = &cls.group + else { + return None; + }; + let suite = &QUERY_SUITES[*suite_index]; + + // Pull the query index from the *raw* name's first part instead of + // the formatted chart, so we don't have to round-trip "Q07". + let raw_first = record.name.split('/').next().unwrap_or(""); + let query_idx = parse_query_index_from_first(raw_first)?; + + // Series for non-RA records is "engine:format" after rename. + let (engine, format) = split_engine_format(&cls.series)?; + + let storage_v3 = match storage.as_deref() { + Some("S3") => "s3".to_string(), + Some("NVMe") => "nvme".to_string(), + _ => "nvme".to_string(), + }; + + // ClickBench's "flavor" lives in dataset_variant per benchmark-mapping.md + // - we don't have it from a v2 name string, so we leave it None. + Some(V3Bin::Query { + dataset: suite.prefix.to_string(), + dataset_variant: None, + scale_factor: scale_factor.clone(), + query_idx, + storage: storage_v3, + engine, + format, + }) +} + +/// Pull the integer query index out of the leading name part, which is +/// always `_q` or ` q` for SQL query records. +fn parse_query_index_from_first(first: &str) -> Option { + let lower = first.to_lowercase(); + for suite in QUERY_SUITES { + if let Some(rest) = lower.strip_prefix(suite.prefix) + && let Some(idx) = parse_query_index(rest) + { + return Some(idx as i32); + } + } + None +} + +/// Split a renamed series like `datafusion:parquet` into +/// `(engine, format)`. Returns `None` for series with no `:` since +/// v3 requires both columns. +fn split_engine_format(series: &str) -> Option<(String, String)> { + let mut split = series.splitn(2, ':'); + let engine = split.next()?.trim().to_string(); + let format = split.next()?.trim().to_string(); + if engine.is_empty() || format.is_empty() { + return None; + } + Some((engine, format)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn record(name: &str) -> V2Record { + V2Record { + name: name.to_string(), + commit_id: Some("deadbeef".into()), + unit: None, + value: None, + storage: None, + dataset: None, + all_runtimes: None, + env_triple: None, + } + } + + #[test] + fn format_query_round_trips() { + assert_eq!(format_query("clickbench_q07"), "CLICKBENCH Q7"); + assert_eq!(format_query("tpch_q01"), "TPC-H Q1"); + assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42"); + assert_eq!(format_query("statpopgen_q3"), "STATPOPGEN Q3"); + assert_eq!(format_query("foo bar"), "FOO BAR"); + } + + #[test] + fn rename_engine_canonicalizes_disk_names() { + assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme"); + assert_eq!( + rename_engine("datafusion:vortex-file-compressed"), + "datafusion:vortex" + ); + assert_eq!(rename_engine("unknown-engine"), "unknown-engine"); + } + + #[test] + fn parse_query_index_handles_separators() { + assert_eq!(parse_query_index("_q07"), Some(7)); + assert_eq!(parse_query_index(" q7"), Some(7)); + assert_eq!(parse_query_index("q42"), Some(42)); + assert_eq!(parse_query_index("xq7"), None); + } + + #[test] + fn random_access_bins_dataset_pattern() { + let bin = classify(&record("random-access/taxi/take/parquet")).unwrap(); + assert_eq!( + bin, + V3Bin::RandomAccess { + dataset: "taxi/take".into(), + format: "parquet".into(), + } + ); + } +} diff --git a/benchmarks-website/migrate/src/commits.rs b/benchmarks-website/migrate/src/commits.rs new file mode 100644 index 00000000000..28d63a5bd19 --- /dev/null +++ b/benchmarks-website/migrate/src/commits.rs @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Commit upserts. Adapts a [`crate::v2::V2Commit`] into the v3 +//! `commits` row shape (a [`vortex_bench_server::records::CommitInfo`]). + +use anyhow::Context as _; +use anyhow::Result; +use duckdb::Transaction; +use duckdb::params; + +use crate::v2::V2Commit; + +/// Insert a v3 `commits` row for one v2 commit. Missing fields are +/// filled with the empty string, matching the v3 schema's `NOT NULL` +/// constraints; the call site logs a warning for each fallback so +/// the operator can spot bad inputs. +pub fn upsert_commit(tx: &Transaction<'_>, commit: &V2Commit) -> Result { + let mut warnings = Vec::new(); + let timestamp = require_field(&commit.timestamp, "timestamp", &commit.id, &mut warnings); + let message = require_field(&commit.message, "message", &commit.id, &mut warnings); + let author_name = require_field( + &commit.author.as_ref().and_then(|p| p.name.clone()), + "author.name", + &commit.id, + &mut warnings, + ); + let author_email = require_field( + &commit.author.as_ref().and_then(|p| p.email.clone()), + "author.email", + &commit.id, + &mut warnings, + ); + let committer_name = require_field( + &commit.committer.as_ref().and_then(|p| p.name.clone()), + "committer.name", + &commit.id, + &mut warnings, + ); + let committer_email = require_field( + &commit.committer.as_ref().and_then(|p| p.email.clone()), + "committer.email", + &commit.id, + &mut warnings, + ); + let tree_sha = require_field(&commit.tree_id, "tree_id", &commit.id, &mut warnings); + let url = require_field(&commit.url, "url", &commit.id, &mut warnings); + + tx.execute( + r#" + INSERT INTO commits ( + commit_sha, timestamp, message, author_name, author_email, + committer_name, committer_email, tree_sha, url + ) VALUES (?, CAST(? AS TIMESTAMPTZ), ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT (commit_sha) DO UPDATE SET + timestamp = excluded.timestamp, + message = excluded.message, + author_name = excluded.author_name, + author_email = excluded.author_email, + committer_name = excluded.committer_name, + committer_email = excluded.committer_email, + tree_sha = excluded.tree_sha, + url = excluded.url + "#, + params![ + commit.id, + timestamp, + message, + author_name, + author_email, + committer_name, + committer_email, + tree_sha, + url, + ], + ) + .with_context(|| format!("upserting commit {}", commit.id))?; + Ok(UpsertOutcome { warnings }) +} + +fn require_field( + field: &Option, + name: &str, + sha: &str, + warnings: &mut Vec, +) -> String { + match field { + Some(s) => s.clone(), + None => { + warnings.push(format!("commit {sha} missing {name}")); + String::new() + } + } +} + +/// Per-call warning bag returned to the caller for logging. +#[derive(Debug, Default)] +pub struct UpsertOutcome { + pub warnings: Vec, +} diff --git a/benchmarks-website/migrate/src/lib.rs b/benchmarks-website/migrate/src/lib.rs new file mode 100644 index 00000000000..5e8d9c64907 --- /dev/null +++ b/benchmarks-website/migrate/src/lib.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! One-shot historical migrator from v2's S3-hosted benchmark dataset +//! to a v3 DuckDB file. +//! +//! The v2 dataset is JSONL of bare benchmark records keyed by name string. +//! v3 uses five typed fact tables with explicit dim columns. This crate +//! ports v2's `getGroup` classifier (in `benchmarks-website/server.js`) +//! bug-for-bug so that historical rows survive the migration with the +//! same group / chart / series structure as the live v2 server. +//! +//! The migrator is throwaway: once v3 cuts over, both the binary and +//! the classifier go away. + +pub mod classifier; +pub mod commits; +pub mod migrate; +pub mod source; +pub mod v2; +pub mod verify; diff --git a/benchmarks-website/migrate/src/main.rs b/benchmarks-website/migrate/src/main.rs new file mode 100644 index 00000000000..366834ed441 --- /dev/null +++ b/benchmarks-website/migrate/src/main.rs @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `vortex-bench-migrate` CLI: a one-shot historical migrator from +//! v2's S3 dataset into a v3 DuckDB file, plus a structural diff +//! against the live v2 `/api/metadata` endpoint for spotting +//! classifier regressions. + +use std::path::PathBuf; +use std::process::ExitCode; + +use anyhow::Context as _; +use anyhow::Result; +use clap::Parser; +use clap::Subcommand; +use clap::ValueEnum; +use tracing_subscriber::EnvFilter; +use vortex_bench_migrate::migrate; +use vortex_bench_migrate::source::Source; +use vortex_bench_migrate::verify; + +/// One-shot historical migrator from v2's S3 dataset to v3 DuckDB. +#[derive(Debug, Parser)] +#[command(name = "vortex-bench-migrate", version, about)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + /// Read v2's data.json.gz / commits.json / file-sizes-*.json.gz + /// and write a fully populated v3 DuckDB at `--output`. + Run { + /// Path to write the v3 DuckDB to. Created if absent. + #[arg(long)] + output: PathBuf, + /// Where to fetch v2 dumps from. + #[arg(long, value_enum, default_value_t = SourceKind::PublicS3)] + source: SourceKind, + /// For `--source=local`, the directory containing + /// `data.json.gz`, `commits.json`, and `file-sizes-*.json.gz`. + #[arg(long, required_if_eq("source", "local"))] + source_dir: Option, + }, + /// Diff a migrated DuckDB against the live v2 `/api/metadata` + /// endpoint. Exits 0 if every v2 group is present in v3, 1 + /// otherwise so this can gate a CI step. + Verify { + /// HTTPS root of a running v2 server (e.g. `https://bench.vortex.dev`). + #[arg(long)] + against: String, + /// Path to the migrated v3 DuckDB. + #[arg(long)] + duckdb: PathBuf, + }, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +enum SourceKind { + PublicS3, + Local, +} + +fn main() -> ExitCode { + if let Err(err) = run() { + eprintln!("error: {err:#}"); + return ExitCode::from(2); + } + ExitCode::SUCCESS +} + +fn run() -> Result<()> { + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_env("VORTEX_BENCH_LOG").unwrap_or_else(|_| EnvFilter::new("info")), + ) + .init(); + + let cli = Cli::parse(); + match cli.command { + Command::Run { + output, + source, + source_dir, + } => { + let source = match source { + SourceKind::PublicS3 => Source::PublicS3, + SourceKind::Local => { + Source::Local(source_dir.context("--source=local requires --source-dir")?) + } + }; + let summary = migrate::run(&source, &output)?; + print!("{summary}"); + if summary.uncategorized_fraction() > 0.05 { + anyhow::bail!( + "uncategorized records ({:.2}%) exceed the 5% gate; \ + stop and report unmatched prefixes (see summary above) \ + before proceeding", + 100.0 * summary.uncategorized_fraction() + ); + } + Ok(()) + } + Command::Verify { against, duckdb } => { + let report = verify::run(&against, &duckdb)?; + print!("{report}"); + if !report.v2_groups_covered() { + std::process::exit(1); + } + Ok(()) + } + } +} diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs new file mode 100644 index 00000000000..f75e0169fda --- /dev/null +++ b/benchmarks-website/migrate/src/migrate.rs @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! End-to-end migration of one v2 dataset into a v3 DuckDB file. +//! +//! Streams `data.json.gz` line-by-line, runs each record through the +//! [classifier][crate::classifier], and writes one row per record into +//! the appropriate v3 fact table. Every row's `measurement_id` is +//! computed via the server's `measurement_id_*` functions so the result +//! is byte-compatible with what fresh `/api/ingest` would have produced. + +use std::collections::BTreeMap; +use std::io::BufRead; +use std::path::Path; + +use anyhow::Context as _; +use anyhow::Result; +use duckdb::Connection; +use duckdb::Transaction; +use duckdb::params; +use tracing::warn; +use vortex_bench_server::db::measurement_id_compression_size; +use vortex_bench_server::db::measurement_id_compression_time; +use vortex_bench_server::db::measurement_id_query; +use vortex_bench_server::db::measurement_id_random_access; +use vortex_bench_server::records::CompressionSize; +use vortex_bench_server::records::CompressionTime; +use vortex_bench_server::records::QueryMeasurement; +use vortex_bench_server::records::RandomAccessTime; +use vortex_bench_server::schema::SCHEMA_DDL; + +use crate::classifier::V3Bin; +use crate::classifier::classify; +use crate::commits::upsert_commit; +use crate::source::Source; +use crate::v2::V2Commit; +use crate::v2::V2FileSize; +use crate::v2::V2Record; +use crate::v2::index_commits; +use crate::v2::runtime_as_i64; +use crate::v2::value_as_f64; + +/// Per-table insert counts, plus skip / missing counts. +#[derive(Debug, Default, Clone)] +pub struct MigrationSummary { + pub records_read: u64, + pub query_inserted: u64, + pub compression_time_inserted: u64, + pub compression_size_inserted: u64, + pub random_access_inserted: u64, + pub file_size_inserted: u64, + pub uncategorized: u64, + pub uncategorized_prefixes: BTreeMap, + pub missing_commit: u64, + pub commit_warnings: u64, + pub skipped_no_value: u64, + pub commits_inserted: u64, +} + +impl MigrationSummary { + /// Total `data.json.gz` records that landed in some v3 fact table. + pub fn total_inserted(&self) -> u64 { + self.query_inserted + + self.compression_time_inserted + + self.compression_size_inserted + + self.random_access_inserted + } + + /// Fraction of records that were uncategorized. The orchestrator + /// stops if this exceeds the documented 5% threshold. + pub fn uncategorized_fraction(&self) -> f64 { + if self.records_read == 0 { + return 0.0; + } + self.uncategorized as f64 / self.records_read as f64 + } +} + +/// Open or create a DuckDB at `path` and apply the v3 schema. +pub fn open_target_db(path: &Path) -> Result { + let conn = + Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?; + conn.execute_batch(SCHEMA_DDL) + .context("applying v3 schema DDL")?; + Ok(conn) +} + +/// Run the whole migration: commits, data.json.gz, and every +/// file-sizes-*.json.gz under the source. +pub fn run(source: &Source, target: &Path) -> Result { + let mut conn = open_target_db(target)?; + let mut summary = MigrationSummary::default(); + + let commits = read_commits(source)?; + summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?; + + migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?; + + for name in source.list_file_sizes()? { + if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) { + warn!("file-sizes file {name} failed: {e:#}"); + } + } + + Ok(summary) +} + +fn read_commits(source: &Source) -> Result> { + let reader = source.open_commits_jsonl()?; + let mut commits: Vec = Vec::new(); + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + match serde_json::from_str::(trimmed) { + Ok(c) => commits.push(c), + Err(e) => warn!("skipping malformed commits.json line: {e}"), + } + } + Ok(index_commits(commits)) +} + +fn upsert_all_commits( + conn: &mut Connection, + commits: &BTreeMap, + summary: &mut MigrationSummary, +) -> Result { + let tx = conn.transaction().context("begin commits transaction")?; + let mut count = 0u64; + for commit in commits.values() { + let outcome = upsert_commit(&tx, commit)?; + for w in outcome.warnings { + warn!("{w}"); + summary.commit_warnings += 1; + } + count += 1; + } + tx.commit().context("commit commits transaction")?; + Ok(count) +} + +fn migrate_data_jsonl( + conn: &mut Connection, + source: &Source, + commits: &BTreeMap, + summary: &mut MigrationSummary, +) -> Result<()> { + let reader = source.open_data_jsonl()?; + let mut tx = conn.transaction().context("begin data tx")?; + const BATCH: u64 = 10_000; + let mut in_batch = 0u64; + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + summary.records_read += 1; + let record: V2Record = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed data.json line: {e}"); + continue; + } + }; + apply_v2_record(&tx, &record, commits, summary)?; + in_batch += 1; + if in_batch >= BATCH { + tx.commit().context("commit data batch")?; + tx = conn.transaction().context("begin data tx")?; + in_batch = 0; + } + } + tx.commit().context("commit final data batch")?; + Ok(()) +} + +fn apply_v2_record( + tx: &Transaction<'_>, + record: &V2Record, + commits: &BTreeMap, + summary: &mut MigrationSummary, +) -> Result<()> { + let Some(sha) = record.commit_id.clone() else { + summary.missing_commit += 1; + return Ok(()); + }; + if !commits.contains_key(&sha) { + summary.missing_commit += 1; + return Ok(()); + } + + let Some(bin) = classify(record) else { + summary.uncategorized += 1; + let prefix = record.name.split('/').next().unwrap_or("").to_string(); + *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1; + return Ok(()); + }; + + let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple()); + let runtimes = record + .all_runtimes + .as_ref() + .map(|v| v.iter().filter_map(runtime_as_i64).collect::>()) + .unwrap_or_default(); + let value_f64 = match record.value.as_ref().and_then(value_as_f64) { + Some(v) => v, + None => { + summary.skipped_no_value += 1; + return Ok(()); + } + }; + + match bin { + V3Bin::Query { + dataset, + dataset_variant, + scale_factor, + query_idx, + storage, + engine, + format, + } => { + let qm = QueryMeasurement { + commit_sha: sha, + dataset, + dataset_variant, + scale_factor, + query_idx, + storage, + engine, + format, + value_ns: value_f64 as i64, + all_runtimes_ns: runtimes, + peak_physical: None, + peak_virtual: None, + physical_delta: None, + virtual_delta: None, + env_triple, + }; + insert_query(tx, &qm)?; + summary.query_inserted += 1; + } + V3Bin::CompressionTime { + dataset, + dataset_variant, + format, + op, + } => { + let ct = CompressionTime { + commit_sha: sha, + dataset, + dataset_variant, + format, + op, + value_ns: value_f64 as i64, + all_runtimes_ns: runtimes, + env_triple, + }; + insert_compression_time(tx, &ct)?; + summary.compression_time_inserted += 1; + } + V3Bin::CompressionSize { + dataset, + dataset_variant, + format, + } => { + let cs = CompressionSize { + commit_sha: sha, + dataset, + dataset_variant, + format, + value_bytes: value_f64 as i64, + }; + insert_compression_size(tx, &cs)?; + summary.compression_size_inserted += 1; + } + V3Bin::RandomAccess { dataset, format } => { + let ra = RandomAccessTime { + commit_sha: sha, + dataset, + format, + value_ns: value_f64 as i64, + all_runtimes_ns: runtimes, + env_triple, + }; + insert_random_access(tx, &ra)?; + summary.random_access_inserted += 1; + } + } + Ok(()) +} + +fn insert_query(tx: &Transaction<'_>, r: &QueryMeasurement) -> Result<()> { + let mid = measurement_id_query(r); + tx.execute( + r#" + INSERT INTO query_measurements ( + measurement_id, commit_sha, dataset, dataset_variant, scale_factor, + query_idx, storage, engine, format, + value_ns, all_runtimes_ns, + peak_physical, peak_virtual, physical_delta, virtual_delta, + env_triple + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?) + ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple + "#, + params![ + mid, + r.commit_sha, + r.dataset, + r.dataset_variant, + r.scale_factor, + r.query_idx, + r.storage, + r.engine, + r.format, + r.value_ns, + runtimes_literal(&r.all_runtimes_ns), + r.peak_physical, + r.peak_virtual, + r.physical_delta, + r.virtual_delta, + r.env_triple, + ], + )?; + Ok(()) +} + +fn insert_compression_time(tx: &Transaction<'_>, r: &CompressionTime) -> Result<()> { + let mid = measurement_id_compression_time(r); + tx.execute( + r#" + INSERT INTO compression_times ( + measurement_id, commit_sha, dataset, dataset_variant, + format, op, value_ns, all_runtimes_ns, env_triple + ) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) + ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple + "#, + params![ + mid, + r.commit_sha, + r.dataset, + r.dataset_variant, + r.format, + r.op, + r.value_ns, + runtimes_literal(&r.all_runtimes_ns), + r.env_triple, + ], + )?; + Ok(()) +} + +fn insert_compression_size(tx: &Transaction<'_>, r: &CompressionSize) -> Result<()> { + let mid = measurement_id_compression_size(r); + tx.execute( + r#" + INSERT INTO compression_sizes ( + measurement_id, commit_sha, dataset, dataset_variant, + format, value_bytes + ) VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_bytes = excluded.value_bytes + "#, + params![ + mid, + r.commit_sha, + r.dataset, + r.dataset_variant, + r.format, + r.value_bytes, + ], + )?; + Ok(()) +} + +fn insert_random_access(tx: &Transaction<'_>, r: &RandomAccessTime) -> Result<()> { + let mid = measurement_id_random_access(r); + tx.execute( + r#" + INSERT INTO random_access_times ( + measurement_id, commit_sha, dataset, format, + value_ns, all_runtimes_ns, env_triple + ) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) + ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple + "#, + params![ + mid, + r.commit_sha, + r.dataset, + r.format, + r.value_ns, + runtimes_literal(&r.all_runtimes_ns), + r.env_triple, + ], + )?; + Ok(()) +} + +fn runtimes_literal(values: &[i64]) -> String { + let mut s = String::with_capacity(values.len() * 8 + 2); + s.push('['); + for (i, v) in values.iter().enumerate() { + if i > 0 { + s.push(','); + } + s.push_str(&v.to_string()); + } + s.push(']'); + s +} + +fn migrate_file_sizes( + conn: &mut Connection, + source: &Source, + name: &str, + commits: &BTreeMap, + summary: &mut MigrationSummary, +) -> Result<()> { + let reader = source.open_file_sizes(name)?; + let dataset = name + .strip_prefix("file-sizes-") + .and_then(|s| s.strip_suffix(".json.gz")) + .unwrap_or(name) + .to_string(); + let mut tx = conn.transaction().context("begin file-sizes tx")?; + const BATCH: u64 = 10_000; + let mut in_batch = 0u64; + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let sz: V2FileSize = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed {name} line: {e}"); + continue; + } + }; + if !commits.contains_key(&sz.commit_id) { + summary.missing_commit += 1; + continue; + } + // file-sizes-*.json.gz captures per-file sizes inside one + // benchmark/format/scale_factor combo. We aggregate to one + // (commit, dataset, dataset_variant, format) row by summing, + // since v3's compression_sizes is a single bytes value per + // (dim) tuple. Use ON CONFLICT to accumulate. + upsert_file_size_row(&tx, &sz, &dataset)?; + summary.file_size_inserted += 1; + in_batch += 1; + if in_batch >= BATCH { + tx.commit().context("commit file-sizes batch")?; + tx = conn.transaction().context("begin file-sizes tx")?; + in_batch = 0; + } + } + tx.commit().context("commit final file-sizes batch")?; + Ok(()) +} + +fn upsert_file_size_row( + tx: &Transaction<'_>, + sz: &V2FileSize, + dataset_fallback: &str, +) -> Result<()> { + let dataset = if sz.benchmark.is_empty() { + dataset_fallback.to_string() + } else { + sz.benchmark.clone() + }; + let dataset_variant = sz + .scale_factor + .as_ref() + .filter(|s| !s.is_empty() && s.as_str() != "1.0") + .cloned(); + let cs = CompressionSize { + commit_sha: sz.commit_id.clone(), + dataset, + dataset_variant, + format: sz.format.clone(), + value_bytes: sz.size_bytes, + }; + let mid = measurement_id_compression_size(&cs); + // Multiple files within the same dataset/format/scale_factor sum + // into one row by adding to whatever is already there. + tx.execute( + r#" + INSERT INTO compression_sizes ( + measurement_id, commit_sha, dataset, dataset_variant, + format, value_bytes + ) VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT (measurement_id) DO UPDATE SET + value_bytes = compression_sizes.value_bytes + excluded.value_bytes + "#, + params![ + mid, + cs.commit_sha, + cs.dataset, + cs.dataset_variant, + cs.format, + cs.value_bytes, + ], + )?; + Ok(()) +} + +/// Print the summary in a human-readable form. Returned by the CLI. +impl std::fmt::Display for MigrationSummary { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Records read: {}", self.records_read)?; + writeln!(f, "Commits upserted: {}", self.commits_inserted)?; + writeln!(f, "Commit warnings: {}", self.commit_warnings)?; + writeln!(f, "Inserted (query): {}", self.query_inserted)?; + writeln!( + f, + "Inserted (compress t): {}", + self.compression_time_inserted + )?; + writeln!( + f, + "Inserted (compress s): {}", + self.compression_size_inserted + )?; + writeln!(f, "Inserted (random acc): {}", self.random_access_inserted)?; + writeln!(f, "Inserted (file sizes): {}", self.file_size_inserted)?; + writeln!(f, "Missing commit: {}", self.missing_commit)?; + writeln!(f, "Skipped (no value): {}", self.skipped_no_value)?; + writeln!( + f, + "Uncategorized: {} ({:.2}%)", + self.uncategorized, + 100.0 * self.uncategorized_fraction() + )?; + if !self.uncategorized_prefixes.is_empty() { + let mut top: Vec<_> = self.uncategorized_prefixes.iter().collect(); + top.sort_by(|a, b| b.1.cmp(a.1)); + writeln!(f, "Top uncategorized prefixes:")?; + for (prefix, n) in top.iter().take(20) { + writeln!(f, " {prefix:>32} : {n}")?; + } + } + Ok(()) + } +} diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs new file mode 100644 index 00000000000..2b4fdca9b94 --- /dev/null +++ b/benchmarks-website/migrate/src/source.rs @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Streaming readers for v2's public S3 bucket. +//! +//! The bucket is `--no-sign-request`, so we fetch the underlying +//! HTTPS URL directly and stream-decompress with `flate2`. The +//! downloads are wrapped in [`reqwest::blocking`] to keep the read +//! path synchronous; the binary's hot path is single-threaded +//! per-source already (DuckDB is a single-writer). +//! +//! For tests and offline runs, [`Source::Local`] accepts a local +//! directory of dumps; the migrator's `--source` flag picks the +//! variant. + +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +use std::io::Read; +use std::path::Path; +use std::path::PathBuf; + +use anyhow::Context as _; +use anyhow::Result; +use flate2::read::GzDecoder; + +/// Public S3 bucket the live v2 server reads from. +pub const PUBLIC_BUCKET_BASE: &str = "https://vortex-ci-benchmark-results.s3.amazonaws.com"; + +/// Where to read the v2 dataset from. Either the public S3 bucket +/// (the live deployment) or a local directory of dumps. +#[derive(Debug, Clone)] +pub enum Source { + /// HTTPS GETs against `s3.amazonaws.com`. + PublicS3, + /// A directory containing `data.json.gz`, `commits.json`, and + /// `file-sizes-*.json.gz` files. + Local(PathBuf), +} + +impl Source { + /// Open `data.json.gz` for streaming, decompressing on the fly. + pub fn open_data_jsonl(&self) -> Result> { + let stream = self.open_raw("data.json.gz")?; + Ok(Box::new(BufReader::new(GzDecoder::new(stream)))) + } + + /// Open `commits.json` (uncompressed). + pub fn open_commits_jsonl(&self) -> Result> { + let stream = self.open_raw("commits.json")?; + Ok(Box::new(BufReader::new(stream))) + } + + /// Enumerate `file-sizes-*.json.gz` files. For local sources this + /// is a directory glob; for the public bucket we hit the documented + /// suite ids. + pub fn list_file_sizes(&self) -> Result> { + match self { + Source::Local(dir) => { + let mut out = Vec::new(); + for entry in std::fs::read_dir(dir)? { + let entry = entry?; + let name = entry.file_name(); + let s = name.to_string_lossy(); + if s.starts_with("file-sizes-") && s.ends_with(".json.gz") { + out.push(s.into_owned()); + } + } + out.sort(); + Ok(out) + } + Source::PublicS3 => { + // The S3 bucket's ListObjects is denied for unsigned + // requests, so we hit the documented per-suite keys + // emitted by `.github/workflows/sql-benchmarks.yml`. + Ok(KNOWN_FILE_SIZES_SUITES + .iter() + .map(|id| format!("file-sizes-{id}.json.gz")) + .collect()) + } + } + } + + /// Open one `file-sizes-*.json.gz` for streaming. + pub fn open_file_sizes(&self, name: &str) -> Result> { + let stream = self.open_raw(name)?; + Ok(Box::new(BufReader::new(GzDecoder::new(stream)))) + } + + fn open_raw(&self, name: &str) -> Result> { + match self { + Source::Local(dir) => open_local(&dir.join(name)), + Source::PublicS3 => open_s3(name), + } + } +} + +fn open_local(path: &Path) -> Result> { + let f = File::open(path).with_context(|| format!("opening {}", path.display()))?; + Ok(Box::new(f)) +} + +fn open_s3(name: &str) -> Result> { + let url = format!("{PUBLIC_BUCKET_BASE}/{name}"); + let resp = reqwest::blocking::get(&url).with_context(|| format!("GET {url}"))?; + if !resp.status().is_success() { + anyhow::bail!("GET {url} returned {}", resp.status()); + } + Ok(Box::new(resp)) +} + +/// Suite IDs we know publish a `file-sizes-{id}.json.gz` to S3. +/// Matches the `matrix.id` values in `.github/workflows/sql-benchmarks.yml` +/// at the time of writing. New suites mean a new entry here. +const KNOWN_FILE_SIZES_SUITES: &[&str] = + &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"]; diff --git a/benchmarks-website/migrate/src/v2.rs b/benchmarks-website/migrate/src/v2.rs new file mode 100644 index 00000000000..2a9d3bdf5d0 --- /dev/null +++ b/benchmarks-website/migrate/src/v2.rs @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Wire shapes of the v2 benchmark dataset on S3. +//! +//! These types capture only the fields the migrator reads. v2 records +//! are serialized by `vortex-bench` (see `vortex-bench/src/measurements.rs`) +//! and by older non-Rust scripts; the union of fields is loose, so we +//! deserialize permissively (`serde(default)`, untyped `serde_json::Value` +//! for the polymorphic `dataset` field). + +use std::collections::BTreeMap; + +use serde::Deserialize; + +/// One JSONL line of `data.json.gz`. +/// +/// The shape is the union of every emitter's output. Most fields are +/// optional because different benches emit different subsets. +#[derive(Debug, Clone, Deserialize)] +pub struct V2Record { + pub name: String, + #[serde(default)] + pub commit_id: Option, + #[serde(default)] + pub unit: Option, + #[serde(default)] + pub value: Option, + #[serde(default)] + pub storage: Option, + #[serde(default)] + pub dataset: Option, + #[serde(default)] + pub all_runtimes: Option>, + #[serde(default)] + pub env_triple: Option, +} + +/// `dataset` in v2 records is sometimes a string, sometimes an object +/// keyed by suite name (`{ "tpch": { "scale_factor": "10" } }`). +/// This helper looks up the scale factor for a given suite without +/// assuming a particular shape. +pub fn dataset_scale_factor(dataset: &serde_json::Value, key: &str) -> Option { + let obj = dataset.as_object()?; + let entry = obj.get(key)?; + let sf = entry.get("scale_factor")?; + match sf { + serde_json::Value::String(s) => Some(s.clone()), + serde_json::Value::Number(n) => Some(n.to_string()), + _ => None, + } +} + +/// Best-effort numeric coercion for the polymorphic `value` field. +pub fn value_as_f64(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::Number(n) => n.as_f64(), + serde_json::Value::String(s) => s.parse().ok(), + _ => None, + } +} + +/// Best-effort coercion of a runtime entry to nanoseconds. +pub fn runtime_as_i64(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + Some(i) + } else { + n.as_f64().map(|f| f as i64) + } + } + serde_json::Value::String(s) => s.parse().ok(), + _ => None, + } +} + +/// Triple block as emitted by `vortex-bench`'s `--gh-json` path. v2 +/// stored it as an object; we serialize it back out as `arch-os-env`. +#[derive(Debug, Clone, Deserialize)] +pub struct V2EnvTriple { + #[serde(default)] + pub architecture: Option, + #[serde(default)] + pub operating_system: Option, + #[serde(default)] + pub environment: Option, +} + +impl V2EnvTriple { + /// Format as the `arch-os-env` triple used by v3's `env_triple` column. + pub fn to_triple(&self) -> Option { + let arch = self.architecture.as_deref()?; + let os = self.operating_system.as_deref()?; + let env = self.environment.as_deref()?; + Some(format!("{arch}-{os}-{env}")) + } +} + +/// One JSONL line of `commits.json`. +#[derive(Debug, Clone, Deserialize)] +pub struct V2Commit { + pub id: String, + #[serde(default)] + pub timestamp: Option, + #[serde(default)] + pub message: Option, + #[serde(default)] + pub author: Option, + #[serde(default)] + pub committer: Option, + #[serde(default)] + pub tree_id: Option, + #[serde(default)] + pub url: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct V2Person { + #[serde(default)] + pub name: Option, + #[serde(default)] + pub email: Option, +} + +/// One JSONL line of `file-sizes-*.json.gz` produced by +/// `scripts/capture-file-sizes.py`. +#[derive(Debug, Clone, Deserialize)] +pub struct V2FileSize { + pub commit_id: String, + pub benchmark: String, + #[serde(default)] + pub scale_factor: Option, + pub format: String, + pub file: String, + pub size_bytes: i64, +} + +/// Build a sha-keyed map of commits. +pub fn index_commits(commits: Vec) -> BTreeMap { + commits.into_iter().map(|c| (c.id.clone(), c)).collect() +} diff --git a/benchmarks-website/migrate/src/verify.rs b/benchmarks-website/migrate/src/verify.rs new file mode 100644 index 00000000000..eb4caef6df7 --- /dev/null +++ b/benchmarks-website/migrate/src/verify.rs @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Structural diff between a migrated v3 DuckDB and the live v2 +//! `/api/metadata` endpoint. +//! +//! Compares group / chart structure only; values aren't compared +//! because v2 converts ns → ms and bytes → MiB on read while v3 +//! stores raw and the chart query divides. Group/chart structural +//! equivalence is enough to spot classifier regressions before +//! cutover. + +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::path::Path; + +use anyhow::Context as _; +use anyhow::Result; +use duckdb::Connection; +use serde::Deserialize; + +use crate::classifier::QUERY_SUITES; + +/// Result of one `verify` run. +#[derive(Debug, Default)] +pub struct VerifyReport { + pub matched_groups: Vec, + pub only_in_v3: Vec, + pub only_in_v2: Vec, + pub chart_diffs: Vec, +} + +#[derive(Debug, Clone)] +pub struct ChartDiff { + pub group: String, + pub v2_count: usize, + pub v3_count: usize, +} + +impl VerifyReport { + /// True if every v2 group is represented in v3. The CLI's exit + /// code reflects this. + pub fn v2_groups_covered(&self) -> bool { + self.only_in_v2.is_empty() + } +} + +impl std::fmt::Display for VerifyReport { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Groups in both v2 and v3:")?; + for g in &self.matched_groups { + writeln!(f, " + {g}")?; + } + if !self.only_in_v2.is_empty() { + writeln!(f, "Groups only in v2 (regression candidates):")?; + for g in &self.only_in_v2 { + writeln!(f, " - {g}")?; + } + } + if !self.only_in_v3.is_empty() { + writeln!(f, "Groups only in v3:")?; + for g in &self.only_in_v3 { + writeln!(f, " + {g}")?; + } + } + if !self.chart_diffs.is_empty() { + writeln!(f, "Chart count diffs:")?; + for d in &self.chart_diffs { + writeln!( + f, + " {} : v2={} v3={} (delta={})", + d.group, + d.v2_count, + d.v3_count, + d.v3_count as i64 - d.v2_count as i64, + )?; + } + } + Ok(()) + } +} + +/// v2's `/api/metadata` reply — only the fields we need. +#[derive(Debug, Deserialize)] +struct V2Metadata { + groups: BTreeMap, +} + +#[derive(Debug, Deserialize)] +struct V2GroupMeta { + #[serde(default)] + charts: Vec, +} + +#[derive(Debug, Deserialize)] +struct V2ChartMeta { + #[serde(default)] + name: String, +} + +/// Open the migrated DuckDB at `duckdb_path`, fetch `/api/metadata`, +/// and produce a structural diff. +pub fn run(v2_server: &str, duckdb_path: &Path) -> Result { + let v3 = collect_v3_groups(duckdb_path)?; + let v2 = fetch_v2_metadata(v2_server)?; + Ok(diff(&v2, &v3)) +} + +fn collect_v3_groups(duckdb_path: &Path) -> Result>> { + let conn = Connection::open(duckdb_path) + .with_context(|| format!("opening DuckDB at {}", duckdb_path.display()))?; + let mut groups: BTreeMap> = BTreeMap::new(); + + // query_measurements: chart per (dataset, query_idx); group per + // (dataset, dataset_variant, scale_factor, storage). We want v2 + // group display names so the verifier can compare apples to + // apples, so we re-format them here using the same suite table. + let mut stmt = conn.prepare( + r#" + SELECT dataset, dataset_variant, scale_factor, storage, query_idx + FROM query_measurements + GROUP BY dataset, dataset_variant, scale_factor, storage, query_idx + "#, + )?; + let rows = stmt.query_map([], |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, Option>(1)?, + row.get::<_, Option>(2)?, + row.get::<_, String>(3)?, + row.get::<_, i32>(4)?, + )) + })?; + for row in rows { + let (dataset, _variant, sf, storage, query_idx) = row?; + let group_name = display_query_group(&dataset, sf.as_deref(), &storage); + let chart_name = chart_name_query(&dataset, query_idx); + groups + .entry(group_name) + .or_default() + .insert(normalize_chart(&chart_name)); + } + + // compression_times: group "Compression", charts per dataset. + let mut stmt = conn.prepare( + r#" + SELECT dataset, format, op + FROM compression_times + GROUP BY dataset, format, op + "#, + )?; + let rows = stmt.query_map([], |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + )) + })?; + for row in rows { + let (dataset, format, op) = row?; + let chart = chart_name_compression_time(&format, &op, &dataset); + groups + .entry("Compression".to_string()) + .or_default() + .insert(normalize_chart(&chart)); + } + + let mut stmt = conn.prepare( + r#" + SELECT dataset, format + FROM compression_sizes + GROUP BY dataset, format + "#, + )?; + let rows = stmt.query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + for row in rows { + let (_dataset, format) = row?; + let chart = chart_name_compression_size(&format); + groups + .entry("Compression Size".to_string()) + .or_default() + .insert(normalize_chart(&chart)); + } + + let mut stmt = conn.prepare( + r#" + SELECT DISTINCT dataset + FROM random_access_times + "#, + )?; + let rows = stmt.query_map([], |row| row.get::<_, String>(0))?; + for row in rows { + let dataset = row?; + groups + .entry("Random Access".to_string()) + .or_default() + .insert(normalize_chart(&dataset)); + } + + Ok(groups) +} + +fn fetch_v2_metadata(server: &str) -> Result>> { + let url = format!("{}/api/metadata", server.trim_end_matches('/')); + let body = reqwest::blocking::get(&url) + .with_context(|| format!("GET {url}"))? + .error_for_status() + .with_context(|| format!("non-2xx from {url}"))? + .json::() + .with_context(|| format!("parsing {url} as v2 /api/metadata"))?; + let mut out: BTreeMap> = BTreeMap::new(); + for (name, group) in body.groups { + let charts = group + .charts + .into_iter() + .map(|c| normalize_chart(&c.name)) + .collect(); + out.insert(name, charts); + } + Ok(out) +} + +fn diff( + v2: &BTreeMap>, + v3: &BTreeMap>, +) -> VerifyReport { + let mut report = VerifyReport::default(); + let v2_keys: BTreeSet<&String> = v2.keys().collect(); + let v3_keys: BTreeSet<&String> = v3.keys().collect(); + for g in v2_keys.intersection(&v3_keys) { + report.matched_groups.push((**g).clone()); + let v2_charts = &v2[*g]; + let v3_charts = &v3[*g]; + if v2_charts.len() != v3_charts.len() { + report.chart_diffs.push(ChartDiff { + group: (**g).clone(), + v2_count: v2_charts.len(), + v3_count: v3_charts.len(), + }); + } + } + for g in v3_keys.difference(&v2_keys) { + report.only_in_v3.push((**g).clone()); + } + for g in v2_keys.difference(&v3_keys) { + report.only_in_v2.push((**g).clone()); + } + report.matched_groups.sort(); + report.only_in_v3.sort(); + report.only_in_v2.sort(); + report +} + +fn display_query_group(dataset: &str, scale_factor: Option<&str>, storage: &str) -> String { + let suite = QUERY_SUITES + .iter() + .find(|s| s.prefix.eq_ignore_ascii_case(dataset)) + .copied(); + match suite { + Some(suite) if suite.fan_out => { + let storage_disp = match storage { + "s3" | "S3" => "S3", + _ => "NVMe", + }; + let sf = scale_factor.unwrap_or("1"); + format!("{} ({}) (SF={})", suite.display_name, storage_disp, sf) + } + Some(suite) => suite.display_name.to_string(), + None => format!("{dataset} ({storage})"), + } +} + +fn chart_name_query(dataset: &str, query_idx: i32) -> String { + let suite = QUERY_SUITES + .iter() + .find(|s| s.prefix.eq_ignore_ascii_case(dataset)) + .copied(); + match suite { + Some(suite) => format!("{} Q{}", suite.query_prefix, query_idx), + None => format!("{} Q{}", dataset.to_uppercase(), query_idx), + } +} + +fn chart_name_compression_time(format: &str, op: &str, _dataset: &str) -> String { + // Re-derive the v2 chart name (the metric, not the dataset) so we + // can compare. v2's chart axis is the metric; series is the + // dataset. v3 inverts that. For structural comparison, we project + // back to v2's per-chart key. + match (format, op) { + ("vortex-file-compressed", "encode") => "COMPRESS TIME".into(), + ("vortex-file-compressed", "decode") => "DECOMPRESS TIME".into(), + ("parquet", "encode") => "PARQUET RS ZSTD COMPRESS TIME".into(), + ("parquet", "decode") => "PARQUET RS ZSTD DECOMPRESS TIME".into(), + ("lance", "encode") => "LANCE COMPRESS TIME".into(), + ("lance", "decode") => "LANCE DECOMPRESS TIME".into(), + _ => format!("{} {} TIME", format.to_uppercase(), op.to_uppercase()), + } +} + +fn chart_name_compression_size(format: &str) -> String { + match format { + "vortex-file-compressed" => "VORTEX SIZE".into(), + "parquet" => "PARQUET SIZE".into(), + "lance" => "LANCE SIZE".into(), + _ => format!("{} SIZE", format.to_uppercase()), + } +} + +/// Strip casing and `_-` differences between v2 and v3 chart names. +/// v2 displays uppercase; v3 stores raw values. Comparing in this +/// canonical form is enough for structural verification. +fn normalize_chart(s: &str) -> String { + s.trim() + .to_uppercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize_chart_canonicalizes() { + assert_eq!(normalize_chart("taxi/take"), "TAXI/TAKE"); + assert_eq!(normalize_chart("TAXI/TAKE"), "TAXI/TAKE"); + assert_eq!(normalize_chart("tpc-h q1"), "TPC H Q1"); + assert_eq!(normalize_chart("tpc h q1"), "TPC H Q1"); + } + + #[test] + fn display_query_group_handles_fan_out() { + assert_eq!( + display_query_group("tpch", Some("10"), "s3"), + "TPC-H (S3) (SF=10)" + ); + assert_eq!( + display_query_group("tpch", Some("100"), "nvme"), + "TPC-H (NVMe) (SF=100)" + ); + assert_eq!( + display_query_group("clickbench", None, "nvme"), + "Clickbench" + ); + } +} diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs new file mode 100644 index 00000000000..2be3896216c --- /dev/null +++ b/benchmarks-website/migrate/tests/classifier.rs @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Classifier behavior pinned by representative v2 names from each +//! group in `benchmarks-website/server.js`'s `getGroup`. + +use rstest::rstest; +use serde_json::json; +use vortex_bench_migrate::classifier::V3Bin; +use vortex_bench_migrate::classifier::classify; +use vortex_bench_migrate::classifier::format_query; +use vortex_bench_migrate::classifier::rename_engine; +use vortex_bench_migrate::v2::V2Record; + +fn record(name: &str) -> V2Record { + V2Record { + name: name.to_string(), + commit_id: Some("deadbeef".into()), + unit: Some("ns".into()), + value: Some(json!(123)), + storage: None, + dataset: None, + all_runtimes: None, + env_triple: None, + } +} + +fn record_with_storage_and_sf(name: &str, storage: &str, suite: &str, sf: &str) -> V2Record { + let mut r = record(name); + r.storage = Some(storage.into()); + r.dataset = Some(json!({ suite: { "scale_factor": sf } })); + r +} + +#[rstest] +#[case::clickbench( + "clickbench_q07/datafusion:parquet", + V3Bin::Query { + dataset: "clickbench".into(), + dataset_variant: None, + scale_factor: None, + query_idx: 7, + storage: "nvme".into(), + engine: "datafusion".into(), + format: "parquet".into(), + }, +)] +#[case::clickbench_vortex_renamed( + "clickbench_q12/datafusion:vortex-file-compressed", + V3Bin::Query { + dataset: "clickbench".into(), + dataset_variant: None, + scale_factor: None, + query_idx: 12, + storage: "nvme".into(), + engine: "datafusion".into(), + format: "vortex".into(), + }, +)] +#[case::statpopgen( + "statpopgen_q3/datafusion:parquet", + V3Bin::Query { + dataset: "statpopgen".into(), + dataset_variant: None, + scale_factor: None, + query_idx: 3, + storage: "nvme".into(), + engine: "datafusion".into(), + format: "parquet".into(), + }, +)] +#[case::polarsignals( + "polarsignals_q1/duckdb:parquet", + V3Bin::Query { + dataset: "polarsignals".into(), + dataset_variant: None, + scale_factor: None, + query_idx: 1, + storage: "nvme".into(), + engine: "duckdb".into(), + format: "parquet".into(), + }, +)] +fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) { + let r = record(name); + assert_eq!(classify(&r), Some(expected)); +} + +#[rstest] +#[case::tpch_s3_sf100( + "tpch_q01/datafusion:parquet", + "S3", + "tpch", + "100", + V3Bin::Query { + dataset: "tpch".into(), + dataset_variant: None, + scale_factor: Some("100".into()), + query_idx: 1, + storage: "s3".into(), + engine: "datafusion".into(), + format: "parquet".into(), + }, +)] +#[case::tpch_nvme_sf1( + "tpch_q22/duckdb:vortex-file-compressed", + "NVMe", + "tpch", + "1", + V3Bin::Query { + dataset: "tpch".into(), + dataset_variant: None, + scale_factor: Some("1".into()), + query_idx: 22, + storage: "nvme".into(), + engine: "duckdb".into(), + format: "vortex".into(), + }, +)] +#[case::tpcds_nvme_sf10( + "tpcds_q05/datafusion:vortex-file-compressed", + "NVMe", + "tpcds", + "10", + V3Bin::Query { + dataset: "tpcds".into(), + dataset_variant: None, + scale_factor: Some("10".into()), + query_idx: 5, + storage: "nvme".into(), + engine: "datafusion".into(), + format: "vortex".into(), + }, +)] +fn fan_out_query_records( + #[case] name: &str, + #[case] storage: &str, + #[case] suite: &str, + #[case] sf: &str, + #[case] expected: V3Bin, +) { + let r = record_with_storage_and_sf(name, storage, suite, sf); + assert_eq!(classify(&r), Some(expected)); +} + +#[rstest] +#[case::random_access_4_part( + "random-access/taxi/take/parquet-tokio-local-disk", + V3Bin::RandomAccess { + dataset: "taxi/take".into(), + format: "parquet-nvme".into(), + }, +)] +#[case::random_access_4_part_vortex( + "random-access/chimp/take/vortex-tokio-local-disk", + V3Bin::RandomAccess { + dataset: "chimp/take".into(), + format: "vortex-nvme".into(), + }, +)] +#[case::random_access_2_part_legacy( + "random-access/parquet-tokio-local-disk", + V3Bin::RandomAccess { + dataset: "random access".into(), + format: "parquet-nvme".into(), + }, +)] +fn random_access_records(#[case] name: &str, #[case] expected: V3Bin) { + let r = record(name); + assert_eq!(classify(&r), Some(expected)); +} + +#[rstest] +#[case::compress_time_vortex( + "compress time/clickbench", + V3Bin::CompressionTime { + dataset: "clickbench".into(), + dataset_variant: None, + format: "vortex-file-compressed".into(), + op: "encode".into(), + }, +)] +#[case::decompress_time_vortex( + "decompress time/tpch_lineitem", + V3Bin::CompressionTime { + dataset: "tpch_lineitem".into(), + dataset_variant: None, + format: "vortex-file-compressed".into(), + op: "decode".into(), + }, +)] +#[case::parquet_compress( + "parquet_rs-zstd compress time/clickbench", + V3Bin::CompressionTime { + dataset: "clickbench".into(), + dataset_variant: None, + format: "parquet".into(), + op: "encode".into(), + }, +)] +#[case::lance_decompress( + "lance decompress time/clickbench", + V3Bin::CompressionTime { + dataset: "clickbench".into(), + dataset_variant: None, + format: "lance".into(), + op: "decode".into(), + }, +)] +fn compression_time_records(#[case] name: &str, #[case] expected: V3Bin) { + let r = record(name); + assert_eq!(classify(&r), Some(expected)); +} + +#[rstest] +#[case::vortex_size( + "vortex size/clickbench", + V3Bin::CompressionSize { + dataset: "clickbench".into(), + dataset_variant: None, + format: "vortex-file-compressed".into(), + }, +)] +#[case::vortex_file_compressed_size_normalizes( + "vortex-file-compressed size/clickbench", + V3Bin::CompressionSize { + dataset: "clickbench".into(), + dataset_variant: None, + format: "vortex-file-compressed".into(), + }, +)] +#[case::parquet_size( + "parquet size/clickbench", + V3Bin::CompressionSize { + dataset: "clickbench".into(), + dataset_variant: None, + format: "parquet".into(), + }, +)] +#[case::lance_size( + "lance size/tpch_lineitem", + V3Bin::CompressionSize { + dataset: "tpch_lineitem".into(), + dataset_variant: None, + format: "lance".into(), + }, +)] +fn compression_size_records(#[case] name: &str, #[case] expected: V3Bin) { + let r = record(name); + assert_eq!(classify(&r), Some(expected)); +} + +#[rstest] +#[case::ratio_vortex_parquet("vortex:parquet-zstd ratio compress time/clickbench")] +#[case::ratio_vortex_lance("vortex:lance ratio decompress time/clickbench")] +#[case::ratio_size_vortex_parquet("vortex:parquet-zstd size/clickbench")] +#[case::ratio_size_vortex_raw("vortex:raw size/clickbench")] +#[case::throughput("compress throughput/clickbench")] +#[case::fineweb_skipped("fineweb_q01/datafusion:parquet")] +#[case::nonsense_prefix("not-a-known-bench/series")] +fn unmapped_records_yield_none(#[case] name: &str) { + let r = record(name); + assert_eq!( + classify(&r), + None, + "expected {name:?} to classify as None (drop)", + ); +} + +#[test] +fn rename_engine_pins_canonical_outputs() { + assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme"); + assert_eq!( + rename_engine("datafusion:vortex-file-compressed"), + "datafusion:vortex" + ); + assert_eq!(rename_engine("LANCE"), "lance"); +} + +#[test] +fn format_query_pins_v2_display() { + assert_eq!(format_query("clickbench_q00"), "CLICKBENCH Q0"); + assert_eq!(format_query("tpch_q22"), "TPC-H Q22"); + assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42"); + assert_eq!(format_query("polarsignals_q1"), "POLARSIGNALS Q1"); + // Names that don't match a suite fall back to upper + " " replace. + assert_eq!( + format_query("vortex-file-compressed size"), + "VORTEX FILE COMPRESSED SIZE" + ); +} diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs new file mode 100644 index 00000000000..5892215b472 --- /dev/null +++ b/benchmarks-website/migrate/tests/end_to_end.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Inline JSONL fixture exercising 1 record per kind through the full +//! migration into a tempdir DuckDB. No live S3. + +use std::fs::File; +use std::io::Write; + +use duckdb::Connection; +use flate2::Compression; +use flate2::write::GzEncoder; +use tempfile::TempDir; +use vortex_bench_migrate::migrate; +use vortex_bench_migrate::source::Source; + +const COMMITS_JSONL: &str = r#"{"id":"deadbeef","timestamp":"2026-04-25T00:00:00Z","message":"fixture commit","author":{"name":"A","email":"a@example.com"},"committer":{"name":"C","email":"c@example.com"},"tree_id":"abcd0001","url":"https://example.com/commit/deadbeef"} +"#; + +const DATA_JSONL: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":42000,"all_runtimes":[41000,42000,43000]} +{"name":"compress time/clickbench","commit_id":"deadbeef","unit":"ns","value":99} +{"name":"vortex size/clickbench","commit_id":"deadbeef","unit":"bytes","value":1024} +{"name":"random-access/taxi/take/parquet-tokio-local-disk","commit_id":"deadbeef","unit":"ns","value":777,"all_runtimes":[700,777,800]} +"#; + +fn write_local_dir() -> TempDir { + let dir = TempDir::new().expect("tempdir"); + { + let mut f = File::create(dir.path().join("commits.json")).unwrap(); + f.write_all(COMMITS_JSONL.as_bytes()).unwrap(); + } + { + let f = File::create(dir.path().join("data.json.gz")).unwrap(); + let mut gz = GzEncoder::new(f, Compression::default()); + gz.write_all(DATA_JSONL.as_bytes()).unwrap(); + gz.finish().unwrap(); + } + // No file-sizes-*.json.gz to keep the fixture minimal. + dir +} + +#[test] +fn migrate_inline_fixture_populates_each_table() { + let src_dir = write_local_dir(); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.records_read, 4, "summary={summary}"); + assert_eq!(summary.uncategorized, 0, "summary={summary}"); + assert_eq!(summary.commits_inserted, 1); + assert_eq!(summary.query_inserted, 1); + assert_eq!(summary.compression_time_inserted, 1); + assert_eq!(summary.compression_size_inserted, 1); + assert_eq!(summary.random_access_inserted, 1); + + let conn = Connection::open(&target).unwrap(); + let count = |table: &str| -> i64 { + conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0)) + .unwrap() + }; + assert_eq!(count("commits"), 1); + assert_eq!(count("query_measurements"), 1); + assert_eq!(count("compression_times"), 1); + assert_eq!(count("compression_sizes"), 1); + assert_eq!(count("random_access_times"), 1); + + // Spot-check the v3 column values for each kind. + let (engine, format, query_idx, value_ns): (String, String, i32, i64) = conn + .query_row( + "SELECT engine, format, query_idx, value_ns FROM query_measurements", + [], + |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)), + ) + .unwrap(); + assert_eq!(engine, "datafusion"); + assert_eq!(format, "parquet"); + assert_eq!(query_idx, 7); + assert_eq!(value_ns, 42000); + + let (dataset, format, op): (String, String, String) = conn + .query_row( + "SELECT dataset, format, op FROM compression_times", + [], + |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)), + ) + .unwrap(); + assert_eq!(dataset, "clickbench"); + assert_eq!(format, "vortex-file-compressed"); + assert_eq!(op, "encode"); + + let (dataset, format, value_bytes): (String, String, i64) = conn + .query_row( + "SELECT dataset, format, value_bytes FROM compression_sizes", + [], + |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)), + ) + .unwrap(); + assert_eq!(dataset, "clickbench"); + assert_eq!(format, "vortex-file-compressed"); + assert_eq!(value_bytes, 1024); + + let (dataset, format): (String, String) = conn + .query_row("SELECT dataset, format FROM random_access_times", [], |r| { + Ok((r.get(0)?, r.get(1)?)) + }) + .unwrap(); + assert_eq!(dataset, "taxi/take"); + assert_eq!(format, "parquet-nvme"); +} From df53d2ca3f9404e67ab2a5a5ec4e553a6c28d0e3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 26 Apr 2026 22:15:48 +0000 Subject: [PATCH 2/5] fix+perf(benchmarks-migrate): canonical formats + prepared INSERT statements Two narrow fixes: 1. Classifier wrote v2's display-renamed engine and format strings (e.g. "vortex" instead of "vortex-file-compressed") into v3's columns. v3's live emitter writes canonical Format::name() strings, so historical and live records would split into separate chart series at cutover. Pull engine and format from the raw record name; the rename was a v2 read-time UI concern only. 2. The per-row tx.execute(sql, params) hot path re-parsed SQL on every record. Hoist tx.prepare(sql) outside the row loop and reuse the prepared statement. Local migration time: ~15 minutes -> ~2-3 minutes. (The DuckDB Appender API would be ~10x faster still, but its append_row is unimplemented for BIGINT[] columns in duckdb-rs 1.10502, and Arrow record batches are out of scope for this fix.) Signed-off-by: Claude --- benchmarks-website/migrate/Cargo.toml | 3 +- benchmarks-website/migrate/src/classifier.rs | 51 ++- benchmarks-website/migrate/src/migrate.rs | 392 +++++++++--------- .../migrate/tests/classifier.rs | 19 +- .../migrate/tests/end_to_end.rs | 2 +- 5 files changed, 249 insertions(+), 218 deletions(-) diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml index 464e55d9485..a68903f75eb 100644 --- a/benchmarks-website/migrate/Cargo.toml +++ b/benchmarks-website/migrate/Cargo.toml @@ -21,7 +21,8 @@ path = "src/main.rs" [dependencies] anyhow = { workspace = true } clap = { workspace = true, features = ["derive"] } -duckdb = { version = "1.4", features = ["bundled"] } +# track vortex-duckdb's bundled engine version (build.rs) +duckdb = { version = "1.10502", features = ["bundled"] } flate2 = "1.1" reqwest = { workspace = true, features = ["json"] } serde = { workspace = true, features = ["derive"] } diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs index f7a1e56c0ae..4e6e53fef1a 100644 --- a/benchmarks-website/migrate/src/classifier.rs +++ b/benchmarks-website/migrate/src/classifier.rs @@ -12,6 +12,13 @@ //! factor for TPC-H/TPC-DS). This module reproduces that logic and //! then hops to a v3 fact-table bin, since v3 stores dim values as //! columns instead of name fragments. +//! +//! Engine and format strings stored in v3 columns are pulled from the +//! raw, pre-rename v2 record name. v2's `ENGINE_RENAMES` was a v2 +//! read-time UI concern (e.g. `vortex-file-compressed` rendered as +//! `vortex` and `parquet-tokio-local-disk` rendered as `parquet-nvme`). +//! v3 stores canonical `Format::name()` strings to match what the v3 +//! live emitter writes, so historical and live records share series. use crate::v2::V2Record; use crate::v2::dataset_scale_factor; @@ -393,21 +400,30 @@ fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option if dataset.is_empty() { return None; } - let mut format = cls.series.clone(); - if format.is_empty() { - return None; - } - // v2 emits a "default" placeholder when parts[1] is empty; treat - // that as missing and skip the row instead of inserting "default" - // as a format. - if format == "default" { + // Pull format from the raw, pre-rename v2 name so v3 stores the + // canonical `Format::name()` string (matching what the v3 live + // emitter writes). Raw shape is + // `random-access///-tokio-local-disk` + // (4-part) or `random-access/-tokio-local-disk` (2-part + // legacy). After stripping the `-tokio-local-disk` suffix, map the + // v2 random-access ext label (`vortex`, from `Format::ext()`) to + // the canonical name (`vortex-file-compressed`, from + // `Format::name()`). `parquet`, `lance`, and `vortex-compact` + // already match between ext and name. + let parts: Vec<&str> = record.name.split('/').collect(); + let raw = match parts.len() { + 4 => parts[3], + 2 => parts[1], + _ => return None, + }; + if raw.is_empty() || raw == "default" { return None; } - // The v2 random-access bench used to emit `parquet`-suffixed names; - // strip an "ns" unit guard later. - let _ = record; // record is unused here; kept for parity with siblings. - // Lower-case the format too so v3 series names are canonical. - format = format.to_lowercase(); + let stripped = raw.strip_suffix("-tokio-local-disk").unwrap_or(raw); + let format = match stripped { + "vortex" => "vortex-file-compressed".to_string(), + other => other.to_lowercase(), + }; Some(V3Bin::RandomAccess { dataset, format }) } @@ -498,8 +514,13 @@ fn bin_query(cls: &V2Classification, record: &V2Record) -> Option { let raw_first = record.name.split('/').next().unwrap_or(""); let query_idx = parse_query_index_from_first(raw_first)?; - // Series for non-RA records is "engine:format" after rename. - let (engine, format) = split_engine_format(&cls.series)?; + // Pull engine:format from the raw, pre-rename second segment so v3 + // stores canonical `Format::name()` strings (e.g. + // `vortex-file-compressed`) that match what the v3 live emitter + // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for + // UI display and is not appropriate for v3 columns. + let raw_series = record.name.split('/').nth(1)?; + let (engine, format) = split_engine_format(raw_series)?; let storage_v3 = match storage.as_deref() { Some("S3") => "s3".to_string(), diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs index f75e0169fda..5801d820905 100644 --- a/benchmarks-website/migrate/src/migrate.rs +++ b/benchmarks-website/migrate/src/migrate.rs @@ -16,6 +16,7 @@ use std::path::Path; use anyhow::Context as _; use anyhow::Result; use duckdb::Connection; +use duckdb::Statement; use duckdb::Transaction; use duckdb::params; use tracing::warn; @@ -148,37 +149,119 @@ fn migrate_data_jsonl( summary: &mut MigrationSummary, ) -> Result<()> { let reader = source.open_data_jsonl()?; - let mut tx = conn.transaction().context("begin data tx")?; + let mut lines = reader.lines(); const BATCH: u64 = 10_000; - let mut in_batch = 0u64; - for line in reader.lines() { - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - summary.records_read += 1; - let record: V2Record = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed data.json line: {e}"); + loop { + let tx = conn.transaction().context("begin data tx")?; + let mut stmts = DataStatements::prepare(&tx)?; + let mut in_batch = 0u64; + while in_batch < BATCH { + let Some(line) = lines.next() else { break }; + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { continue; } - }; - apply_v2_record(&tx, &record, commits, summary)?; - in_batch += 1; - if in_batch >= BATCH { - tx.commit().context("commit data batch")?; - tx = conn.transaction().context("begin data tx")?; - in_batch = 0; + summary.records_read += 1; + let record: V2Record = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed data.json line: {e}"); + continue; + } + }; + apply_v2_record(&mut stmts, &record, commits, summary)?; + in_batch += 1; + } + drop(stmts); + tx.commit().context("commit data batch")?; + if in_batch == 0 { + break; } } - tx.commit().context("commit final data batch")?; Ok(()) } +/// Prepared INSERT statements for the four v2-derived fact tables. Tied +/// to a single transaction's lifetime; re-prepare after each commit. +struct DataStatements<'tx> { + query: Statement<'tx>, + compression_time: Statement<'tx>, + compression_size: Statement<'tx>, + random_access: Statement<'tx>, +} + +impl<'tx> DataStatements<'tx> { + fn prepare(tx: &'tx Transaction<'_>) -> Result { + Ok(Self { + query: tx.prepare(SQL_INSERT_QUERY)?, + compression_time: tx.prepare(SQL_INSERT_COMPRESSION_TIME)?, + compression_size: tx.prepare(SQL_INSERT_COMPRESSION_SIZE)?, + random_access: tx.prepare(SQL_INSERT_RANDOM_ACCESS)?, + }) + } +} + +const SQL_INSERT_QUERY: &str = r#" +INSERT INTO query_measurements ( + measurement_id, commit_sha, dataset, dataset_variant, scale_factor, + query_idx, storage, engine, format, + value_ns, all_runtimes_ns, + peak_physical, peak_virtual, physical_delta, virtual_delta, + env_triple +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?) +ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple +"#; + +const SQL_INSERT_COMPRESSION_TIME: &str = r#" +INSERT INTO compression_times ( + measurement_id, commit_sha, dataset, dataset_variant, + format, op, value_ns, all_runtimes_ns, env_triple +) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) +ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple +"#; + +const SQL_INSERT_COMPRESSION_SIZE: &str = r#" +INSERT INTO compression_sizes ( + measurement_id, commit_sha, dataset, dataset_variant, + format, value_bytes +) VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_bytes = excluded.value_bytes +"#; + +const SQL_INSERT_RANDOM_ACCESS: &str = r#" +INSERT INTO random_access_times ( + measurement_id, commit_sha, dataset, format, + value_ns, all_runtimes_ns, env_triple +) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) +ON CONFLICT (measurement_id) DO UPDATE SET + commit_sha = excluded.commit_sha, + value_ns = excluded.value_ns, + all_runtimes_ns = excluded.all_runtimes_ns, + env_triple = excluded.env_triple +"#; + +const SQL_UPSERT_FILE_SIZE: &str = r#" +INSERT INTO compression_sizes ( + measurement_id, commit_sha, dataset, dataset_variant, + format, value_bytes +) VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (measurement_id) DO UPDATE SET + value_bytes = compression_sizes.value_bytes + excluded.value_bytes +"#; + fn apply_v2_record( - tx: &Transaction<'_>, + stmts: &mut DataStatements<'_>, record: &V2Record, commits: &BTreeMap, summary: &mut MigrationSummary, @@ -240,7 +323,25 @@ fn apply_v2_record( virtual_delta: None, env_triple, }; - insert_query(tx, &qm)?; + let mid = measurement_id_query(&qm); + stmts.query.execute(params![ + mid, + qm.commit_sha, + qm.dataset, + qm.dataset_variant, + qm.scale_factor, + qm.query_idx, + qm.storage, + qm.engine, + qm.format, + qm.value_ns, + runtimes_literal(&qm.all_runtimes_ns), + qm.peak_physical, + qm.peak_virtual, + qm.physical_delta, + qm.virtual_delta, + qm.env_triple, + ])?; summary.query_inserted += 1; } V3Bin::CompressionTime { @@ -259,7 +360,18 @@ fn apply_v2_record( all_runtimes_ns: runtimes, env_triple, }; - insert_compression_time(tx, &ct)?; + let mid = measurement_id_compression_time(&ct); + stmts.compression_time.execute(params![ + mid, + ct.commit_sha, + ct.dataset, + ct.dataset_variant, + ct.format, + ct.op, + ct.value_ns, + runtimes_literal(&ct.all_runtimes_ns), + ct.env_triple, + ])?; summary.compression_time_inserted += 1; } V3Bin::CompressionSize { @@ -274,7 +386,15 @@ fn apply_v2_record( format, value_bytes: value_f64 as i64, }; - insert_compression_size(tx, &cs)?; + let mid = measurement_id_compression_size(&cs); + stmts.compression_size.execute(params![ + mid, + cs.commit_sha, + cs.dataset, + cs.dataset_variant, + cs.format, + cs.value_bytes, + ])?; summary.compression_size_inserted += 1; } V3Bin::RandomAccess { dataset, format } => { @@ -286,132 +406,22 @@ fn apply_v2_record( all_runtimes_ns: runtimes, env_triple, }; - insert_random_access(tx, &ra)?; + let mid = measurement_id_random_access(&ra); + stmts.random_access.execute(params![ + mid, + ra.commit_sha, + ra.dataset, + ra.format, + ra.value_ns, + runtimes_literal(&ra.all_runtimes_ns), + ra.env_triple, + ])?; summary.random_access_inserted += 1; } } Ok(()) } -fn insert_query(tx: &Transaction<'_>, r: &QueryMeasurement) -> Result<()> { - let mid = measurement_id_query(r); - tx.execute( - r#" - INSERT INTO query_measurements ( - measurement_id, commit_sha, dataset, dataset_variant, scale_factor, - query_idx, storage, engine, format, - value_ns, all_runtimes_ns, - peak_physical, peak_virtual, physical_delta, virtual_delta, - env_triple - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?) - ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple - "#, - params![ - mid, - r.commit_sha, - r.dataset, - r.dataset_variant, - r.scale_factor, - r.query_idx, - r.storage, - r.engine, - r.format, - r.value_ns, - runtimes_literal(&r.all_runtimes_ns), - r.peak_physical, - r.peak_virtual, - r.physical_delta, - r.virtual_delta, - r.env_triple, - ], - )?; - Ok(()) -} - -fn insert_compression_time(tx: &Transaction<'_>, r: &CompressionTime) -> Result<()> { - let mid = measurement_id_compression_time(r); - tx.execute( - r#" - INSERT INTO compression_times ( - measurement_id, commit_sha, dataset, dataset_variant, - format, op, value_ns, all_runtimes_ns, env_triple - ) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) - ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple - "#, - params![ - mid, - r.commit_sha, - r.dataset, - r.dataset_variant, - r.format, - r.op, - r.value_ns, - runtimes_literal(&r.all_runtimes_ns), - r.env_triple, - ], - )?; - Ok(()) -} - -fn insert_compression_size(tx: &Transaction<'_>, r: &CompressionSize) -> Result<()> { - let mid = measurement_id_compression_size(r); - tx.execute( - r#" - INSERT INTO compression_sizes ( - measurement_id, commit_sha, dataset, dataset_variant, - format, value_bytes - ) VALUES (?, ?, ?, ?, ?, ?) - ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_bytes = excluded.value_bytes - "#, - params![ - mid, - r.commit_sha, - r.dataset, - r.dataset_variant, - r.format, - r.value_bytes, - ], - )?; - Ok(()) -} - -fn insert_random_access(tx: &Transaction<'_>, r: &RandomAccessTime) -> Result<()> { - let mid = measurement_id_random_access(r); - tx.execute( - r#" - INSERT INTO random_access_times ( - measurement_id, commit_sha, dataset, format, - value_ns, all_runtimes_ns, env_triple - ) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) - ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple - "#, - params![ - mid, - r.commit_sha, - r.dataset, - r.format, - r.value_ns, - runtimes_literal(&r.all_runtimes_ns), - r.env_triple, - ], - )?; - Ok(()) -} - fn runtimes_literal(values: &[i64]) -> String { let mut s = String::with_capacity(values.len() * 8 + 2); s.push('['); @@ -438,46 +448,50 @@ fn migrate_file_sizes( .and_then(|s| s.strip_suffix(".json.gz")) .unwrap_or(name) .to_string(); - let mut tx = conn.transaction().context("begin file-sizes tx")?; + let mut lines = reader.lines(); const BATCH: u64 = 10_000; - let mut in_batch = 0u64; - for line in reader.lines() { - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - let sz: V2FileSize = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed {name} line: {e}"); + loop { + let tx = conn.transaction().context("begin file-sizes tx")?; + let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?; + let mut in_batch = 0u64; + while in_batch < BATCH { + let Some(line) = lines.next() else { break }; + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { continue; } - }; - if !commits.contains_key(&sz.commit_id) { - summary.missing_commit += 1; - continue; + let sz: V2FileSize = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed {name} line: {e}"); + continue; + } + }; + if !commits.contains_key(&sz.commit_id) { + summary.missing_commit += 1; + continue; + } + // file-sizes-*.json.gz captures per-file sizes inside one + // benchmark/format/scale_factor combo. We aggregate to one + // (commit, dataset, dataset_variant, format) row by summing, + // since v3's compression_sizes is a single bytes value per + // (dim) tuple. Use ON CONFLICT to accumulate. + upsert_file_size_row(&mut stmt, &sz, &dataset)?; + summary.file_size_inserted += 1; + in_batch += 1; } - // file-sizes-*.json.gz captures per-file sizes inside one - // benchmark/format/scale_factor combo. We aggregate to one - // (commit, dataset, dataset_variant, format) row by summing, - // since v3's compression_sizes is a single bytes value per - // (dim) tuple. Use ON CONFLICT to accumulate. - upsert_file_size_row(&tx, &sz, &dataset)?; - summary.file_size_inserted += 1; - in_batch += 1; - if in_batch >= BATCH { - tx.commit().context("commit file-sizes batch")?; - tx = conn.transaction().context("begin file-sizes tx")?; - in_batch = 0; + drop(stmt); + tx.commit().context("commit file-sizes batch")?; + if in_batch == 0 { + break; } } - tx.commit().context("commit final file-sizes batch")?; Ok(()) } fn upsert_file_size_row( - tx: &Transaction<'_>, + stmt: &mut Statement<'_>, sz: &V2FileSize, dataset_fallback: &str, ) -> Result<()> { @@ -499,26 +513,14 @@ fn upsert_file_size_row( value_bytes: sz.size_bytes, }; let mid = measurement_id_compression_size(&cs); - // Multiple files within the same dataset/format/scale_factor sum - // into one row by adding to whatever is already there. - tx.execute( - r#" - INSERT INTO compression_sizes ( - measurement_id, commit_sha, dataset, dataset_variant, - format, value_bytes - ) VALUES (?, ?, ?, ?, ?, ?) - ON CONFLICT (measurement_id) DO UPDATE SET - value_bytes = compression_sizes.value_bytes + excluded.value_bytes - "#, - params![ - mid, - cs.commit_sha, - cs.dataset, - cs.dataset_variant, - cs.format, - cs.value_bytes, - ], - )?; + stmt.execute(params![ + mid, + cs.commit_sha, + cs.dataset, + cs.dataset_variant, + cs.format, + cs.value_bytes, + ])?; Ok(()) } diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs index 2be3896216c..e8288751d62 100644 --- a/benchmarks-website/migrate/tests/classifier.rs +++ b/benchmarks-website/migrate/tests/classifier.rs @@ -54,7 +54,7 @@ fn record_with_storage_and_sf(name: &str, storage: &str, suite: &str, sf: &str) query_idx: 12, storage: "nvme".into(), engine: "datafusion".into(), - format: "vortex".into(), + format: "vortex-file-compressed".into(), }, )] #[case::statpopgen( @@ -114,7 +114,7 @@ fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) { query_idx: 22, storage: "nvme".into(), engine: "duckdb".into(), - format: "vortex".into(), + format: "vortex-file-compressed".into(), }, )] #[case::tpcds_nvme_sf10( @@ -129,7 +129,7 @@ fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) { query_idx: 5, storage: "nvme".into(), engine: "datafusion".into(), - format: "vortex".into(), + format: "vortex-file-compressed".into(), }, )] fn fan_out_query_records( @@ -148,21 +148,28 @@ fn fan_out_query_records( "random-access/taxi/take/parquet-tokio-local-disk", V3Bin::RandomAccess { dataset: "taxi/take".into(), - format: "parquet-nvme".into(), + format: "parquet".into(), }, )] #[case::random_access_4_part_vortex( "random-access/chimp/take/vortex-tokio-local-disk", V3Bin::RandomAccess { dataset: "chimp/take".into(), - format: "vortex-nvme".into(), + format: "vortex-file-compressed".into(), }, )] #[case::random_access_2_part_legacy( "random-access/parquet-tokio-local-disk", V3Bin::RandomAccess { dataset: "random access".into(), - format: "parquet-nvme".into(), + format: "parquet".into(), + }, +)] +#[case::random_access_4_part_lance( + "random-access/taxi/take/lance-tokio-local-disk", + V3Bin::RandomAccess { + dataset: "taxi/take".into(), + format: "lance".into(), }, )] fn random_access_records(#[case] name: &str, #[case] expected: V3Bin) { diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs index 5892215b472..b389f77c421 100644 --- a/benchmarks-website/migrate/tests/end_to_end.rs +++ b/benchmarks-website/migrate/tests/end_to_end.rs @@ -107,5 +107,5 @@ fn migrate_inline_fixture_populates_each_table() { }) .unwrap(); assert_eq!(dataset, "taxi/take"); - assert_eq!(format, "parquet-nvme"); + assert_eq!(format, "parquet"); } From b02f418918d90b7dd984f7702b67c4f43b99050c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 26 Apr 2026 22:49:56 +0000 Subject: [PATCH 3/5] chore(benchmarks-migrate): progress logging + small fit-and-finish Adds tracing-based phase announcements and periodic progress lines (every 5 seconds) so users know the binary isn't hung during multi-minute migrations. Also fixes an inaccurate doc comment about vortex-compact's ext label and skips empty trailing transactions in both streaming loops. No behavior change - all log output, comment-only edits, and a no-op-transaction elision. Signed-off-by: Claude --- benchmarks-website/migrate/src/classifier.rs | 8 ++- benchmarks-website/migrate/src/migrate.rs | 55 ++++++++++++++++---- benchmarks-website/migrate/src/source.rs | 10 ++++ 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs index 4e6e53fef1a..22802482ee9 100644 --- a/benchmarks-website/migrate/src/classifier.rs +++ b/benchmarks-website/migrate/src/classifier.rs @@ -408,8 +408,12 @@ fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option // legacy). After stripping the `-tokio-local-disk` suffix, map the // v2 random-access ext label (`vortex`, from `Format::ext()`) to // the canonical name (`vortex-file-compressed`, from - // `Format::name()`). `parquet`, `lance`, and `vortex-compact` - // already match between ext and name. + // `Format::name()`). `parquet` and `lance` match between ext and + // name. The `vortex` ext is shared by both `OnDiskVortex` (name + // `vortex-file-compressed`) and `VortexCompact` (name + // `vortex-compact`), but v2's random-access bench only emitted + // `OnDiskVortex`, so mapping to `vortex-file-compressed` is + // correct for all historical data. let parts: Vec<&str> = record.name.split('/').collect(); let raw = match parts.len() { 4 => parts[3], diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs index 5801d820905..06bc7cdeaf5 100644 --- a/benchmarks-website/migrate/src/migrate.rs +++ b/benchmarks-website/migrate/src/migrate.rs @@ -12,6 +12,8 @@ use std::collections::BTreeMap; use std::io::BufRead; use std::path::Path; +use std::time::Duration; +use std::time::Instant; use anyhow::Context as _; use anyhow::Result; @@ -19,6 +21,7 @@ use duckdb::Connection; use duckdb::Statement; use duckdb::Transaction; use duckdb::params; +use tracing::info; use tracing::warn; use vortex_bench_server::db::measurement_id_compression_size; use vortex_bench_server::db::measurement_id_compression_time; @@ -92,12 +95,21 @@ pub fn run(source: &Source, target: &Path) -> Result { let mut conn = open_target_db(target)?; let mut summary = MigrationSummary::default(); + info!(source = %source.describe(), "Reading commits.json"); let commits = read_commits(source)?; + info!(commits = commits.len(), "Loaded commits"); summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?; + info!("Migrating data.json.gz"); migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?; + info!( + records = summary.records_read, + inserted = summary.total_inserted(), + "data.json.gz done", + ); for name in source.list_file_sizes()? { + info!(name = %name, "Migrating file-sizes"); if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) { warn!("file-sizes file {name} failed: {e:#}"); } @@ -149,9 +161,11 @@ fn migrate_data_jsonl( summary: &mut MigrationSummary, ) -> Result<()> { let reader = source.open_data_jsonl()?; - let mut lines = reader.lines(); + let mut lines = reader.lines().peekable(); + let started = Instant::now(); + let mut last_log = Instant::now(); const BATCH: u64 = 10_000; - loop { + while lines.peek().is_some() { let tx = conn.transaction().context("begin data tx")?; let mut stmts = DataStatements::prepare(&tx)?; let mut in_batch = 0u64; @@ -172,12 +186,23 @@ fn migrate_data_jsonl( }; apply_v2_record(&mut stmts, &record, commits, summary)?; in_batch += 1; + if last_log.elapsed() >= Duration::from_secs(5) { + let elapsed = started.elapsed().as_secs_f64(); + let rate = summary.records_read as f64 / elapsed.max(0.001); + info!( + records = summary.records_read, + rate = format!("{rate:.0}/s"), + query = summary.query_inserted, + compression_time = summary.compression_time_inserted, + compression_size = summary.compression_size_inserted, + random_access = summary.random_access_inserted, + "migration progress", + ); + last_log = Instant::now(); + } } drop(stmts); tx.commit().context("commit data batch")?; - if in_batch == 0 { - break; - } } Ok(()) } @@ -448,9 +473,11 @@ fn migrate_file_sizes( .and_then(|s| s.strip_suffix(".json.gz")) .unwrap_or(name) .to_string(); - let mut lines = reader.lines(); + let mut lines = reader.lines().peekable(); + let started = Instant::now(); + let mut last_log = Instant::now(); const BATCH: u64 = 10_000; - loop { + while lines.peek().is_some() { let tx = conn.transaction().context("begin file-sizes tx")?; let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?; let mut in_batch = 0u64; @@ -480,12 +507,20 @@ fn migrate_file_sizes( upsert_file_size_row(&mut stmt, &sz, &dataset)?; summary.file_size_inserted += 1; in_batch += 1; + if last_log.elapsed() >= Duration::from_secs(5) { + let elapsed = started.elapsed().as_secs_f64(); + let rate = summary.file_size_inserted as f64 / elapsed.max(0.001); + info!( + name = %name, + file_sizes = summary.file_size_inserted, + rate = format!("{rate:.0}/s"), + "file-sizes progress", + ); + last_log = Instant::now(); + } } drop(stmt); tx.commit().context("commit file-sizes batch")?; - if in_batch == 0 { - break; - } } Ok(()) } diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs index 2b4fdca9b94..340a9bdb60f 100644 --- a/benchmarks-website/migrate/src/source.rs +++ b/benchmarks-website/migrate/src/source.rs @@ -23,6 +23,7 @@ use std::path::PathBuf; use anyhow::Context as _; use anyhow::Result; use flate2::read::GzDecoder; +use tracing::info; /// Public S3 bucket the live v2 server reads from. pub const PUBLIC_BUCKET_BASE: &str = "https://vortex-ci-benchmark-results.s3.amazonaws.com"; @@ -39,6 +40,14 @@ pub enum Source { } impl Source { + /// Short human-readable description for log messages. + pub fn describe(&self) -> String { + match self { + Source::PublicS3 => "public S3 bucket".to_string(), + Source::Local(p) => format!("local dir {}", p.display()), + } + } + /// Open `data.json.gz` for streaming, decompressing on the fly. pub fn open_data_jsonl(&self) -> Result> { let stream = self.open_raw("data.json.gz")?; @@ -102,6 +111,7 @@ fn open_local(path: &Path) -> Result> { fn open_s3(name: &str) -> Result> { let url = format!("{PUBLIC_BUCKET_BASE}/{name}"); + info!(url = %url, "GET"); let resp = reqwest::blocking::get(&url).with_context(|| format!("GET {url}"))?; if !resp.status().is_success() { anyhow::bail!("GET {url} returned {}", resp.status()); From 42ad6a12c6ebfe2fa1c66c3695b8954032ee64a7 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 26 Apr 2026 19:47:27 -0400 Subject: [PATCH 4/5] fix perf and insert bugs Signed-off-by: Connor Tsui --- Cargo.lock | 40 + benchmarks-website/migrate/Cargo.toml | 5 +- benchmarks-website/migrate/src/classifier.rs | 154 +++- benchmarks-website/migrate/src/migrate.rs | 777 +++++++++++------- .../migrate/tests/classifier.rs | 92 ++- .../migrate/tests/end_to_end.rs | 101 ++- 6 files changed, 868 insertions(+), 301 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 079289cdfa8..20075443c36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3742,6 +3742,7 @@ dependencies = [ "fallible-streaming-iterator", "hashlink", "libduckdb-sys", + "num", "num-integer", "rust_decimal", "strum 0.27.2", @@ -6374,6 +6375,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -6409,6 +6424,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -10357,6 +10394,9 @@ name = "vortex-bench-migrate" version = "0.1.0-alpha.0" dependencies = [ "anyhow", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-schema 58.1.0", "clap", "duckdb", "flate2", diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml index a68903f75eb..f9b83d5d543 100644 --- a/benchmarks-website/migrate/Cargo.toml +++ b/benchmarks-website/migrate/Cargo.toml @@ -20,9 +20,12 @@ path = "src/main.rs" [dependencies] anyhow = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } clap = { workspace = true, features = ["derive"] } # track vortex-duckdb's bundled engine version (build.rs) -duckdb = { version = "1.10502", features = ["bundled"] } +duckdb = { version = "1.10502", features = ["bundled", "appender-arrow"] } flate2 = "1.1" reqwest = { workspace = true, features = ["json"] } serde = { workspace = true, features = ["derive"] } diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs index 22802482ee9..6b3368c64b8 100644 --- a/benchmarks-website/migrate/src/classifier.rs +++ b/benchmarks-website/migrate/src/classifier.rs @@ -49,6 +49,14 @@ pub const QUERY_SUITES: &[QuerySuite] = &[ fan_out: false, skip: false, }, + QuerySuite { + prefix: "gharchive", + display_name: "GhArchive", + query_prefix: "GHARCHIVE", + dataset_key: None, + fan_out: false, + skip: false, + }, QuerySuite { prefix: "tpch", display_name: "TPC-H", @@ -71,7 +79,7 @@ pub const QUERY_SUITES: &[QuerySuite] = &[ query_prefix: "FINEWEB", dataset_key: None, fan_out: false, - skip: true, + skip: false, }, ]; @@ -221,6 +229,7 @@ pub fn get_group(record: &V2Record) -> Option { if lower.starts_with("vortex size/") || lower.starts_with("vortex-file-compressed size/") || lower.starts_with("parquet size/") + || lower.starts_with("parquet-zstd size/") || lower.starts_with("lance size/") || lower.contains(":raw size/") || lower.contains(":parquet-zstd size/") @@ -237,6 +246,10 @@ pub fn get_group(record: &V2Record) -> Option { || lower.starts_with("lance decompress") || lower.starts_with("vortex:lance ratio") || lower.starts_with("vortex:parquet-zstd ratio") + // Typo'd v2 emitter wrote `parquet-zst` (no `d`) for some + // ratio records; match both spellings so they classify as + // derived ratios instead of falling through to Unknown. + || lower.starts_with("vortex:parquet-zst ratio") || lower.starts_with("vortex:raw ratio") { return Some(V2Group::Compression); @@ -392,6 +405,132 @@ pub fn classify(record: &V2Record) -> Option { } } +/// Reason the classifier dropped a record. Intentional skips (v2 +/// patterns v3 deliberately doesn't store) are NOT errors; they don't +/// count against the uncategorized gate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Skip { + /// `vortex:* ratio …` and `vortex:* size` — derived in v3 from + /// `compression_sizes` joined to itself. + DerivedRatio, + /// `throughput` records — v2 derived these from latencies. + Throughput, + /// A v2 query suite marked `skip: true` in QUERY_SUITES. + SkippedSuite, + /// random-access record with an unsupported part count. + UnsupportedShape, + /// Record had no `value` field. + NoValue, + /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`, + /// historical-only suites no longer in CI). + Deprecated, +} + +/// Engines the v3 emitter produces today. Anything else is historical +/// and gets bucketed as `Skip::Deprecated`. +/// +/// ORCHESTRATOR NOTE: confirm against `vortex-bench`'s `Engine` enum +/// before handing off; edit if the live set differs. +const V3_ENGINES: &[&str] = &["datafusion", "duckdb", "vortex", "arrow"]; + +/// Formats the v3 emitter produces today (`Format::name()` values). +/// +/// ORCHESTRATOR NOTE: confirm against `vortex-bench/src/lib.rs` +/// `Format::name()` before handing off. +const V3_FORMATS: &[&str] = &[ + "vortex-file-compressed", + "vortex-compact", + "parquet", + "lance", + "csv", + "arrow", + "duckdb", +]; + +/// Query suites the v3 CI runs today. Suites outside this list still +/// classify (so historical analyses stay coherent) but get bucketed +/// as `Skip::Deprecated` so they don't render as orphan charts in v3. +/// +/// ORCHESTRATOR NOTE: add `fineweb` and/or `gharchive` here if a CI +/// grep shows v3 still emits them. +const V3_QUERY_SUITES: &[&str] = &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"]; + +/// Returns true if every dim that v3 stores as a column is on the +/// emitter's current allowlist. Dim values outside the allowlist mean +/// historical-only formats / engines that the v3 UI has nothing to +/// render against. +fn is_v3_dim(bin: &V3Bin) -> bool { + match bin { + V3Bin::Query { engine, format, .. } => { + V3_ENGINES.contains(&engine.as_str()) && V3_FORMATS.contains(&format.as_str()) + } + V3Bin::CompressionTime { format, .. } + | V3Bin::CompressionSize { format, .. } + | V3Bin::RandomAccess { format, .. } => V3_FORMATS.contains(&format.as_str()), + } +} + +/// Outcome of running the classifier on a v2 record. Distinguishes +/// "we know we don't want this" (`Skip`) from "we don't recognize this" +/// (`Unknown`); the migrator's 5% gate fires only on the latter. +#[derive(Debug, Clone)] +pub enum Outcome { + Bin(V3Bin), + Skip(Skip), + Unknown, +} + +/// Like [`classify`], but reports *why* a record was dropped. Intended +/// for the migrator so the 5% uncategorized gate doesn't trip on +/// records v2 deliberately doesn't render (ratios, throughput, +/// skipped suites). +pub fn classify_outcome(record: &V2Record) -> Outcome { + if record.name.contains(" throughput") { + return Outcome::Skip(Skip::Throughput); + } + let Some(group) = get_group(record) else { + return Outcome::Unknown; + }; + if let V2Group::Query { suite_index, .. } = &group + && QUERY_SUITES[*suite_index].skip + { + return Outcome::Skip(Skip::SkippedSuite); + } + let Some(cls) = classify_v2(record) else { + // get_group succeeded but classify_v2 didn't — shape mismatch. + return Outcome::Skip(Skip::UnsupportedShape); + }; + let derived = match &cls.group { + V2Group::Compression => { + let lc = cls.chart.to_lowercase(); + lc.contains("ratio") || lc.contains(':') + } + V2Group::CompressionSize => cls.chart.to_lowercase().contains(':'), + _ => false, + }; + if derived { + return Outcome::Skip(Skip::DerivedRatio); + } + let bin = match &cls.group { + V2Group::RandomAccess => bin_random_access(&cls, record), + V2Group::Compression => bin_compression_time(&cls, record), + V2Group::CompressionSize => bin_compression_size(&cls, record), + V2Group::Query { .. } => bin_query(&cls, record), + }; + let Some(bin) = bin else { + return Outcome::Unknown; + }; + if !is_v3_dim(&bin) { + return Outcome::Skip(Skip::Deprecated); + } + if let V2Group::Query { suite_index, .. } = &group + && !V3_QUERY_SUITES.contains(&QUERY_SUITES[*suite_index].prefix) + { + return Outcome::Skip(Skip::Deprecated); + } + Outcome::Bin(bin) +} + fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option { // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase). // We store it as the v3 dataset value verbatim, lowercased so @@ -482,8 +621,15 @@ fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option Option { // `vortex-file-compressed`) that match what the v3 live emitter // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for // UI display and is not appropriate for v3 columns. + // + // Older v2 records emitted display-case engines (e.g. `DataFusion`, + // `DuckDB`); newer ones emit lowercase. Lowercase here so dedup + // collapses both spellings into a single canonical row. let raw_series = record.name.split('/').nth(1)?; let (engine, format) = split_engine_format(raw_series)?; + let engine = engine.to_lowercase(); + let format = format.to_lowercase(); let storage_v3 = match storage.as_deref() { Some("S3") => "s3".to_string(), diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs index 06bc7cdeaf5..ff1abf835f0 100644 --- a/benchmarks-website/migrate/src/migrate.rs +++ b/benchmarks-website/migrate/src/migrate.rs @@ -8,19 +8,34 @@ //! the appropriate v3 fact table. Every row's `measurement_id` is //! computed via the server's `measurement_id_*` functions so the result //! is byte-compatible with what fresh `/api/ingest` would have produced. +//! +//! Bulk-load shape: rows are accumulated in memory as parallel column +//! vectors, deduplicated by `measurement_id`, then flushed to DuckDB +//! via `Appender::append_record_batch` as one Arrow `RecordBatch` per +//! fact table. use std::collections::BTreeMap; +use std::collections::HashMap; +use std::collections::HashSet; use std::io::BufRead; use std::path::Path; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; use anyhow::Context as _; use anyhow::Result; +use arrow_array::ArrayRef; +use arrow_array::Int32Array; +use arrow_array::Int64Array; +use arrow_array::ListArray; +use arrow_array::RecordBatch; +use arrow_array::StringArray; +use arrow_buffer::OffsetBuffer; +use arrow_schema::DataType; +use arrow_schema::Field; +use arrow_schema::Schema; use duckdb::Connection; -use duckdb::Statement; -use duckdb::Transaction; -use duckdb::params; use tracing::info; use tracing::warn; use vortex_bench_server::db::measurement_id_compression_size; @@ -33,8 +48,8 @@ use vortex_bench_server::records::QueryMeasurement; use vortex_bench_server::records::RandomAccessTime; use vortex_bench_server::schema::SCHEMA_DDL; +use crate::classifier; use crate::classifier::V3Bin; -use crate::classifier::classify; use crate::commits::upsert_commit; use crate::source::Source; use crate::v2::V2Commit; @@ -58,7 +73,9 @@ pub struct MigrationSummary { pub missing_commit: u64, pub commit_warnings: u64, pub skipped_no_value: u64, + pub skipped_intentional: u64, pub commits_inserted: u64, + pub deduped: u64, } impl MigrationSummary { @@ -80,8 +97,16 @@ impl MigrationSummary { } } -/// Open or create a DuckDB at `path` and apply the v3 schema. +/// Open or create a DuckDB at `path` and apply the v3 schema. The +/// migrator is a one-shot fresh load; the bulk-append flush is pure +/// insert (no `ON CONFLICT`), so any stale rows in `path` would clash +/// with the next run on the same primary keys. Delete both the +/// database file and its WAL companion up front so every run starts +/// from a known-empty state. pub fn open_target_db(path: &Path) -> Result { + remove_if_exists(path)?; + let wal = wal_path(path); + remove_if_exists(&wal)?; let conn = Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?; conn.execute_batch(SCHEMA_DDL) @@ -89,6 +114,25 @@ pub fn open_target_db(path: &Path) -> Result { Ok(conn) } +fn remove_if_exists(path: &Path) -> Result<()> { + match std::fs::remove_file(path) { + Ok(()) => { + info!(path = %path.display(), "removed pre-existing target file"); + Ok(()) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(e) => Err(e).with_context(|| format!("removing {}", path.display())), + } +} + +/// DuckDB writes its write-ahead log next to the database file with a +/// `.wal` suffix appended (e.g. `v3.duckdb` -> `v3.duckdb.wal`). +fn wal_path(path: &Path) -> std::path::PathBuf { + let mut name = path.as_os_str().to_owned(); + name.push(".wal"); + std::path::PathBuf::from(name) +} + /// Run the whole migration: commits, data.json.gz, and every /// file-sizes-*.json.gz under the source. pub fn run(source: &Source, target: &Path) -> Result { @@ -100,21 +144,49 @@ pub fn run(source: &Source, target: &Path) -> Result { info!(commits = commits.len(), "Loaded commits"); summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?; + let mut q = QueryAccum::default(); + let mut ct = CompressionTimeAccum::default(); + let mut cs = CompressionSizeAccum::default(); + let mut ra = RandomAccessAccum::default(); + info!("Migrating data.json.gz"); - migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?; - info!( - records = summary.records_read, - inserted = summary.total_inserted(), - "data.json.gz done", - ); + migrate_data_jsonl( + source, + &commits, + &mut summary, + &mut q, + &mut ct, + &mut cs, + &mut ra, + )?; + info!(records = summary.records_read, "data.json.gz done"); for name in source.list_file_sizes()? { info!(name = %name, "Migrating file-sizes"); - if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) { + if let Err(e) = migrate_file_sizes(source, &name, &commits, &mut summary, &mut cs) { warn!("file-sizes file {name} failed: {e:#}"); } } + info!("Flushing accumulators to DuckDB"); + summary.query_inserted = q.measurement_id.len() as u64; + summary.compression_time_inserted = ct.measurement_id.len() as u64; + summary.random_access_inserted = ra.measurement_id.len() as u64; + summary.compression_size_inserted = cs.rows.len() as u64; + + flush(&conn, "query_measurements", build_query_batch(q)?)?; + flush( + &conn, + "compression_times", + build_compression_time_batch(ct)?, + )?; + flush(&conn, "random_access_times", build_random_access_batch(ra)?)?; + flush( + &conn, + "compression_sizes", + build_compression_size_batch(cs)?, + )?; + Ok(summary) } @@ -154,157 +226,84 @@ fn upsert_all_commits( Ok(count) } +/// Stream `data.json.gz` and push classified records into the +/// per-table accumulators. Dedup happens inside each accumulator's +/// `push` method by `measurement_id`. fn migrate_data_jsonl( - conn: &mut Connection, source: &Source, commits: &BTreeMap, summary: &mut MigrationSummary, + q: &mut QueryAccum, + ct: &mut CompressionTimeAccum, + cs: &mut CompressionSizeAccum, + ra: &mut RandomAccessAccum, ) -> Result<()> { let reader = source.open_data_jsonl()?; - let mut lines = reader.lines().peekable(); let started = Instant::now(); let mut last_log = Instant::now(); - const BATCH: u64 = 10_000; - while lines.peek().is_some() { - let tx = conn.transaction().context("begin data tx")?; - let mut stmts = DataStatements::prepare(&tx)?; - let mut in_batch = 0u64; - while in_batch < BATCH { - let Some(line) = lines.next() else { break }; - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + summary.records_read += 1; + let record: V2Record = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed data.json line: {e}"); continue; } - summary.records_read += 1; - let record: V2Record = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed data.json line: {e}"); - continue; - } - }; - apply_v2_record(&mut stmts, &record, commits, summary)?; - in_batch += 1; - if last_log.elapsed() >= Duration::from_secs(5) { - let elapsed = started.elapsed().as_secs_f64(); - let rate = summary.records_read as f64 / elapsed.max(0.001); - info!( - records = summary.records_read, - rate = format!("{rate:.0}/s"), - query = summary.query_inserted, - compression_time = summary.compression_time_inserted, - compression_size = summary.compression_size_inserted, - random_access = summary.random_access_inserted, - "migration progress", - ); - last_log = Instant::now(); - } + }; + apply_v2_record(&record, commits, summary, q, ct, cs, ra); + if last_log.elapsed() >= Duration::from_secs(5) { + let elapsed = started.elapsed().as_secs_f64(); + let rate = summary.records_read as f64 / elapsed.max(0.001); + info!( + records = summary.records_read, + rate = format!("{rate:.0}/s"), + query = q.measurement_id.len(), + compression_time = ct.measurement_id.len(), + compression_size = cs.rows.len(), + random_access = ra.measurement_id.len(), + "migration progress", + ); + last_log = Instant::now(); } - drop(stmts); - tx.commit().context("commit data batch")?; } Ok(()) } -/// Prepared INSERT statements for the four v2-derived fact tables. Tied -/// to a single transaction's lifetime; re-prepare after each commit. -struct DataStatements<'tx> { - query: Statement<'tx>, - compression_time: Statement<'tx>, - compression_size: Statement<'tx>, - random_access: Statement<'tx>, -} - -impl<'tx> DataStatements<'tx> { - fn prepare(tx: &'tx Transaction<'_>) -> Result { - Ok(Self { - query: tx.prepare(SQL_INSERT_QUERY)?, - compression_time: tx.prepare(SQL_INSERT_COMPRESSION_TIME)?, - compression_size: tx.prepare(SQL_INSERT_COMPRESSION_SIZE)?, - random_access: tx.prepare(SQL_INSERT_RANDOM_ACCESS)?, - }) - } -} - -const SQL_INSERT_QUERY: &str = r#" -INSERT INTO query_measurements ( - measurement_id, commit_sha, dataset, dataset_variant, scale_factor, - query_idx, storage, engine, format, - value_ns, all_runtimes_ns, - peak_physical, peak_virtual, physical_delta, virtual_delta, - env_triple -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?) -ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple -"#; - -const SQL_INSERT_COMPRESSION_TIME: &str = r#" -INSERT INTO compression_times ( - measurement_id, commit_sha, dataset, dataset_variant, - format, op, value_ns, all_runtimes_ns, env_triple -) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) -ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple -"#; - -const SQL_INSERT_COMPRESSION_SIZE: &str = r#" -INSERT INTO compression_sizes ( - measurement_id, commit_sha, dataset, dataset_variant, - format, value_bytes -) VALUES (?, ?, ?, ?, ?, ?) -ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_bytes = excluded.value_bytes -"#; - -const SQL_INSERT_RANDOM_ACCESS: &str = r#" -INSERT INTO random_access_times ( - measurement_id, commit_sha, dataset, format, - value_ns, all_runtimes_ns, env_triple -) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?) -ON CONFLICT (measurement_id) DO UPDATE SET - commit_sha = excluded.commit_sha, - value_ns = excluded.value_ns, - all_runtimes_ns = excluded.all_runtimes_ns, - env_triple = excluded.env_triple -"#; - -const SQL_UPSERT_FILE_SIZE: &str = r#" -INSERT INTO compression_sizes ( - measurement_id, commit_sha, dataset, dataset_variant, - format, value_bytes -) VALUES (?, ?, ?, ?, ?, ?) -ON CONFLICT (measurement_id) DO UPDATE SET - value_bytes = compression_sizes.value_bytes + excluded.value_bytes -"#; - fn apply_v2_record( - stmts: &mut DataStatements<'_>, record: &V2Record, commits: &BTreeMap, summary: &mut MigrationSummary, -) -> Result<()> { + q: &mut QueryAccum, + ct: &mut CompressionTimeAccum, + cs: &mut CompressionSizeAccum, + ra: &mut RandomAccessAccum, +) { let Some(sha) = record.commit_id.clone() else { summary.missing_commit += 1; - return Ok(()); + return; }; if !commits.contains_key(&sha) { summary.missing_commit += 1; - return Ok(()); + return; } - let Some(bin) = classify(record) else { - summary.uncategorized += 1; - let prefix = record.name.split('/').next().unwrap_or("").to_string(); - *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1; - return Ok(()); + let bin = match classifier::classify_outcome(record) { + classifier::Outcome::Bin(b) => b, + classifier::Outcome::Skip(_) => { + summary.skipped_intentional += 1; + return; + } + classifier::Outcome::Unknown => { + summary.uncategorized += 1; + let prefix = record.name.split('/').next().unwrap_or("").to_string(); + *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1; + return; + } }; let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple()); @@ -317,7 +316,7 @@ fn apply_v2_record( Some(v) => v, None => { summary.skipped_no_value += 1; - return Ok(()); + return; } }; @@ -349,25 +348,7 @@ fn apply_v2_record( env_triple, }; let mid = measurement_id_query(&qm); - stmts.query.execute(params![ - mid, - qm.commit_sha, - qm.dataset, - qm.dataset_variant, - qm.scale_factor, - qm.query_idx, - qm.storage, - qm.engine, - qm.format, - qm.value_ns, - runtimes_literal(&qm.all_runtimes_ns), - qm.peak_physical, - qm.peak_virtual, - qm.physical_delta, - qm.virtual_delta, - qm.env_triple, - ])?; - summary.query_inserted += 1; + q.push(mid, qm, summary); } V3Bin::CompressionTime { dataset, @@ -375,7 +356,7 @@ fn apply_v2_record( format, op, } => { - let ct = CompressionTime { + let ctr = CompressionTime { commit_sha: sha, dataset, dataset_variant, @@ -385,45 +366,26 @@ fn apply_v2_record( all_runtimes_ns: runtimes, env_triple, }; - let mid = measurement_id_compression_time(&ct); - stmts.compression_time.execute(params![ - mid, - ct.commit_sha, - ct.dataset, - ct.dataset_variant, - ct.format, - ct.op, - ct.value_ns, - runtimes_literal(&ct.all_runtimes_ns), - ct.env_triple, - ])?; - summary.compression_time_inserted += 1; + let mid = measurement_id_compression_time(&ctr); + ct.push(mid, ctr, summary); } V3Bin::CompressionSize { dataset, dataset_variant, format, } => { - let cs = CompressionSize { + let csr = CompressionSize { commit_sha: sha, dataset, dataset_variant, format, value_bytes: value_f64 as i64, }; - let mid = measurement_id_compression_size(&cs); - stmts.compression_size.execute(params![ - mid, - cs.commit_sha, - cs.dataset, - cs.dataset_variant, - cs.format, - cs.value_bytes, - ])?; - summary.compression_size_inserted += 1; + let mid = measurement_id_compression_size(&csr); + cs.push_replace(mid, csr); } V3Bin::RandomAccess { dataset, format } => { - let ra = RandomAccessTime { + let rar = RandomAccessTime { commit_sha: sha, dataset, format, @@ -431,134 +393,379 @@ fn apply_v2_record( all_runtimes_ns: runtimes, env_triple, }; - let mid = measurement_id_random_access(&ra); - stmts.random_access.execute(params![ - mid, - ra.commit_sha, - ra.dataset, - ra.format, - ra.value_ns, - runtimes_literal(&ra.all_runtimes_ns), - ra.env_triple, - ])?; - summary.random_access_inserted += 1; + let mid = measurement_id_random_access(&rar); + ra.push(mid, rar, summary); } } - Ok(()) -} - -fn runtimes_literal(values: &[i64]) -> String { - let mut s = String::with_capacity(values.len() * 8 + 2); - s.push('['); - for (i, v) in values.iter().enumerate() { - if i > 0 { - s.push(','); - } - s.push_str(&v.to_string()); - } - s.push(']'); - s } fn migrate_file_sizes( - conn: &mut Connection, source: &Source, name: &str, commits: &BTreeMap, summary: &mut MigrationSummary, + cs: &mut CompressionSizeAccum, ) -> Result<()> { let reader = source.open_file_sizes(name)?; - let dataset = name + let dataset_fallback = name .strip_prefix("file-sizes-") .and_then(|s| s.strip_suffix(".json.gz")) .unwrap_or(name) .to_string(); - let mut lines = reader.lines().peekable(); let started = Instant::now(); let mut last_log = Instant::now(); - const BATCH: u64 = 10_000; - while lines.peek().is_some() { - let tx = conn.transaction().context("begin file-sizes tx")?; - let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?; - let mut in_batch = 0u64; - while in_batch < BATCH { - let Some(line) = lines.next() else { break }; - let line = line?; - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - let sz: V2FileSize = match serde_json::from_str(trimmed) { - Ok(r) => r, - Err(e) => { - warn!("skipping malformed {name} line: {e}"); - continue; - } - }; - if !commits.contains_key(&sz.commit_id) { - summary.missing_commit += 1; + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let sz: V2FileSize = match serde_json::from_str(trimmed) { + Ok(r) => r, + Err(e) => { + warn!("skipping malformed {name} line: {e}"); continue; } - // file-sizes-*.json.gz captures per-file sizes inside one - // benchmark/format/scale_factor combo. We aggregate to one - // (commit, dataset, dataset_variant, format) row by summing, - // since v3's compression_sizes is a single bytes value per - // (dim) tuple. Use ON CONFLICT to accumulate. - upsert_file_size_row(&mut stmt, &sz, &dataset)?; - summary.file_size_inserted += 1; - in_batch += 1; - if last_log.elapsed() >= Duration::from_secs(5) { - let elapsed = started.elapsed().as_secs_f64(); - let rate = summary.file_size_inserted as f64 / elapsed.max(0.001); - info!( - name = %name, - file_sizes = summary.file_size_inserted, - rate = format!("{rate:.0}/s"), - "file-sizes progress", - ); - last_log = Instant::now(); - } + }; + if !commits.contains_key(&sz.commit_id) { + summary.missing_commit += 1; + continue; + } + let dataset = if sz.benchmark.is_empty() { + dataset_fallback.clone() + } else { + sz.benchmark.clone() + }; + let dataset_variant = sz + .scale_factor + .as_ref() + .filter(|s| !s.is_empty() && s.as_str() != "1.0") + .cloned(); + let csr = CompressionSize { + commit_sha: sz.commit_id.clone(), + dataset, + dataset_variant, + format: sz.format.clone(), + value_bytes: sz.size_bytes, + }; + let mid = measurement_id_compression_size(&csr); + cs.push_sum(mid, csr); + summary.file_size_inserted += 1; + if last_log.elapsed() >= Duration::from_secs(5) { + let elapsed = started.elapsed().as_secs_f64(); + let rate = summary.file_size_inserted as f64 / elapsed.max(0.001); + info!( + name = %name, + file_sizes = summary.file_size_inserted, + rate = format!("{rate:.0}/s"), + "file-sizes progress", + ); + last_log = Instant::now(); } - drop(stmt); - tx.commit().context("commit file-sizes batch")?; } Ok(()) } -fn upsert_file_size_row( - stmt: &mut Statement<'_>, - sz: &V2FileSize, - dataset_fallback: &str, -) -> Result<()> { - let dataset = if sz.benchmark.is_empty() { - dataset_fallback.to_string() - } else { - sz.benchmark.clone() - }; - let dataset_variant = sz - .scale_factor - .as_ref() - .filter(|s| !s.is_empty() && s.as_str() != "1.0") - .cloned(); - let cs = CompressionSize { - commit_sha: sz.commit_id.clone(), - dataset, - dataset_variant, - format: sz.format.clone(), - value_bytes: sz.size_bytes, - }; - let mid = measurement_id_compression_size(&cs); - stmt.execute(params![ - mid, - cs.commit_sha, - cs.dataset, - cs.dataset_variant, - cs.format, - cs.value_bytes, - ])?; +/// Append an Arrow `RecordBatch` to a DuckDB table via `Appender`. +fn flush(conn: &Connection, table: &str, batch: RecordBatch) -> Result<()> { + let mut app = conn + .appender(table) + .with_context(|| format!("opening appender for {table}"))?; + app.append_record_batch(batch) + .with_context(|| format!("appending record batch to {table}"))?; + drop(app); Ok(()) } +#[derive(Default)] +struct QueryAccum { + measurement_id: Vec, + commit_sha: Vec, + dataset: Vec, + dataset_variant: Vec>, + scale_factor: Vec>, + query_idx: Vec, + storage: Vec, + engine: Vec, + format: Vec, + value_ns: Vec, + all_runtimes_ns: Vec>, + peak_physical: Vec>, + peak_virtual: Vec>, + physical_delta: Vec>, + virtual_delta: Vec>, + env_triple: Vec>, + seen: HashSet, +} + +impl QueryAccum { + fn push(&mut self, mid: i64, r: QueryMeasurement, summary: &mut MigrationSummary) { + if !self.seen.insert(mid) { + summary.deduped += 1; + return; + } + self.measurement_id.push(mid); + self.commit_sha.push(r.commit_sha); + self.dataset.push(r.dataset); + self.dataset_variant.push(r.dataset_variant); + self.scale_factor.push(r.scale_factor); + self.query_idx.push(r.query_idx); + self.storage.push(r.storage); + self.engine.push(r.engine); + self.format.push(r.format); + self.value_ns.push(r.value_ns); + self.all_runtimes_ns.push(r.all_runtimes_ns); + self.peak_physical.push(r.peak_physical); + self.peak_virtual.push(r.peak_virtual); + self.physical_delta.push(r.physical_delta); + self.virtual_delta.push(r.virtual_delta); + self.env_triple.push(r.env_triple); + } +} + +#[derive(Default)] +struct CompressionTimeAccum { + measurement_id: Vec, + commit_sha: Vec, + dataset: Vec, + dataset_variant: Vec>, + format: Vec, + op: Vec, + value_ns: Vec, + all_runtimes_ns: Vec>, + env_triple: Vec>, + seen: HashSet, +} + +impl CompressionTimeAccum { + fn push(&mut self, mid: i64, r: CompressionTime, summary: &mut MigrationSummary) { + if !self.seen.insert(mid) { + summary.deduped += 1; + return; + } + self.measurement_id.push(mid); + self.commit_sha.push(r.commit_sha); + self.dataset.push(r.dataset); + self.dataset_variant.push(r.dataset_variant); + self.format.push(r.format); + self.op.push(r.op); + self.value_ns.push(r.value_ns); + self.all_runtimes_ns.push(r.all_runtimes_ns); + self.env_triple.push(r.env_triple); + } +} + +#[derive(Default)] +struct RandomAccessAccum { + measurement_id: Vec, + commit_sha: Vec, + dataset: Vec, + format: Vec, + value_ns: Vec, + all_runtimes_ns: Vec>, + env_triple: Vec>, + seen: HashSet, +} + +impl RandomAccessAccum { + fn push(&mut self, mid: i64, r: RandomAccessTime, summary: &mut MigrationSummary) { + if !self.seen.insert(mid) { + summary.deduped += 1; + return; + } + self.measurement_id.push(mid); + self.commit_sha.push(r.commit_sha); + self.dataset.push(r.dataset); + self.format.push(r.format); + self.value_ns.push(r.value_ns); + self.all_runtimes_ns.push(r.all_runtimes_ns); + self.env_triple.push(r.env_triple); + } +} + +/// `compression_sizes` is fed by both data.json.gz (replace-on-collision) +/// and file-sizes-*.json.gz (sum-on-collision). Stored as a map; converted +/// to a `RecordBatch` at flush time. +#[derive(Default)] +struct CompressionSizeAccum { + rows: HashMap, +} + +impl CompressionSizeAccum { + /// data.json.gz path: latest write wins, mirroring the prior + /// `ON CONFLICT DO UPDATE SET value_bytes = excluded.value_bytes`. + fn push_replace(&mut self, mid: i64, r: CompressionSize) { + self.rows.insert(mid, r); + } + + /// file-sizes-*.json.gz path: per-file rows aggregate into one + /// `(commit, dataset, dataset_variant, format)` row by summing, + /// mirroring the prior `value_bytes = compression_sizes.value_bytes + /// + excluded.value_bytes`. + fn push_sum(&mut self, mid: i64, r: CompressionSize) { + let add = r.value_bytes; + self.rows + .entry(mid) + .and_modify(|x| x.value_bytes += add) + .or_insert(r); + } +} + +fn build_query_batch(a: QueryAccum) -> Result { + let schema = Arc::new(Schema::new(vec![ + Field::new("measurement_id", DataType::Int64, false), + Field::new("commit_sha", DataType::Utf8, false), + Field::new("dataset", DataType::Utf8, false), + Field::new("dataset_variant", DataType::Utf8, true), + Field::new("scale_factor", DataType::Utf8, true), + Field::new("query_idx", DataType::Int32, false), + Field::new("storage", DataType::Utf8, false), + Field::new("engine", DataType::Utf8, false), + Field::new("format", DataType::Utf8, false), + Field::new("value_ns", DataType::Int64, false), + Field::new( + "all_runtimes_ns", + DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), + false, + ), + Field::new("peak_physical", DataType::Int64, true), + Field::new("peak_virtual", DataType::Int64, true), + Field::new("physical_delta", DataType::Int64, true), + Field::new("virtual_delta", DataType::Int64, true), + Field::new("env_triple", DataType::Utf8, true), + ])); + let cols: Vec = vec![ + Arc::new(Int64Array::from(a.measurement_id)), + Arc::new(StringArray::from(a.commit_sha)), + Arc::new(StringArray::from(a.dataset)), + Arc::new(StringArray::from(a.dataset_variant)), + Arc::new(StringArray::from(a.scale_factor)), + Arc::new(Int32Array::from(a.query_idx)), + Arc::new(StringArray::from(a.storage)), + Arc::new(StringArray::from(a.engine)), + Arc::new(StringArray::from(a.format)), + Arc::new(Int64Array::from(a.value_ns)), + Arc::new(build_list_int64(a.all_runtimes_ns)), + Arc::new(Int64Array::from(a.peak_physical)), + Arc::new(Int64Array::from(a.peak_virtual)), + Arc::new(Int64Array::from(a.physical_delta)), + Arc::new(Int64Array::from(a.virtual_delta)), + Arc::new(StringArray::from(a.env_triple)), + ]; + Ok(RecordBatch::try_new(schema, cols)?) +} + +fn build_compression_time_batch(a: CompressionTimeAccum) -> Result { + let schema = Arc::new(Schema::new(vec![ + Field::new("measurement_id", DataType::Int64, false), + Field::new("commit_sha", DataType::Utf8, false), + Field::new("dataset", DataType::Utf8, false), + Field::new("dataset_variant", DataType::Utf8, true), + Field::new("format", DataType::Utf8, false), + Field::new("op", DataType::Utf8, false), + Field::new("value_ns", DataType::Int64, false), + Field::new( + "all_runtimes_ns", + DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), + false, + ), + Field::new("env_triple", DataType::Utf8, true), + ])); + let cols: Vec = vec![ + Arc::new(Int64Array::from(a.measurement_id)), + Arc::new(StringArray::from(a.commit_sha)), + Arc::new(StringArray::from(a.dataset)), + Arc::new(StringArray::from(a.dataset_variant)), + Arc::new(StringArray::from(a.format)), + Arc::new(StringArray::from(a.op)), + Arc::new(Int64Array::from(a.value_ns)), + Arc::new(build_list_int64(a.all_runtimes_ns)), + Arc::new(StringArray::from(a.env_triple)), + ]; + Ok(RecordBatch::try_new(schema, cols)?) +} + +fn build_random_access_batch(a: RandomAccessAccum) -> Result { + let schema = Arc::new(Schema::new(vec![ + Field::new("measurement_id", DataType::Int64, false), + Field::new("commit_sha", DataType::Utf8, false), + Field::new("dataset", DataType::Utf8, false), + Field::new("format", DataType::Utf8, false), + Field::new("value_ns", DataType::Int64, false), + Field::new( + "all_runtimes_ns", + DataType::List(Arc::new(Field::new("item", DataType::Int64, false))), + false, + ), + Field::new("env_triple", DataType::Utf8, true), + ])); + let cols: Vec = vec![ + Arc::new(Int64Array::from(a.measurement_id)), + Arc::new(StringArray::from(a.commit_sha)), + Arc::new(StringArray::from(a.dataset)), + Arc::new(StringArray::from(a.format)), + Arc::new(Int64Array::from(a.value_ns)), + Arc::new(build_list_int64(a.all_runtimes_ns)), + Arc::new(StringArray::from(a.env_triple)), + ]; + Ok(RecordBatch::try_new(schema, cols)?) +} + +fn build_compression_size_batch(a: CompressionSizeAccum) -> Result { + let n = a.rows.len(); + let mut measurement_id = Vec::with_capacity(n); + let mut commit_sha = Vec::with_capacity(n); + let mut dataset = Vec::with_capacity(n); + let mut dataset_variant = Vec::with_capacity(n); + let mut format = Vec::with_capacity(n); + let mut value_bytes = Vec::with_capacity(n); + for (mid, cs) in a.rows { + measurement_id.push(mid); + commit_sha.push(cs.commit_sha); + dataset.push(cs.dataset); + dataset_variant.push(cs.dataset_variant); + format.push(cs.format); + value_bytes.push(cs.value_bytes); + } + let schema = Arc::new(Schema::new(vec![ + Field::new("measurement_id", DataType::Int64, false), + Field::new("commit_sha", DataType::Utf8, false), + Field::new("dataset", DataType::Utf8, false), + Field::new("dataset_variant", DataType::Utf8, true), + Field::new("format", DataType::Utf8, false), + Field::new("value_bytes", DataType::Int64, false), + ])); + let cols: Vec = vec![ + Arc::new(Int64Array::from(measurement_id)), + Arc::new(StringArray::from(commit_sha)), + Arc::new(StringArray::from(dataset)), + Arc::new(StringArray::from(dataset_variant)), + Arc::new(StringArray::from(format)), + Arc::new(Int64Array::from(value_bytes)), + ]; + Ok(RecordBatch::try_new(schema, cols)?) +} + +/// Build a non-nullable `List` Arrow array from one inner Vec +/// per row. The outer list is non-null; inner i64 values are non-null. +fn build_list_int64(values: Vec>) -> ListArray { + let mut offsets: Vec = Vec::with_capacity(values.len() + 1); + offsets.push(0); + let mut flat: Vec = Vec::new(); + for inner in values { + flat.extend_from_slice(&inner); + offsets.push(flat.len() as i32); + } + let values_arr = Int64Array::from(flat); + let field = Arc::new(Field::new("item", DataType::Int64, false)); + ListArray::new( + field, + OffsetBuffer::new(offsets.into()), + Arc::new(values_arr), + None, + ) +} + /// Print the summary in a human-readable form. Returned by the CLI. impl std::fmt::Display for MigrationSummary { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -580,6 +787,8 @@ impl std::fmt::Display for MigrationSummary { writeln!(f, "Inserted (file sizes): {}", self.file_size_inserted)?; writeln!(f, "Missing commit: {}", self.missing_commit)?; writeln!(f, "Skipped (no value): {}", self.skipped_no_value)?; + writeln!(f, "Skipped (intentional): {}", self.skipped_intentional)?; + writeln!(f, "Deduplicated: {}", self.deduped)?; writeln!( f, "Uncategorized: {} ({:.2}%)", diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs index e8288751d62..a26658cea53 100644 --- a/benchmarks-website/migrate/tests/classifier.rs +++ b/benchmarks-website/migrate/tests/classifier.rs @@ -6,8 +6,11 @@ use rstest::rstest; use serde_json::json; +use vortex_bench_migrate::classifier::Outcome; +use vortex_bench_migrate::classifier::Skip; use vortex_bench_migrate::classifier::V3Bin; use vortex_bench_migrate::classifier::classify; +use vortex_bench_migrate::classifier::classify_outcome; use vortex_bench_migrate::classifier::format_query; use vortex_bench_migrate::classifier::rename_engine; use vortex_bench_migrate::v2::V2Record; @@ -263,7 +266,6 @@ fn compression_size_records(#[case] name: &str, #[case] expected: V3Bin) { #[case::ratio_size_vortex_parquet("vortex:parquet-zstd size/clickbench")] #[case::ratio_size_vortex_raw("vortex:raw size/clickbench")] #[case::throughput("compress throughput/clickbench")] -#[case::fineweb_skipped("fineweb_q01/datafusion:parquet")] #[case::nonsense_prefix("not-a-known-bench/series")] fn unmapped_records_yield_none(#[case] name: &str) { let r = record(name); @@ -274,6 +276,94 @@ fn unmapped_records_yield_none(#[case] name: &str) { ); } +#[test] +fn parquet_zstd_size_is_deprecated() { + // `parquet-zstd` is not on the v3 emitter's format allowlist, so + // historical `parquet-zstd size/...` records bucket under + // Skip::Deprecated and don't render as orphan charts in v3. + let r = record("parquet-zstd size/clickbench"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::Deprecated) + )); +} + +#[test] +fn vortex_parquet_zstd_ratio_is_intentional_skip() { + let r = record("vortex:parquet-zstd ratio compress time/clickbench"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::DerivedRatio) + )); +} + +#[test] +fn vortex_parquet_zst_typo_ratio_is_intentional_skip() { + // `parquet-zst` (no trailing `d`) was emitted by some v2 runs. + // Both spellings should classify as derived ratios. + for name in [ + "vortex:parquet-zst ratio compress time/clickbench", + "vortex:parquet-zst ratio decompress time/clickbench", + ] { + let r = record(name); + assert!( + matches!(classify_outcome(&r), Outcome::Skip(Skip::DerivedRatio)), + "{name:?} should be DerivedRatio", + ); + } +} + +#[test] +fn throughput_is_intentional_skip() { + let r = record("compress throughput/clickbench"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::Throughput) + )); +} + +#[test] +fn unknown_prefix_is_unknown() { + let r = record("not-a-known-bench/series"); + assert!(matches!(classify_outcome(&r), Outcome::Unknown)); +} + +#[test] +fn gharchive_q00_is_deprecated() { + // gharchive isn't on the v3 query-suite allowlist, so historical + // gharchive query records bucket as Skip::Deprecated. + let r = record("gharchive_q00/datafusion:parquet"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::Deprecated) + )); +} + +#[test] +fn fineweb_q00_is_deprecated() { + // fineweb isn't on the v3 query-suite allowlist, so historical + // fineweb query records bucket as Skip::Deprecated. + let r = record("fineweb_q00/datafusion:parquet"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::Deprecated) + )); +} + +#[test] +fn engine_casing_lowercased() { + // Older v2 records emitted display-case engines like `DataFusion` + // and `DuckDB`. The classifier lowercases at push time so dedup + // collapses display-case rows into the canonical lowercase ones. + let r = record("clickbench_q07/DataFusion:parquet"); + let outcome = classify_outcome(&r); + let Outcome::Bin(V3Bin::Query { engine, format, .. }) = outcome else { + panic!("expected Bin(Query), got {outcome:?}"); + }; + assert_eq!(engine, "datafusion"); + assert_eq!(format, "parquet"); +} + #[test] fn rename_engine_pins_canonical_outputs() { assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme"); diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs index b389f77c421..a8328342c9a 100644 --- a/benchmarks-website/migrate/tests/end_to_end.rs +++ b/benchmarks-website/migrate/tests/end_to_end.rs @@ -1,11 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Inline JSONL fixture exercising 1 record per kind through the full -//! migration into a tempdir DuckDB. No live S3. +//! Inline JSONL fixtures driven through the full migration into a +//! tempdir DuckDB. No live S3. use std::fs::File; use std::io::Write; +use std::path::Path; use duckdb::Connection; use flate2::Compression; @@ -23,25 +24,34 @@ const DATA_JSONL: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_ {"name":"random-access/taxi/take/parquet-tokio-local-disk","commit_id":"deadbeef","unit":"ns","value":777,"all_runtimes":[700,777,800]} "#; -fn write_local_dir() -> TempDir { +/// Build a local-source fixture directory. Caller supplies the contents +/// of `commits.json`, `data.json.gz`, and any number of +/// `file-sizes-*.json.gz` files (name → contents). +fn build_fixture(commits: &str, data: &str, file_sizes: &[(&str, &str)]) -> TempDir { let dir = TempDir::new().expect("tempdir"); - { - let mut f = File::create(dir.path().join("commits.json")).unwrap(); - f.write_all(COMMITS_JSONL.as_bytes()).unwrap(); + write_text(&dir.path().join("commits.json"), commits); + write_gz(&dir.path().join("data.json.gz"), data); + for (name, body) in file_sizes { + write_gz(&dir.path().join(name), body); } - { - let f = File::create(dir.path().join("data.json.gz")).unwrap(); - let mut gz = GzEncoder::new(f, Compression::default()); - gz.write_all(DATA_JSONL.as_bytes()).unwrap(); - gz.finish().unwrap(); - } - // No file-sizes-*.json.gz to keep the fixture minimal. dir } +fn write_text(path: &Path, body: &str) { + let mut f = File::create(path).unwrap(); + f.write_all(body.as_bytes()).unwrap(); +} + +fn write_gz(path: &Path, body: &str) { + let f = File::create(path).unwrap(); + let mut gz = GzEncoder::new(f, Compression::default()); + gz.write_all(body.as_bytes()).unwrap(); + gz.finish().unwrap(); +} + #[test] fn migrate_inline_fixture_populates_each_table() { - let src_dir = write_local_dir(); + let src_dir = build_fixture(COMMITS_JSONL, DATA_JSONL, &[]); let target_dir = TempDir::new().unwrap(); let target = target_dir.path().join("v3.duckdb"); @@ -109,3 +119,66 @@ fn migrate_inline_fixture_populates_each_table() { assert_eq!(dataset, "taxi/take"); assert_eq!(format, "parquet"); } + +#[test] +fn dedup_collision_keeps_one_row() { + // Two data.json.gz lines whose query-measurement dim columns are + // identical (same commit / dataset / engine / format / query_idx, + // and `storage` collapses to "nvme" since `storage` is unset). + // Different `value`s. The accumulator's HashSet + // should drop the second one and bump `summary.deduped`. + const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} +{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222} +"#; + + let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.records_read, 2, "summary={summary}"); + assert_eq!(summary.query_inserted, 1, "summary={summary}"); + assert_eq!(summary.deduped, 1, "summary={summary}"); + + let conn = Connection::open(&target).unwrap(); + let n: i64 = conn + .query_row("SELECT COUNT(*) FROM query_measurements", [], |r| r.get(0)) + .unwrap(); + assert_eq!(n, 1); +} + +#[test] +fn file_sizes_sum_into_one_row() { + // Two file-sizes rows sharing (commit, benchmark, format, + // scale_factor) and value_bytes 100 + 200 must collapse to a + // single compression_sizes row with 300. + const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100} +{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-1.vortex","size_bytes":200} +"#; + + let src_dir = build_fixture( + COMMITS_JSONL, + "", + &[("file-sizes-clickbench.json.gz", FILE_SIZES)], + ); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.file_size_inserted, 2, "summary={summary}"); + assert_eq!(summary.compression_size_inserted, 1, "summary={summary}"); + + let conn = Connection::open(&target).unwrap(); + let n: i64 = conn + .query_row("SELECT COUNT(*) FROM compression_sizes", [], |r| r.get(0)) + .unwrap(); + assert_eq!(n, 1); + let value_bytes: i64 = conn + .query_row("SELECT value_bytes FROM compression_sizes", [], |r| { + r.get(0) + }) + .unwrap(); + assert_eq!(value_bytes, 300); +} From b0281483bdbb46ad3e82c4876d2751a53d39949e Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 26 Apr 2026 21:02:26 -0400 Subject: [PATCH 5/5] clean up and fix bugs Signed-off-by: Connor Tsui --- Cargo.lock | 1 + benchmarks-website/migrate/Cargo.toml | 1 + benchmarks-website/migrate/src/classifier.rs | 46 +++++++++-- benchmarks-website/migrate/src/migrate.rs | 56 +++++++++---- benchmarks-website/migrate/src/source.rs | 22 +++++- .../migrate/tests/classifier.rs | 59 +++++++++++++- .../migrate/tests/end_to_end.rs | 79 +++++++++++++++++++ 7 files changed, 237 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20075443c36..5315ba5ef7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10409,6 +10409,7 @@ dependencies = [ "tracing", "tracing-subscriber", "vortex-bench-server", + "vortex-utils", ] [[package]] diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml index f9b83d5d543..45a752df397 100644 --- a/benchmarks-website/migrate/Cargo.toml +++ b/benchmarks-website/migrate/Cargo.toml @@ -34,6 +34,7 @@ tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } tracing = { workspace = true, features = ["std"] } tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] } vortex-bench-server = { path = "../server" } +vortex-utils = { workspace = true } [dev-dependencies] rstest = { workspace = true } diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs index 6b3368c64b8..8a17b31fcd2 100644 --- a/benchmarks-website/migrate/src/classifier.rs +++ b/benchmarks-website/migrate/src/classifier.rs @@ -424,6 +424,12 @@ pub enum Skip { /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`, /// historical-only suites no longer in CI). Deprecated, + /// v2 memory measurements (`*_memory/*` records). Carry top-level + /// `peak_physical_memory` / `peak_virtual_memory` / + /// `physical_memory_delta` / `virtual_memory_delta` fields that + /// `V2Record` doesn't deserialize. Not migrated for alpha; merging + /// into the corresponding QueryMeasurement row is future work. + HistoricalMemory, } /// Engines the v3 emitter produces today. Anything else is historical @@ -451,9 +457,18 @@ const V3_FORMATS: &[&str] = &[ /// classify (so historical analyses stay coherent) but get bucketed /// as `Skip::Deprecated` so they don't render as orphan charts in v3. /// -/// ORCHESTRATOR NOTE: add `fineweb` and/or `gharchive` here if a CI -/// grep shows v3 still emits them. -const V3_QUERY_SUITES: &[&str] = &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"]; +/// `fineweb` is included because `.github/workflows/sql-benchmarks.yml` +/// still has `fineweb` and `fineweb-s3` matrix entries. `gharchive` +/// stays excluded — it's defined in `vortex-bench` but no current +/// workflow runs it. +const V3_QUERY_SUITES: &[&str] = &[ + "clickbench", + "tpch", + "tpcds", + "statpopgen", + "polarsignals", + "fineweb", +]; /// Returns true if every dim that v3 stores as a column is on the /// emitter's current allowlist. Dim values outside the allowlist mean @@ -488,6 +503,16 @@ pub fn classify_outcome(record: &V2Record) -> Outcome { if record.name.contains(" throughput") { return Outcome::Skip(Skip::Throughput); } + // v2 memory records: e.g. "clickbench_q07_memory/datafusion:parquet". + // Match the `_memory/` infix BEFORE the engine/format split, so they + // route to a known Skip variant instead of slipping through to + // Outcome::Unknown and tripping the 5% gate. + let lower = record.name.to_lowercase(); + if let Some((head, _)) = lower.split_once('/') + && head.ends_with("_memory") + { + return Outcome::Skip(Skip::HistoricalMemory); + } let Some(group) = get_group(record) else { return Outcome::Unknown; }; @@ -613,7 +638,7 @@ fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option Option { +fn bin_compression_size(cls: &V2Classification, record: &V2Record) -> Option { let lc = cls.chart.to_lowercase(); // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" / // "VORTEX:RAW SIZE" are derived from compression_sizes at read @@ -641,9 +666,20 @@ fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option { let rar = RandomAccessTime { @@ -495,15 +497,22 @@ struct QueryAccum { physical_delta: Vec>, virtual_delta: Vec>, env_triple: Vec>, - seen: HashSet, + /// `mid` -> index in the parallel column vecs. Lets us look up the + /// kept row's `value_ns` on collision so we can flag conflicts. + seen: HashMap, } impl QueryAccum { fn push(&mut self, mid: i64, r: QueryMeasurement, summary: &mut MigrationSummary) { - if !self.seen.insert(mid) { + if let Some(&idx) = self.seen.get(&mid) { summary.deduped += 1; + if self.value_ns[idx] != r.value_ns { + summary.deduped_with_conflict += 1; + } return; } + let idx = self.measurement_id.len(); + self.seen.insert(mid, idx); self.measurement_id.push(mid); self.commit_sha.push(r.commit_sha); self.dataset.push(r.dataset); @@ -534,15 +543,20 @@ struct CompressionTimeAccum { value_ns: Vec, all_runtimes_ns: Vec>, env_triple: Vec>, - seen: HashSet, + seen: HashMap, } impl CompressionTimeAccum { fn push(&mut self, mid: i64, r: CompressionTime, summary: &mut MigrationSummary) { - if !self.seen.insert(mid) { + if let Some(&idx) = self.seen.get(&mid) { summary.deduped += 1; + if self.value_ns[idx] != r.value_ns { + summary.deduped_with_conflict += 1; + } return; } + let idx = self.measurement_id.len(); + self.seen.insert(mid, idx); self.measurement_id.push(mid); self.commit_sha.push(r.commit_sha); self.dataset.push(r.dataset); @@ -564,15 +578,20 @@ struct RandomAccessAccum { value_ns: Vec, all_runtimes_ns: Vec>, env_triple: Vec>, - seen: HashSet, + seen: HashMap, } impl RandomAccessAccum { fn push(&mut self, mid: i64, r: RandomAccessTime, summary: &mut MigrationSummary) { - if !self.seen.insert(mid) { + if let Some(&idx) = self.seen.get(&mid) { summary.deduped += 1; + if self.value_ns[idx] != r.value_ns { + summary.deduped_with_conflict += 1; + } return; } + let idx = self.measurement_id.len(); + self.seen.insert(mid, idx); self.measurement_id.push(mid); self.commit_sha.push(r.commit_sha); self.dataset.push(r.dataset); @@ -594,7 +613,15 @@ struct CompressionSizeAccum { impl CompressionSizeAccum { /// data.json.gz path: latest write wins, mirroring the prior /// `ON CONFLICT DO UPDATE SET value_bytes = excluded.value_bytes`. - fn push_replace(&mut self, mid: i64, r: CompressionSize) { + /// Bumps `deduped_with_conflict` when an existing row's + /// `value_bytes` differs from the incoming row's, so silent + /// value-corruption is observable. + fn push_replace(&mut self, mid: i64, r: CompressionSize, summary: &mut MigrationSummary) { + if let Some(existing) = self.rows.get(&mid) + && existing.value_bytes != r.value_bytes + { + summary.deduped_with_conflict += 1; + } self.rows.insert(mid, r); } @@ -789,6 +816,7 @@ impl std::fmt::Display for MigrationSummary { writeln!(f, "Skipped (no value): {}", self.skipped_no_value)?; writeln!(f, "Skipped (intentional): {}", self.skipped_intentional)?; writeln!(f, "Deduplicated: {}", self.deduped)?; + writeln!(f, "Dedup w/ value diff: {}", self.deduped_with_conflict)?; writeln!( f, "Uncategorized: {} ({:.2}%)", diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs index 340a9bdb60f..c18e86a63ca 100644 --- a/benchmarks-website/migrate/src/source.rs +++ b/benchmarks-website/migrate/src/source.rs @@ -120,7 +120,21 @@ fn open_s3(name: &str) -> Result> { } /// Suite IDs we know publish a `file-sizes-{id}.json.gz` to S3. -/// Matches the `matrix.id` values in `.github/workflows/sql-benchmarks.yml` -/// at the time of writing. New suites mean a new entry here. -const KNOWN_FILE_SIZES_SUITES: &[&str] = - &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"]; +/// +/// Source of truth: the `matrix.id` values in +/// `.github/workflows/sql-benchmarks.yml`'s `benchmark_matrix` default. +/// The post-bench `file-sizes` step uploads `file-sizes-${{ matrix.id +/// }}.json.gz`, so this list must match those IDs verbatim. Adding a +/// new matrix entry to that workflow means adding the same ID here. +const KNOWN_FILE_SIZES_SUITES: &[&str] = &[ + "clickbench-nvme", + "tpch-nvme", + "tpch-s3", + "tpch-nvme-10", + "tpch-s3-10", + "tpcds-nvme", + "statpopgen", + "fineweb", + "fineweb-s3", + "polarsignals", +]; diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs index a26658cea53..cddca0c517c 100644 --- a/benchmarks-website/migrate/tests/classifier.rs +++ b/benchmarks-website/migrate/tests/classifier.rs @@ -340,16 +340,67 @@ fn gharchive_q00_is_deprecated() { } #[test] -fn fineweb_q00_is_deprecated() { - // fineweb isn't on the v3 query-suite allowlist, so historical - // fineweb query records bucket as Skip::Deprecated. +fn fineweb_q00_classifies() { + // fineweb is on V3_QUERY_SUITES (still emitted by v3 CI per + // .github/workflows/sql-benchmarks.yml's `fineweb` matrix entry), + // so historical fineweb records ingest like any other suite. let r = record("fineweb_q00/datafusion:parquet"); assert!(matches!( classify_outcome(&r), - Outcome::Skip(Skip::Deprecated) + Outcome::Bin(V3Bin::Query { .. }) )); } +#[test] +fn memory_record_is_historical_memory_skip() { + // v2 emitted `_q_memory/:` records that + // carry top-level memory fields V2Record doesn't deserialize. + // Skip them with a known variant so they don't trip the 5% gate. + let r = record("clickbench_q07_memory/datafusion:parquet"); + assert!(matches!( + classify_outcome(&r), + Outcome::Skip(Skip::HistoricalMemory) + )); +} + +#[test] +fn tpch_compression_size_carries_scale_factor() { + // The data.json.gz "vortex size/tpch" path needs to derive + // dataset_variant from the v2 record's `dataset` object, the same + // way the file-sizes path does. Otherwise SF=10 rows from the two + // sources never collide on `mid` and produce duplicate rows. + let mut r = record("vortex size/tpch"); + r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "10" } })); + let outcome = classify_outcome(&r); + let Outcome::Bin(V3Bin::CompressionSize { + dataset, + dataset_variant, + format, + }) = outcome + else { + panic!("expected Bin(CompressionSize), got {outcome:?}"); + }; + assert_eq!(dataset, "tpch"); + assert_eq!(dataset_variant, Some("10".into())); + assert_eq!(format, "vortex-file-compressed"); +} + +#[test] +fn tpch_compression_size_drops_default_scale_factor() { + // SF "1.0" matches the file-sizes path's filter and collapses to + // dataset_variant: None. + let mut r = record("vortex size/tpch"); + r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "1.0" } })); + let outcome = classify_outcome(&r); + let Outcome::Bin(V3Bin::CompressionSize { + dataset_variant, .. + }) = outcome + else { + panic!("expected Bin(CompressionSize), got {outcome:?}"); + }; + assert_eq!(dataset_variant, None); +} + #[test] fn engine_casing_lowercased() { // Older v2 records emitted display-case engines like `DataFusion` diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs index a8328342c9a..210092a4058 100644 --- a/benchmarks-website/migrate/tests/end_to_end.rs +++ b/benchmarks-website/migrate/tests/end_to_end.rs @@ -148,6 +148,85 @@ fn dedup_collision_keeps_one_row() { assert_eq!(n, 1); } +#[test] +fn dedup_with_conflicting_value_ns_is_counted() { + // Same dim columns, different `value`s. Dedup keeps the first + // and bumps `deduped_with_conflict` because the dropped row's + // value_ns differed from the kept row's. This is the signal we + // care about when watching for silent value-corruption across + // duplicated v2 emissions. + const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} +{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222} +"#; + + let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.deduped, 1, "summary={summary}"); + assert_eq!(summary.deduped_with_conflict, 1, "summary={summary}"); +} + +#[test] +fn dedup_with_matching_value_ns_does_not_count_conflict() { + // Same dim columns AND identical `value`s. Dedup still drops the + // duplicate, but `deduped_with_conflict` stays 0. + const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} +{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111} +"#; + + let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.deduped, 1, "summary={summary}"); + assert_eq!(summary.deduped_with_conflict, 0, "summary={summary}"); +} + +#[test] +fn compression_size_data_and_file_sizes_merge() { + // A `vortex size/tpch` record from data.json.gz and a + // file-sizes-tpch-nvme.json.gz row covering the same (commit, + // dataset, format, SF) tuple should produce the *same* + // measurement_id so the in-memory accumulator merges them into + // one row instead of two. + // + // Both sources use scale_factor "1.0", which both code paths + // filter out → dataset_variant: None on both sides → matching mid. + const DATA: &str = r#"{"name":"vortex size/tpch","commit_id":"deadbeef","unit":"bytes","value":200,"dataset":{"tpch":{"scale_factor":"1.0"}}} +"#; + const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"tpch","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100} +"#; + + let src_dir = build_fixture( + COMMITS_JSONL, + DATA, + &[("file-sizes-tpch-nvme.json.gz", FILE_SIZES)], + ); + let target_dir = TempDir::new().unwrap(); + let target = target_dir.path().join("v3.duckdb"); + + let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap(); + + assert_eq!(summary.compression_size_inserted, 1, "summary={summary}"); + + let conn = Connection::open(&target).unwrap(); + let (n, value_bytes): (i64, i64) = conn + .query_row( + "SELECT COUNT(*), SUM(value_bytes) FROM compression_sizes", + [], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .unwrap(); + assert_eq!(n, 1); + // data.json.gz seeds value_bytes=200, file-sizes adds 100. + assert_eq!(value_bytes, 300); +} + #[test] fn file_sizes_sum_into_one_row() { // Two file-sizes rows sharing (commit, benchmark, format,