From abae54d7085ce34e4e3e745c2f15196ce08f1eb9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 26 Apr 2026 19:58:36 +0000
Subject: [PATCH 1/5] migrate(benchmarks-v3): add one-shot v2-to-v3 historical
 migrator

Reads v2's data.json.gz/commits.json/file-sizes from S3, ports v2's
getGroup classifier bug-for-bug, and writes a fully populated v3
DuckDB. Includes a verify subcommand that diffs group/chart
structure against the live v2 /api/metadata endpoint. Binary and
classifier are throwaway: deleted post-cutover.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                                    |  19 +
 Cargo.toml                                    |   1 +
 benchmarks-website/migrate/Cargo.toml         |  36 ++
 benchmarks-website/migrate/src/classifier.rs  | 605 ++++++++++++++++++
 benchmarks-website/migrate/src/commits.rs     | 100 +++
 benchmarks-website/migrate/src/lib.rs         |  21 +
 benchmarks-website/migrate/src/main.rs        | 114 ++++
 benchmarks-website/migrate/src/migrate.rs     | 562 ++++++++++++++++
 benchmarks-website/migrate/src/source.rs      | 116 ++++
 benchmarks-website/migrate/src/v2.rs          | 142 ++++
 benchmarks-website/migrate/src/verify.rs      | 350 ++++++++++
 .../migrate/tests/classifier.rs               | 291 +++++++++
 .../migrate/tests/end_to_end.rs               | 111 ++++
 13 files changed, 2468 insertions(+)
 create mode 100644 benchmarks-website/migrate/Cargo.toml
 create mode 100644 benchmarks-website/migrate/src/classifier.rs
 create mode 100644 benchmarks-website/migrate/src/commits.rs
 create mode 100644 benchmarks-website/migrate/src/lib.rs
 create mode 100644 benchmarks-website/migrate/src/main.rs
 create mode 100644 benchmarks-website/migrate/src/migrate.rs
 create mode 100644 benchmarks-website/migrate/src/source.rs
 create mode 100644 benchmarks-website/migrate/src/v2.rs
 create mode 100644 benchmarks-website/migrate/src/verify.rs
 create mode 100644 benchmarks-website/migrate/tests/classifier.rs
 create mode 100644 benchmarks-website/migrate/tests/end_to_end.rs

diff --git a/Cargo.lock b/Cargo.lock
index df22be4de9b..079289cdfa8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10352,6 +10352,25 @@ dependencies = [
  "vortex-tensor",
 ]
 
+[[package]]
+name = "vortex-bench-migrate"
+version = "0.1.0-alpha.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "duckdb",
+ "flate2",
+ "reqwest 0.13.2",
+ "rstest",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "vortex-bench-server",
+]
+
 [[package]]
 name = "vortex-bench-server"
 version = "0.1.0-alpha.0"
diff --git a/Cargo.toml b/Cargo.toml
index 70a02d78312..8d7a1ee6adb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ members = [
     "benchmarks/vector-search-bench",
     # Benchmarks website v3 (alpha) - leaf binary, not part of vortex-* API
     "benchmarks-website/server",
+    "benchmarks-website/migrate",
 ]
 exclude = ["java/testfiles", "wasm-test"]
 resolver = "2"
diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml
new file mode 100644
index 00000000000..464e55d9485
--- /dev/null
+++ b/benchmarks-website/migrate/Cargo.toml
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+[package]
+name = "vortex-bench-migrate"
+version = "0.1.0-alpha.0"
+edition = "2024"
+rust-version = "1.91.0"
+license = "Apache-2.0"
+description = "One-shot historical migrator from the v2 benchmarks S3 dataset to a v3 DuckDB file"
+publish = false
+
+[[bin]]
+name = "vortex-bench-migrate"
+path = "src/main.rs"
+
+# Throwaway binary, not part of the vortex-* public API surface.
+# Errors use anyhow, and the crate is intentionally outside the
+# workspace public-api lockfile set.
+
+[dependencies]
+anyhow = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+duckdb = { version = "1.4", features = ["bundled"] }
+flate2 = "1.1"
+reqwest = { workspace = true, features = ["json"] }
+serde = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
+tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
+tracing = { workspace = true, features = ["std"] }
+tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] }
+vortex-bench-server = { path = "../server" }
+
+[dev-dependencies]
+rstest = { workspace = true }
+tempfile = { workspace = true }
diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
new file mode 100644
index 00000000000..f7a1e56c0ae
--- /dev/null
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -0,0 +1,605 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Bug-for-bug port of v2's `getGroup`, `formatQuery`, and
+//! `normalizeChartName` from `benchmarks-website/server.js`, plus the
+//! mapping from v2 group + name pattern to a v3 fact-table bin.
+//!
+//! The v2 classifier was the source of truth for what historical
+//! records mean. It groups records by name prefix into one of:
+//! "Random Access", "Compression", "Compression Size", or one of the
+//! SQL query suites (with optional fan-out by storage and scale
+//! factor for TPC-H/TPC-DS). This module reproduces that logic and
+//! then hops to a v3 fact-table bin, since v3 stores dim values as
+//! columns instead of name fragments.
+
+use crate::v2::V2Record;
+use crate::v2::dataset_scale_factor;
+
+/// Static port of v2's `QUERY_SUITES`.
+pub const QUERY_SUITES: &[QuerySuite] = &[
+    QuerySuite {
+        prefix: "clickbench",
+        display_name: "Clickbench",
+        query_prefix: "CLICKBENCH",
+        dataset_key: None,
+        fan_out: false,
+        skip: false,
+    },
+    QuerySuite {
+        prefix: "statpopgen",
+        display_name: "Statistical and Population Genetics",
+        query_prefix: "STATPOPGEN",
+        dataset_key: None,
+        fan_out: false,
+        skip: false,
+    },
+    QuerySuite {
+        prefix: "polarsignals",
+        display_name: "PolarSignals Profiling",
+        query_prefix: "POLARSIGNALS",
+        dataset_key: None,
+        fan_out: false,
+        skip: false,
+    },
+    QuerySuite {
+        prefix: "tpch",
+        display_name: "TPC-H",
+        query_prefix: "TPC-H",
+        dataset_key: Some("tpch"),
+        fan_out: true,
+        skip: false,
+    },
+    QuerySuite {
+        prefix: "tpcds",
+        display_name: "TPC-DS",
+        query_prefix: "TPC-DS",
+        dataset_key: Some("tpcds"),
+        fan_out: true,
+        skip: false,
+    },
+    QuerySuite {
+        prefix: "fineweb",
+        display_name: "Fineweb",
+        query_prefix: "FINEWEB",
+        dataset_key: None,
+        fan_out: false,
+        skip: true,
+    },
+];
+
+/// Static port of v2's `ENGINE_RENAMES`. Applied to the "series" half
+/// of a benchmark name (the part after the first `/`) before splitting
+/// on `:` into engine/format. Order doesn't matter — keys are unique.
+const ENGINE_RENAMES: &[(&str, &str)] = &[
+    ("datafusion:vortex-file-compressed", "datafusion:vortex"),
+    ("datafusion:parquet", "datafusion:parquet"),
+    ("datafusion:arrow", "datafusion:in-memory-arrow"),
+    ("datafusion:lance", "datafusion:lance"),
+    ("datafusion:vortex-compact", "datafusion:vortex-compact"),
+    ("duckdb:vortex-file-compressed", "duckdb:vortex"),
+    ("duckdb:parquet", "duckdb:parquet"),
+    ("duckdb:duckdb", "duckdb:duckdb"),
+    ("duckdb:vortex-compact", "duckdb:vortex-compact"),
+    ("vortex-tokio-local-disk", "vortex-nvme"),
+    ("vortex-compact-tokio-local-disk", "vortex-compact-nvme"),
+    ("lance-tokio-local-disk", "lance-nvme"),
+    ("parquet-tokio-local-disk", "parquet-nvme"),
+    ("lance", "lance"),
+];
+
+/// One entry of `QUERY_SUITES`.
+#[derive(Debug, Clone, Copy)]
+pub struct QuerySuite {
+    pub prefix: &'static str,
+    pub display_name: &'static str,
+    pub query_prefix: &'static str,
+    pub dataset_key: Option<&'static str>,
+    pub fan_out: bool,
+    pub skip: bool,
+}
+
+/// Group a v2 record falls into. Mirrors `getGroup` in `server.js`,
+/// including the fan-out group naming for TPC-H/TPC-DS.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum V2Group {
+    RandomAccess,
+    Compression,
+    CompressionSize,
+    Query {
+        suite_index: usize,
+        /// `Some` for fan-out suites only.
+        storage: Option<String>,
+        /// `Some` for fan-out suites only.
+        scale_factor: Option<String>,
+    },
+}
+
+impl V2Group {
+    /// Display name as v2 served it from `/api/metadata`.
+    pub fn display_name(&self) -> String {
+        match self {
+            V2Group::RandomAccess => "Random Access".into(),
+            V2Group::Compression => "Compression".into(),
+            V2Group::CompressionSize => "Compression Size".into(),
+            V2Group::Query {
+                suite_index,
+                storage,
+                scale_factor,
+            } => {
+                let suite = &QUERY_SUITES[*suite_index];
+                if let (Some(storage), Some(sf)) = (storage, scale_factor) {
+                    format!("{} ({}) (SF={})", suite.display_name, storage, sf)
+                } else {
+                    suite.display_name.to_string()
+                }
+            }
+        }
+    }
+}
+
+/// Apply v2's `ENGINE_RENAMES`. Reproduces the JS `rename`:
+/// `RENAMES[s.toLowerCase()] || RENAMES[s] || s`.
+pub fn rename_engine(s: &str) -> String {
+    let lower = s.to_lowercase();
+    for (k, v) in ENGINE_RENAMES {
+        if *k == lower {
+            return (*v).to_string();
+        }
+    }
+    for (k, v) in ENGINE_RENAMES {
+        if *k == s {
+            return (*v).to_string();
+        }
+    }
+    s.to_string()
+}
+
+/// Faithful port of v2's `formatQuery`: maps `clickbench_q07` →
+/// `"CLICKBENCH Q7"`. Returns the original (uppercased,
+/// `-` and `_` replaced with spaces) when no suite matches.
+pub fn format_query(q: &str) -> String {
+    let lower = q.to_lowercase();
+    for suite in QUERY_SUITES {
+        if suite.skip {
+            continue;
+        }
+        let prefix = suite.prefix;
+        if let Some(rest) = lower.strip_prefix(prefix)
+            && let Some(idx) = parse_query_index(rest)
+        {
+            return format!("{} Q{}", suite.query_prefix, idx);
+        }
+    }
+    let mut out = q.to_uppercase();
+    out = out.replace(['_', '-'], " ");
+    out
+}
+
+/// Parse the `_q07` / ` q7` / `q42` tail used by `format_query`.
+/// Returns the integer query index if the tail matches the v2 regex
+/// `^[_ ]?q(\d+)`.
+fn parse_query_index(rest: &str) -> Option<u32> {
+    let after_sep = rest
+        .strip_prefix('_')
+        .or_else(|| rest.strip_prefix(' '))
+        .unwrap_or(rest);
+    let after_q = after_sep
+        .strip_prefix('q')
+        .or_else(|| after_sep.strip_prefix('Q'))?;
+    let digits: String = after_q.chars().take_while(|c| c.is_ascii_digit()).collect();
+    if digits.is_empty() {
+        return None;
+    }
+    digits.parse().ok()
+}
+
+/// Faithful port of v2's `normalizeChartName`.
+pub fn normalize_chart_name(group: &V2Group, chart_name: &str) -> String {
+    if matches!(group, V2Group::CompressionSize) && chart_name == "VORTEX FILE COMPRESSED SIZE" {
+        return "VORTEX SIZE".into();
+    }
+    chart_name.to_string()
+}
+
+/// Port of v2's `getGroup`. Returns `None` for skipped suites
+/// (e.g. `fineweb`) or names that match nothing.
+pub fn get_group(record: &V2Record) -> Option<V2Group> {
+    let lower = record.name.to_lowercase();
+
+    if lower.starts_with("random-access/") || lower.starts_with("random access/") {
+        return Some(V2Group::RandomAccess);
+    }
+
+    if lower.starts_with("vortex size/")
+        || lower.starts_with("vortex-file-compressed size/")
+        || lower.starts_with("parquet size/")
+        || lower.starts_with("lance size/")
+        || lower.contains(":raw size/")
+        || lower.contains(":parquet-zstd size/")
+        || lower.contains(":lance size/")
+    {
+        return Some(V2Group::CompressionSize);
+    }
+
+    if lower.starts_with("compress time/")
+        || lower.starts_with("decompress time/")
+        || lower.starts_with("parquet_rs-zstd compress")
+        || lower.starts_with("parquet_rs-zstd decompress")
+        || lower.starts_with("lance compress")
+        || lower.starts_with("lance decompress")
+        || lower.starts_with("vortex:lance ratio")
+        || lower.starts_with("vortex:parquet-zstd ratio")
+        || lower.starts_with("vortex:raw ratio")
+    {
+        return Some(V2Group::Compression);
+    }
+
+    for (i, suite) in QUERY_SUITES.iter().enumerate() {
+        let prefix_q = format!("{}_q", suite.prefix);
+        let prefix_slash = format!("{}/", suite.prefix);
+        if !lower.starts_with(&prefix_q) && !lower.starts_with(&prefix_slash) {
+            continue;
+        }
+        if suite.skip {
+            return None;
+        }
+        if !suite.fan_out {
+            return Some(V2Group::Query {
+                suite_index: i,
+                storage: None,
+                scale_factor: None,
+            });
+        }
+        let storage = match record.storage.as_deref().map(str::to_uppercase).as_deref() {
+            Some("S3") => "S3",
+            _ => "NVMe",
+        };
+        let dataset_key = suite.dataset_key.unwrap_or(suite.prefix);
+        let raw_sf = record
+            .dataset
+            .as_ref()
+            .and_then(|d| dataset_scale_factor(d, dataset_key));
+        let sf = raw_sf
+            .as_deref()
+            .and_then(|s| s.parse::<f64>().ok())
+            .map(|f| f.round() as i64)
+            .unwrap_or(1);
+        return Some(V2Group::Query {
+            suite_index: i,
+            storage: Some(storage.into()),
+            scale_factor: Some(sf.to_string()),
+        });
+    }
+
+    None
+}
+
+/// Group + chart + series breakdown for a v2 record, using the same
+/// rules `server.js` applies in `refresh()`. Equivalent to v2's
+/// `(group, chartName, seriesName)` triple after rename / skip rules.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct V2Classification {
+    pub group: V2Group,
+    pub chart: String,
+    pub series: String,
+}
+
+/// Apply the same chart / series naming v2's `refresh()` does, plus
+/// the throughput / `PARQUET-UNC` skip rules.
+pub fn classify_v2(record: &V2Record) -> Option<V2Classification> {
+    if record.name.contains(" throughput") {
+        return None;
+    }
+    let group = get_group(record)?;
+    let parts: Vec<&str> = record.name.split('/').collect();
+    let (chart, series) = match (&group, parts.len()) {
+        (V2Group::RandomAccess, 4) => {
+            let chart = format!("{}/{}", parts[1], parts[2])
+                .to_uppercase()
+                .replace(['_', '-'], " ");
+            let series = rename_engine(if parts[3].is_empty() {
+                "default"
+            } else {
+                parts[3]
+            });
+            (chart, series)
+        }
+        (V2Group::RandomAccess, 2) => (
+            "RANDOM ACCESS".to_string(),
+            rename_engine(if parts[1].is_empty() {
+                "default"
+            } else {
+                parts[1]
+            }),
+        ),
+        (V2Group::RandomAccess, _) => return None,
+        _ => {
+            let series_raw = if parts.len() >= 2 && !parts[1].is_empty() {
+                parts[1]
+            } else {
+                "default"
+            };
+            let series = rename_engine(series_raw);
+            let chart = format_query(parts[0]);
+            (chart, series)
+        }
+    };
+    let chart = normalize_chart_name(&group, &chart);
+    if chart.contains("PARQUET-UNC") {
+        return None;
+    }
+    Some(V2Classification {
+        group,
+        chart,
+        series,
+    })
+}
+
+/// Mapping target: which v3 fact table a v2 record lands in, plus the
+/// dim values that table needs.
+#[derive(Debug, Clone, PartialEq)]
+pub enum V3Bin {
+    Query {
+        dataset: String,
+        dataset_variant: Option<String>,
+        scale_factor: Option<String>,
+        query_idx: i32,
+        storage: String,
+        engine: String,
+        format: String,
+    },
+    CompressionTime {
+        dataset: String,
+        dataset_variant: Option<String>,
+        format: String,
+        op: String,
+    },
+    CompressionSize {
+        dataset: String,
+        dataset_variant: Option<String>,
+        format: String,
+    },
+    RandomAccess {
+        dataset: String,
+        format: String,
+    },
+}
+
+/// Top-level entry point. Combines `classify_v2` with the v3 fact-table
+/// mapping. Returns `None` for records that:
+///
+/// - Don't match any v2 group (uncategorized prefix).
+/// - Are explicitly skipped by v2 (throughput, PARQUET-UNC, fineweb).
+/// - Are computed-at-read-time ratios that v3 derives from
+///   `compression_sizes` (`vortex:parquet-zstd ratio …`,
+///   `vortex:lance ratio …`, `vortex:raw ratio …`,
+///   `vortex:* size/…`).
+pub fn classify(record: &V2Record) -> Option<V3Bin> {
+    let cls = classify_v2(record)?;
+    match &cls.group {
+        V2Group::RandomAccess => bin_random_access(&cls, record),
+        V2Group::Compression => bin_compression_time(&cls, record),
+        V2Group::CompressionSize => bin_compression_size(&cls, record),
+        V2Group::Query { .. } => bin_query(&cls, record),
+    }
+}
+
+fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
+    // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase).
+    // We store it as the v3 dataset value verbatim, lowercased so
+    // `/api/groups` returns canonical lowercase names.
+    let dataset = cls.chart.to_lowercase();
+    if dataset.is_empty() {
+        return None;
+    }
+    let mut format = cls.series.clone();
+    if format.is_empty() {
+        return None;
+    }
+    // v2 emits a "default" placeholder when parts[1] is empty; treat
+    // that as missing and skip the row instead of inserting "default"
+    // as a format.
+    if format == "default" {
+        return None;
+    }
+    // The v2 random-access bench used to emit `parquet`-suffixed names;
+    // strip an "ns" unit guard later.
+    let _ = record; // record is unused here; kept for parity with siblings.
+    // Lower-case the format too so v3 series names are canonical.
+    format = format.to_lowercase();
+    Some(V3Bin::RandomAccess { dataset, format })
+}
+
+fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option<V3Bin> {
+    // v2 compression chart names look like (after format_query):
+    //   "COMPRESS TIME"                                       [vortex/encode]
+    //   "DECOMPRESS TIME"                                     [vortex/decode]
+    //   "PARQUET RS ZSTD COMPRESS TIME"                       [parquet/encode]
+    //   "PARQUET RS ZSTD DECOMPRESS TIME"                     [parquet/decode]
+    //   "LANCE COMPRESS TIME"                                 [lance/encode]
+    //   "LANCE DECOMPRESS TIME"                               [lance/decode]
+    //   "VORTEX:LANCE RATIO COMPRESS TIME"                    [drop]
+    //   "VORTEX:PARQUET-ZSTD RATIO COMPRESS TIME"             [drop]
+    //   "VORTEX:RAW RATIO COMPRESS TIME"                      [drop]
+    let lc = cls.chart.to_lowercase();
+    if lc.contains("ratio") || lc.contains(':') {
+        // Ratios are computed at read time from compression_sizes.
+        return None;
+    }
+    let (format, op) = if lc.starts_with("compress time") {
+        ("vortex-file-compressed", "encode")
+    } else if lc.starts_with("decompress time") {
+        ("vortex-file-compressed", "decode")
+    } else if lc.starts_with("parquet rs zstd compress time") {
+        ("parquet", "encode")
+    } else if lc.starts_with("parquet rs zstd decompress time") {
+        ("parquet", "decode")
+    } else if lc.starts_with("lance compress time") {
+        ("lance", "encode")
+    } else if lc.starts_with("lance decompress time") {
+        ("lance", "decode")
+    } else {
+        return None;
+    };
+    let dataset = cls.series.to_lowercase();
+    if dataset.is_empty() || dataset == "default" {
+        return None;
+    }
+    Some(V3Bin::CompressionTime {
+        dataset,
+        dataset_variant: None,
+        format: format.to_string(),
+        op: op.to_string(),
+    })
+}
+
+fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option<V3Bin> {
+    let lc = cls.chart.to_lowercase();
+    // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" /
+    // "VORTEX:RAW SIZE" are derived from compression_sizes at read
+    // time, not stored.
+    if lc.contains(':') {
+        return None;
+    }
+    let format = if lc.starts_with("vortex size") {
+        "vortex-file-compressed"
+    } else if lc.starts_with("parquet size") {
+        "parquet"
+    } else if lc.starts_with("lance size") {
+        "lance"
+    } else {
+        return None;
+    };
+    let dataset = cls.series.to_lowercase();
+    if dataset.is_empty() || dataset == "default" {
+        return None;
+    }
+    Some(V3Bin::CompressionSize {
+        dataset,
+        dataset_variant: None,
+        format: format.to_string(),
+    })
+}
+
+fn bin_query(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
+    let V2Group::Query {
+        suite_index,
+        storage,
+        scale_factor,
+    } = &cls.group
+    else {
+        return None;
+    };
+    let suite = &QUERY_SUITES[*suite_index];
+
+    // Pull the query index from the *raw* name's first part instead of
+    // the formatted chart, so we don't have to round-trip "Q07".
+    let raw_first = record.name.split('/').next().unwrap_or("");
+    let query_idx = parse_query_index_from_first(raw_first)?;
+
+    // Series for non-RA records is "engine:format" after rename.
+    let (engine, format) = split_engine_format(&cls.series)?;
+
+    let storage_v3 = match storage.as_deref() {
+        Some("S3") => "s3".to_string(),
+        Some("NVMe") => "nvme".to_string(),
+        _ => "nvme".to_string(),
+    };
+
+    // ClickBench's "flavor" lives in dataset_variant per benchmark-mapping.md
+    // - we don't have it from a v2 name string, so we leave it None.
+    Some(V3Bin::Query {
+        dataset: suite.prefix.to_string(),
+        dataset_variant: None,
+        scale_factor: scale_factor.clone(),
+        query_idx,
+        storage: storage_v3,
+        engine,
+        format,
+    })
+}
+
+/// Pull the integer query index out of the leading name part, which is
+/// always `<prefix>_q<NN>` or `<prefix> q<NN>` for SQL query records.
+fn parse_query_index_from_first(first: &str) -> Option<i32> {
+    let lower = first.to_lowercase();
+    for suite in QUERY_SUITES {
+        if let Some(rest) = lower.strip_prefix(suite.prefix)
+            && let Some(idx) = parse_query_index(rest)
+        {
+            return Some(idx as i32);
+        }
+    }
+    None
+}
+
+/// Split a renamed series like `datafusion:parquet` into
+/// `(engine, format)`. Returns `None` for series with no `:` since
+/// v3 requires both columns.
+fn split_engine_format(series: &str) -> Option<(String, String)> {
+    let mut split = series.splitn(2, ':');
+    let engine = split.next()?.trim().to_string();
+    let format = split.next()?.trim().to_string();
+    if engine.is_empty() || format.is_empty() {
+        return None;
+    }
+    Some((engine, format))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn record(name: &str) -> V2Record {
+        V2Record {
+            name: name.to_string(),
+            commit_id: Some("deadbeef".into()),
+            unit: None,
+            value: None,
+            storage: None,
+            dataset: None,
+            all_runtimes: None,
+            env_triple: None,
+        }
+    }
+
+    #[test]
+    fn format_query_round_trips() {
+        assert_eq!(format_query("clickbench_q07"), "CLICKBENCH Q7");
+        assert_eq!(format_query("tpch_q01"), "TPC-H Q1");
+        assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42");
+        assert_eq!(format_query("statpopgen_q3"), "STATPOPGEN Q3");
+        assert_eq!(format_query("foo bar"), "FOO BAR");
+    }
+
+    #[test]
+    fn rename_engine_canonicalizes_disk_names() {
+        assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme");
+        assert_eq!(
+            rename_engine("datafusion:vortex-file-compressed"),
+            "datafusion:vortex"
+        );
+        assert_eq!(rename_engine("unknown-engine"), "unknown-engine");
+    }
+
+    #[test]
+    fn parse_query_index_handles_separators() {
+        assert_eq!(parse_query_index("_q07"), Some(7));
+        assert_eq!(parse_query_index(" q7"), Some(7));
+        assert_eq!(parse_query_index("q42"), Some(42));
+        assert_eq!(parse_query_index("xq7"), None);
+    }
+
+    #[test]
+    fn random_access_bins_dataset_pattern() {
+        let bin = classify(&record("random-access/taxi/take/parquet")).unwrap();
+        assert_eq!(
+            bin,
+            V3Bin::RandomAccess {
+                dataset: "taxi/take".into(),
+                format: "parquet".into(),
+            }
+        );
+    }
+}
diff --git a/benchmarks-website/migrate/src/commits.rs b/benchmarks-website/migrate/src/commits.rs
new file mode 100644
index 00000000000..28d63a5bd19
--- /dev/null
+++ b/benchmarks-website/migrate/src/commits.rs
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Commit upserts. Adapts a [`crate::v2::V2Commit`] into the v3
+//! `commits` row shape (a [`vortex_bench_server::records::CommitInfo`]).
+
+use anyhow::Context as _;
+use anyhow::Result;
+use duckdb::Transaction;
+use duckdb::params;
+
+use crate::v2::V2Commit;
+
+/// Insert a v3 `commits` row for one v2 commit. Missing fields are
+/// filled with the empty string, matching the v3 schema's `NOT NULL`
+/// constraints; the call site logs a warning for each fallback so
+/// the operator can spot bad inputs.
+pub fn upsert_commit(tx: &Transaction<'_>, commit: &V2Commit) -> Result<UpsertOutcome> {
+    let mut warnings = Vec::new();
+    let timestamp = require_field(&commit.timestamp, "timestamp", &commit.id, &mut warnings);
+    let message = require_field(&commit.message, "message", &commit.id, &mut warnings);
+    let author_name = require_field(
+        &commit.author.as_ref().and_then(|p| p.name.clone()),
+        "author.name",
+        &commit.id,
+        &mut warnings,
+    );
+    let author_email = require_field(
+        &commit.author.as_ref().and_then(|p| p.email.clone()),
+        "author.email",
+        &commit.id,
+        &mut warnings,
+    );
+    let committer_name = require_field(
+        &commit.committer.as_ref().and_then(|p| p.name.clone()),
+        "committer.name",
+        &commit.id,
+        &mut warnings,
+    );
+    let committer_email = require_field(
+        &commit.committer.as_ref().and_then(|p| p.email.clone()),
+        "committer.email",
+        &commit.id,
+        &mut warnings,
+    );
+    let tree_sha = require_field(&commit.tree_id, "tree_id", &commit.id, &mut warnings);
+    let url = require_field(&commit.url, "url", &commit.id, &mut warnings);
+
+    tx.execute(
+        r#"
+        INSERT INTO commits (
+            commit_sha, timestamp, message, author_name, author_email,
+            committer_name, committer_email, tree_sha, url
+        ) VALUES (?, CAST(? AS TIMESTAMPTZ), ?, ?, ?, ?, ?, ?, ?)
+        ON CONFLICT (commit_sha) DO UPDATE SET
+            timestamp       = excluded.timestamp,
+            message         = excluded.message,
+            author_name     = excluded.author_name,
+            author_email    = excluded.author_email,
+            committer_name  = excluded.committer_name,
+            committer_email = excluded.committer_email,
+            tree_sha        = excluded.tree_sha,
+            url             = excluded.url
+        "#,
+        params![
+            commit.id,
+            timestamp,
+            message,
+            author_name,
+            author_email,
+            committer_name,
+            committer_email,
+            tree_sha,
+            url,
+        ],
+    )
+    .with_context(|| format!("upserting commit {}", commit.id))?;
+    Ok(UpsertOutcome { warnings })
+}
+
+fn require_field(
+    field: &Option<String>,
+    name: &str,
+    sha: &str,
+    warnings: &mut Vec<String>,
+) -> String {
+    match field {
+        Some(s) => s.clone(),
+        None => {
+            warnings.push(format!("commit {sha} missing {name}"));
+            String::new()
+        }
+    }
+}
+
+/// Per-call warning bag returned to the caller for logging.
+#[derive(Debug, Default)]
+pub struct UpsertOutcome {
+    pub warnings: Vec<String>,
+}
diff --git a/benchmarks-website/migrate/src/lib.rs b/benchmarks-website/migrate/src/lib.rs
new file mode 100644
index 00000000000..5e8d9c64907
--- /dev/null
+++ b/benchmarks-website/migrate/src/lib.rs
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! One-shot historical migrator from v2's S3-hosted benchmark dataset
+//! to a v3 DuckDB file.
+//!
+//! The v2 dataset is JSONL of bare benchmark records keyed by name string.
+//! v3 uses five typed fact tables with explicit dim columns. This crate
+//! ports v2's `getGroup` classifier (in `benchmarks-website/server.js`)
+//! bug-for-bug so that historical rows survive the migration with the
+//! same group / chart / series structure as the live v2 server.
+//!
+//! The migrator is throwaway: once v3 cuts over, both the binary and
+//! the classifier go away.
+
+pub mod classifier;
+pub mod commits;
+pub mod migrate;
+pub mod source;
+pub mod v2;
+pub mod verify;
diff --git a/benchmarks-website/migrate/src/main.rs b/benchmarks-website/migrate/src/main.rs
new file mode 100644
index 00000000000..366834ed441
--- /dev/null
+++ b/benchmarks-website/migrate/src/main.rs
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `vortex-bench-migrate` CLI: a one-shot historical migrator from
+//! v2's S3 dataset into a v3 DuckDB file, plus a structural diff
+//! against the live v2 `/api/metadata` endpoint for spotting
+//! classifier regressions.
+
+use std::path::PathBuf;
+use std::process::ExitCode;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use clap::Parser;
+use clap::Subcommand;
+use clap::ValueEnum;
+use tracing_subscriber::EnvFilter;
+use vortex_bench_migrate::migrate;
+use vortex_bench_migrate::source::Source;
+use vortex_bench_migrate::verify;
+
+/// One-shot historical migrator from v2's S3 dataset to v3 DuckDB.
+#[derive(Debug, Parser)]
+#[command(name = "vortex-bench-migrate", version, about)]
+struct Cli {
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Debug, Subcommand)]
+enum Command {
+    /// Read v2's data.json.gz / commits.json / file-sizes-*.json.gz
+    /// and write a fully populated v3 DuckDB at `--output`.
+    Run {
+        /// Path to write the v3 DuckDB to. Created if absent.
+        #[arg(long)]
+        output: PathBuf,
+        /// Where to fetch v2 dumps from.
+        #[arg(long, value_enum, default_value_t = SourceKind::PublicS3)]
+        source: SourceKind,
+        /// For `--source=local`, the directory containing
+        /// `data.json.gz`, `commits.json`, and `file-sizes-*.json.gz`.
+        #[arg(long, required_if_eq("source", "local"))]
+        source_dir: Option<PathBuf>,
+    },
+    /// Diff a migrated DuckDB against the live v2 `/api/metadata`
+    /// endpoint. Exits 0 if every v2 group is present in v3, 1
+    /// otherwise so this can gate a CI step.
+    Verify {
+        /// HTTPS root of a running v2 server (e.g. `https://bench.vortex.dev`).
+        #[arg(long)]
+        against: String,
+        /// Path to the migrated v3 DuckDB.
+        #[arg(long)]
+        duckdb: PathBuf,
+    },
+}
+
+#[derive(Debug, Clone, Copy, ValueEnum)]
+enum SourceKind {
+    PublicS3,
+    Local,
+}
+
+fn main() -> ExitCode {
+    if let Err(err) = run() {
+        eprintln!("error: {err:#}");
+        return ExitCode::from(2);
+    }
+    ExitCode::SUCCESS
+}
+
+fn run() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_env("VORTEX_BENCH_LOG").unwrap_or_else(|_| EnvFilter::new("info")),
+        )
+        .init();
+
+    let cli = Cli::parse();
+    match cli.command {
+        Command::Run {
+            output,
+            source,
+            source_dir,
+        } => {
+            let source = match source {
+                SourceKind::PublicS3 => Source::PublicS3,
+                SourceKind::Local => {
+                    Source::Local(source_dir.context("--source=local requires --source-dir")?)
+                }
+            };
+            let summary = migrate::run(&source, &output)?;
+            print!("{summary}");
+            if summary.uncategorized_fraction() > 0.05 {
+                anyhow::bail!(
+                    "uncategorized records ({:.2}%) exceed the 5% gate; \
+                     stop and report unmatched prefixes (see summary above) \
+                     before proceeding",
+                    100.0 * summary.uncategorized_fraction()
+                );
+            }
+            Ok(())
+        }
+        Command::Verify { against, duckdb } => {
+            let report = verify::run(&against, &duckdb)?;
+            print!("{report}");
+            if !report.v2_groups_covered() {
+                std::process::exit(1);
+            }
+            Ok(())
+        }
+    }
+}
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
new file mode 100644
index 00000000000..f75e0169fda
--- /dev/null
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! End-to-end migration of one v2 dataset into a v3 DuckDB file.
+//!
+//! Streams `data.json.gz` line-by-line, runs each record through the
+//! [classifier][crate::classifier], and writes one row per record into
+//! the appropriate v3 fact table. Every row's `measurement_id` is
+//! computed via the server's `measurement_id_*` functions so the result
+//! is byte-compatible with what fresh `/api/ingest` would have produced.
+
+use std::collections::BTreeMap;
+use std::io::BufRead;
+use std::path::Path;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use duckdb::Connection;
+use duckdb::Transaction;
+use duckdb::params;
+use tracing::warn;
+use vortex_bench_server::db::measurement_id_compression_size;
+use vortex_bench_server::db::measurement_id_compression_time;
+use vortex_bench_server::db::measurement_id_query;
+use vortex_bench_server::db::measurement_id_random_access;
+use vortex_bench_server::records::CompressionSize;
+use vortex_bench_server::records::CompressionTime;
+use vortex_bench_server::records::QueryMeasurement;
+use vortex_bench_server::records::RandomAccessTime;
+use vortex_bench_server::schema::SCHEMA_DDL;
+
+use crate::classifier::V3Bin;
+use crate::classifier::classify;
+use crate::commits::upsert_commit;
+use crate::source::Source;
+use crate::v2::V2Commit;
+use crate::v2::V2FileSize;
+use crate::v2::V2Record;
+use crate::v2::index_commits;
+use crate::v2::runtime_as_i64;
+use crate::v2::value_as_f64;
+
+/// Per-table insert counts, plus skip / missing counts.
+#[derive(Debug, Default, Clone)]
+pub struct MigrationSummary {
+    pub records_read: u64,
+    pub query_inserted: u64,
+    pub compression_time_inserted: u64,
+    pub compression_size_inserted: u64,
+    pub random_access_inserted: u64,
+    pub file_size_inserted: u64,
+    pub uncategorized: u64,
+    pub uncategorized_prefixes: BTreeMap<String, u64>,
+    pub missing_commit: u64,
+    pub commit_warnings: u64,
+    pub skipped_no_value: u64,
+    pub commits_inserted: u64,
+}
+
+impl MigrationSummary {
+    /// Total `data.json.gz` records that landed in some v3 fact table.
+    pub fn total_inserted(&self) -> u64 {
+        self.query_inserted
+            + self.compression_time_inserted
+            + self.compression_size_inserted
+            + self.random_access_inserted
+    }
+
+    /// Fraction of records that were uncategorized. The orchestrator
+    /// stops if this exceeds the documented 5% threshold.
+    pub fn uncategorized_fraction(&self) -> f64 {
+        if self.records_read == 0 {
+            return 0.0;
+        }
+        self.uncategorized as f64 / self.records_read as f64
+    }
+}
+
+/// Open or create a DuckDB at `path` and apply the v3 schema.
+pub fn open_target_db(path: &Path) -> Result<Connection> {
+    let conn =
+        Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?;
+    conn.execute_batch(SCHEMA_DDL)
+        .context("applying v3 schema DDL")?;
+    Ok(conn)
+}
+
+/// Run the whole migration: commits, data.json.gz, and every
+/// file-sizes-*.json.gz under the source.
+pub fn run(source: &Source, target: &Path) -> Result<MigrationSummary> {
+    let mut conn = open_target_db(target)?;
+    let mut summary = MigrationSummary::default();
+
+    let commits = read_commits(source)?;
+    summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?;
+
+    migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?;
+
+    for name in source.list_file_sizes()? {
+        if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) {
+            warn!("file-sizes file {name} failed: {e:#}");
+        }
+    }
+
+    Ok(summary)
+}
+
+fn read_commits(source: &Source) -> Result<BTreeMap<String, V2Commit>> {
+    let reader = source.open_commits_jsonl()?;
+    let mut commits: Vec<V2Commit> = Vec::new();
+    for line in reader.lines() {
+        let line = line?;
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        match serde_json::from_str::<V2Commit>(trimmed) {
+            Ok(c) => commits.push(c),
+            Err(e) => warn!("skipping malformed commits.json line: {e}"),
+        }
+    }
+    Ok(index_commits(commits))
+}
+
+fn upsert_all_commits(
+    conn: &mut Connection,
+    commits: &BTreeMap<String, V2Commit>,
+    summary: &mut MigrationSummary,
+) -> Result<u64> {
+    let tx = conn.transaction().context("begin commits transaction")?;
+    let mut count = 0u64;
+    for commit in commits.values() {
+        let outcome = upsert_commit(&tx, commit)?;
+        for w in outcome.warnings {
+            warn!("{w}");
+            summary.commit_warnings += 1;
+        }
+        count += 1;
+    }
+    tx.commit().context("commit commits transaction")?;
+    Ok(count)
+}
+
+fn migrate_data_jsonl(
+    conn: &mut Connection,
+    source: &Source,
+    commits: &BTreeMap<String, V2Commit>,
+    summary: &mut MigrationSummary,
+) -> Result<()> {
+    let reader = source.open_data_jsonl()?;
+    let mut tx = conn.transaction().context("begin data tx")?;
+    const BATCH: u64 = 10_000;
+    let mut in_batch = 0u64;
+    for line in reader.lines() {
+        let line = line?;
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        summary.records_read += 1;
+        let record: V2Record = match serde_json::from_str(trimmed) {
+            Ok(r) => r,
+            Err(e) => {
+                warn!("skipping malformed data.json line: {e}");
+                continue;
+            }
+        };
+        apply_v2_record(&tx, &record, commits, summary)?;
+        in_batch += 1;
+        if in_batch >= BATCH {
+            tx.commit().context("commit data batch")?;
+            tx = conn.transaction().context("begin data tx")?;
+            in_batch = 0;
+        }
+    }
+    tx.commit().context("commit final data batch")?;
+    Ok(())
+}
+
+fn apply_v2_record(
+    tx: &Transaction<'_>,
+    record: &V2Record,
+    commits: &BTreeMap<String, V2Commit>,
+    summary: &mut MigrationSummary,
+) -> Result<()> {
+    let Some(sha) = record.commit_id.clone() else {
+        summary.missing_commit += 1;
+        return Ok(());
+    };
+    if !commits.contains_key(&sha) {
+        summary.missing_commit += 1;
+        return Ok(());
+    }
+
+    let Some(bin) = classify(record) else {
+        summary.uncategorized += 1;
+        let prefix = record.name.split('/').next().unwrap_or("").to_string();
+        *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1;
+        return Ok(());
+    };
+
+    let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple());
+    let runtimes = record
+        .all_runtimes
+        .as_ref()
+        .map(|v| v.iter().filter_map(runtime_as_i64).collect::<Vec<i64>>())
+        .unwrap_or_default();
+    let value_f64 = match record.value.as_ref().and_then(value_as_f64) {
+        Some(v) => v,
+        None => {
+            summary.skipped_no_value += 1;
+            return Ok(());
+        }
+    };
+
+    match bin {
+        V3Bin::Query {
+            dataset,
+            dataset_variant,
+            scale_factor,
+            query_idx,
+            storage,
+            engine,
+            format,
+        } => {
+            let qm = QueryMeasurement {
+                commit_sha: sha,
+                dataset,
+                dataset_variant,
+                scale_factor,
+                query_idx,
+                storage,
+                engine,
+                format,
+                value_ns: value_f64 as i64,
+                all_runtimes_ns: runtimes,
+                peak_physical: None,
+                peak_virtual: None,
+                physical_delta: None,
+                virtual_delta: None,
+                env_triple,
+            };
+            insert_query(tx, &qm)?;
+            summary.query_inserted += 1;
+        }
+        V3Bin::CompressionTime {
+            dataset,
+            dataset_variant,
+            format,
+            op,
+        } => {
+            let ct = CompressionTime {
+                commit_sha: sha,
+                dataset,
+                dataset_variant,
+                format,
+                op,
+                value_ns: value_f64 as i64,
+                all_runtimes_ns: runtimes,
+                env_triple,
+            };
+            insert_compression_time(tx, &ct)?;
+            summary.compression_time_inserted += 1;
+        }
+        V3Bin::CompressionSize {
+            dataset,
+            dataset_variant,
+            format,
+        } => {
+            let cs = CompressionSize {
+                commit_sha: sha,
+                dataset,
+                dataset_variant,
+                format,
+                value_bytes: value_f64 as i64,
+            };
+            insert_compression_size(tx, &cs)?;
+            summary.compression_size_inserted += 1;
+        }
+        V3Bin::RandomAccess { dataset, format } => {
+            let ra = RandomAccessTime {
+                commit_sha: sha,
+                dataset,
+                format,
+                value_ns: value_f64 as i64,
+                all_runtimes_ns: runtimes,
+                env_triple,
+            };
+            insert_random_access(tx, &ra)?;
+            summary.random_access_inserted += 1;
+        }
+    }
+    Ok(())
+}
+
+fn insert_query(tx: &Transaction<'_>, r: &QueryMeasurement) -> Result<()> {
+    let mid = measurement_id_query(r);
+    tx.execute(
+        r#"
+        INSERT INTO query_measurements (
+            measurement_id, commit_sha, dataset, dataset_variant, scale_factor,
+            query_idx, storage, engine, format,
+            value_ns, all_runtimes_ns,
+            peak_physical, peak_virtual, physical_delta, virtual_delta,
+            env_triple
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?)
+        ON CONFLICT (measurement_id) DO UPDATE SET
+            commit_sha      = excluded.commit_sha,
+            value_ns        = excluded.value_ns,
+            all_runtimes_ns = excluded.all_runtimes_ns,
+            env_triple      = excluded.env_triple
+        "#,
+        params![
+            mid,
+            r.commit_sha,
+            r.dataset,
+            r.dataset_variant,
+            r.scale_factor,
+            r.query_idx,
+            r.storage,
+            r.engine,
+            r.format,
+            r.value_ns,
+            runtimes_literal(&r.all_runtimes_ns),
+            r.peak_physical,
+            r.peak_virtual,
+            r.physical_delta,
+            r.virtual_delta,
+            r.env_triple,
+        ],
+    )?;
+    Ok(())
+}
+
+fn insert_compression_time(tx: &Transaction<'_>, r: &CompressionTime) -> Result<()> {
+    let mid = measurement_id_compression_time(r);
+    tx.execute(
+        r#"
+        INSERT INTO compression_times (
+            measurement_id, commit_sha, dataset, dataset_variant,
+            format, op, value_ns, all_runtimes_ns, env_triple
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
+        ON CONFLICT (measurement_id) DO UPDATE SET
+            commit_sha      = excluded.commit_sha,
+            value_ns        = excluded.value_ns,
+            all_runtimes_ns = excluded.all_runtimes_ns,
+            env_triple      = excluded.env_triple
+        "#,
+        params![
+            mid,
+            r.commit_sha,
+            r.dataset,
+            r.dataset_variant,
+            r.format,
+            r.op,
+            r.value_ns,
+            runtimes_literal(&r.all_runtimes_ns),
+            r.env_triple,
+        ],
+    )?;
+    Ok(())
+}
+
+fn insert_compression_size(tx: &Transaction<'_>, r: &CompressionSize) -> Result<()> {
+    let mid = measurement_id_compression_size(r);
+    tx.execute(
+        r#"
+        INSERT INTO compression_sizes (
+            measurement_id, commit_sha, dataset, dataset_variant,
+            format, value_bytes
+        ) VALUES (?, ?, ?, ?, ?, ?)
+        ON CONFLICT (measurement_id) DO UPDATE SET
+            commit_sha   = excluded.commit_sha,
+            value_bytes  = excluded.value_bytes
+        "#,
+        params![
+            mid,
+            r.commit_sha,
+            r.dataset,
+            r.dataset_variant,
+            r.format,
+            r.value_bytes,
+        ],
+    )?;
+    Ok(())
+}
+
+fn insert_random_access(tx: &Transaction<'_>, r: &RandomAccessTime) -> Result<()> {
+    let mid = measurement_id_random_access(r);
+    tx.execute(
+        r#"
+        INSERT INTO random_access_times (
+            measurement_id, commit_sha, dataset, format,
+            value_ns, all_runtimes_ns, env_triple
+        ) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
+        ON CONFLICT (measurement_id) DO UPDATE SET
+            commit_sha      = excluded.commit_sha,
+            value_ns        = excluded.value_ns,
+            all_runtimes_ns = excluded.all_runtimes_ns,
+            env_triple      = excluded.env_triple
+        "#,
+        params![
+            mid,
+            r.commit_sha,
+            r.dataset,
+            r.format,
+            r.value_ns,
+            runtimes_literal(&r.all_runtimes_ns),
+            r.env_triple,
+        ],
+    )?;
+    Ok(())
+}
+
+fn runtimes_literal(values: &[i64]) -> String {
+    let mut s = String::with_capacity(values.len() * 8 + 2);
+    s.push('[');
+    for (i, v) in values.iter().enumerate() {
+        if i > 0 {
+            s.push(',');
+        }
+        s.push_str(&v.to_string());
+    }
+    s.push(']');
+    s
+}
+
+fn migrate_file_sizes(
+    conn: &mut Connection,
+    source: &Source,
+    name: &str,
+    commits: &BTreeMap<String, V2Commit>,
+    summary: &mut MigrationSummary,
+) -> Result<()> {
+    let reader = source.open_file_sizes(name)?;
+    let dataset = name
+        .strip_prefix("file-sizes-")
+        .and_then(|s| s.strip_suffix(".json.gz"))
+        .unwrap_or(name)
+        .to_string();
+    let mut tx = conn.transaction().context("begin file-sizes tx")?;
+    const BATCH: u64 = 10_000;
+    let mut in_batch = 0u64;
+    for line in reader.lines() {
+        let line = line?;
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        let sz: V2FileSize = match serde_json::from_str(trimmed) {
+            Ok(r) => r,
+            Err(e) => {
+                warn!("skipping malformed {name} line: {e}");
+                continue;
+            }
+        };
+        if !commits.contains_key(&sz.commit_id) {
+            summary.missing_commit += 1;
+            continue;
+        }
+        // file-sizes-*.json.gz captures per-file sizes inside one
+        // benchmark/format/scale_factor combo. We aggregate to one
+        // (commit, dataset, dataset_variant, format) row by summing,
+        // since v3's compression_sizes is a single bytes value per
+        // (dim) tuple. Use ON CONFLICT to accumulate.
+        upsert_file_size_row(&tx, &sz, &dataset)?;
+        summary.file_size_inserted += 1;
+        in_batch += 1;
+        if in_batch >= BATCH {
+            tx.commit().context("commit file-sizes batch")?;
+            tx = conn.transaction().context("begin file-sizes tx")?;
+            in_batch = 0;
+        }
+    }
+    tx.commit().context("commit final file-sizes batch")?;
+    Ok(())
+}
+
+fn upsert_file_size_row(
+    tx: &Transaction<'_>,
+    sz: &V2FileSize,
+    dataset_fallback: &str,
+) -> Result<()> {
+    let dataset = if sz.benchmark.is_empty() {
+        dataset_fallback.to_string()
+    } else {
+        sz.benchmark.clone()
+    };
+    let dataset_variant = sz
+        .scale_factor
+        .as_ref()
+        .filter(|s| !s.is_empty() && s.as_str() != "1.0")
+        .cloned();
+    let cs = CompressionSize {
+        commit_sha: sz.commit_id.clone(),
+        dataset,
+        dataset_variant,
+        format: sz.format.clone(),
+        value_bytes: sz.size_bytes,
+    };
+    let mid = measurement_id_compression_size(&cs);
+    // Multiple files within the same dataset/format/scale_factor sum
+    // into one row by adding to whatever is already there.
+    tx.execute(
+        r#"
+        INSERT INTO compression_sizes (
+            measurement_id, commit_sha, dataset, dataset_variant,
+            format, value_bytes
+        ) VALUES (?, ?, ?, ?, ?, ?)
+        ON CONFLICT (measurement_id) DO UPDATE SET
+            value_bytes = compression_sizes.value_bytes + excluded.value_bytes
+        "#,
+        params![
+            mid,
+            cs.commit_sha,
+            cs.dataset,
+            cs.dataset_variant,
+            cs.format,
+            cs.value_bytes,
+        ],
+    )?;
+    Ok(())
+}
+
+/// Print the summary in a human-readable form. Returned by the CLI.
+impl std::fmt::Display for MigrationSummary {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Records read:           {}", self.records_read)?;
+        writeln!(f, "Commits upserted:       {}", self.commits_inserted)?;
+        writeln!(f, "Commit warnings:        {}", self.commit_warnings)?;
+        writeln!(f, "Inserted (query):       {}", self.query_inserted)?;
+        writeln!(
+            f,
+            "Inserted (compress t):  {}",
+            self.compression_time_inserted
+        )?;
+        writeln!(
+            f,
+            "Inserted (compress s):  {}",
+            self.compression_size_inserted
+        )?;
+        writeln!(f, "Inserted (random acc):  {}", self.random_access_inserted)?;
+        writeln!(f, "Inserted (file sizes):  {}", self.file_size_inserted)?;
+        writeln!(f, "Missing commit:         {}", self.missing_commit)?;
+        writeln!(f, "Skipped (no value):     {}", self.skipped_no_value)?;
+        writeln!(
+            f,
+            "Uncategorized:          {} ({:.2}%)",
+            self.uncategorized,
+            100.0 * self.uncategorized_fraction()
+        )?;
+        if !self.uncategorized_prefixes.is_empty() {
+            let mut top: Vec<_> = self.uncategorized_prefixes.iter().collect();
+            top.sort_by(|a, b| b.1.cmp(a.1));
+            writeln!(f, "Top uncategorized prefixes:")?;
+            for (prefix, n) in top.iter().take(20) {
+                writeln!(f, "  {prefix:>32} : {n}")?;
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs
new file mode 100644
index 00000000000..2b4fdca9b94
--- /dev/null
+++ b/benchmarks-website/migrate/src/source.rs
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Streaming readers for v2's public S3 bucket.
+//!
+//! The bucket is `--no-sign-request`, so we fetch the underlying
+//! HTTPS URL directly and stream-decompress with `flate2`. The
+//! downloads are wrapped in [`reqwest::blocking`] to keep the read
+//! path synchronous; the binary's hot path is single-threaded
+//! per-source already (DuckDB is a single-writer).
+//!
+//! For tests and offline runs, [`Source::Local`] accepts a local
+//! directory of dumps; the migrator's `--source` flag picks the
+//! variant.
+
+use std::fs::File;
+use std::io::BufRead;
+use std::io::BufReader;
+use std::io::Read;
+use std::path::Path;
+use std::path::PathBuf;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use flate2::read::GzDecoder;
+
+/// Public S3 bucket the live v2 server reads from.
+pub const PUBLIC_BUCKET_BASE: &str = "https://vortex-ci-benchmark-results.s3.amazonaws.com";
+
+/// Where to read the v2 dataset from. Either the public S3 bucket
+/// (the live deployment) or a local directory of dumps.
+#[derive(Debug, Clone)]
+pub enum Source {
+    /// HTTPS GETs against `s3.amazonaws.com`.
+    PublicS3,
+    /// A directory containing `data.json.gz`, `commits.json`, and
+    /// `file-sizes-*.json.gz` files.
+    Local(PathBuf),
+}
+
+impl Source {
+    /// Open `data.json.gz` for streaming, decompressing on the fly.
+    pub fn open_data_jsonl(&self) -> Result<Box<dyn BufRead>> {
+        let stream = self.open_raw("data.json.gz")?;
+        Ok(Box::new(BufReader::new(GzDecoder::new(stream))))
+    }
+
+    /// Open `commits.json` (uncompressed).
+    pub fn open_commits_jsonl(&self) -> Result<Box<dyn BufRead>> {
+        let stream = self.open_raw("commits.json")?;
+        Ok(Box::new(BufReader::new(stream)))
+    }
+
+    /// Enumerate `file-sizes-*.json.gz` files. For local sources this
+    /// is a directory glob; for the public bucket we hit the documented
+    /// suite ids.
+    pub fn list_file_sizes(&self) -> Result<Vec<String>> {
+        match self {
+            Source::Local(dir) => {
+                let mut out = Vec::new();
+                for entry in std::fs::read_dir(dir)? {
+                    let entry = entry?;
+                    let name = entry.file_name();
+                    let s = name.to_string_lossy();
+                    if s.starts_with("file-sizes-") && s.ends_with(".json.gz") {
+                        out.push(s.into_owned());
+                    }
+                }
+                out.sort();
+                Ok(out)
+            }
+            Source::PublicS3 => {
+                // The S3 bucket's ListObjects is denied for unsigned
+                // requests, so we hit the documented per-suite keys
+                // emitted by `.github/workflows/sql-benchmarks.yml`.
+                Ok(KNOWN_FILE_SIZES_SUITES
+                    .iter()
+                    .map(|id| format!("file-sizes-{id}.json.gz"))
+                    .collect())
+            }
+        }
+    }
+
+    /// Open one `file-sizes-*.json.gz` for streaming.
+    pub fn open_file_sizes(&self, name: &str) -> Result<Box<dyn BufRead>> {
+        let stream = self.open_raw(name)?;
+        Ok(Box::new(BufReader::new(GzDecoder::new(stream))))
+    }
+
+    fn open_raw(&self, name: &str) -> Result<Box<dyn Read + Send>> {
+        match self {
+            Source::Local(dir) => open_local(&dir.join(name)),
+            Source::PublicS3 => open_s3(name),
+        }
+    }
+}
+
+fn open_local(path: &Path) -> Result<Box<dyn Read + Send>> {
+    let f = File::open(path).with_context(|| format!("opening {}", path.display()))?;
+    Ok(Box::new(f))
+}
+
+fn open_s3(name: &str) -> Result<Box<dyn Read + Send>> {
+    let url = format!("{PUBLIC_BUCKET_BASE}/{name}");
+    let resp = reqwest::blocking::get(&url).with_context(|| format!("GET {url}"))?;
+    if !resp.status().is_success() {
+        anyhow::bail!("GET {url} returned {}", resp.status());
+    }
+    Ok(Box::new(resp))
+}
+
+/// Suite IDs we know publish a `file-sizes-{id}.json.gz` to S3.
+/// Matches the `matrix.id` values in `.github/workflows/sql-benchmarks.yml`
+/// at the time of writing. New suites mean a new entry here.
+const KNOWN_FILE_SIZES_SUITES: &[&str] =
+    &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"];
diff --git a/benchmarks-website/migrate/src/v2.rs b/benchmarks-website/migrate/src/v2.rs
new file mode 100644
index 00000000000..2a9d3bdf5d0
--- /dev/null
+++ b/benchmarks-website/migrate/src/v2.rs
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Wire shapes of the v2 benchmark dataset on S3.
+//!
+//! These types capture only the fields the migrator reads. v2 records
+//! are serialized by `vortex-bench` (see `vortex-bench/src/measurements.rs`)
+//! and by older non-Rust scripts; the union of fields is loose, so we
+//! deserialize permissively (`serde(default)`, untyped `serde_json::Value`
+//! for the polymorphic `dataset` field).
+
+use std::collections::BTreeMap;
+
+use serde::Deserialize;
+
+/// One JSONL line of `data.json.gz`.
+///
+/// The shape is the union of every emitter's output. Most fields are
+/// optional because different benches emit different subsets.
+#[derive(Debug, Clone, Deserialize)]
+pub struct V2Record {
+    pub name: String,
+    #[serde(default)]
+    pub commit_id: Option<String>,
+    #[serde(default)]
+    pub unit: Option<String>,
+    #[serde(default)]
+    pub value: Option<serde_json::Value>,
+    #[serde(default)]
+    pub storage: Option<String>,
+    #[serde(default)]
+    pub dataset: Option<serde_json::Value>,
+    #[serde(default)]
+    pub all_runtimes: Option<Vec<serde_json::Value>>,
+    #[serde(default)]
+    pub env_triple: Option<V2EnvTriple>,
+}
+
+/// `dataset` in v2 records is sometimes a string, sometimes an object
+/// keyed by suite name (`{ "tpch": { "scale_factor": "10" } }`).
+/// This helper looks up the scale factor for a given suite without
+/// assuming a particular shape.
+pub fn dataset_scale_factor(dataset: &serde_json::Value, key: &str) -> Option<String> {
+    let obj = dataset.as_object()?;
+    let entry = obj.get(key)?;
+    let sf = entry.get("scale_factor")?;
+    match sf {
+        serde_json::Value::String(s) => Some(s.clone()),
+        serde_json::Value::Number(n) => Some(n.to_string()),
+        _ => None,
+    }
+}
+
+/// Best-effort numeric coercion for the polymorphic `value` field.
+pub fn value_as_f64(value: &serde_json::Value) -> Option<f64> {
+    match value {
+        serde_json::Value::Number(n) => n.as_f64(),
+        serde_json::Value::String(s) => s.parse().ok(),
+        _ => None,
+    }
+}
+
+/// Best-effort coercion of a runtime entry to nanoseconds.
+pub fn runtime_as_i64(value: &serde_json::Value) -> Option<i64> {
+    match value {
+        serde_json::Value::Number(n) => {
+            if let Some(i) = n.as_i64() {
+                Some(i)
+            } else {
+                n.as_f64().map(|f| f as i64)
+            }
+        }
+        serde_json::Value::String(s) => s.parse().ok(),
+        _ => None,
+    }
+}
+
+/// Triple block as emitted by `vortex-bench`'s `--gh-json` path. v2
+/// stored it as an object; we serialize it back out as `arch-os-env`.
+#[derive(Debug, Clone, Deserialize)]
+pub struct V2EnvTriple {
+    #[serde(default)]
+    pub architecture: Option<String>,
+    #[serde(default)]
+    pub operating_system: Option<String>,
+    #[serde(default)]
+    pub environment: Option<String>,
+}
+
+impl V2EnvTriple {
+    /// Format as the `arch-os-env` triple used by v3's `env_triple` column.
+    pub fn to_triple(&self) -> Option<String> {
+        let arch = self.architecture.as_deref()?;
+        let os = self.operating_system.as_deref()?;
+        let env = self.environment.as_deref()?;
+        Some(format!("{arch}-{os}-{env}"))
+    }
+}
+
+/// One JSONL line of `commits.json`.
+#[derive(Debug, Clone, Deserialize)]
+pub struct V2Commit {
+    pub id: String,
+    #[serde(default)]
+    pub timestamp: Option<String>,
+    #[serde(default)]
+    pub message: Option<String>,
+    #[serde(default)]
+    pub author: Option<V2Person>,
+    #[serde(default)]
+    pub committer: Option<V2Person>,
+    #[serde(default)]
+    pub tree_id: Option<String>,
+    #[serde(default)]
+    pub url: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct V2Person {
+    #[serde(default)]
+    pub name: Option<String>,
+    #[serde(default)]
+    pub email: Option<String>,
+}
+
+/// One JSONL line of `file-sizes-*.json.gz` produced by
+/// `scripts/capture-file-sizes.py`.
+#[derive(Debug, Clone, Deserialize)]
+pub struct V2FileSize {
+    pub commit_id: String,
+    pub benchmark: String,
+    #[serde(default)]
+    pub scale_factor: Option<String>,
+    pub format: String,
+    pub file: String,
+    pub size_bytes: i64,
+}
+
+/// Build a sha-keyed map of commits.
+pub fn index_commits(commits: Vec<V2Commit>) -> BTreeMap<String, V2Commit> {
+    commits.into_iter().map(|c| (c.id.clone(), c)).collect()
+}
diff --git a/benchmarks-website/migrate/src/verify.rs b/benchmarks-website/migrate/src/verify.rs
new file mode 100644
index 00000000000..eb4caef6df7
--- /dev/null
+++ b/benchmarks-website/migrate/src/verify.rs
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Structural diff between a migrated v3 DuckDB and the live v2
+//! `/api/metadata` endpoint.
+//!
+//! Compares group / chart structure only; values aren't compared
+//! because v2 converts ns → ms and bytes → MiB on read while v3
+//! stores raw and the chart query divides. Group/chart structural
+//! equivalence is enough to spot classifier regressions before
+//! cutover.
+
+use std::collections::BTreeMap;
+use std::collections::BTreeSet;
+use std::path::Path;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use duckdb::Connection;
+use serde::Deserialize;
+
+use crate::classifier::QUERY_SUITES;
+
+/// Result of one `verify` run.
+#[derive(Debug, Default)]
+pub struct VerifyReport {
+    pub matched_groups: Vec<String>,
+    pub only_in_v3: Vec<String>,
+    pub only_in_v2: Vec<String>,
+    pub chart_diffs: Vec<ChartDiff>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ChartDiff {
+    pub group: String,
+    pub v2_count: usize,
+    pub v3_count: usize,
+}
+
+impl VerifyReport {
+    /// True if every v2 group is represented in v3. The CLI's exit
+    /// code reflects this.
+    pub fn v2_groups_covered(&self) -> bool {
+        self.only_in_v2.is_empty()
+    }
+}
+
+impl std::fmt::Display for VerifyReport {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Groups in both v2 and v3:")?;
+        for g in &self.matched_groups {
+            writeln!(f, "  + {g}")?;
+        }
+        if !self.only_in_v2.is_empty() {
+            writeln!(f, "Groups only in v2 (regression candidates):")?;
+            for g in &self.only_in_v2 {
+                writeln!(f, "  - {g}")?;
+            }
+        }
+        if !self.only_in_v3.is_empty() {
+            writeln!(f, "Groups only in v3:")?;
+            for g in &self.only_in_v3 {
+                writeln!(f, "  + {g}")?;
+            }
+        }
+        if !self.chart_diffs.is_empty() {
+            writeln!(f, "Chart count diffs:")?;
+            for d in &self.chart_diffs {
+                writeln!(
+                    f,
+                    "  {} : v2={} v3={} (delta={})",
+                    d.group,
+                    d.v2_count,
+                    d.v3_count,
+                    d.v3_count as i64 - d.v2_count as i64,
+                )?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// v2's `/api/metadata` reply — only the fields we need.
+#[derive(Debug, Deserialize)]
+struct V2Metadata {
+    groups: BTreeMap<String, V2GroupMeta>,
+}
+
+#[derive(Debug, Deserialize)]
+struct V2GroupMeta {
+    #[serde(default)]
+    charts: Vec<V2ChartMeta>,
+}
+
+#[derive(Debug, Deserialize)]
+struct V2ChartMeta {
+    #[serde(default)]
+    name: String,
+}
+
+/// Open the migrated DuckDB at `duckdb_path`, fetch `<v2_server>/api/metadata`,
+/// and produce a structural diff.
+pub fn run(v2_server: &str, duckdb_path: &Path) -> Result<VerifyReport> {
+    let v3 = collect_v3_groups(duckdb_path)?;
+    let v2 = fetch_v2_metadata(v2_server)?;
+    Ok(diff(&v2, &v3))
+}
+
+fn collect_v3_groups(duckdb_path: &Path) -> Result<BTreeMap<String, BTreeSet<String>>> {
+    let conn = Connection::open(duckdb_path)
+        .with_context(|| format!("opening DuckDB at {}", duckdb_path.display()))?;
+    let mut groups: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
+
+    // query_measurements: chart per (dataset, query_idx); group per
+    // (dataset, dataset_variant, scale_factor, storage). We want v2
+    // group display names so the verifier can compare apples to
+    // apples, so we re-format them here using the same suite table.
+    let mut stmt = conn.prepare(
+        r#"
+        SELECT dataset, dataset_variant, scale_factor, storage, query_idx
+          FROM query_measurements
+         GROUP BY dataset, dataset_variant, scale_factor, storage, query_idx
+        "#,
+    )?;
+    let rows = stmt.query_map([], |row| {
+        Ok((
+            row.get::<_, String>(0)?,
+            row.get::<_, Option<String>>(1)?,
+            row.get::<_, Option<String>>(2)?,
+            row.get::<_, String>(3)?,
+            row.get::<_, i32>(4)?,
+        ))
+    })?;
+    for row in rows {
+        let (dataset, _variant, sf, storage, query_idx) = row?;
+        let group_name = display_query_group(&dataset, sf.as_deref(), &storage);
+        let chart_name = chart_name_query(&dataset, query_idx);
+        groups
+            .entry(group_name)
+            .or_default()
+            .insert(normalize_chart(&chart_name));
+    }
+
+    // compression_times: group "Compression", charts per dataset.
+    let mut stmt = conn.prepare(
+        r#"
+        SELECT dataset, format, op
+          FROM compression_times
+         GROUP BY dataset, format, op
+        "#,
+    )?;
+    let rows = stmt.query_map([], |row| {
+        Ok((
+            row.get::<_, String>(0)?,
+            row.get::<_, String>(1)?,
+            row.get::<_, String>(2)?,
+        ))
+    })?;
+    for row in rows {
+        let (dataset, format, op) = row?;
+        let chart = chart_name_compression_time(&format, &op, &dataset);
+        groups
+            .entry("Compression".to_string())
+            .or_default()
+            .insert(normalize_chart(&chart));
+    }
+
+    let mut stmt = conn.prepare(
+        r#"
+        SELECT dataset, format
+          FROM compression_sizes
+         GROUP BY dataset, format
+        "#,
+    )?;
+    let rows = stmt.query_map([], |row| {
+        Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
+    })?;
+    for row in rows {
+        let (_dataset, format) = row?;
+        let chart = chart_name_compression_size(&format);
+        groups
+            .entry("Compression Size".to_string())
+            .or_default()
+            .insert(normalize_chart(&chart));
+    }
+
+    let mut stmt = conn.prepare(
+        r#"
+        SELECT DISTINCT dataset
+          FROM random_access_times
+        "#,
+    )?;
+    let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
+    for row in rows {
+        let dataset = row?;
+        groups
+            .entry("Random Access".to_string())
+            .or_default()
+            .insert(normalize_chart(&dataset));
+    }
+
+    Ok(groups)
+}
+
+fn fetch_v2_metadata(server: &str) -> Result<BTreeMap<String, BTreeSet<String>>> {
+    let url = format!("{}/api/metadata", server.trim_end_matches('/'));
+    let body = reqwest::blocking::get(&url)
+        .with_context(|| format!("GET {url}"))?
+        .error_for_status()
+        .with_context(|| format!("non-2xx from {url}"))?
+        .json::<V2Metadata>()
+        .with_context(|| format!("parsing {url} as v2 /api/metadata"))?;
+    let mut out: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
+    for (name, group) in body.groups {
+        let charts = group
+            .charts
+            .into_iter()
+            .map(|c| normalize_chart(&c.name))
+            .collect();
+        out.insert(name, charts);
+    }
+    Ok(out)
+}
+
+fn diff(
+    v2: &BTreeMap<String, BTreeSet<String>>,
+    v3: &BTreeMap<String, BTreeSet<String>>,
+) -> VerifyReport {
+    let mut report = VerifyReport::default();
+    let v2_keys: BTreeSet<&String> = v2.keys().collect();
+    let v3_keys: BTreeSet<&String> = v3.keys().collect();
+    for g in v2_keys.intersection(&v3_keys) {
+        report.matched_groups.push((**g).clone());
+        let v2_charts = &v2[*g];
+        let v3_charts = &v3[*g];
+        if v2_charts.len() != v3_charts.len() {
+            report.chart_diffs.push(ChartDiff {
+                group: (**g).clone(),
+                v2_count: v2_charts.len(),
+                v3_count: v3_charts.len(),
+            });
+        }
+    }
+    for g in v3_keys.difference(&v2_keys) {
+        report.only_in_v3.push((**g).clone());
+    }
+    for g in v2_keys.difference(&v3_keys) {
+        report.only_in_v2.push((**g).clone());
+    }
+    report.matched_groups.sort();
+    report.only_in_v3.sort();
+    report.only_in_v2.sort();
+    report
+}
+
+fn display_query_group(dataset: &str, scale_factor: Option<&str>, storage: &str) -> String {
+    let suite = QUERY_SUITES
+        .iter()
+        .find(|s| s.prefix.eq_ignore_ascii_case(dataset))
+        .copied();
+    match suite {
+        Some(suite) if suite.fan_out => {
+            let storage_disp = match storage {
+                "s3" | "S3" => "S3",
+                _ => "NVMe",
+            };
+            let sf = scale_factor.unwrap_or("1");
+            format!("{} ({}) (SF={})", suite.display_name, storage_disp, sf)
+        }
+        Some(suite) => suite.display_name.to_string(),
+        None => format!("{dataset} ({storage})"),
+    }
+}
+
+fn chart_name_query(dataset: &str, query_idx: i32) -> String {
+    let suite = QUERY_SUITES
+        .iter()
+        .find(|s| s.prefix.eq_ignore_ascii_case(dataset))
+        .copied();
+    match suite {
+        Some(suite) => format!("{} Q{}", suite.query_prefix, query_idx),
+        None => format!("{} Q{}", dataset.to_uppercase(), query_idx),
+    }
+}
+
+fn chart_name_compression_time(format: &str, op: &str, _dataset: &str) -> String {
+    // Re-derive the v2 chart name (the metric, not the dataset) so we
+    // can compare. v2's chart axis is the metric; series is the
+    // dataset. v3 inverts that. For structural comparison, we project
+    // back to v2's per-chart key.
+    match (format, op) {
+        ("vortex-file-compressed", "encode") => "COMPRESS TIME".into(),
+        ("vortex-file-compressed", "decode") => "DECOMPRESS TIME".into(),
+        ("parquet", "encode") => "PARQUET RS ZSTD COMPRESS TIME".into(),
+        ("parquet", "decode") => "PARQUET RS ZSTD DECOMPRESS TIME".into(),
+        ("lance", "encode") => "LANCE COMPRESS TIME".into(),
+        ("lance", "decode") => "LANCE DECOMPRESS TIME".into(),
+        _ => format!("{} {} TIME", format.to_uppercase(), op.to_uppercase()),
+    }
+}
+
+fn chart_name_compression_size(format: &str) -> String {
+    match format {
+        "vortex-file-compressed" => "VORTEX SIZE".into(),
+        "parquet" => "PARQUET SIZE".into(),
+        "lance" => "LANCE SIZE".into(),
+        _ => format!("{} SIZE", format.to_uppercase()),
+    }
+}
+
+/// Strip casing and `_-` differences between v2 and v3 chart names.
+/// v2 displays uppercase; v3 stores raw values. Comparing in this
+/// canonical form is enough for structural verification.
+fn normalize_chart(s: &str) -> String {
+    s.trim()
+        .to_uppercase()
+        .replace(['_', '-'], " ")
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_chart_canonicalizes() {
+        assert_eq!(normalize_chart("taxi/take"), "TAXI/TAKE");
+        assert_eq!(normalize_chart("TAXI/TAKE"), "TAXI/TAKE");
+        assert_eq!(normalize_chart("tpc-h q1"), "TPC H Q1");
+        assert_eq!(normalize_chart("tpc h q1"), "TPC H Q1");
+    }
+
+    #[test]
+    fn display_query_group_handles_fan_out() {
+        assert_eq!(
+            display_query_group("tpch", Some("10"), "s3"),
+            "TPC-H (S3) (SF=10)"
+        );
+        assert_eq!(
+            display_query_group("tpch", Some("100"), "nvme"),
+            "TPC-H (NVMe) (SF=100)"
+        );
+        assert_eq!(
+            display_query_group("clickbench", None, "nvme"),
+            "Clickbench"
+        );
+    }
+}
diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs
new file mode 100644
index 00000000000..2be3896216c
--- /dev/null
+++ b/benchmarks-website/migrate/tests/classifier.rs
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Classifier behavior pinned by representative v2 names from each
+//! group in `benchmarks-website/server.js`'s `getGroup`.
+
+use rstest::rstest;
+use serde_json::json;
+use vortex_bench_migrate::classifier::V3Bin;
+use vortex_bench_migrate::classifier::classify;
+use vortex_bench_migrate::classifier::format_query;
+use vortex_bench_migrate::classifier::rename_engine;
+use vortex_bench_migrate::v2::V2Record;
+
+fn record(name: &str) -> V2Record {
+    V2Record {
+        name: name.to_string(),
+        commit_id: Some("deadbeef".into()),
+        unit: Some("ns".into()),
+        value: Some(json!(123)),
+        storage: None,
+        dataset: None,
+        all_runtimes: None,
+        env_triple: None,
+    }
+}
+
+fn record_with_storage_and_sf(name: &str, storage: &str, suite: &str, sf: &str) -> V2Record {
+    let mut r = record(name);
+    r.storage = Some(storage.into());
+    r.dataset = Some(json!({ suite: { "scale_factor": sf } }));
+    r
+}
+
+#[rstest]
+#[case::clickbench(
+    "clickbench_q07/datafusion:parquet",
+    V3Bin::Query {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        scale_factor: None,
+        query_idx: 7,
+        storage: "nvme".into(),
+        engine: "datafusion".into(),
+        format: "parquet".into(),
+    },
+)]
+#[case::clickbench_vortex_renamed(
+    "clickbench_q12/datafusion:vortex-file-compressed",
+    V3Bin::Query {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        scale_factor: None,
+        query_idx: 12,
+        storage: "nvme".into(),
+        engine: "datafusion".into(),
+        format: "vortex".into(),
+    },
+)]
+#[case::statpopgen(
+    "statpopgen_q3/datafusion:parquet",
+    V3Bin::Query {
+        dataset: "statpopgen".into(),
+        dataset_variant: None,
+        scale_factor: None,
+        query_idx: 3,
+        storage: "nvme".into(),
+        engine: "datafusion".into(),
+        format: "parquet".into(),
+    },
+)]
+#[case::polarsignals(
+    "polarsignals_q1/duckdb:parquet",
+    V3Bin::Query {
+        dataset: "polarsignals".into(),
+        dataset_variant: None,
+        scale_factor: None,
+        query_idx: 1,
+        storage: "nvme".into(),
+        engine: "duckdb".into(),
+        format: "parquet".into(),
+    },
+)]
+fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) {
+    let r = record(name);
+    assert_eq!(classify(&r), Some(expected));
+}
+
+#[rstest]
+#[case::tpch_s3_sf100(
+    "tpch_q01/datafusion:parquet",
+    "S3",
+    "tpch",
+    "100",
+    V3Bin::Query {
+        dataset: "tpch".into(),
+        dataset_variant: None,
+        scale_factor: Some("100".into()),
+        query_idx: 1,
+        storage: "s3".into(),
+        engine: "datafusion".into(),
+        format: "parquet".into(),
+    },
+)]
+#[case::tpch_nvme_sf1(
+    "tpch_q22/duckdb:vortex-file-compressed",
+    "NVMe",
+    "tpch",
+    "1",
+    V3Bin::Query {
+        dataset: "tpch".into(),
+        dataset_variant: None,
+        scale_factor: Some("1".into()),
+        query_idx: 22,
+        storage: "nvme".into(),
+        engine: "duckdb".into(),
+        format: "vortex".into(),
+    },
+)]
+#[case::tpcds_nvme_sf10(
+    "tpcds_q05/datafusion:vortex-file-compressed",
+    "NVMe",
+    "tpcds",
+    "10",
+    V3Bin::Query {
+        dataset: "tpcds".into(),
+        dataset_variant: None,
+        scale_factor: Some("10".into()),
+        query_idx: 5,
+        storage: "nvme".into(),
+        engine: "datafusion".into(),
+        format: "vortex".into(),
+    },
+)]
+fn fan_out_query_records(
+    #[case] name: &str,
+    #[case] storage: &str,
+    #[case] suite: &str,
+    #[case] sf: &str,
+    #[case] expected: V3Bin,
+) {
+    let r = record_with_storage_and_sf(name, storage, suite, sf);
+    assert_eq!(classify(&r), Some(expected));
+}
+
+#[rstest]
+#[case::random_access_4_part(
+    "random-access/taxi/take/parquet-tokio-local-disk",
+    V3Bin::RandomAccess {
+        dataset: "taxi/take".into(),
+        format: "parquet-nvme".into(),
+    },
+)]
+#[case::random_access_4_part_vortex(
+    "random-access/chimp/take/vortex-tokio-local-disk",
+    V3Bin::RandomAccess {
+        dataset: "chimp/take".into(),
+        format: "vortex-nvme".into(),
+    },
+)]
+#[case::random_access_2_part_legacy(
+    "random-access/parquet-tokio-local-disk",
+    V3Bin::RandomAccess {
+        dataset: "random access".into(),
+        format: "parquet-nvme".into(),
+    },
+)]
+fn random_access_records(#[case] name: &str, #[case] expected: V3Bin) {
+    let r = record(name);
+    assert_eq!(classify(&r), Some(expected));
+}
+
+#[rstest]
+#[case::compress_time_vortex(
+    "compress time/clickbench",
+    V3Bin::CompressionTime {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "vortex-file-compressed".into(),
+        op: "encode".into(),
+    },
+)]
+#[case::decompress_time_vortex(
+    "decompress time/tpch_lineitem",
+    V3Bin::CompressionTime {
+        dataset: "tpch_lineitem".into(),
+        dataset_variant: None,
+        format: "vortex-file-compressed".into(),
+        op: "decode".into(),
+    },
+)]
+#[case::parquet_compress(
+    "parquet_rs-zstd compress time/clickbench",
+    V3Bin::CompressionTime {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "parquet".into(),
+        op: "encode".into(),
+    },
+)]
+#[case::lance_decompress(
+    "lance decompress time/clickbench",
+    V3Bin::CompressionTime {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "lance".into(),
+        op: "decode".into(),
+    },
+)]
+fn compression_time_records(#[case] name: &str, #[case] expected: V3Bin) {
+    let r = record(name);
+    assert_eq!(classify(&r), Some(expected));
+}
+
+#[rstest]
+#[case::vortex_size(
+    "vortex size/clickbench",
+    V3Bin::CompressionSize {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "vortex-file-compressed".into(),
+    },
+)]
+#[case::vortex_file_compressed_size_normalizes(
+    "vortex-file-compressed size/clickbench",
+    V3Bin::CompressionSize {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "vortex-file-compressed".into(),
+    },
+)]
+#[case::parquet_size(
+    "parquet size/clickbench",
+    V3Bin::CompressionSize {
+        dataset: "clickbench".into(),
+        dataset_variant: None,
+        format: "parquet".into(),
+    },
+)]
+#[case::lance_size(
+    "lance size/tpch_lineitem",
+    V3Bin::CompressionSize {
+        dataset: "tpch_lineitem".into(),
+        dataset_variant: None,
+        format: "lance".into(),
+    },
+)]
+fn compression_size_records(#[case] name: &str, #[case] expected: V3Bin) {
+    let r = record(name);
+    assert_eq!(classify(&r), Some(expected));
+}
+
+#[rstest]
+#[case::ratio_vortex_parquet("vortex:parquet-zstd ratio compress time/clickbench")]
+#[case::ratio_vortex_lance("vortex:lance ratio decompress time/clickbench")]
+#[case::ratio_size_vortex_parquet("vortex:parquet-zstd size/clickbench")]
+#[case::ratio_size_vortex_raw("vortex:raw size/clickbench")]
+#[case::throughput("compress throughput/clickbench")]
+#[case::fineweb_skipped("fineweb_q01/datafusion:parquet")]
+#[case::nonsense_prefix("not-a-known-bench/series")]
+fn unmapped_records_yield_none(#[case] name: &str) {
+    let r = record(name);
+    assert_eq!(
+        classify(&r),
+        None,
+        "expected {name:?} to classify as None (drop)",
+    );
+}
+
+#[test]
+fn rename_engine_pins_canonical_outputs() {
+    assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme");
+    assert_eq!(
+        rename_engine("datafusion:vortex-file-compressed"),
+        "datafusion:vortex"
+    );
+    assert_eq!(rename_engine("LANCE"), "lance");
+}
+
+#[test]
+fn format_query_pins_v2_display() {
+    assert_eq!(format_query("clickbench_q00"), "CLICKBENCH Q0");
+    assert_eq!(format_query("tpch_q22"), "TPC-H Q22");
+    assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42");
+    assert_eq!(format_query("polarsignals_q1"), "POLARSIGNALS Q1");
+    // Names that don't match a suite fall back to upper + " " replace.
+    assert_eq!(
+        format_query("vortex-file-compressed size"),
+        "VORTEX FILE COMPRESSED SIZE"
+    );
+}
diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs
new file mode 100644
index 00000000000..5892215b472
--- /dev/null
+++ b/benchmarks-website/migrate/tests/end_to_end.rs
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Inline JSONL fixture exercising 1 record per kind through the full
+//! migration into a tempdir DuckDB. No live S3.
+
+use std::fs::File;
+use std::io::Write;
+
+use duckdb::Connection;
+use flate2::Compression;
+use flate2::write::GzEncoder;
+use tempfile::TempDir;
+use vortex_bench_migrate::migrate;
+use vortex_bench_migrate::source::Source;
+
+const COMMITS_JSONL: &str = r#"{"id":"deadbeef","timestamp":"2026-04-25T00:00:00Z","message":"fixture commit","author":{"name":"A","email":"a@example.com"},"committer":{"name":"C","email":"c@example.com"},"tree_id":"abcd0001","url":"https://example.com/commit/deadbeef"}
+"#;
+
+const DATA_JSONL: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":42000,"all_runtimes":[41000,42000,43000]}
+{"name":"compress time/clickbench","commit_id":"deadbeef","unit":"ns","value":99}
+{"name":"vortex size/clickbench","commit_id":"deadbeef","unit":"bytes","value":1024}
+{"name":"random-access/taxi/take/parquet-tokio-local-disk","commit_id":"deadbeef","unit":"ns","value":777,"all_runtimes":[700,777,800]}
+"#;
+
+fn write_local_dir() -> TempDir {
+    let dir = TempDir::new().expect("tempdir");
+    {
+        let mut f = File::create(dir.path().join("commits.json")).unwrap();
+        f.write_all(COMMITS_JSONL.as_bytes()).unwrap();
+    }
+    {
+        let f = File::create(dir.path().join("data.json.gz")).unwrap();
+        let mut gz = GzEncoder::new(f, Compression::default());
+        gz.write_all(DATA_JSONL.as_bytes()).unwrap();
+        gz.finish().unwrap();
+    }
+    // No file-sizes-*.json.gz to keep the fixture minimal.
+    dir
+}
+
+#[test]
+fn migrate_inline_fixture_populates_each_table() {
+    let src_dir = write_local_dir();
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.records_read, 4, "summary={summary}");
+    assert_eq!(summary.uncategorized, 0, "summary={summary}");
+    assert_eq!(summary.commits_inserted, 1);
+    assert_eq!(summary.query_inserted, 1);
+    assert_eq!(summary.compression_time_inserted, 1);
+    assert_eq!(summary.compression_size_inserted, 1);
+    assert_eq!(summary.random_access_inserted, 1);
+
+    let conn = Connection::open(&target).unwrap();
+    let count = |table: &str| -> i64 {
+        conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0))
+            .unwrap()
+    };
+    assert_eq!(count("commits"), 1);
+    assert_eq!(count("query_measurements"), 1);
+    assert_eq!(count("compression_times"), 1);
+    assert_eq!(count("compression_sizes"), 1);
+    assert_eq!(count("random_access_times"), 1);
+
+    // Spot-check the v3 column values for each kind.
+    let (engine, format, query_idx, value_ns): (String, String, i32, i64) = conn
+        .query_row(
+            "SELECT engine, format, query_idx, value_ns FROM query_measurements",
+            [],
+            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
+        )
+        .unwrap();
+    assert_eq!(engine, "datafusion");
+    assert_eq!(format, "parquet");
+    assert_eq!(query_idx, 7);
+    assert_eq!(value_ns, 42000);
+
+    let (dataset, format, op): (String, String, String) = conn
+        .query_row(
+            "SELECT dataset, format, op FROM compression_times",
+            [],
+            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
+        )
+        .unwrap();
+    assert_eq!(dataset, "clickbench");
+    assert_eq!(format, "vortex-file-compressed");
+    assert_eq!(op, "encode");
+
+    let (dataset, format, value_bytes): (String, String, i64) = conn
+        .query_row(
+            "SELECT dataset, format, value_bytes FROM compression_sizes",
+            [],
+            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
+        )
+        .unwrap();
+    assert_eq!(dataset, "clickbench");
+    assert_eq!(format, "vortex-file-compressed");
+    assert_eq!(value_bytes, 1024);
+
+    let (dataset, format): (String, String) = conn
+        .query_row("SELECT dataset, format FROM random_access_times", [], |r| {
+            Ok((r.get(0)?, r.get(1)?))
+        })
+        .unwrap();
+    assert_eq!(dataset, "taxi/take");
+    assert_eq!(format, "parquet-nvme");
+}

From df53d2ca3f9404e67ab2a5a5ec4e553a6c28d0e3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 26 Apr 2026 22:15:48 +0000
Subject: [PATCH 2/5] fix+perf(benchmarks-migrate): canonical formats +
 prepared INSERT statements

Two narrow fixes:

1. Classifier wrote v2's display-renamed engine and format strings
   (e.g. "vortex" instead of "vortex-file-compressed") into v3's
   columns. v3's live emitter writes canonical Format::name() strings,
   so historical and live records would split into separate chart
   series at cutover. Pull engine and format from the raw record
   name; the rename was a v2 read-time UI concern only.

2. The per-row tx.execute(sql, params) hot path re-parsed SQL on
   every record. Hoist tx.prepare(sql) outside the row loop and
   reuse the prepared statement. Local migration time: ~15 minutes
   -> ~2-3 minutes.

(The DuckDB Appender API would be ~10x faster still, but its
append_row is unimplemented for BIGINT[] columns in duckdb-rs
1.10502, and Arrow record batches are out of scope for this fix.)

Signed-off-by: Claude <noreply@anthropic.com>
---
 benchmarks-website/migrate/Cargo.toml         |   3 +-
 benchmarks-website/migrate/src/classifier.rs  |  51 ++-
 benchmarks-website/migrate/src/migrate.rs     | 392 +++++++++---------
 .../migrate/tests/classifier.rs               |  19 +-
 .../migrate/tests/end_to_end.rs               |   2 +-
 5 files changed, 249 insertions(+), 218 deletions(-)

diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml
index 464e55d9485..a68903f75eb 100644
--- a/benchmarks-website/migrate/Cargo.toml
+++ b/benchmarks-website/migrate/Cargo.toml
@@ -21,7 +21,8 @@ path = "src/main.rs"
 [dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
-duckdb = { version = "1.4", features = ["bundled"] }
+# track vortex-duckdb's bundled engine version (build.rs)
+duckdb = { version = "1.10502", features = ["bundled"] }
 flate2 = "1.1"
 reqwest = { workspace = true, features = ["json"] }
 serde = { workspace = true, features = ["derive"] }
diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
index f7a1e56c0ae..4e6e53fef1a 100644
--- a/benchmarks-website/migrate/src/classifier.rs
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -12,6 +12,13 @@
 //! factor for TPC-H/TPC-DS). This module reproduces that logic and
 //! then hops to a v3 fact-table bin, since v3 stores dim values as
 //! columns instead of name fragments.
+//!
+//! Engine and format strings stored in v3 columns are pulled from the
+//! raw, pre-rename v2 record name. v2's `ENGINE_RENAMES` was a v2
+//! read-time UI concern (e.g. `vortex-file-compressed` rendered as
+//! `vortex` and `parquet-tokio-local-disk` rendered as `parquet-nvme`).
+//! v3 stores canonical `Format::name()` strings to match what the v3
+//! live emitter writes, so historical and live records share series.
 
 use crate::v2::V2Record;
 use crate::v2::dataset_scale_factor;
@@ -393,21 +400,30 @@ fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option<V3Bin>
     if dataset.is_empty() {
         return None;
     }
-    let mut format = cls.series.clone();
-    if format.is_empty() {
-        return None;
-    }
-    // v2 emits a "default" placeholder when parts[1] is empty; treat
-    // that as missing and skip the row instead of inserting "default"
-    // as a format.
-    if format == "default" {
+    // Pull format from the raw, pre-rename v2 name so v3 stores the
+    // canonical `Format::name()` string (matching what the v3 live
+    // emitter writes). Raw shape is
+    // `random-access/<dataset>/<pattern>/<format>-tokio-local-disk`
+    // (4-part) or `random-access/<format>-tokio-local-disk` (2-part
+    // legacy). After stripping the `-tokio-local-disk` suffix, map the
+    // v2 random-access ext label (`vortex`, from `Format::ext()`) to
+    // the canonical name (`vortex-file-compressed`, from
+    // `Format::name()`). `parquet`, `lance`, and `vortex-compact`
+    // already match between ext and name.
+    let parts: Vec<&str> = record.name.split('/').collect();
+    let raw = match parts.len() {
+        4 => parts[3],
+        2 => parts[1],
+        _ => return None,
+    };
+    if raw.is_empty() || raw == "default" {
         return None;
     }
-    // The v2 random-access bench used to emit `parquet`-suffixed names;
-    // strip an "ns" unit guard later.
-    let _ = record; // record is unused here; kept for parity with siblings.
-    // Lower-case the format too so v3 series names are canonical.
-    format = format.to_lowercase();
+    let stripped = raw.strip_suffix("-tokio-local-disk").unwrap_or(raw);
+    let format = match stripped {
+        "vortex" => "vortex-file-compressed".to_string(),
+        other => other.to_lowercase(),
+    };
     Some(V3Bin::RandomAccess { dataset, format })
 }
 
@@ -498,8 +514,13 @@ fn bin_query(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
     let raw_first = record.name.split('/').next().unwrap_or("");
     let query_idx = parse_query_index_from_first(raw_first)?;
 
-    // Series for non-RA records is "engine:format" after rename.
-    let (engine, format) = split_engine_format(&cls.series)?;
+    // Pull engine:format from the raw, pre-rename second segment so v3
+    // stores canonical `Format::name()` strings (e.g.
+    // `vortex-file-compressed`) that match what the v3 live emitter
+    // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for
+    // UI display and is not appropriate for v3 columns.
+    let raw_series = record.name.split('/').nth(1)?;
+    let (engine, format) = split_engine_format(raw_series)?;
 
     let storage_v3 = match storage.as_deref() {
         Some("S3") => "s3".to_string(),
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
index f75e0169fda..5801d820905 100644
--- a/benchmarks-website/migrate/src/migrate.rs
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -16,6 +16,7 @@ use std::path::Path;
 use anyhow::Context as _;
 use anyhow::Result;
 use duckdb::Connection;
+use duckdb::Statement;
 use duckdb::Transaction;
 use duckdb::params;
 use tracing::warn;
@@ -148,37 +149,119 @@ fn migrate_data_jsonl(
     summary: &mut MigrationSummary,
 ) -> Result<()> {
     let reader = source.open_data_jsonl()?;
-    let mut tx = conn.transaction().context("begin data tx")?;
+    let mut lines = reader.lines();
     const BATCH: u64 = 10_000;
-    let mut in_batch = 0u64;
-    for line in reader.lines() {
-        let line = line?;
-        let trimmed = line.trim();
-        if trimmed.is_empty() {
-            continue;
-        }
-        summary.records_read += 1;
-        let record: V2Record = match serde_json::from_str(trimmed) {
-            Ok(r) => r,
-            Err(e) => {
-                warn!("skipping malformed data.json line: {e}");
+    loop {
+        let tx = conn.transaction().context("begin data tx")?;
+        let mut stmts = DataStatements::prepare(&tx)?;
+        let mut in_batch = 0u64;
+        while in_batch < BATCH {
+            let Some(line) = lines.next() else { break };
+            let line = line?;
+            let trimmed = line.trim();
+            if trimmed.is_empty() {
                 continue;
             }
-        };
-        apply_v2_record(&tx, &record, commits, summary)?;
-        in_batch += 1;
-        if in_batch >= BATCH {
-            tx.commit().context("commit data batch")?;
-            tx = conn.transaction().context("begin data tx")?;
-            in_batch = 0;
+            summary.records_read += 1;
+            let record: V2Record = match serde_json::from_str(trimmed) {
+                Ok(r) => r,
+                Err(e) => {
+                    warn!("skipping malformed data.json line: {e}");
+                    continue;
+                }
+            };
+            apply_v2_record(&mut stmts, &record, commits, summary)?;
+            in_batch += 1;
+        }
+        drop(stmts);
+        tx.commit().context("commit data batch")?;
+        if in_batch == 0 {
+            break;
         }
     }
-    tx.commit().context("commit final data batch")?;
     Ok(())
 }
 
+/// Prepared INSERT statements for the four v2-derived fact tables. Tied
+/// to a single transaction's lifetime; re-prepare after each commit.
+struct DataStatements<'tx> {
+    query: Statement<'tx>,
+    compression_time: Statement<'tx>,
+    compression_size: Statement<'tx>,
+    random_access: Statement<'tx>,
+}
+
+impl<'tx> DataStatements<'tx> {
+    fn prepare(tx: &'tx Transaction<'_>) -> Result<Self> {
+        Ok(Self {
+            query: tx.prepare(SQL_INSERT_QUERY)?,
+            compression_time: tx.prepare(SQL_INSERT_COMPRESSION_TIME)?,
+            compression_size: tx.prepare(SQL_INSERT_COMPRESSION_SIZE)?,
+            random_access: tx.prepare(SQL_INSERT_RANDOM_ACCESS)?,
+        })
+    }
+}
+
+const SQL_INSERT_QUERY: &str = r#"
+INSERT INTO query_measurements (
+    measurement_id, commit_sha, dataset, dataset_variant, scale_factor,
+    query_idx, storage, engine, format,
+    value_ns, all_runtimes_ns,
+    peak_physical, peak_virtual, physical_delta, virtual_delta,
+    env_triple
+) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?)
+ON CONFLICT (measurement_id) DO UPDATE SET
+    commit_sha      = excluded.commit_sha,
+    value_ns        = excluded.value_ns,
+    all_runtimes_ns = excluded.all_runtimes_ns,
+    env_triple      = excluded.env_triple
+"#;
+
+const SQL_INSERT_COMPRESSION_TIME: &str = r#"
+INSERT INTO compression_times (
+    measurement_id, commit_sha, dataset, dataset_variant,
+    format, op, value_ns, all_runtimes_ns, env_triple
+) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
+ON CONFLICT (measurement_id) DO UPDATE SET
+    commit_sha      = excluded.commit_sha,
+    value_ns        = excluded.value_ns,
+    all_runtimes_ns = excluded.all_runtimes_ns,
+    env_triple      = excluded.env_triple
+"#;
+
+const SQL_INSERT_COMPRESSION_SIZE: &str = r#"
+INSERT INTO compression_sizes (
+    measurement_id, commit_sha, dataset, dataset_variant,
+    format, value_bytes
+) VALUES (?, ?, ?, ?, ?, ?)
+ON CONFLICT (measurement_id) DO UPDATE SET
+    commit_sha   = excluded.commit_sha,
+    value_bytes  = excluded.value_bytes
+"#;
+
+const SQL_INSERT_RANDOM_ACCESS: &str = r#"
+INSERT INTO random_access_times (
+    measurement_id, commit_sha, dataset, format,
+    value_ns, all_runtimes_ns, env_triple
+) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
+ON CONFLICT (measurement_id) DO UPDATE SET
+    commit_sha      = excluded.commit_sha,
+    value_ns        = excluded.value_ns,
+    all_runtimes_ns = excluded.all_runtimes_ns,
+    env_triple      = excluded.env_triple
+"#;
+
+const SQL_UPSERT_FILE_SIZE: &str = r#"
+INSERT INTO compression_sizes (
+    measurement_id, commit_sha, dataset, dataset_variant,
+    format, value_bytes
+) VALUES (?, ?, ?, ?, ?, ?)
+ON CONFLICT (measurement_id) DO UPDATE SET
+    value_bytes = compression_sizes.value_bytes + excluded.value_bytes
+"#;
+
 fn apply_v2_record(
-    tx: &Transaction<'_>,
+    stmts: &mut DataStatements<'_>,
     record: &V2Record,
     commits: &BTreeMap<String, V2Commit>,
     summary: &mut MigrationSummary,
@@ -240,7 +323,25 @@ fn apply_v2_record(
                 virtual_delta: None,
                 env_triple,
             };
-            insert_query(tx, &qm)?;
+            let mid = measurement_id_query(&qm);
+            stmts.query.execute(params![
+                mid,
+                qm.commit_sha,
+                qm.dataset,
+                qm.dataset_variant,
+                qm.scale_factor,
+                qm.query_idx,
+                qm.storage,
+                qm.engine,
+                qm.format,
+                qm.value_ns,
+                runtimes_literal(&qm.all_runtimes_ns),
+                qm.peak_physical,
+                qm.peak_virtual,
+                qm.physical_delta,
+                qm.virtual_delta,
+                qm.env_triple,
+            ])?;
             summary.query_inserted += 1;
         }
         V3Bin::CompressionTime {
@@ -259,7 +360,18 @@ fn apply_v2_record(
                 all_runtimes_ns: runtimes,
                 env_triple,
             };
-            insert_compression_time(tx, &ct)?;
+            let mid = measurement_id_compression_time(&ct);
+            stmts.compression_time.execute(params![
+                mid,
+                ct.commit_sha,
+                ct.dataset,
+                ct.dataset_variant,
+                ct.format,
+                ct.op,
+                ct.value_ns,
+                runtimes_literal(&ct.all_runtimes_ns),
+                ct.env_triple,
+            ])?;
             summary.compression_time_inserted += 1;
         }
         V3Bin::CompressionSize {
@@ -274,7 +386,15 @@ fn apply_v2_record(
                 format,
                 value_bytes: value_f64 as i64,
             };
-            insert_compression_size(tx, &cs)?;
+            let mid = measurement_id_compression_size(&cs);
+            stmts.compression_size.execute(params![
+                mid,
+                cs.commit_sha,
+                cs.dataset,
+                cs.dataset_variant,
+                cs.format,
+                cs.value_bytes,
+            ])?;
             summary.compression_size_inserted += 1;
         }
         V3Bin::RandomAccess { dataset, format } => {
@@ -286,132 +406,22 @@ fn apply_v2_record(
                 all_runtimes_ns: runtimes,
                 env_triple,
             };
-            insert_random_access(tx, &ra)?;
+            let mid = measurement_id_random_access(&ra);
+            stmts.random_access.execute(params![
+                mid,
+                ra.commit_sha,
+                ra.dataset,
+                ra.format,
+                ra.value_ns,
+                runtimes_literal(&ra.all_runtimes_ns),
+                ra.env_triple,
+            ])?;
             summary.random_access_inserted += 1;
         }
     }
     Ok(())
 }
 
-fn insert_query(tx: &Transaction<'_>, r: &QueryMeasurement) -> Result<()> {
-    let mid = measurement_id_query(r);
-    tx.execute(
-        r#"
-        INSERT INTO query_measurements (
-            measurement_id, commit_sha, dataset, dataset_variant, scale_factor,
-            query_idx, storage, engine, format,
-            value_ns, all_runtimes_ns,
-            peak_physical, peak_virtual, physical_delta, virtual_delta,
-            env_triple
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?)
-        ON CONFLICT (measurement_id) DO UPDATE SET
-            commit_sha      = excluded.commit_sha,
-            value_ns        = excluded.value_ns,
-            all_runtimes_ns = excluded.all_runtimes_ns,
-            env_triple      = excluded.env_triple
-        "#,
-        params![
-            mid,
-            r.commit_sha,
-            r.dataset,
-            r.dataset_variant,
-            r.scale_factor,
-            r.query_idx,
-            r.storage,
-            r.engine,
-            r.format,
-            r.value_ns,
-            runtimes_literal(&r.all_runtimes_ns),
-            r.peak_physical,
-            r.peak_virtual,
-            r.physical_delta,
-            r.virtual_delta,
-            r.env_triple,
-        ],
-    )?;
-    Ok(())
-}
-
-fn insert_compression_time(tx: &Transaction<'_>, r: &CompressionTime) -> Result<()> {
-    let mid = measurement_id_compression_time(r);
-    tx.execute(
-        r#"
-        INSERT INTO compression_times (
-            measurement_id, commit_sha, dataset, dataset_variant,
-            format, op, value_ns, all_runtimes_ns, env_triple
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
-        ON CONFLICT (measurement_id) DO UPDATE SET
-            commit_sha      = excluded.commit_sha,
-            value_ns        = excluded.value_ns,
-            all_runtimes_ns = excluded.all_runtimes_ns,
-            env_triple      = excluded.env_triple
-        "#,
-        params![
-            mid,
-            r.commit_sha,
-            r.dataset,
-            r.dataset_variant,
-            r.format,
-            r.op,
-            r.value_ns,
-            runtimes_literal(&r.all_runtimes_ns),
-            r.env_triple,
-        ],
-    )?;
-    Ok(())
-}
-
-fn insert_compression_size(tx: &Transaction<'_>, r: &CompressionSize) -> Result<()> {
-    let mid = measurement_id_compression_size(r);
-    tx.execute(
-        r#"
-        INSERT INTO compression_sizes (
-            measurement_id, commit_sha, dataset, dataset_variant,
-            format, value_bytes
-        ) VALUES (?, ?, ?, ?, ?, ?)
-        ON CONFLICT (measurement_id) DO UPDATE SET
-            commit_sha   = excluded.commit_sha,
-            value_bytes  = excluded.value_bytes
-        "#,
-        params![
-            mid,
-            r.commit_sha,
-            r.dataset,
-            r.dataset_variant,
-            r.format,
-            r.value_bytes,
-        ],
-    )?;
-    Ok(())
-}
-
-fn insert_random_access(tx: &Transaction<'_>, r: &RandomAccessTime) -> Result<()> {
-    let mid = measurement_id_random_access(r);
-    tx.execute(
-        r#"
-        INSERT INTO random_access_times (
-            measurement_id, commit_sha, dataset, format,
-            value_ns, all_runtimes_ns, env_triple
-        ) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
-        ON CONFLICT (measurement_id) DO UPDATE SET
-            commit_sha      = excluded.commit_sha,
-            value_ns        = excluded.value_ns,
-            all_runtimes_ns = excluded.all_runtimes_ns,
-            env_triple      = excluded.env_triple
-        "#,
-        params![
-            mid,
-            r.commit_sha,
-            r.dataset,
-            r.format,
-            r.value_ns,
-            runtimes_literal(&r.all_runtimes_ns),
-            r.env_triple,
-        ],
-    )?;
-    Ok(())
-}
-
 fn runtimes_literal(values: &[i64]) -> String {
     let mut s = String::with_capacity(values.len() * 8 + 2);
     s.push('[');
@@ -438,46 +448,50 @@ fn migrate_file_sizes(
         .and_then(|s| s.strip_suffix(".json.gz"))
         .unwrap_or(name)
         .to_string();
-    let mut tx = conn.transaction().context("begin file-sizes tx")?;
+    let mut lines = reader.lines();
     const BATCH: u64 = 10_000;
-    let mut in_batch = 0u64;
-    for line in reader.lines() {
-        let line = line?;
-        let trimmed = line.trim();
-        if trimmed.is_empty() {
-            continue;
-        }
-        let sz: V2FileSize = match serde_json::from_str(trimmed) {
-            Ok(r) => r,
-            Err(e) => {
-                warn!("skipping malformed {name} line: {e}");
+    loop {
+        let tx = conn.transaction().context("begin file-sizes tx")?;
+        let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?;
+        let mut in_batch = 0u64;
+        while in_batch < BATCH {
+            let Some(line) = lines.next() else { break };
+            let line = line?;
+            let trimmed = line.trim();
+            if trimmed.is_empty() {
                 continue;
             }
-        };
-        if !commits.contains_key(&sz.commit_id) {
-            summary.missing_commit += 1;
-            continue;
+            let sz: V2FileSize = match serde_json::from_str(trimmed) {
+                Ok(r) => r,
+                Err(e) => {
+                    warn!("skipping malformed {name} line: {e}");
+                    continue;
+                }
+            };
+            if !commits.contains_key(&sz.commit_id) {
+                summary.missing_commit += 1;
+                continue;
+            }
+            // file-sizes-*.json.gz captures per-file sizes inside one
+            // benchmark/format/scale_factor combo. We aggregate to one
+            // (commit, dataset, dataset_variant, format) row by summing,
+            // since v3's compression_sizes is a single bytes value per
+            // (dim) tuple. Use ON CONFLICT to accumulate.
+            upsert_file_size_row(&mut stmt, &sz, &dataset)?;
+            summary.file_size_inserted += 1;
+            in_batch += 1;
         }
-        // file-sizes-*.json.gz captures per-file sizes inside one
-        // benchmark/format/scale_factor combo. We aggregate to one
-        // (commit, dataset, dataset_variant, format) row by summing,
-        // since v3's compression_sizes is a single bytes value per
-        // (dim) tuple. Use ON CONFLICT to accumulate.
-        upsert_file_size_row(&tx, &sz, &dataset)?;
-        summary.file_size_inserted += 1;
-        in_batch += 1;
-        if in_batch >= BATCH {
-            tx.commit().context("commit file-sizes batch")?;
-            tx = conn.transaction().context("begin file-sizes tx")?;
-            in_batch = 0;
+        drop(stmt);
+        tx.commit().context("commit file-sizes batch")?;
+        if in_batch == 0 {
+            break;
         }
     }
-    tx.commit().context("commit final file-sizes batch")?;
     Ok(())
 }
 
 fn upsert_file_size_row(
-    tx: &Transaction<'_>,
+    stmt: &mut Statement<'_>,
     sz: &V2FileSize,
     dataset_fallback: &str,
 ) -> Result<()> {
@@ -499,26 +513,14 @@ fn upsert_file_size_row(
         value_bytes: sz.size_bytes,
     };
     let mid = measurement_id_compression_size(&cs);
-    // Multiple files within the same dataset/format/scale_factor sum
-    // into one row by adding to whatever is already there.
-    tx.execute(
-        r#"
-        INSERT INTO compression_sizes (
-            measurement_id, commit_sha, dataset, dataset_variant,
-            format, value_bytes
-        ) VALUES (?, ?, ?, ?, ?, ?)
-        ON CONFLICT (measurement_id) DO UPDATE SET
-            value_bytes = compression_sizes.value_bytes + excluded.value_bytes
-        "#,
-        params![
-            mid,
-            cs.commit_sha,
-            cs.dataset,
-            cs.dataset_variant,
-            cs.format,
-            cs.value_bytes,
-        ],
-    )?;
+    stmt.execute(params![
+        mid,
+        cs.commit_sha,
+        cs.dataset,
+        cs.dataset_variant,
+        cs.format,
+        cs.value_bytes,
+    ])?;
     Ok(())
 }
 
diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs
index 2be3896216c..e8288751d62 100644
--- a/benchmarks-website/migrate/tests/classifier.rs
+++ b/benchmarks-website/migrate/tests/classifier.rs
@@ -54,7 +54,7 @@ fn record_with_storage_and_sf(name: &str, storage: &str, suite: &str, sf: &str)
         query_idx: 12,
         storage: "nvme".into(),
         engine: "datafusion".into(),
-        format: "vortex".into(),
+        format: "vortex-file-compressed".into(),
     },
 )]
 #[case::statpopgen(
@@ -114,7 +114,7 @@ fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) {
         query_idx: 22,
         storage: "nvme".into(),
         engine: "duckdb".into(),
-        format: "vortex".into(),
+        format: "vortex-file-compressed".into(),
     },
 )]
 #[case::tpcds_nvme_sf10(
@@ -129,7 +129,7 @@ fn non_fan_out_query_records(#[case] name: &str, #[case] expected: V3Bin) {
         query_idx: 5,
         storage: "nvme".into(),
         engine: "datafusion".into(),
-        format: "vortex".into(),
+        format: "vortex-file-compressed".into(),
     },
 )]
 fn fan_out_query_records(
@@ -148,21 +148,28 @@ fn fan_out_query_records(
     "random-access/taxi/take/parquet-tokio-local-disk",
     V3Bin::RandomAccess {
         dataset: "taxi/take".into(),
-        format: "parquet-nvme".into(),
+        format: "parquet".into(),
     },
 )]
 #[case::random_access_4_part_vortex(
     "random-access/chimp/take/vortex-tokio-local-disk",
     V3Bin::RandomAccess {
         dataset: "chimp/take".into(),
-        format: "vortex-nvme".into(),
+        format: "vortex-file-compressed".into(),
     },
 )]
 #[case::random_access_2_part_legacy(
     "random-access/parquet-tokio-local-disk",
     V3Bin::RandomAccess {
         dataset: "random access".into(),
-        format: "parquet-nvme".into(),
+        format: "parquet".into(),
+    },
+)]
+#[case::random_access_4_part_lance(
+    "random-access/taxi/take/lance-tokio-local-disk",
+    V3Bin::RandomAccess {
+        dataset: "taxi/take".into(),
+        format: "lance".into(),
     },
 )]
 fn random_access_records(#[case] name: &str, #[case] expected: V3Bin) {
diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs
index 5892215b472..b389f77c421 100644
--- a/benchmarks-website/migrate/tests/end_to_end.rs
+++ b/benchmarks-website/migrate/tests/end_to_end.rs
@@ -107,5 +107,5 @@ fn migrate_inline_fixture_populates_each_table() {
         })
         .unwrap();
     assert_eq!(dataset, "taxi/take");
-    assert_eq!(format, "parquet-nvme");
+    assert_eq!(format, "parquet");
 }

From b02f418918d90b7dd984f7702b67c4f43b99050c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 26 Apr 2026 22:49:56 +0000
Subject: [PATCH 3/5] chore(benchmarks-migrate): progress logging + small
 fit-and-finish

Adds tracing-based phase announcements and periodic progress lines
(every 5 seconds) so users know the binary isn't hung during
multi-minute migrations. Also fixes an inaccurate doc comment about
vortex-compact's ext label and skips empty trailing transactions in
both streaming loops.

No behavior change - all log output, comment-only edits, and a
no-op-transaction elision.

Signed-off-by: Claude <noreply@anthropic.com>
---
 benchmarks-website/migrate/src/classifier.rs |  8 ++-
 benchmarks-website/migrate/src/migrate.rs    | 55 ++++++++++++++++----
 benchmarks-website/migrate/src/source.rs     | 10 ++++
 3 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
index 4e6e53fef1a..22802482ee9 100644
--- a/benchmarks-website/migrate/src/classifier.rs
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -408,8 +408,12 @@ fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option<V3Bin>
     // legacy). After stripping the `-tokio-local-disk` suffix, map the
     // v2 random-access ext label (`vortex`, from `Format::ext()`) to
     // the canonical name (`vortex-file-compressed`, from
-    // `Format::name()`). `parquet`, `lance`, and `vortex-compact`
-    // already match between ext and name.
+    // `Format::name()`). `parquet` and `lance` match between ext and
+    // name. The `vortex` ext is shared by both `OnDiskVortex` (name
+    // `vortex-file-compressed`) and `VortexCompact` (name
+    // `vortex-compact`), but v2's random-access bench only emitted
+    // `OnDiskVortex`, so mapping to `vortex-file-compressed` is
+    // correct for all historical data.
     let parts: Vec<&str> = record.name.split('/').collect();
     let raw = match parts.len() {
         4 => parts[3],
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
index 5801d820905..06bc7cdeaf5 100644
--- a/benchmarks-website/migrate/src/migrate.rs
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -12,6 +12,8 @@
 use std::collections::BTreeMap;
 use std::io::BufRead;
 use std::path::Path;
+use std::time::Duration;
+use std::time::Instant;
 
 use anyhow::Context as _;
 use anyhow::Result;
@@ -19,6 +21,7 @@ use duckdb::Connection;
 use duckdb::Statement;
 use duckdb::Transaction;
 use duckdb::params;
+use tracing::info;
 use tracing::warn;
 use vortex_bench_server::db::measurement_id_compression_size;
 use vortex_bench_server::db::measurement_id_compression_time;
@@ -92,12 +95,21 @@ pub fn run(source: &Source, target: &Path) -> Result<MigrationSummary> {
     let mut conn = open_target_db(target)?;
     let mut summary = MigrationSummary::default();
 
+    info!(source = %source.describe(), "Reading commits.json");
     let commits = read_commits(source)?;
+    info!(commits = commits.len(), "Loaded commits");
     summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?;
 
+    info!("Migrating data.json.gz");
     migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?;
+    info!(
+        records = summary.records_read,
+        inserted = summary.total_inserted(),
+        "data.json.gz done",
+    );
 
     for name in source.list_file_sizes()? {
+        info!(name = %name, "Migrating file-sizes");
         if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) {
             warn!("file-sizes file {name} failed: {e:#}");
         }
@@ -149,9 +161,11 @@ fn migrate_data_jsonl(
     summary: &mut MigrationSummary,
 ) -> Result<()> {
     let reader = source.open_data_jsonl()?;
-    let mut lines = reader.lines();
+    let mut lines = reader.lines().peekable();
+    let started = Instant::now();
+    let mut last_log = Instant::now();
     const BATCH: u64 = 10_000;
-    loop {
+    while lines.peek().is_some() {
         let tx = conn.transaction().context("begin data tx")?;
         let mut stmts = DataStatements::prepare(&tx)?;
         let mut in_batch = 0u64;
@@ -172,12 +186,23 @@ fn migrate_data_jsonl(
             };
             apply_v2_record(&mut stmts, &record, commits, summary)?;
             in_batch += 1;
+            if last_log.elapsed() >= Duration::from_secs(5) {
+                let elapsed = started.elapsed().as_secs_f64();
+                let rate = summary.records_read as f64 / elapsed.max(0.001);
+                info!(
+                    records = summary.records_read,
+                    rate = format!("{rate:.0}/s"),
+                    query = summary.query_inserted,
+                    compression_time = summary.compression_time_inserted,
+                    compression_size = summary.compression_size_inserted,
+                    random_access = summary.random_access_inserted,
+                    "migration progress",
+                );
+                last_log = Instant::now();
+            }
         }
         drop(stmts);
         tx.commit().context("commit data batch")?;
-        if in_batch == 0 {
-            break;
-        }
     }
     Ok(())
 }
@@ -448,9 +473,11 @@ fn migrate_file_sizes(
         .and_then(|s| s.strip_suffix(".json.gz"))
         .unwrap_or(name)
         .to_string();
-    let mut lines = reader.lines();
+    let mut lines = reader.lines().peekable();
+    let started = Instant::now();
+    let mut last_log = Instant::now();
     const BATCH: u64 = 10_000;
-    loop {
+    while lines.peek().is_some() {
         let tx = conn.transaction().context("begin file-sizes tx")?;
         let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?;
         let mut in_batch = 0u64;
@@ -480,12 +507,20 @@ fn migrate_file_sizes(
             upsert_file_size_row(&mut stmt, &sz, &dataset)?;
             summary.file_size_inserted += 1;
             in_batch += 1;
+            if last_log.elapsed() >= Duration::from_secs(5) {
+                let elapsed = started.elapsed().as_secs_f64();
+                let rate = summary.file_size_inserted as f64 / elapsed.max(0.001);
+                info!(
+                    name = %name,
+                    file_sizes = summary.file_size_inserted,
+                    rate = format!("{rate:.0}/s"),
+                    "file-sizes progress",
+                );
+                last_log = Instant::now();
+            }
         }
         drop(stmt);
         tx.commit().context("commit file-sizes batch")?;
-        if in_batch == 0 {
-            break;
-        }
     }
     Ok(())
 }
diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs
index 2b4fdca9b94..340a9bdb60f 100644
--- a/benchmarks-website/migrate/src/source.rs
+++ b/benchmarks-website/migrate/src/source.rs
@@ -23,6 +23,7 @@ use std::path::PathBuf;
 use anyhow::Context as _;
 use anyhow::Result;
 use flate2::read::GzDecoder;
+use tracing::info;
 
 /// Public S3 bucket the live v2 server reads from.
 pub const PUBLIC_BUCKET_BASE: &str = "https://vortex-ci-benchmark-results.s3.amazonaws.com";
@@ -39,6 +40,14 @@ pub enum Source {
 }
 
 impl Source {
+    /// Short human-readable description for log messages.
+    pub fn describe(&self) -> String {
+        match self {
+            Source::PublicS3 => "public S3 bucket".to_string(),
+            Source::Local(p) => format!("local dir {}", p.display()),
+        }
+    }
+
     /// Open `data.json.gz` for streaming, decompressing on the fly.
     pub fn open_data_jsonl(&self) -> Result<Box<dyn BufRead>> {
         let stream = self.open_raw("data.json.gz")?;
@@ -102,6 +111,7 @@ fn open_local(path: &Path) -> Result<Box<dyn Read + Send>> {
 
 fn open_s3(name: &str) -> Result<Box<dyn Read + Send>> {
     let url = format!("{PUBLIC_BUCKET_BASE}/{name}");
+    info!(url = %url, "GET");
     let resp = reqwest::blocking::get(&url).with_context(|| format!("GET {url}"))?;
     if !resp.status().is_success() {
         anyhow::bail!("GET {url} returned {}", resp.status());

From 42ad6a12c6ebfe2fa1c66c3695b8954032ee64a7 Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Sun, 26 Apr 2026 19:47:27 -0400
Subject: [PATCH 4/5] fix perf and insert bugs

Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                    |  40 +
 benchmarks-website/migrate/Cargo.toml         |   5 +-
 benchmarks-website/migrate/src/classifier.rs  | 154 +++-
 benchmarks-website/migrate/src/migrate.rs     | 777 +++++++++++-------
 .../migrate/tests/classifier.rs               |  92 ++-
 .../migrate/tests/end_to_end.rs               | 101 ++-
 6 files changed, 868 insertions(+), 301 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 079289cdfa8..20075443c36 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3742,6 +3742,7 @@ dependencies = [
  "fallible-streaming-iterator",
  "hashlink",
  "libduckdb-sys",
+ "num",
  "num-integer",
  "rust_decimal",
  "strum 0.27.2",
@@ -6374,6 +6375,20 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -6409,6 +6424,28 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -10357,6 +10394,9 @@ name = "vortex-bench-migrate"
 version = "0.1.0-alpha.0"
 dependencies = [
  "anyhow",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-schema 58.1.0",
  "clap",
  "duckdb",
  "flate2",
diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml
index a68903f75eb..f9b83d5d543 100644
--- a/benchmarks-website/migrate/Cargo.toml
+++ b/benchmarks-website/migrate/Cargo.toml
@@ -20,9 +20,12 @@ path = "src/main.rs"
 
 [dependencies]
 anyhow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
+arrow-schema = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
 # track vortex-duckdb's bundled engine version (build.rs)
-duckdb = { version = "1.10502", features = ["bundled"] }
+duckdb = { version = "1.10502", features = ["bundled", "appender-arrow"] }
 flate2 = "1.1"
 reqwest = { workspace = true, features = ["json"] }
 serde = { workspace = true, features = ["derive"] }
diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
index 22802482ee9..6b3368c64b8 100644
--- a/benchmarks-website/migrate/src/classifier.rs
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -49,6 +49,14 @@ pub const QUERY_SUITES: &[QuerySuite] = &[
         fan_out: false,
         skip: false,
     },
+    QuerySuite {
+        prefix: "gharchive",
+        display_name: "GhArchive",
+        query_prefix: "GHARCHIVE",
+        dataset_key: None,
+        fan_out: false,
+        skip: false,
+    },
     QuerySuite {
         prefix: "tpch",
         display_name: "TPC-H",
@@ -71,7 +79,7 @@ pub const QUERY_SUITES: &[QuerySuite] = &[
         query_prefix: "FINEWEB",
         dataset_key: None,
         fan_out: false,
-        skip: true,
+        skip: false,
     },
 ];
 
@@ -221,6 +229,7 @@ pub fn get_group(record: &V2Record) -> Option<V2Group> {
     if lower.starts_with("vortex size/")
         || lower.starts_with("vortex-file-compressed size/")
         || lower.starts_with("parquet size/")
+        || lower.starts_with("parquet-zstd size/")
         || lower.starts_with("lance size/")
         || lower.contains(":raw size/")
         || lower.contains(":parquet-zstd size/")
@@ -237,6 +246,10 @@ pub fn get_group(record: &V2Record) -> Option<V2Group> {
         || lower.starts_with("lance decompress")
         || lower.starts_with("vortex:lance ratio")
         || lower.starts_with("vortex:parquet-zstd ratio")
+        // Typo'd v2 emitter wrote `parquet-zst` (no `d`) for some
+        // ratio records; match both spellings so they classify as
+        // derived ratios instead of falling through to Unknown.
+        || lower.starts_with("vortex:parquet-zst ratio")
         || lower.starts_with("vortex:raw ratio")
     {
         return Some(V2Group::Compression);
@@ -392,6 +405,132 @@ pub fn classify(record: &V2Record) -> Option<V3Bin> {
     }
 }
 
+/// Reason the classifier dropped a record. Intentional skips (v2
+/// patterns v3 deliberately doesn't store) are NOT errors; they don't
+/// count against the uncategorized gate.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Skip {
+    /// `vortex:* ratio …` and `vortex:* size` — derived in v3 from
+    /// `compression_sizes` joined to itself.
+    DerivedRatio,
+    /// `throughput` records — v2 derived these from latencies.
+    Throughput,
+    /// A v2 query suite marked `skip: true` in QUERY_SUITES.
+    SkippedSuite,
+    /// random-access record with an unsupported part count.
+    UnsupportedShape,
+    /// Record had no `value` field.
+    NoValue,
+    /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`,
+    /// historical-only suites no longer in CI).
+    Deprecated,
+}
+
+/// Engines the v3 emitter produces today. Anything else is historical
+/// and gets bucketed as `Skip::Deprecated`.
+///
+/// ORCHESTRATOR NOTE: confirm against `vortex-bench`'s `Engine` enum
+/// before handing off; edit if the live set differs.
+const V3_ENGINES: &[&str] = &["datafusion", "duckdb", "vortex", "arrow"];
+
+/// Formats the v3 emitter produces today (`Format::name()` values).
+///
+/// ORCHESTRATOR NOTE: confirm against `vortex-bench/src/lib.rs`
+/// `Format::name()` before handing off.
+const V3_FORMATS: &[&str] = &[
+    "vortex-file-compressed",
+    "vortex-compact",
+    "parquet",
+    "lance",
+    "csv",
+    "arrow",
+    "duckdb",
+];
+
+/// Query suites the v3 CI runs today. Suites outside this list still
+/// classify (so historical analyses stay coherent) but get bucketed
+/// as `Skip::Deprecated` so they don't render as orphan charts in v3.
+///
+/// ORCHESTRATOR NOTE: add `fineweb` and/or `gharchive` here if a CI
+/// grep shows v3 still emits them.
+const V3_QUERY_SUITES: &[&str] = &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"];
+
+/// Returns true if every dim that v3 stores as a column is on the
+/// emitter's current allowlist. Dim values outside the allowlist mean
+/// historical-only formats / engines that the v3 UI has nothing to
+/// render against.
+fn is_v3_dim(bin: &V3Bin) -> bool {
+    match bin {
+        V3Bin::Query { engine, format, .. } => {
+            V3_ENGINES.contains(&engine.as_str()) && V3_FORMATS.contains(&format.as_str())
+        }
+        V3Bin::CompressionTime { format, .. }
+        | V3Bin::CompressionSize { format, .. }
+        | V3Bin::RandomAccess { format, .. } => V3_FORMATS.contains(&format.as_str()),
+    }
+}
+
+/// Outcome of running the classifier on a v2 record. Distinguishes
+/// "we know we don't want this" (`Skip`) from "we don't recognize this"
+/// (`Unknown`); the migrator's 5% gate fires only on the latter.
+#[derive(Debug, Clone)]
+pub enum Outcome {
+    Bin(V3Bin),
+    Skip(Skip),
+    Unknown,
+}
+
+/// Like [`classify`], but reports *why* a record was dropped. Intended
+/// for the migrator so the 5% uncategorized gate doesn't trip on
+/// records v2 deliberately doesn't render (ratios, throughput,
+/// skipped suites).
+pub fn classify_outcome(record: &V2Record) -> Outcome {
+    if record.name.contains(" throughput") {
+        return Outcome::Skip(Skip::Throughput);
+    }
+    let Some(group) = get_group(record) else {
+        return Outcome::Unknown;
+    };
+    if let V2Group::Query { suite_index, .. } = &group
+        && QUERY_SUITES[*suite_index].skip
+    {
+        return Outcome::Skip(Skip::SkippedSuite);
+    }
+    let Some(cls) = classify_v2(record) else {
+        // get_group succeeded but classify_v2 didn't — shape mismatch.
+        return Outcome::Skip(Skip::UnsupportedShape);
+    };
+    let derived = match &cls.group {
+        V2Group::Compression => {
+            let lc = cls.chart.to_lowercase();
+            lc.contains("ratio") || lc.contains(':')
+        }
+        V2Group::CompressionSize => cls.chart.to_lowercase().contains(':'),
+        _ => false,
+    };
+    if derived {
+        return Outcome::Skip(Skip::DerivedRatio);
+    }
+    let bin = match &cls.group {
+        V2Group::RandomAccess => bin_random_access(&cls, record),
+        V2Group::Compression => bin_compression_time(&cls, record),
+        V2Group::CompressionSize => bin_compression_size(&cls, record),
+        V2Group::Query { .. } => bin_query(&cls, record),
+    };
+    let Some(bin) = bin else {
+        return Outcome::Unknown;
+    };
+    if !is_v3_dim(&bin) {
+        return Outcome::Skip(Skip::Deprecated);
+    }
+    if let V2Group::Query { suite_index, .. } = &group
+        && !V3_QUERY_SUITES.contains(&QUERY_SUITES[*suite_index].prefix)
+    {
+        return Outcome::Skip(Skip::Deprecated);
+    }
+    Outcome::Bin(bin)
+}
+
 fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
     // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase).
     // We store it as the v3 dataset value verbatim, lowercased so
@@ -482,8 +621,15 @@ fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option<V3
     if lc.contains(':') {
         return None;
     }
+    // `parquet-zstd size` shares a leading "parquet" with `parquet size`,
+    // so check the more specific prefix first. `format_query` upper-cases
+    // and replaces `-`/`_` with spaces, so the chart we match against is
+    // `"PARQUET ZSTD SIZE"` (no hyphen) — same convention as the existing
+    // `"parquet rs zstd compress time"` branches above.
     let format = if lc.starts_with("vortex size") {
         "vortex-file-compressed"
+    } else if lc.starts_with("parquet zstd size") {
+        "parquet-zstd"
     } else if lc.starts_with("parquet size") {
         "parquet"
     } else if lc.starts_with("lance size") {
@@ -523,8 +669,14 @@ fn bin_query(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
     // `vortex-file-compressed`) that match what the v3 live emitter
     // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for
     // UI display and is not appropriate for v3 columns.
+    //
+    // Older v2 records emitted display-case engines (e.g. `DataFusion`,
+    // `DuckDB`); newer ones emit lowercase. Lowercase here so dedup
+    // collapses both spellings into a single canonical row.
     let raw_series = record.name.split('/').nth(1)?;
     let (engine, format) = split_engine_format(raw_series)?;
+    let engine = engine.to_lowercase();
+    let format = format.to_lowercase();
 
     let storage_v3 = match storage.as_deref() {
         Some("S3") => "s3".to_string(),
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
index 06bc7cdeaf5..ff1abf835f0 100644
--- a/benchmarks-website/migrate/src/migrate.rs
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -8,19 +8,34 @@
 //! the appropriate v3 fact table. Every row's `measurement_id` is
 //! computed via the server's `measurement_id_*` functions so the result
 //! is byte-compatible with what fresh `/api/ingest` would have produced.
+//!
+//! Bulk-load shape: rows are accumulated in memory as parallel column
+//! vectors, deduplicated by `measurement_id`, then flushed to DuckDB
+//! via `Appender::append_record_batch` as one Arrow `RecordBatch` per
+//! fact table.
 
 use std::collections::BTreeMap;
+use std::collections::HashMap;
+use std::collections::HashSet;
 use std::io::BufRead;
 use std::path::Path;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 
 use anyhow::Context as _;
 use anyhow::Result;
+use arrow_array::ArrayRef;
+use arrow_array::Int32Array;
+use arrow_array::Int64Array;
+use arrow_array::ListArray;
+use arrow_array::RecordBatch;
+use arrow_array::StringArray;
+use arrow_buffer::OffsetBuffer;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use arrow_schema::Schema;
 use duckdb::Connection;
-use duckdb::Statement;
-use duckdb::Transaction;
-use duckdb::params;
 use tracing::info;
 use tracing::warn;
 use vortex_bench_server::db::measurement_id_compression_size;
@@ -33,8 +48,8 @@ use vortex_bench_server::records::QueryMeasurement;
 use vortex_bench_server::records::RandomAccessTime;
 use vortex_bench_server::schema::SCHEMA_DDL;
 
+use crate::classifier;
 use crate::classifier::V3Bin;
-use crate::classifier::classify;
 use crate::commits::upsert_commit;
 use crate::source::Source;
 use crate::v2::V2Commit;
@@ -58,7 +73,9 @@ pub struct MigrationSummary {
     pub missing_commit: u64,
     pub commit_warnings: u64,
     pub skipped_no_value: u64,
+    pub skipped_intentional: u64,
     pub commits_inserted: u64,
+    pub deduped: u64,
 }
 
 impl MigrationSummary {
@@ -80,8 +97,16 @@ impl MigrationSummary {
     }
 }
 
-/// Open or create a DuckDB at `path` and apply the v3 schema.
+/// Open or create a DuckDB at `path` and apply the v3 schema. The
+/// migrator is a one-shot fresh load; the bulk-append flush is pure
+/// insert (no `ON CONFLICT`), so any stale rows in `path` would clash
+/// with the next run on the same primary keys. Delete both the
+/// database file and its WAL companion up front so every run starts
+/// from a known-empty state.
 pub fn open_target_db(path: &Path) -> Result<Connection> {
+    remove_if_exists(path)?;
+    let wal = wal_path(path);
+    remove_if_exists(&wal)?;
     let conn =
         Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?;
     conn.execute_batch(SCHEMA_DDL)
@@ -89,6 +114,25 @@ pub fn open_target_db(path: &Path) -> Result<Connection> {
     Ok(conn)
 }
 
+fn remove_if_exists(path: &Path) -> Result<()> {
+    match std::fs::remove_file(path) {
+        Ok(()) => {
+            info!(path = %path.display(), "removed pre-existing target file");
+            Ok(())
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
+        Err(e) => Err(e).with_context(|| format!("removing {}", path.display())),
+    }
+}
+
+/// DuckDB writes its write-ahead log next to the database file with a
+/// `.wal` suffix appended (e.g. `v3.duckdb` -> `v3.duckdb.wal`).
+fn wal_path(path: &Path) -> std::path::PathBuf {
+    let mut name = path.as_os_str().to_owned();
+    name.push(".wal");
+    std::path::PathBuf::from(name)
+}
+
 /// Run the whole migration: commits, data.json.gz, and every
 /// file-sizes-*.json.gz under the source.
 pub fn run(source: &Source, target: &Path) -> Result<MigrationSummary> {
@@ -100,21 +144,49 @@ pub fn run(source: &Source, target: &Path) -> Result<MigrationSummary> {
     info!(commits = commits.len(), "Loaded commits");
     summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?;
 
+    let mut q = QueryAccum::default();
+    let mut ct = CompressionTimeAccum::default();
+    let mut cs = CompressionSizeAccum::default();
+    let mut ra = RandomAccessAccum::default();
+
     info!("Migrating data.json.gz");
-    migrate_data_jsonl(&mut conn, source, &commits, &mut summary)?;
-    info!(
-        records = summary.records_read,
-        inserted = summary.total_inserted(),
-        "data.json.gz done",
-    );
+    migrate_data_jsonl(
+        source,
+        &commits,
+        &mut summary,
+        &mut q,
+        &mut ct,
+        &mut cs,
+        &mut ra,
+    )?;
+    info!(records = summary.records_read, "data.json.gz done");
 
     for name in source.list_file_sizes()? {
         info!(name = %name, "Migrating file-sizes");
-        if let Err(e) = migrate_file_sizes(&mut conn, source, &name, &commits, &mut summary) {
+        if let Err(e) = migrate_file_sizes(source, &name, &commits, &mut summary, &mut cs) {
             warn!("file-sizes file {name} failed: {e:#}");
         }
     }
 
+    info!("Flushing accumulators to DuckDB");
+    summary.query_inserted = q.measurement_id.len() as u64;
+    summary.compression_time_inserted = ct.measurement_id.len() as u64;
+    summary.random_access_inserted = ra.measurement_id.len() as u64;
+    summary.compression_size_inserted = cs.rows.len() as u64;
+
+    flush(&conn, "query_measurements", build_query_batch(q)?)?;
+    flush(
+        &conn,
+        "compression_times",
+        build_compression_time_batch(ct)?,
+    )?;
+    flush(&conn, "random_access_times", build_random_access_batch(ra)?)?;
+    flush(
+        &conn,
+        "compression_sizes",
+        build_compression_size_batch(cs)?,
+    )?;
+
     Ok(summary)
 }
 
@@ -154,157 +226,84 @@ fn upsert_all_commits(
     Ok(count)
 }
 
+/// Stream `data.json.gz` and push classified records into the
+/// per-table accumulators. Dedup happens inside each accumulator's
+/// `push` method by `measurement_id`.
 fn migrate_data_jsonl(
-    conn: &mut Connection,
     source: &Source,
     commits: &BTreeMap<String, V2Commit>,
     summary: &mut MigrationSummary,
+    q: &mut QueryAccum,
+    ct: &mut CompressionTimeAccum,
+    cs: &mut CompressionSizeAccum,
+    ra: &mut RandomAccessAccum,
 ) -> Result<()> {
     let reader = source.open_data_jsonl()?;
-    let mut lines = reader.lines().peekable();
     let started = Instant::now();
     let mut last_log = Instant::now();
-    const BATCH: u64 = 10_000;
-    while lines.peek().is_some() {
-        let tx = conn.transaction().context("begin data tx")?;
-        let mut stmts = DataStatements::prepare(&tx)?;
-        let mut in_batch = 0u64;
-        while in_batch < BATCH {
-            let Some(line) = lines.next() else { break };
-            let line = line?;
-            let trimmed = line.trim();
-            if trimmed.is_empty() {
+    for line in reader.lines() {
+        let line = line?;
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        summary.records_read += 1;
+        let record: V2Record = match serde_json::from_str(trimmed) {
+            Ok(r) => r,
+            Err(e) => {
+                warn!("skipping malformed data.json line: {e}");
                 continue;
             }
-            summary.records_read += 1;
-            let record: V2Record = match serde_json::from_str(trimmed) {
-                Ok(r) => r,
-                Err(e) => {
-                    warn!("skipping malformed data.json line: {e}");
-                    continue;
-                }
-            };
-            apply_v2_record(&mut stmts, &record, commits, summary)?;
-            in_batch += 1;
-            if last_log.elapsed() >= Duration::from_secs(5) {
-                let elapsed = started.elapsed().as_secs_f64();
-                let rate = summary.records_read as f64 / elapsed.max(0.001);
-                info!(
-                    records = summary.records_read,
-                    rate = format!("{rate:.0}/s"),
-                    query = summary.query_inserted,
-                    compression_time = summary.compression_time_inserted,
-                    compression_size = summary.compression_size_inserted,
-                    random_access = summary.random_access_inserted,
-                    "migration progress",
-                );
-                last_log = Instant::now();
-            }
+        };
+        apply_v2_record(&record, commits, summary, q, ct, cs, ra);
+        if last_log.elapsed() >= Duration::from_secs(5) {
+            let elapsed = started.elapsed().as_secs_f64();
+            let rate = summary.records_read as f64 / elapsed.max(0.001);
+            info!(
+                records = summary.records_read,
+                rate = format!("{rate:.0}/s"),
+                query = q.measurement_id.len(),
+                compression_time = ct.measurement_id.len(),
+                compression_size = cs.rows.len(),
+                random_access = ra.measurement_id.len(),
+                "migration progress",
+            );
+            last_log = Instant::now();
         }
-        drop(stmts);
-        tx.commit().context("commit data batch")?;
     }
     Ok(())
 }
 
-/// Prepared INSERT statements for the four v2-derived fact tables. Tied
-/// to a single transaction's lifetime; re-prepare after each commit.
-struct DataStatements<'tx> {
-    query: Statement<'tx>,
-    compression_time: Statement<'tx>,
-    compression_size: Statement<'tx>,
-    random_access: Statement<'tx>,
-}
-
-impl<'tx> DataStatements<'tx> {
-    fn prepare(tx: &'tx Transaction<'_>) -> Result<Self> {
-        Ok(Self {
-            query: tx.prepare(SQL_INSERT_QUERY)?,
-            compression_time: tx.prepare(SQL_INSERT_COMPRESSION_TIME)?,
-            compression_size: tx.prepare(SQL_INSERT_COMPRESSION_SIZE)?,
-            random_access: tx.prepare(SQL_INSERT_RANDOM_ACCESS)?,
-        })
-    }
-}
-
-const SQL_INSERT_QUERY: &str = r#"
-INSERT INTO query_measurements (
-    measurement_id, commit_sha, dataset, dataset_variant, scale_factor,
-    query_idx, storage, engine, format,
-    value_ns, all_runtimes_ns,
-    peak_physical, peak_virtual, physical_delta, virtual_delta,
-    env_triple
-) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?, ?, ?, ?, ?)
-ON CONFLICT (measurement_id) DO UPDATE SET
-    commit_sha      = excluded.commit_sha,
-    value_ns        = excluded.value_ns,
-    all_runtimes_ns = excluded.all_runtimes_ns,
-    env_triple      = excluded.env_triple
-"#;
-
-const SQL_INSERT_COMPRESSION_TIME: &str = r#"
-INSERT INTO compression_times (
-    measurement_id, commit_sha, dataset, dataset_variant,
-    format, op, value_ns, all_runtimes_ns, env_triple
-) VALUES (?, ?, ?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
-ON CONFLICT (measurement_id) DO UPDATE SET
-    commit_sha      = excluded.commit_sha,
-    value_ns        = excluded.value_ns,
-    all_runtimes_ns = excluded.all_runtimes_ns,
-    env_triple      = excluded.env_triple
-"#;
-
-const SQL_INSERT_COMPRESSION_SIZE: &str = r#"
-INSERT INTO compression_sizes (
-    measurement_id, commit_sha, dataset, dataset_variant,
-    format, value_bytes
-) VALUES (?, ?, ?, ?, ?, ?)
-ON CONFLICT (measurement_id) DO UPDATE SET
-    commit_sha   = excluded.commit_sha,
-    value_bytes  = excluded.value_bytes
-"#;
-
-const SQL_INSERT_RANDOM_ACCESS: &str = r#"
-INSERT INTO random_access_times (
-    measurement_id, commit_sha, dataset, format,
-    value_ns, all_runtimes_ns, env_triple
-) VALUES (?, ?, ?, ?, ?, CAST(? AS BIGINT[]), ?)
-ON CONFLICT (measurement_id) DO UPDATE SET
-    commit_sha      = excluded.commit_sha,
-    value_ns        = excluded.value_ns,
-    all_runtimes_ns = excluded.all_runtimes_ns,
-    env_triple      = excluded.env_triple
-"#;
-
-const SQL_UPSERT_FILE_SIZE: &str = r#"
-INSERT INTO compression_sizes (
-    measurement_id, commit_sha, dataset, dataset_variant,
-    format, value_bytes
-) VALUES (?, ?, ?, ?, ?, ?)
-ON CONFLICT (measurement_id) DO UPDATE SET
-    value_bytes = compression_sizes.value_bytes + excluded.value_bytes
-"#;
-
 fn apply_v2_record(
-    stmts: &mut DataStatements<'_>,
     record: &V2Record,
     commits: &BTreeMap<String, V2Commit>,
     summary: &mut MigrationSummary,
-) -> Result<()> {
+    q: &mut QueryAccum,
+    ct: &mut CompressionTimeAccum,
+    cs: &mut CompressionSizeAccum,
+    ra: &mut RandomAccessAccum,
+) {
     let Some(sha) = record.commit_id.clone() else {
         summary.missing_commit += 1;
-        return Ok(());
+        return;
     };
     if !commits.contains_key(&sha) {
         summary.missing_commit += 1;
-        return Ok(());
+        return;
     }
 
-    let Some(bin) = classify(record) else {
-        summary.uncategorized += 1;
-        let prefix = record.name.split('/').next().unwrap_or("").to_string();
-        *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1;
-        return Ok(());
+    let bin = match classifier::classify_outcome(record) {
+        classifier::Outcome::Bin(b) => b,
+        classifier::Outcome::Skip(_) => {
+            summary.skipped_intentional += 1;
+            return;
+        }
+        classifier::Outcome::Unknown => {
+            summary.uncategorized += 1;
+            let prefix = record.name.split('/').next().unwrap_or("").to_string();
+            *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1;
+            return;
+        }
     };
 
     let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple());
@@ -317,7 +316,7 @@ fn apply_v2_record(
         Some(v) => v,
         None => {
             summary.skipped_no_value += 1;
-            return Ok(());
+            return;
         }
     };
 
@@ -349,25 +348,7 @@ fn apply_v2_record(
                 env_triple,
             };
             let mid = measurement_id_query(&qm);
-            stmts.query.execute(params![
-                mid,
-                qm.commit_sha,
-                qm.dataset,
-                qm.dataset_variant,
-                qm.scale_factor,
-                qm.query_idx,
-                qm.storage,
-                qm.engine,
-                qm.format,
-                qm.value_ns,
-                runtimes_literal(&qm.all_runtimes_ns),
-                qm.peak_physical,
-                qm.peak_virtual,
-                qm.physical_delta,
-                qm.virtual_delta,
-                qm.env_triple,
-            ])?;
-            summary.query_inserted += 1;
+            q.push(mid, qm, summary);
         }
         V3Bin::CompressionTime {
             dataset,
@@ -375,7 +356,7 @@ fn apply_v2_record(
             format,
             op,
         } => {
-            let ct = CompressionTime {
+            let ctr = CompressionTime {
                 commit_sha: sha,
                 dataset,
                 dataset_variant,
@@ -385,45 +366,26 @@ fn apply_v2_record(
                 all_runtimes_ns: runtimes,
                 env_triple,
             };
-            let mid = measurement_id_compression_time(&ct);
-            stmts.compression_time.execute(params![
-                mid,
-                ct.commit_sha,
-                ct.dataset,
-                ct.dataset_variant,
-                ct.format,
-                ct.op,
-                ct.value_ns,
-                runtimes_literal(&ct.all_runtimes_ns),
-                ct.env_triple,
-            ])?;
-            summary.compression_time_inserted += 1;
+            let mid = measurement_id_compression_time(&ctr);
+            ct.push(mid, ctr, summary);
         }
         V3Bin::CompressionSize {
             dataset,
             dataset_variant,
             format,
         } => {
-            let cs = CompressionSize {
+            let csr = CompressionSize {
                 commit_sha: sha,
                 dataset,
                 dataset_variant,
                 format,
                 value_bytes: value_f64 as i64,
             };
-            let mid = measurement_id_compression_size(&cs);
-            stmts.compression_size.execute(params![
-                mid,
-                cs.commit_sha,
-                cs.dataset,
-                cs.dataset_variant,
-                cs.format,
-                cs.value_bytes,
-            ])?;
-            summary.compression_size_inserted += 1;
+            let mid = measurement_id_compression_size(&csr);
+            cs.push_replace(mid, csr);
         }
         V3Bin::RandomAccess { dataset, format } => {
-            let ra = RandomAccessTime {
+            let rar = RandomAccessTime {
                 commit_sha: sha,
                 dataset,
                 format,
@@ -431,134 +393,379 @@ fn apply_v2_record(
                 all_runtimes_ns: runtimes,
                 env_triple,
             };
-            let mid = measurement_id_random_access(&ra);
-            stmts.random_access.execute(params![
-                mid,
-                ra.commit_sha,
-                ra.dataset,
-                ra.format,
-                ra.value_ns,
-                runtimes_literal(&ra.all_runtimes_ns),
-                ra.env_triple,
-            ])?;
-            summary.random_access_inserted += 1;
+            let mid = measurement_id_random_access(&rar);
+            ra.push(mid, rar, summary);
         }
     }
-    Ok(())
-}
-
-fn runtimes_literal(values: &[i64]) -> String {
-    let mut s = String::with_capacity(values.len() * 8 + 2);
-    s.push('[');
-    for (i, v) in values.iter().enumerate() {
-        if i > 0 {
-            s.push(',');
-        }
-        s.push_str(&v.to_string());
-    }
-    s.push(']');
-    s
 }
 
 fn migrate_file_sizes(
-    conn: &mut Connection,
     source: &Source,
     name: &str,
     commits: &BTreeMap<String, V2Commit>,
     summary: &mut MigrationSummary,
+    cs: &mut CompressionSizeAccum,
 ) -> Result<()> {
     let reader = source.open_file_sizes(name)?;
-    let dataset = name
+    let dataset_fallback = name
         .strip_prefix("file-sizes-")
         .and_then(|s| s.strip_suffix(".json.gz"))
         .unwrap_or(name)
         .to_string();
-    let mut lines = reader.lines().peekable();
     let started = Instant::now();
     let mut last_log = Instant::now();
-    const BATCH: u64 = 10_000;
-    while lines.peek().is_some() {
-        let tx = conn.transaction().context("begin file-sizes tx")?;
-        let mut stmt = tx.prepare(SQL_UPSERT_FILE_SIZE)?;
-        let mut in_batch = 0u64;
-        while in_batch < BATCH {
-            let Some(line) = lines.next() else { break };
-            let line = line?;
-            let trimmed = line.trim();
-            if trimmed.is_empty() {
-                continue;
-            }
-            let sz: V2FileSize = match serde_json::from_str(trimmed) {
-                Ok(r) => r,
-                Err(e) => {
-                    warn!("skipping malformed {name} line: {e}");
-                    continue;
-                }
-            };
-            if !commits.contains_key(&sz.commit_id) {
-                summary.missing_commit += 1;
+    for line in reader.lines() {
+        let line = line?;
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        let sz: V2FileSize = match serde_json::from_str(trimmed) {
+            Ok(r) => r,
+            Err(e) => {
+                warn!("skipping malformed {name} line: {e}");
                 continue;
             }
-            // file-sizes-*.json.gz captures per-file sizes inside one
-            // benchmark/format/scale_factor combo. We aggregate to one
-            // (commit, dataset, dataset_variant, format) row by summing,
-            // since v3's compression_sizes is a single bytes value per
-            // (dim) tuple. Use ON CONFLICT to accumulate.
-            upsert_file_size_row(&mut stmt, &sz, &dataset)?;
-            summary.file_size_inserted += 1;
-            in_batch += 1;
-            if last_log.elapsed() >= Duration::from_secs(5) {
-                let elapsed = started.elapsed().as_secs_f64();
-                let rate = summary.file_size_inserted as f64 / elapsed.max(0.001);
-                info!(
-                    name = %name,
-                    file_sizes = summary.file_size_inserted,
-                    rate = format!("{rate:.0}/s"),
-                    "file-sizes progress",
-                );
-                last_log = Instant::now();
-            }
+        };
+        if !commits.contains_key(&sz.commit_id) {
+            summary.missing_commit += 1;
+            continue;
+        }
+        let dataset = if sz.benchmark.is_empty() {
+            dataset_fallback.clone()
+        } else {
+            sz.benchmark.clone()
+        };
+        let dataset_variant = sz
+            .scale_factor
+            .as_ref()
+            .filter(|s| !s.is_empty() && s.as_str() != "1.0")
+            .cloned();
+        let csr = CompressionSize {
+            commit_sha: sz.commit_id.clone(),
+            dataset,
+            dataset_variant,
+            format: sz.format.clone(),
+            value_bytes: sz.size_bytes,
+        };
+        let mid = measurement_id_compression_size(&csr);
+        cs.push_sum(mid, csr);
+        summary.file_size_inserted += 1;
+        if last_log.elapsed() >= Duration::from_secs(5) {
+            let elapsed = started.elapsed().as_secs_f64();
+            let rate = summary.file_size_inserted as f64 / elapsed.max(0.001);
+            info!(
+                name = %name,
+                file_sizes = summary.file_size_inserted,
+                rate = format!("{rate:.0}/s"),
+                "file-sizes progress",
+            );
+            last_log = Instant::now();
         }
-        drop(stmt);
-        tx.commit().context("commit file-sizes batch")?;
     }
     Ok(())
 }
 
-fn upsert_file_size_row(
-    stmt: &mut Statement<'_>,
-    sz: &V2FileSize,
-    dataset_fallback: &str,
-) -> Result<()> {
-    let dataset = if sz.benchmark.is_empty() {
-        dataset_fallback.to_string()
-    } else {
-        sz.benchmark.clone()
-    };
-    let dataset_variant = sz
-        .scale_factor
-        .as_ref()
-        .filter(|s| !s.is_empty() && s.as_str() != "1.0")
-        .cloned();
-    let cs = CompressionSize {
-        commit_sha: sz.commit_id.clone(),
-        dataset,
-        dataset_variant,
-        format: sz.format.clone(),
-        value_bytes: sz.size_bytes,
-    };
-    let mid = measurement_id_compression_size(&cs);
-    stmt.execute(params![
-        mid,
-        cs.commit_sha,
-        cs.dataset,
-        cs.dataset_variant,
-        cs.format,
-        cs.value_bytes,
-    ])?;
+/// Append an Arrow `RecordBatch` to a DuckDB table via `Appender`.
+fn flush(conn: &Connection, table: &str, batch: RecordBatch) -> Result<()> {
+    let mut app = conn
+        .appender(table)
+        .with_context(|| format!("opening appender for {table}"))?;
+    app.append_record_batch(batch)
+        .with_context(|| format!("appending record batch to {table}"))?;
+    drop(app);
     Ok(())
 }
 
+#[derive(Default)]
+struct QueryAccum {
+    measurement_id: Vec<i64>,
+    commit_sha: Vec<String>,
+    dataset: Vec<String>,
+    dataset_variant: Vec<Option<String>>,
+    scale_factor: Vec<Option<String>>,
+    query_idx: Vec<i32>,
+    storage: Vec<String>,
+    engine: Vec<String>,
+    format: Vec<String>,
+    value_ns: Vec<i64>,
+    all_runtimes_ns: Vec<Vec<i64>>,
+    peak_physical: Vec<Option<i64>>,
+    peak_virtual: Vec<Option<i64>>,
+    physical_delta: Vec<Option<i64>>,
+    virtual_delta: Vec<Option<i64>>,
+    env_triple: Vec<Option<String>>,
+    seen: HashSet<i64>,
+}
+
+impl QueryAccum {
+    fn push(&mut self, mid: i64, r: QueryMeasurement, summary: &mut MigrationSummary) {
+        if !self.seen.insert(mid) {
+            summary.deduped += 1;
+            return;
+        }
+        self.measurement_id.push(mid);
+        self.commit_sha.push(r.commit_sha);
+        self.dataset.push(r.dataset);
+        self.dataset_variant.push(r.dataset_variant);
+        self.scale_factor.push(r.scale_factor);
+        self.query_idx.push(r.query_idx);
+        self.storage.push(r.storage);
+        self.engine.push(r.engine);
+        self.format.push(r.format);
+        self.value_ns.push(r.value_ns);
+        self.all_runtimes_ns.push(r.all_runtimes_ns);
+        self.peak_physical.push(r.peak_physical);
+        self.peak_virtual.push(r.peak_virtual);
+        self.physical_delta.push(r.physical_delta);
+        self.virtual_delta.push(r.virtual_delta);
+        self.env_triple.push(r.env_triple);
+    }
+}
+
+#[derive(Default)]
+struct CompressionTimeAccum {
+    measurement_id: Vec<i64>,
+    commit_sha: Vec<String>,
+    dataset: Vec<String>,
+    dataset_variant: Vec<Option<String>>,
+    format: Vec<String>,
+    op: Vec<String>,
+    value_ns: Vec<i64>,
+    all_runtimes_ns: Vec<Vec<i64>>,
+    env_triple: Vec<Option<String>>,
+    seen: HashSet<i64>,
+}
+
+impl CompressionTimeAccum {
+    fn push(&mut self, mid: i64, r: CompressionTime, summary: &mut MigrationSummary) {
+        if !self.seen.insert(mid) {
+            summary.deduped += 1;
+            return;
+        }
+        self.measurement_id.push(mid);
+        self.commit_sha.push(r.commit_sha);
+        self.dataset.push(r.dataset);
+        self.dataset_variant.push(r.dataset_variant);
+        self.format.push(r.format);
+        self.op.push(r.op);
+        self.value_ns.push(r.value_ns);
+        self.all_runtimes_ns.push(r.all_runtimes_ns);
+        self.env_triple.push(r.env_triple);
+    }
+}
+
+#[derive(Default)]
+struct RandomAccessAccum {
+    measurement_id: Vec<i64>,
+    commit_sha: Vec<String>,
+    dataset: Vec<String>,
+    format: Vec<String>,
+    value_ns: Vec<i64>,
+    all_runtimes_ns: Vec<Vec<i64>>,
+    env_triple: Vec<Option<String>>,
+    seen: HashSet<i64>,
+}
+
+impl RandomAccessAccum {
+    fn push(&mut self, mid: i64, r: RandomAccessTime, summary: &mut MigrationSummary) {
+        if !self.seen.insert(mid) {
+            summary.deduped += 1;
+            return;
+        }
+        self.measurement_id.push(mid);
+        self.commit_sha.push(r.commit_sha);
+        self.dataset.push(r.dataset);
+        self.format.push(r.format);
+        self.value_ns.push(r.value_ns);
+        self.all_runtimes_ns.push(r.all_runtimes_ns);
+        self.env_triple.push(r.env_triple);
+    }
+}
+
+/// `compression_sizes` is fed by both data.json.gz (replace-on-collision)
+/// and file-sizes-*.json.gz (sum-on-collision). Stored as a map; converted
+/// to a `RecordBatch` at flush time.
+#[derive(Default)]
+struct CompressionSizeAccum {
+    rows: HashMap<i64, CompressionSize>,
+}
+
+impl CompressionSizeAccum {
+    /// data.json.gz path: latest write wins, mirroring the prior
+    /// `ON CONFLICT DO UPDATE SET value_bytes = excluded.value_bytes`.
+    fn push_replace(&mut self, mid: i64, r: CompressionSize) {
+        self.rows.insert(mid, r);
+    }
+
+    /// file-sizes-*.json.gz path: per-file rows aggregate into one
+    /// `(commit, dataset, dataset_variant, format)` row by summing,
+    /// mirroring the prior `value_bytes = compression_sizes.value_bytes
+    /// + excluded.value_bytes`.
+    fn push_sum(&mut self, mid: i64, r: CompressionSize) {
+        let add = r.value_bytes;
+        self.rows
+            .entry(mid)
+            .and_modify(|x| x.value_bytes += add)
+            .or_insert(r);
+    }
+}
+
+fn build_query_batch(a: QueryAccum) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("measurement_id", DataType::Int64, false),
+        Field::new("commit_sha", DataType::Utf8, false),
+        Field::new("dataset", DataType::Utf8, false),
+        Field::new("dataset_variant", DataType::Utf8, true),
+        Field::new("scale_factor", DataType::Utf8, true),
+        Field::new("query_idx", DataType::Int32, false),
+        Field::new("storage", DataType::Utf8, false),
+        Field::new("engine", DataType::Utf8, false),
+        Field::new("format", DataType::Utf8, false),
+        Field::new("value_ns", DataType::Int64, false),
+        Field::new(
+            "all_runtimes_ns",
+            DataType::List(Arc::new(Field::new("item", DataType::Int64, false))),
+            false,
+        ),
+        Field::new("peak_physical", DataType::Int64, true),
+        Field::new("peak_virtual", DataType::Int64, true),
+        Field::new("physical_delta", DataType::Int64, true),
+        Field::new("virtual_delta", DataType::Int64, true),
+        Field::new("env_triple", DataType::Utf8, true),
+    ]));
+    let cols: Vec<ArrayRef> = vec![
+        Arc::new(Int64Array::from(a.measurement_id)),
+        Arc::new(StringArray::from(a.commit_sha)),
+        Arc::new(StringArray::from(a.dataset)),
+        Arc::new(StringArray::from(a.dataset_variant)),
+        Arc::new(StringArray::from(a.scale_factor)),
+        Arc::new(Int32Array::from(a.query_idx)),
+        Arc::new(StringArray::from(a.storage)),
+        Arc::new(StringArray::from(a.engine)),
+        Arc::new(StringArray::from(a.format)),
+        Arc::new(Int64Array::from(a.value_ns)),
+        Arc::new(build_list_int64(a.all_runtimes_ns)),
+        Arc::new(Int64Array::from(a.peak_physical)),
+        Arc::new(Int64Array::from(a.peak_virtual)),
+        Arc::new(Int64Array::from(a.physical_delta)),
+        Arc::new(Int64Array::from(a.virtual_delta)),
+        Arc::new(StringArray::from(a.env_triple)),
+    ];
+    Ok(RecordBatch::try_new(schema, cols)?)
+}
+
+fn build_compression_time_batch(a: CompressionTimeAccum) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("measurement_id", DataType::Int64, false),
+        Field::new("commit_sha", DataType::Utf8, false),
+        Field::new("dataset", DataType::Utf8, false),
+        Field::new("dataset_variant", DataType::Utf8, true),
+        Field::new("format", DataType::Utf8, false),
+        Field::new("op", DataType::Utf8, false),
+        Field::new("value_ns", DataType::Int64, false),
+        Field::new(
+            "all_runtimes_ns",
+            DataType::List(Arc::new(Field::new("item", DataType::Int64, false))),
+            false,
+        ),
+        Field::new("env_triple", DataType::Utf8, true),
+    ]));
+    let cols: Vec<ArrayRef> = vec![
+        Arc::new(Int64Array::from(a.measurement_id)),
+        Arc::new(StringArray::from(a.commit_sha)),
+        Arc::new(StringArray::from(a.dataset)),
+        Arc::new(StringArray::from(a.dataset_variant)),
+        Arc::new(StringArray::from(a.format)),
+        Arc::new(StringArray::from(a.op)),
+        Arc::new(Int64Array::from(a.value_ns)),
+        Arc::new(build_list_int64(a.all_runtimes_ns)),
+        Arc::new(StringArray::from(a.env_triple)),
+    ];
+    Ok(RecordBatch::try_new(schema, cols)?)
+}
+
+fn build_random_access_batch(a: RandomAccessAccum) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("measurement_id", DataType::Int64, false),
+        Field::new("commit_sha", DataType::Utf8, false),
+        Field::new("dataset", DataType::Utf8, false),
+        Field::new("format", DataType::Utf8, false),
+        Field::new("value_ns", DataType::Int64, false),
+        Field::new(
+            "all_runtimes_ns",
+            DataType::List(Arc::new(Field::new("item", DataType::Int64, false))),
+            false,
+        ),
+        Field::new("env_triple", DataType::Utf8, true),
+    ]));
+    let cols: Vec<ArrayRef> = vec![
+        Arc::new(Int64Array::from(a.measurement_id)),
+        Arc::new(StringArray::from(a.commit_sha)),
+        Arc::new(StringArray::from(a.dataset)),
+        Arc::new(StringArray::from(a.format)),
+        Arc::new(Int64Array::from(a.value_ns)),
+        Arc::new(build_list_int64(a.all_runtimes_ns)),
+        Arc::new(StringArray::from(a.env_triple)),
+    ];
+    Ok(RecordBatch::try_new(schema, cols)?)
+}
+
+fn build_compression_size_batch(a: CompressionSizeAccum) -> Result<RecordBatch> {
+    let n = a.rows.len();
+    let mut measurement_id = Vec::with_capacity(n);
+    let mut commit_sha = Vec::with_capacity(n);
+    let mut dataset = Vec::with_capacity(n);
+    let mut dataset_variant = Vec::with_capacity(n);
+    let mut format = Vec::with_capacity(n);
+    let mut value_bytes = Vec::with_capacity(n);
+    for (mid, cs) in a.rows {
+        measurement_id.push(mid);
+        commit_sha.push(cs.commit_sha);
+        dataset.push(cs.dataset);
+        dataset_variant.push(cs.dataset_variant);
+        format.push(cs.format);
+        value_bytes.push(cs.value_bytes);
+    }
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("measurement_id", DataType::Int64, false),
+        Field::new("commit_sha", DataType::Utf8, false),
+        Field::new("dataset", DataType::Utf8, false),
+        Field::new("dataset_variant", DataType::Utf8, true),
+        Field::new("format", DataType::Utf8, false),
+        Field::new("value_bytes", DataType::Int64, false),
+    ]));
+    let cols: Vec<ArrayRef> = vec![
+        Arc::new(Int64Array::from(measurement_id)),
+        Arc::new(StringArray::from(commit_sha)),
+        Arc::new(StringArray::from(dataset)),
+        Arc::new(StringArray::from(dataset_variant)),
+        Arc::new(StringArray::from(format)),
+        Arc::new(Int64Array::from(value_bytes)),
+    ];
+    Ok(RecordBatch::try_new(schema, cols)?)
+}
+
+/// Build a non-nullable `List<Int64>` Arrow array from one inner Vec
+/// per row. The outer list is non-null; inner i64 values are non-null.
+fn build_list_int64(values: Vec<Vec<i64>>) -> ListArray {
+    let mut offsets: Vec<i32> = Vec::with_capacity(values.len() + 1);
+    offsets.push(0);
+    let mut flat: Vec<i64> = Vec::new();
+    for inner in values {
+        flat.extend_from_slice(&inner);
+        offsets.push(flat.len() as i32);
+    }
+    let values_arr = Int64Array::from(flat);
+    let field = Arc::new(Field::new("item", DataType::Int64, false));
+    ListArray::new(
+        field,
+        OffsetBuffer::new(offsets.into()),
+        Arc::new(values_arr),
+        None,
+    )
+}
+
 /// Print the summary in a human-readable form. Returned by the CLI.
 impl std::fmt::Display for MigrationSummary {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -580,6 +787,8 @@ impl std::fmt::Display for MigrationSummary {
         writeln!(f, "Inserted (file sizes):  {}", self.file_size_inserted)?;
         writeln!(f, "Missing commit:         {}", self.missing_commit)?;
         writeln!(f, "Skipped (no value):     {}", self.skipped_no_value)?;
+        writeln!(f, "Skipped (intentional):  {}", self.skipped_intentional)?;
+        writeln!(f, "Deduplicated:           {}", self.deduped)?;
         writeln!(
             f,
             "Uncategorized:          {} ({:.2}%)",
diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs
index e8288751d62..a26658cea53 100644
--- a/benchmarks-website/migrate/tests/classifier.rs
+++ b/benchmarks-website/migrate/tests/classifier.rs
@@ -6,8 +6,11 @@
 
 use rstest::rstest;
 use serde_json::json;
+use vortex_bench_migrate::classifier::Outcome;
+use vortex_bench_migrate::classifier::Skip;
 use vortex_bench_migrate::classifier::V3Bin;
 use vortex_bench_migrate::classifier::classify;
+use vortex_bench_migrate::classifier::classify_outcome;
 use vortex_bench_migrate::classifier::format_query;
 use vortex_bench_migrate::classifier::rename_engine;
 use vortex_bench_migrate::v2::V2Record;
@@ -263,7 +266,6 @@ fn compression_size_records(#[case] name: &str, #[case] expected: V3Bin) {
 #[case::ratio_size_vortex_parquet("vortex:parquet-zstd size/clickbench")]
 #[case::ratio_size_vortex_raw("vortex:raw size/clickbench")]
 #[case::throughput("compress throughput/clickbench")]
-#[case::fineweb_skipped("fineweb_q01/datafusion:parquet")]
 #[case::nonsense_prefix("not-a-known-bench/series")]
 fn unmapped_records_yield_none(#[case] name: &str) {
     let r = record(name);
@@ -274,6 +276,94 @@ fn unmapped_records_yield_none(#[case] name: &str) {
     );
 }
 
+#[test]
+fn parquet_zstd_size_is_deprecated() {
+    // `parquet-zstd` is not on the v3 emitter's format allowlist, so
+    // historical `parquet-zstd size/...` records bucket under
+    // Skip::Deprecated and don't render as orphan charts in v3.
+    let r = record("parquet-zstd size/clickbench");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::Deprecated)
+    ));
+}
+
+#[test]
+fn vortex_parquet_zstd_ratio_is_intentional_skip() {
+    let r = record("vortex:parquet-zstd ratio compress time/clickbench");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::DerivedRatio)
+    ));
+}
+
+#[test]
+fn vortex_parquet_zst_typo_ratio_is_intentional_skip() {
+    // `parquet-zst` (no trailing `d`) was emitted by some v2 runs.
+    // Both spellings should classify as derived ratios.
+    for name in [
+        "vortex:parquet-zst ratio compress time/clickbench",
+        "vortex:parquet-zst ratio decompress time/clickbench",
+    ] {
+        let r = record(name);
+        assert!(
+            matches!(classify_outcome(&r), Outcome::Skip(Skip::DerivedRatio)),
+            "{name:?} should be DerivedRatio",
+        );
+    }
+}
+
+#[test]
+fn throughput_is_intentional_skip() {
+    let r = record("compress throughput/clickbench");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::Throughput)
+    ));
+}
+
+#[test]
+fn unknown_prefix_is_unknown() {
+    let r = record("not-a-known-bench/series");
+    assert!(matches!(classify_outcome(&r), Outcome::Unknown));
+}
+
+#[test]
+fn gharchive_q00_is_deprecated() {
+    // gharchive isn't on the v3 query-suite allowlist, so historical
+    // gharchive query records bucket as Skip::Deprecated.
+    let r = record("gharchive_q00/datafusion:parquet");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::Deprecated)
+    ));
+}
+
+#[test]
+fn fineweb_q00_is_deprecated() {
+    // fineweb isn't on the v3 query-suite allowlist, so historical
+    // fineweb query records bucket as Skip::Deprecated.
+    let r = record("fineweb_q00/datafusion:parquet");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::Deprecated)
+    ));
+}
+
+#[test]
+fn engine_casing_lowercased() {
+    // Older v2 records emitted display-case engines like `DataFusion`
+    // and `DuckDB`. The classifier lowercases at push time so dedup
+    // collapses display-case rows into the canonical lowercase ones.
+    let r = record("clickbench_q07/DataFusion:parquet");
+    let outcome = classify_outcome(&r);
+    let Outcome::Bin(V3Bin::Query { engine, format, .. }) = outcome else {
+        panic!("expected Bin(Query), got {outcome:?}");
+    };
+    assert_eq!(engine, "datafusion");
+    assert_eq!(format, "parquet");
+}
+
 #[test]
 fn rename_engine_pins_canonical_outputs() {
     assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme");
diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs
index b389f77c421..a8328342c9a 100644
--- a/benchmarks-website/migrate/tests/end_to_end.rs
+++ b/benchmarks-website/migrate/tests/end_to_end.rs
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Inline JSONL fixture exercising 1 record per kind through the full
-//! migration into a tempdir DuckDB. No live S3.
+//! Inline JSONL fixtures driven through the full migration into a
+//! tempdir DuckDB. No live S3.
 
 use std::fs::File;
 use std::io::Write;
+use std::path::Path;
 
 use duckdb::Connection;
 use flate2::Compression;
@@ -23,25 +24,34 @@ const DATA_JSONL: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_
 {"name":"random-access/taxi/take/parquet-tokio-local-disk","commit_id":"deadbeef","unit":"ns","value":777,"all_runtimes":[700,777,800]}
 "#;
 
-fn write_local_dir() -> TempDir {
+/// Build a local-source fixture directory. Caller supplies the contents
+/// of `commits.json`, `data.json.gz`, and any number of
+/// `file-sizes-*.json.gz` files (name → contents).
+fn build_fixture(commits: &str, data: &str, file_sizes: &[(&str, &str)]) -> TempDir {
     let dir = TempDir::new().expect("tempdir");
-    {
-        let mut f = File::create(dir.path().join("commits.json")).unwrap();
-        f.write_all(COMMITS_JSONL.as_bytes()).unwrap();
+    write_text(&dir.path().join("commits.json"), commits);
+    write_gz(&dir.path().join("data.json.gz"), data);
+    for (name, body) in file_sizes {
+        write_gz(&dir.path().join(name), body);
     }
-    {
-        let f = File::create(dir.path().join("data.json.gz")).unwrap();
-        let mut gz = GzEncoder::new(f, Compression::default());
-        gz.write_all(DATA_JSONL.as_bytes()).unwrap();
-        gz.finish().unwrap();
-    }
-    // No file-sizes-*.json.gz to keep the fixture minimal.
     dir
 }
 
+fn write_text(path: &Path, body: &str) {
+    let mut f = File::create(path).unwrap();
+    f.write_all(body.as_bytes()).unwrap();
+}
+
+fn write_gz(path: &Path, body: &str) {
+    let f = File::create(path).unwrap();
+    let mut gz = GzEncoder::new(f, Compression::default());
+    gz.write_all(body.as_bytes()).unwrap();
+    gz.finish().unwrap();
+}
+
 #[test]
 fn migrate_inline_fixture_populates_each_table() {
-    let src_dir = write_local_dir();
+    let src_dir = build_fixture(COMMITS_JSONL, DATA_JSONL, &[]);
     let target_dir = TempDir::new().unwrap();
     let target = target_dir.path().join("v3.duckdb");
 
@@ -109,3 +119,66 @@ fn migrate_inline_fixture_populates_each_table() {
     assert_eq!(dataset, "taxi/take");
     assert_eq!(format, "parquet");
 }
+
+#[test]
+fn dedup_collision_keeps_one_row() {
+    // Two data.json.gz lines whose query-measurement dim columns are
+    // identical (same commit / dataset / engine / format / query_idx,
+    // and `storage` collapses to "nvme" since `storage` is unset).
+    // Different `value`s. The accumulator's HashSet<measurement_id>
+    // should drop the second one and bump `summary.deduped`.
+    const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111}
+{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222}
+"#;
+
+    let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]);
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.records_read, 2, "summary={summary}");
+    assert_eq!(summary.query_inserted, 1, "summary={summary}");
+    assert_eq!(summary.deduped, 1, "summary={summary}");
+
+    let conn = Connection::open(&target).unwrap();
+    let n: i64 = conn
+        .query_row("SELECT COUNT(*) FROM query_measurements", [], |r| r.get(0))
+        .unwrap();
+    assert_eq!(n, 1);
+}
+
+#[test]
+fn file_sizes_sum_into_one_row() {
+    // Two file-sizes rows sharing (commit, benchmark, format,
+    // scale_factor) and value_bytes 100 + 200 must collapse to a
+    // single compression_sizes row with 300.
+    const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100}
+{"commit_id":"deadbeef","benchmark":"clickbench","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-1.vortex","size_bytes":200}
+"#;
+
+    let src_dir = build_fixture(
+        COMMITS_JSONL,
+        "",
+        &[("file-sizes-clickbench.json.gz", FILE_SIZES)],
+    );
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.file_size_inserted, 2, "summary={summary}");
+    assert_eq!(summary.compression_size_inserted, 1, "summary={summary}");
+
+    let conn = Connection::open(&target).unwrap();
+    let n: i64 = conn
+        .query_row("SELECT COUNT(*) FROM compression_sizes", [], |r| r.get(0))
+        .unwrap();
+    assert_eq!(n, 1);
+    let value_bytes: i64 = conn
+        .query_row("SELECT value_bytes FROM compression_sizes", [], |r| {
+            r.get(0)
+        })
+        .unwrap();
+    assert_eq!(value_bytes, 300);
+}

From b0281483bdbb46ad3e82c4876d2751a53d39949e Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Sun, 26 Apr 2026 21:02:26 -0400
Subject: [PATCH 5/5] clean up and fix bugs

Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                    |  1 +
 benchmarks-website/migrate/Cargo.toml         |  1 +
 benchmarks-website/migrate/src/classifier.rs  | 46 +++++++++--
 benchmarks-website/migrate/src/migrate.rs     | 56 +++++++++----
 benchmarks-website/migrate/src/source.rs      | 22 +++++-
 .../migrate/tests/classifier.rs               | 59 +++++++++++++-
 .../migrate/tests/end_to_end.rs               | 79 +++++++++++++++++++
 7 files changed, 237 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 20075443c36..5315ba5ef7f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10409,6 +10409,7 @@ dependencies = [
  "tracing",
  "tracing-subscriber",
  "vortex-bench-server",
+ "vortex-utils",
 ]
 
 [[package]]
diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml
index f9b83d5d543..45a752df397 100644
--- a/benchmarks-website/migrate/Cargo.toml
+++ b/benchmarks-website/migrate/Cargo.toml
@@ -34,6 +34,7 @@ tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
 tracing = { workspace = true, features = ["std"] }
 tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] }
 vortex-bench-server = { path = "../server" }
+vortex-utils = { workspace = true }
 
 [dev-dependencies]
 rstest = { workspace = true }
diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
index 6b3368c64b8..8a17b31fcd2 100644
--- a/benchmarks-website/migrate/src/classifier.rs
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -424,6 +424,12 @@ pub enum Skip {
     /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`,
     /// historical-only suites no longer in CI).
     Deprecated,
+    /// v2 memory measurements (`*_memory/*` records). Carry top-level
+    /// `peak_physical_memory` / `peak_virtual_memory` /
+    /// `physical_memory_delta` / `virtual_memory_delta` fields that
+    /// `V2Record` doesn't deserialize. Not migrated for alpha; merging
+    /// into the corresponding QueryMeasurement row is future work.
+    HistoricalMemory,
 }
 
 /// Engines the v3 emitter produces today. Anything else is historical
@@ -451,9 +457,18 @@ const V3_FORMATS: &[&str] = &[
 /// classify (so historical analyses stay coherent) but get bucketed
 /// as `Skip::Deprecated` so they don't render as orphan charts in v3.
 ///
-/// ORCHESTRATOR NOTE: add `fineweb` and/or `gharchive` here if a CI
-/// grep shows v3 still emits them.
-const V3_QUERY_SUITES: &[&str] = &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"];
+/// `fineweb` is included because `.github/workflows/sql-benchmarks.yml`
+/// still has `fineweb` and `fineweb-s3` matrix entries. `gharchive`
+/// stays excluded — it's defined in `vortex-bench` but no current
+/// workflow runs it.
+const V3_QUERY_SUITES: &[&str] = &[
+    "clickbench",
+    "tpch",
+    "tpcds",
+    "statpopgen",
+    "polarsignals",
+    "fineweb",
+];
 
 /// Returns true if every dim that v3 stores as a column is on the
 /// emitter's current allowlist. Dim values outside the allowlist mean
@@ -488,6 +503,16 @@ pub fn classify_outcome(record: &V2Record) -> Outcome {
     if record.name.contains(" throughput") {
         return Outcome::Skip(Skip::Throughput);
     }
+    // v2 memory records: e.g. "clickbench_q07_memory/datafusion:parquet".
+    // Match the `_memory/` infix BEFORE the engine/format split, so they
+    // route to a known Skip variant instead of slipping through to
+    // Outcome::Unknown and tripping the 5% gate.
+    let lower = record.name.to_lowercase();
+    if let Some((head, _)) = lower.split_once('/')
+        && head.ends_with("_memory")
+    {
+        return Outcome::Skip(Skip::HistoricalMemory);
+    }
     let Some(group) = get_group(record) else {
         return Outcome::Unknown;
     };
@@ -613,7 +638,7 @@ fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option<V3
     })
 }
 
-fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option<V3Bin> {
+fn bin_compression_size(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
     let lc = cls.chart.to_lowercase();
     // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" /
     // "VORTEX:RAW SIZE" are derived from compression_sizes at read
@@ -641,9 +666,20 @@ fn bin_compression_size(cls: &V2Classification, _record: &V2Record) -> Option<V3
     if dataset.is_empty() || dataset == "default" {
         return None;
     }
+    // Mirror the file-sizes ingest path's dataset_variant derivation
+    // (see `migrate::migrate_file_sizes`): pull the SF out of the v2
+    // record's `dataset` object when present, drop empty / "1.0".
+    // Without this both code paths produce the same `mid` only by
+    // accident, so SF=10 file-sizes rows wouldn't merge with the
+    // matching data.json.gz "vortex size/tpch" rows.
+    let dataset_variant = record
+        .dataset
+        .as_ref()
+        .and_then(|d| crate::v2::dataset_scale_factor(d, dataset.as_str()))
+        .filter(|s| !s.is_empty() && s.as_str() != "1.0");
     Some(V3Bin::CompressionSize {
         dataset,
-        dataset_variant: None,
+        dataset_variant,
         format: format.to_string(),
     })
 }
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
index ff1abf835f0..7b3b32bb51c 100644
--- a/benchmarks-website/migrate/src/migrate.rs
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -4,10 +4,9 @@
 //! End-to-end migration of one v2 dataset into a v3 DuckDB file.
 //!
 //! Streams `data.json.gz` line-by-line, runs each record through the
-//! [classifier][crate::classifier], and writes one row per record into
-//! the appropriate v3 fact table. Every row's `measurement_id` is
-//! computed via the server's `measurement_id_*` functions so the result
-//! is byte-compatible with what fresh `/api/ingest` would have produced.
+//! [`classifier`], and writes one row per record into the appropriate v3 fact table.
+//! Every row's `measurement_id` is computed via the server's `measurement_id_*` functions so the
+//! result is byte-compatible with what fresh `/api/ingest` would have produced.
 //!
 //! Bulk-load shape: rows are accumulated in memory as parallel column
 //! vectors, deduplicated by `measurement_id`, then flushed to DuckDB
@@ -15,8 +14,6 @@
 //! fact table.
 
 use std::collections::BTreeMap;
-use std::collections::HashMap;
-use std::collections::HashSet;
 use std::io::BufRead;
 use std::path::Path;
 use std::sync::Arc;
@@ -47,6 +44,7 @@ use vortex_bench_server::records::CompressionTime;
 use vortex_bench_server::records::QueryMeasurement;
 use vortex_bench_server::records::RandomAccessTime;
 use vortex_bench_server::schema::SCHEMA_DDL;
+use vortex_utils::aliases::hash_map::HashMap;
 
 use crate::classifier;
 use crate::classifier::V3Bin;
@@ -76,6 +74,10 @@ pub struct MigrationSummary {
     pub skipped_intentional: u64,
     pub commits_inserted: u64,
     pub deduped: u64,
+    /// Number of records dropped by dedup whose `value_ns` (or
+    /// `value_bytes` for compression_sizes' replace path) differed
+    /// from the kept row's. Non-zero is a smell worth investigating.
+    pub deduped_with_conflict: u64,
 }
 
 impl MigrationSummary {
@@ -382,7 +384,7 @@ fn apply_v2_record(
                 value_bytes: value_f64 as i64,
             };
             let mid = measurement_id_compression_size(&csr);
-            cs.push_replace(mid, csr);
+            cs.push_replace(mid, csr, summary);
         }
         V3Bin::RandomAccess { dataset, format } => {
             let rar = RandomAccessTime {
@@ -495,15 +497,22 @@ struct QueryAccum {
     physical_delta: Vec<Option<i64>>,
     virtual_delta: Vec<Option<i64>>,
     env_triple: Vec<Option<String>>,
-    seen: HashSet<i64>,
+    /// `mid` -> index in the parallel column vecs. Lets us look up the
+    /// kept row's `value_ns` on collision so we can flag conflicts.
+    seen: HashMap<i64, usize>,
 }
 
 impl QueryAccum {
     fn push(&mut self, mid: i64, r: QueryMeasurement, summary: &mut MigrationSummary) {
-        if !self.seen.insert(mid) {
+        if let Some(&idx) = self.seen.get(&mid) {
             summary.deduped += 1;
+            if self.value_ns[idx] != r.value_ns {
+                summary.deduped_with_conflict += 1;
+            }
             return;
         }
+        let idx = self.measurement_id.len();
+        self.seen.insert(mid, idx);
         self.measurement_id.push(mid);
         self.commit_sha.push(r.commit_sha);
         self.dataset.push(r.dataset);
@@ -534,15 +543,20 @@ struct CompressionTimeAccum {
     value_ns: Vec<i64>,
     all_runtimes_ns: Vec<Vec<i64>>,
     env_triple: Vec<Option<String>>,
-    seen: HashSet<i64>,
+    seen: HashMap<i64, usize>,
 }
 
 impl CompressionTimeAccum {
     fn push(&mut self, mid: i64, r: CompressionTime, summary: &mut MigrationSummary) {
-        if !self.seen.insert(mid) {
+        if let Some(&idx) = self.seen.get(&mid) {
             summary.deduped += 1;
+            if self.value_ns[idx] != r.value_ns {
+                summary.deduped_with_conflict += 1;
+            }
             return;
         }
+        let idx = self.measurement_id.len();
+        self.seen.insert(mid, idx);
         self.measurement_id.push(mid);
         self.commit_sha.push(r.commit_sha);
         self.dataset.push(r.dataset);
@@ -564,15 +578,20 @@ struct RandomAccessAccum {
     value_ns: Vec<i64>,
     all_runtimes_ns: Vec<Vec<i64>>,
     env_triple: Vec<Option<String>>,
-    seen: HashSet<i64>,
+    seen: HashMap<i64, usize>,
 }
 
 impl RandomAccessAccum {
     fn push(&mut self, mid: i64, r: RandomAccessTime, summary: &mut MigrationSummary) {
-        if !self.seen.insert(mid) {
+        if let Some(&idx) = self.seen.get(&mid) {
             summary.deduped += 1;
+            if self.value_ns[idx] != r.value_ns {
+                summary.deduped_with_conflict += 1;
+            }
             return;
         }
+        let idx = self.measurement_id.len();
+        self.seen.insert(mid, idx);
         self.measurement_id.push(mid);
         self.commit_sha.push(r.commit_sha);
         self.dataset.push(r.dataset);
@@ -594,7 +613,15 @@ struct CompressionSizeAccum {
 impl CompressionSizeAccum {
     /// data.json.gz path: latest write wins, mirroring the prior
     /// `ON CONFLICT DO UPDATE SET value_bytes = excluded.value_bytes`.
-    fn push_replace(&mut self, mid: i64, r: CompressionSize) {
+    /// Bumps `deduped_with_conflict` when an existing row's
+    /// `value_bytes` differs from the incoming row's, so silent
+    /// value-corruption is observable.
+    fn push_replace(&mut self, mid: i64, r: CompressionSize, summary: &mut MigrationSummary) {
+        if let Some(existing) = self.rows.get(&mid)
+            && existing.value_bytes != r.value_bytes
+        {
+            summary.deduped_with_conflict += 1;
+        }
         self.rows.insert(mid, r);
     }
 
@@ -789,6 +816,7 @@ impl std::fmt::Display for MigrationSummary {
         writeln!(f, "Skipped (no value):     {}", self.skipped_no_value)?;
         writeln!(f, "Skipped (intentional):  {}", self.skipped_intentional)?;
         writeln!(f, "Deduplicated:           {}", self.deduped)?;
+        writeln!(f, "Dedup w/ value diff:    {}", self.deduped_with_conflict)?;
         writeln!(
             f,
             "Uncategorized:          {} ({:.2}%)",
diff --git a/benchmarks-website/migrate/src/source.rs b/benchmarks-website/migrate/src/source.rs
index 340a9bdb60f..c18e86a63ca 100644
--- a/benchmarks-website/migrate/src/source.rs
+++ b/benchmarks-website/migrate/src/source.rs
@@ -120,7 +120,21 @@ fn open_s3(name: &str) -> Result<Box<dyn Read + Send>> {
 }
 
 /// Suite IDs we know publish a `file-sizes-{id}.json.gz` to S3.
-/// Matches the `matrix.id` values in `.github/workflows/sql-benchmarks.yml`
-/// at the time of writing. New suites mean a new entry here.
-const KNOWN_FILE_SIZES_SUITES: &[&str] =
-    &["clickbench", "tpch", "tpcds", "statpopgen", "polarsignals"];
+///
+/// Source of truth: the `matrix.id` values in
+/// `.github/workflows/sql-benchmarks.yml`'s `benchmark_matrix` default.
+/// The post-bench `file-sizes` step uploads `file-sizes-${{ matrix.id
+/// }}.json.gz`, so this list must match those IDs verbatim. Adding a
+/// new matrix entry to that workflow means adding the same ID here.
+const KNOWN_FILE_SIZES_SUITES: &[&str] = &[
+    "clickbench-nvme",
+    "tpch-nvme",
+    "tpch-s3",
+    "tpch-nvme-10",
+    "tpch-s3-10",
+    "tpcds-nvme",
+    "statpopgen",
+    "fineweb",
+    "fineweb-s3",
+    "polarsignals",
+];
diff --git a/benchmarks-website/migrate/tests/classifier.rs b/benchmarks-website/migrate/tests/classifier.rs
index a26658cea53..cddca0c517c 100644
--- a/benchmarks-website/migrate/tests/classifier.rs
+++ b/benchmarks-website/migrate/tests/classifier.rs
@@ -340,16 +340,67 @@ fn gharchive_q00_is_deprecated() {
 }
 
 #[test]
-fn fineweb_q00_is_deprecated() {
-    // fineweb isn't on the v3 query-suite allowlist, so historical
-    // fineweb query records bucket as Skip::Deprecated.
+fn fineweb_q00_classifies() {
+    // fineweb is on V3_QUERY_SUITES (still emitted by v3 CI per
+    // .github/workflows/sql-benchmarks.yml's `fineweb` matrix entry),
+    // so historical fineweb records ingest like any other suite.
     let r = record("fineweb_q00/datafusion:parquet");
     assert!(matches!(
         classify_outcome(&r),
-        Outcome::Skip(Skip::Deprecated)
+        Outcome::Bin(V3Bin::Query { .. })
     ));
 }
 
+#[test]
+fn memory_record_is_historical_memory_skip() {
+    // v2 emitted `<suite>_q<NN>_memory/<engine>:<format>` records that
+    // carry top-level memory fields V2Record doesn't deserialize.
+    // Skip them with a known variant so they don't trip the 5% gate.
+    let r = record("clickbench_q07_memory/datafusion:parquet");
+    assert!(matches!(
+        classify_outcome(&r),
+        Outcome::Skip(Skip::HistoricalMemory)
+    ));
+}
+
+#[test]
+fn tpch_compression_size_carries_scale_factor() {
+    // The data.json.gz "vortex size/tpch" path needs to derive
+    // dataset_variant from the v2 record's `dataset` object, the same
+    // way the file-sizes path does. Otherwise SF=10 rows from the two
+    // sources never collide on `mid` and produce duplicate rows.
+    let mut r = record("vortex size/tpch");
+    r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "10" } }));
+    let outcome = classify_outcome(&r);
+    let Outcome::Bin(V3Bin::CompressionSize {
+        dataset,
+        dataset_variant,
+        format,
+    }) = outcome
+    else {
+        panic!("expected Bin(CompressionSize), got {outcome:?}");
+    };
+    assert_eq!(dataset, "tpch");
+    assert_eq!(dataset_variant, Some("10".into()));
+    assert_eq!(format, "vortex-file-compressed");
+}
+
+#[test]
+fn tpch_compression_size_drops_default_scale_factor() {
+    // SF "1.0" matches the file-sizes path's filter and collapses to
+    // dataset_variant: None.
+    let mut r = record("vortex size/tpch");
+    r.dataset = Some(serde_json::json!({ "tpch": { "scale_factor": "1.0" } }));
+    let outcome = classify_outcome(&r);
+    let Outcome::Bin(V3Bin::CompressionSize {
+        dataset_variant, ..
+    }) = outcome
+    else {
+        panic!("expected Bin(CompressionSize), got {outcome:?}");
+    };
+    assert_eq!(dataset_variant, None);
+}
+
 #[test]
 fn engine_casing_lowercased() {
     // Older v2 records emitted display-case engines like `DataFusion`
diff --git a/benchmarks-website/migrate/tests/end_to_end.rs b/benchmarks-website/migrate/tests/end_to_end.rs
index a8328342c9a..210092a4058 100644
--- a/benchmarks-website/migrate/tests/end_to_end.rs
+++ b/benchmarks-website/migrate/tests/end_to_end.rs
@@ -148,6 +148,85 @@ fn dedup_collision_keeps_one_row() {
     assert_eq!(n, 1);
 }
 
+#[test]
+fn dedup_with_conflicting_value_ns_is_counted() {
+    // Same dim columns, different `value`s. Dedup keeps the first
+    // and bumps `deduped_with_conflict` because the dropped row's
+    // value_ns differed from the kept row's. This is the signal we
+    // care about when watching for silent value-corruption across
+    // duplicated v2 emissions.
+    const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111}
+{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":222}
+"#;
+
+    let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]);
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.deduped, 1, "summary={summary}");
+    assert_eq!(summary.deduped_with_conflict, 1, "summary={summary}");
+}
+
+#[test]
+fn dedup_with_matching_value_ns_does_not_count_conflict() {
+    // Same dim columns AND identical `value`s. Dedup still drops the
+    // duplicate, but `deduped_with_conflict` stays 0.
+    const DATA: &str = r#"{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111}
+{"name":"clickbench_q07/datafusion:parquet","commit_id":"deadbeef","unit":"ns","value":111}
+"#;
+
+    let src_dir = build_fixture(COMMITS_JSONL, DATA, &[]);
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.deduped, 1, "summary={summary}");
+    assert_eq!(summary.deduped_with_conflict, 0, "summary={summary}");
+}
+
+#[test]
+fn compression_size_data_and_file_sizes_merge() {
+    // A `vortex size/tpch` record from data.json.gz and a
+    // file-sizes-tpch-nvme.json.gz row covering the same (commit,
+    // dataset, format, SF) tuple should produce the *same*
+    // measurement_id so the in-memory accumulator merges them into
+    // one row instead of two.
+    //
+    // Both sources use scale_factor "1.0", which both code paths
+    // filter out → dataset_variant: None on both sides → matching mid.
+    const DATA: &str = r#"{"name":"vortex size/tpch","commit_id":"deadbeef","unit":"bytes","value":200,"dataset":{"tpch":{"scale_factor":"1.0"}}}
+"#;
+    const FILE_SIZES: &str = r#"{"commit_id":"deadbeef","benchmark":"tpch","scale_factor":"1.0","format":"vortex-file-compressed","file":"part-0.vortex","size_bytes":100}
+"#;
+
+    let src_dir = build_fixture(
+        COMMITS_JSONL,
+        DATA,
+        &[("file-sizes-tpch-nvme.json.gz", FILE_SIZES)],
+    );
+    let target_dir = TempDir::new().unwrap();
+    let target = target_dir.path().join("v3.duckdb");
+
+    let summary = migrate::run(&Source::Local(src_dir.path().into()), &target).unwrap();
+
+    assert_eq!(summary.compression_size_inserted, 1, "summary={summary}");
+
+    let conn = Connection::open(&target).unwrap();
+    let (n, value_bytes): (i64, i64) = conn
+        .query_row(
+            "SELECT COUNT(*), SUM(value_bytes) FROM compression_sizes",
+            [],
+            |r| Ok((r.get(0)?, r.get(1)?)),
+        )
+        .unwrap();
+    assert_eq!(n, 1);
+    // data.json.gz seeds value_bytes=200, file-sizes adds 100.
+    assert_eq!(value_bytes, 300);
+}
+
 #[test]
 fn file_sizes_sum_into_one_row() {
     // Two file-sizes rows sharing (commit, benchmark, format,