Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 2 additions & 26 deletions vortex-bench/src/clickbench/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ use std::path::Path;

use anyhow::Result;
use url::Url;
use vortex::error::VortexExpect;

use crate::Benchmark;
use crate::BenchmarkDataset;
use crate::IdempotentPath;
use crate::TableSpec;
use crate::clickbench::*;
use crate::utils::file::resolve_data_url;

/// ClickBench benchmark implementation
pub struct ClickBenchBenchmark {
Expand All @@ -37,31 +37,7 @@ impl ClickBenchBenchmark {
}

fn create_data_url(remote_data_dir: &Option<String>, flavor: Flavor) -> Result<Url> {
match remote_data_dir {
None => {
let basepath = format!("clickbench_{flavor}").to_data_path();
Ok(Url::parse(&format!(
"file:{}/",
basepath.to_str().vortex_expect("path should be utf8")
))?)
}
Some(remote_data_dir) => {
if !remote_data_dir.ends_with("/") {
tracing::warn!(
"Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/parquet/"
);
}
tracing::info!(
concat!(
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\\n",
"If it does not, you should kill this command, locally generate the files (by running without\\n",
"--use-remote-data-dir) and upload data/clickbench/ to some remote location.",
),
remote_data_dir,
);
Ok(Url::parse(remote_data_dir)?)
}
}
resolve_data_url(remote_data_dir.as_deref(), &format!("clickbench_{flavor}"))
}
}

Expand Down
20 changes: 8 additions & 12 deletions vortex-bench/src/clickbench/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ use arrow_schema::TimeUnit;
use clap::ValueEnum;
use serde::Deserialize;
use serde::Serialize;
use tokio::task::JoinSet;
use tracing::info;
use vortex::error::VortexExpect;

use crate::Format;
// Re-export for use by clickbench_benchmark
pub use crate::conversions::convert_parquet_directory_to_vortex;
use crate::datasets::data_downloads::download_data;
use crate::datasets::data_downloads::download_many;

pub static HITS_SCHEMA: LazyLock<Schema> = LazyLock::new(|| {
use DataType::*;
Expand Down Expand Up @@ -193,18 +193,14 @@ impl Flavor {
Flavor::Partitioned => {
// The clickbench-provided file is missing some higher-level type info, so we reprocess it
// to add that info, see https://github.com/ClickHouse/ClickBench/issues/7.

let mut tasks = (0_u32..100).map(|idx| {
let output_path = basepath.join(Format::Parquet.name()).join(format!("hits_{idx}.parquet"));

info!("Downloading file {idx}");
info!("Downloading 100 ClickBench parquet shards");
let parquet_dir = basepath.join(Format::Parquet.name());
let downloads = (0_u32..100).map(|idx| {
let output_path = parquet_dir.join(format!("hits_{idx}.parquet"));
let url = format!("https://pub-3ba949c0f0354ac18db1f0f14f0a2c52.r2.dev/clickbench/parquet_many/hits_{idx}.parquet");
download_data(output_path, url)
}).collect::<JoinSet<_>>();

while let Some(task) = tasks.join_next().await {
task??;
}
(output_path, url)
});
download_many(downloads).await?;
}
}
Ok(())
Expand Down
Loading
Loading