From b46a86d48a552f64e48dd1d04b5ef89ba85d801b Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 31 May 2026 10:41:52 +0200 Subject: [PATCH] bench: end-to-end search throughput via uumain The existing match/throughput benches call Matcher::match_line on pre-split lines, so they only measure matching in isolation and cannot observe how the searcher feeds data to the matcher. Add a 'search' group that drives the whole pipeline through uumain over a multi-MB file: a literal pattern (which a buffer-at-a-time searcher can speed up) and an extended-regex control (which it cannot). Uses -q with a non-matching pattern for a silent full-file scan. --- benches/grep_bench.rs | 70 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/benches/grep_bench.rs b/benches/grep_bench.rs index 3232f08..a87ab50 100644 --- a/benches/grep_bench.rs +++ b/benches/grep_bench.rs @@ -284,5 +284,73 @@ fn bench_throughput(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, bench_compile, bench_match, bench_throughput); +/// End-to-end search throughput, driven through the real `uumain` entry point +/// so the whole pipeline (input buffering, searcher, output) is exercised. +/// +/// `bench_match` / `bench_throughput` call `Matcher::match_line` on pre-split +/// lines, which measures matching in isolation. They cannot see a change to how +/// the *searcher* feeds data to the matcher (e.g. scanning whole buffers instead +/// of testing one line at a time), because they never run the searcher. These +/// cases do: a literal pattern (which a buffer-at-a-time engine can accelerate) +/// and an extended-regex control (which cannot), over a multi-megabyte file. +fn bench_search(c: &mut Criterion) { + use std::ffi::OsString; + + // A log-like file large enough to cross many internal read buffers. + let mut content = String::new(); + for i in 0..80_000u32 { + if i % 100 == 0 { + content.push_str(&format!( + "2024-01-15 10:30:{:02} ERROR worker-{i} connection reset\n", + i % 60 + )); + } else { + content.push_str(&format!( + "2024-01-15 10:30:{:02} INFO worker-{i} request handled in {}ms\n", + i % 60, + i % 1000 + )); + } + } + assert!(content.len() > 4 * 1024 * 1024); + + let mut path = std::env::temp_dir(); + path.push(format!("uu_grep_bench_{}.log", std::process::id())); + std::fs::write(&path, &content).unwrap(); + let path_arg = path.clone().into_os_string(); + + // `-q` with a pattern that never matches forces a full scan of the file and + // produces no output, so the timing reflects pure scanning throughput. + let run = |extra_flag: Option<&str>, pattern: &str| { + let mut args: Vec = vec![OsString::from("grep"), OsString::from("-q")]; + if let Some(flag) = extra_flag { + args.push(OsString::from(flag)); + } + args.push(OsString::from(pattern)); + args.push(path_arg.clone()); + // No match => Err(exit code 1); we only care about the work, not status. + let _ = uu_grep::uumain(args.into_iter()); + }; + + let mut group = c.benchmark_group("search"); + + group.bench_function("scan_literal_no_match", |b| { + b.iter(|| run(None, black_box("NONEXISTENT_TOKEN_XYZ"))) + }); + + group.bench_function("scan_regex_no_match", |b| { + b.iter(|| run(Some("-E"), black_box("NON[0-9]EXISTENT_TOKEN"))) + }); + + group.finish(); + let _ = std::fs::remove_file(&path); +} + +criterion_group!( + benches, + bench_compile, + bench_match, + bench_throughput, + bench_search +); criterion_main!(benches);