Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use std::io::{IsTerminal as _, Read};
use std::path::Path;
use uucore::error::{FromIo, UResult, USimpleError};

#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[doc(hidden)]
pub enum RegexMode {
Fixed,
Expand Down
191 changes: 190 additions & 1 deletion src/line_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

use memchr::memchr;
use memchr::{memchr, memrchr};
use std::fs::File;
use std::io::{self, Read as _};

Expand Down Expand Up @@ -111,4 +111,193 @@ impl LineBuffer {
self.end += n;
}
}

/// Read the next run of *complete* lines as a single slice.
///
/// Returns `Ok(None)` at end of input. Otherwise returns `Ok(Some((chunk,
/// chunk_start)))`, where `chunk` spans one or more whole lines (each ending
/// in the terminator) and `chunk_start` is the absolute byte offset of the
/// first byte of the chunk. The only exception is a final line lacking a
/// terminator, which is returned on its own as the last chunk.
///
/// This hands back as much buffered data as ends on a line boundary, so a
/// caller can scan many lines with one pass instead of line by line.
pub fn read_chunk(&mut self, file: &mut File) -> io::Result<Option<(&[u8], u64)>> {
loop {
// Hand back everything up to and including the last terminator.
if self.end > self.beg
&& let Some(off) = memrchr(self.line_terminator, &self.buffer[self.beg..self.end])
{
let beg = self.beg;
let lim = self.beg + off + 1;
let chunk_start = self.next_line_start;
self.next_line_start += (lim - beg) as u64;
self.beg = lim;
self.scan = lim;
return Ok(Some((&self.buffer[beg..lim], chunk_start)));
}

// No whole line buffered. At EOF, flush any unterminated remainder.
if self.eof {
if self.beg == self.end {
return Ok(None);
}
let beg = self.beg;
let chunk_start = self.next_line_start;
self.next_line_start += (self.end - beg) as u64;
self.beg = self.end;
self.scan = self.end;
return Ok(Some((&self.buffer[beg..self.end], chunk_start)));
}

// Slide the partial tail to the front to maximize room for reading.
if self.beg > 0 {
self.buffer.copy_within(self.beg..self.end, 0);
self.end -= self.beg;
self.beg = 0;
self.scan = 0;
}
if self.end == self.buffer.len() {
// A single line is longer than the whole buffer; grow it.
self.buffer.resize(self.buffer.len() * 2, 0);
}

let n = loop {
match file.read(&mut self.buffer[self.end..]) {
Ok(n) => break n,
Err(e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
};
if n == 0 {
self.eof = true;
} else {
self.end += n;
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::io::{Seek as _, SeekFrom, Write as _};
use std::sync::atomic::{AtomicU32, Ordering};

static COUNTER: AtomicU32 = AtomicU32::new(0);

/// A temp file pre-loaded with `content`, rewound to the start, and removed
/// from disk when dropped.
struct TempInput {
file: File,
path: std::path::PathBuf,
}

impl Drop for TempInput {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.path);
}
}

fn temp_input(content: &[u8]) -> TempInput {
let mut path = std::env::temp_dir();
let n = COUNTER.fetch_add(1, Ordering::Relaxed);
path.push(format!("uu_grep_lb_{}_{n}.tmp", std::process::id()));
let mut file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&path)
.unwrap();
file.write_all(content).unwrap();
file.seek(SeekFrom::Start(0)).unwrap();
TempInput { file, path }
}

/// Drain `read_chunk` into a list of (owned bytes, start offset) pairs.
fn chunks(term: u8, content: &[u8]) -> Vec<(Vec<u8>, u64)> {
let mut lb = LineBuffer::new(term);
let mut input = temp_input(content);
let mut out = Vec::new();
while let Some((chunk, start)) = lb.read_chunk(&mut input.file).unwrap() {
out.push((chunk.to_vec(), start));
}
out
}

#[test]
fn empty_input_yields_nothing() {
assert!(chunks(b'\n', b"").is_empty());
}

#[test]
fn whole_complete_lines_come_back_as_one_chunk() {
// Small input arrives in a single read, so everything up to the final
// terminator is one chunk starting at offset 0.
assert_eq!(
chunks(b'\n', b"a\nbb\nccc\n"),
vec![(b"a\nbb\nccc\n".to_vec(), 0)]
);
}

#[test]
fn unterminated_tail_is_a_final_chunk_with_its_own_offset() {
// "a\n" is the complete-line chunk; "bb" is flushed at EOF at offset 2.
assert_eq!(
chunks(b'\n', b"a\nbb"),
vec![(b"a\n".to_vec(), 0), (b"bb".to_vec(), 2)]
);
}

#[test]
fn input_without_any_terminator_is_one_chunk() {
assert_eq!(chunks(b'\n', b"abc"), vec![(b"abc".to_vec(), 0)]);
}

#[test]
fn honors_a_custom_terminator() {
assert_eq!(
chunks(b'\0', b"a\0bb\0c"),
vec![(b"a\0bb\0".to_vec(), 0), (b"c".to_vec(), 5)]
);
}

#[test]
fn reassembles_input_larger_than_the_buffer() {
// Force many reads and at least one chunk boundary mid-file.
let mut content = Vec::new();
for i in 0..50_000u32 {
content.extend_from_slice(format!("line number {i}\n").as_bytes());
}
assert!(content.len() > 128 * 1024);

let got = chunks(b'\n', &content);
assert!(got.len() > 1, "expected multiple chunks, got {}", got.len());

// Chunks must tile the input exactly, contiguously, each ending on a
// line boundary (the input ends with a terminator).
let mut expected_start = 0u64;
let mut joined = Vec::new();
for (bytes, start) in &got {
assert_eq!(*start, expected_start);
assert_eq!(*bytes.last().unwrap(), b'\n');
expected_start += bytes.len() as u64;
joined.extend_from_slice(bytes);
}
assert_eq!(joined, content);
}

#[test]
fn grows_to_hold_a_single_overlong_line() {
// One line far bigger than the initial 128 KiB buffer, then a short one.
let mut content = vec![b'x'; 300 * 1024];
content.push(b'\n');
content.extend_from_slice(b"tail\n");

let got = chunks(b'\n', &content);
let joined: Vec<u8> = got.iter().flat_map(|(b, _)| b.clone()).collect();
assert_eq!(joined, content);
assert_eq!(got[0].1, 0);
}
}
99 changes: 98 additions & 1 deletion src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// file that was distributed with this source code.

use crate::{Config, RegexMode};
use memchr::memmem;
use onig::{
EncodedBytes, Regex, RegexOptions, Region, SearchOptions, Syntax, SyntaxBehavior,
SyntaxOperator,
Expand All @@ -14,6 +15,12 @@ use uucore::error::{UResult, USimpleError};
pub struct Matcher<'a> {
config: &'a Config<'a>,
patterns: Vec<CompiledPattern>,
/// One substring searcher per pattern, present only when *every* pattern is
/// a plain literal that a raw byte search resolves exactly (see
/// [`plain_literal`]). When set, a caller can decide a line matches by
/// looking for any of these needles, bypassing the regex engine entirely.
/// `None` as soon as a single pattern needs real regex evaluation.
literal_searchers: Option<Vec<memmem::Finder<'static>>>,
}

impl<'a> Matcher<'a> {
Expand All @@ -22,7 +29,32 @@ impl<'a> Matcher<'a> {
for raw in config.patterns {
patterns.push(CompiledPattern::compile(raw, config)?);
}
Ok(Self { config, patterns })

// If we can reduce the whole pattern set to literal needles, keep a
// searcher for each so the driver can take a bulk substring-scan path.
let needles: Option<Vec<Vec<u8>>> = config
.patterns
.iter()
.map(|p| plain_literal(p, config.ignore_case, config.regex_mode))
.collect();
let literal_searchers = needles.filter(|n| !n.is_empty()).map(|n| {
n.iter()
.map(|w| memmem::Finder::new(w).into_owned())
.collect()
});

Ok(Self {
config,
patterns,
literal_searchers,
})
}

/// Per-pattern substring searchers, present only when the pattern set is a
/// pure set of literals (no regex needed). Used by the searcher to scan a
/// whole buffer at once instead of testing line by line.
pub fn literal_searchers(&self) -> Option<&[memmem::Finder<'static>]> {
self.literal_searchers.as_deref()
}

/// Decide whether `line` matches and return the positions to highlight.
Expand Down Expand Up @@ -194,6 +226,25 @@ impl Cursor<'_> {
}
}

/// Return the literal bytes of `pattern` when a raw byte-for-byte substring
/// search is *exactly* equivalent to matching it, otherwise `None`.
///
/// We accept only ASCII, case-sensitive needles. That keeps the byte search in
/// agreement with the regex engine on every possible input, including bytes that
/// are not valid UTF-8: an ASCII byte can never be part of a multi-byte sequence,
/// so its presence is unambiguous. In the regex modes we also require that no
/// byte could ever act as a metacharacter; under `-F` the text is literal as-is.
fn plain_literal(pattern: &str, ignore_case: bool, mode: RegexMode) -> Option<Vec<u8>> {
if ignore_case || pattern.is_empty() || !pattern.is_ascii() {
return None;
}
// Every byte that carries special meaning in any of our regex syntaxes.
// A needle without these reads the same as a literal in Basic/Extended/Perl.
const SPECIAL: &[u8] = b".*[]^$\\+?{}()|";
let plain = mode == RegexMode::Fixed || !pattern.bytes().any(|b| SPECIAL.contains(&b));
plain.then(|| pattern.as_bytes().to_vec())
}

struct CompiledPattern {
/// Default semantics. It's decently fast and used for searching.
leftmost: Regex,
Expand Down Expand Up @@ -289,3 +340,49 @@ impl CompiledPattern {
.is_some()
}
}

#[cfg(test)]
mod tests {
use super::plain_literal;
use crate::RegexMode;

fn lit(p: &str, ic: bool, mode: RegexMode) -> Option<Vec<u8>> {
plain_literal(p, ic, mode)
}

#[test]
fn fixed_mode_takes_any_ascii_verbatim() {
// Under -F every byte is literal, even regex metacharacters.
assert_eq!(lit("abc", false, RegexMode::Fixed), Some(b"abc".to_vec()));
assert_eq!(lit("a.*b", false, RegexMode::Fixed), Some(b"a.*b".to_vec()));
assert_eq!(lit("a+b", false, RegexMode::Fixed), Some(b"a+b".to_vec()));
}

#[test]
fn regex_modes_accept_metacharacter_free_literals() {
for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] {
assert_eq!(lit("ing", false, mode), Some(b"ing".to_vec()));
assert_eq!(lit("Hello123", false, mode), Some(b"Hello123".to_vec()));
}
}

#[test]
fn regex_modes_reject_anything_with_a_metacharacter() {
for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] {
for p in [
"a.b", "a*", "[ab]", "^a", "a$", "a\\b", "a+", "a?", "(a)", "a|b", "a{2}",
] {
assert_eq!(lit(p, false, mode), None, "pattern {p:?} in {mode:?}");
}
}
}

#[test]
fn rejects_empty_case_insensitive_and_non_ascii() {
assert_eq!(lit("", false, RegexMode::Fixed), None);
assert_eq!(lit("abc", true, RegexMode::Fixed), None); // -i
assert_eq!(lit("abc", true, RegexMode::Basic), None);
assert_eq!(lit("café", false, RegexMode::Fixed), None); // non-ASCII
assert_eq!(lit("naïve", false, RegexMode::Basic), None);
}
}
Loading
Loading