uutils · sylvestre · May 31, 2026 · May 31, 2026
diff --git a/src/lib.rs b/src/lib.rs
@@ -22,7 +22,7 @@ use std::io::{IsTerminal as _, Read};
 use std::path::Path;
 use uucore::error::{FromIo, UResult, USimpleError};
 
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 #[doc(hidden)]
 pub enum RegexMode {
     Fixed,

diff --git a/src/line_buffer.rs b/src/line_buffer.rs
@@ -3,7 +3,7 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-use memchr::memchr;
+use memchr::{memchr, memrchr};
 use std::fs::File;
 use std::io::{self, Read as _};
 
@@ -111,4 +111,193 @@ impl LineBuffer {
             self.end += n;
         }
     }
+
+    /// Read the next run of *complete* lines as a single slice.
+    ///
+    /// Returns `Ok(None)` at end of input. Otherwise returns `Ok(Some((chunk,
+    /// chunk_start)))`, where `chunk` spans one or more whole lines (each ending
+    /// in the terminator) and `chunk_start` is the absolute byte offset of the
+    /// first byte of the chunk. The only exception is a final line lacking a
+    /// terminator, which is returned on its own as the last chunk.
+    ///
+    /// This hands back as much buffered data as ends on a line boundary, so a
+    /// caller can scan many lines with one pass instead of line by line.
+    pub fn read_chunk(&mut self, file: &mut File) -> io::Result<Option<(&[u8], u64)>> {
+        loop {
+            // Hand back everything up to and including the last terminator.
+            if self.end > self.beg
+                && let Some(off) = memrchr(self.line_terminator, &self.buffer[self.beg..self.end])
+            {
+                let beg = self.beg;
+                let lim = self.beg + off + 1;
+                let chunk_start = self.next_line_start;
+                self.next_line_start += (lim - beg) as u64;
+                self.beg = lim;
+                self.scan = lim;
+                return Ok(Some((&self.buffer[beg..lim], chunk_start)));
+            }
+
+            // No whole line buffered. At EOF, flush any unterminated remainder.
+            if self.eof {
+                if self.beg == self.end {
+                    return Ok(None);
+                }
+                let beg = self.beg;
+                let chunk_start = self.next_line_start;
+                self.next_line_start += (self.end - beg) as u64;
+                self.beg = self.end;
+                self.scan = self.end;
+                return Ok(Some((&self.buffer[beg..self.end], chunk_start)));
+            }
+
+            // Slide the partial tail to the front to maximize room for reading.
+            if self.beg > 0 {
+                self.buffer.copy_within(self.beg..self.end, 0);
+                self.end -= self.beg;
+                self.beg = 0;
+                self.scan = 0;
+            }
+            if self.end == self.buffer.len() {
+                // A single line is longer than the whole buffer; grow it.
+                self.buffer.resize(self.buffer.len() * 2, 0);
+            }
+
+            let n = loop {
+                match file.read(&mut self.buffer[self.end..]) {
+                    Ok(n) => break n,
+                    Err(e) if e.kind() == io::ErrorKind::Interrupted => {}
+                    Err(e) => return Err(e),
+                }
+            };
+            if n == 0 {
+                self.eof = true;
+            } else {
+                self.end += n;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::{Seek as _, SeekFrom, Write as _};
+    use std::sync::atomic::{AtomicU32, Ordering};
+
+    static COUNTER: AtomicU32 = AtomicU32::new(0);
+
+    /// A temp file pre-loaded with `content`, rewound to the start, and removed
+    /// from disk when dropped.
+    struct TempInput {
+        file: File,
+        path: std::path::PathBuf,
+    }
+
+    impl Drop for TempInput {
+        fn drop(&mut self) {
+            let _ = std::fs::remove_file(&self.path);
+        }
+    }
+
+    fn temp_input(content: &[u8]) -> TempInput {
+        let mut path = std::env::temp_dir();
+        let n = COUNTER.fetch_add(1, Ordering::Relaxed);
+        path.push(format!("uu_grep_lb_{}_{n}.tmp", std::process::id()));
+        let mut file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&path)
+            .unwrap();
+        file.write_all(content).unwrap();
+        file.seek(SeekFrom::Start(0)).unwrap();
+        TempInput { file, path }
+    }
+
+    /// Drain `read_chunk` into a list of (owned bytes, start offset) pairs.
+    fn chunks(term: u8, content: &[u8]) -> Vec<(Vec<u8>, u64)> {
+        let mut lb = LineBuffer::new(term);
+        let mut input = temp_input(content);
+        let mut out = Vec::new();
+        while let Some((chunk, start)) = lb.read_chunk(&mut input.file).unwrap() {
+            out.push((chunk.to_vec(), start));
+        }
+        out
+    }
+
+    #[test]
+    fn empty_input_yields_nothing() {
+        assert!(chunks(b'\n', b"").is_empty());
+    }
+
+    #[test]
+    fn whole_complete_lines_come_back_as_one_chunk() {
+        // Small input arrives in a single read, so everything up to the final
+        // terminator is one chunk starting at offset 0.
+        assert_eq!(
+            chunks(b'\n', b"a\nbb\nccc\n"),
+            vec![(b"a\nbb\nccc\n".to_vec(), 0)]
+        );
+    }
+
+    #[test]
+    fn unterminated_tail_is_a_final_chunk_with_its_own_offset() {
+        // "a\n" is the complete-line chunk; "bb" is flushed at EOF at offset 2.
+        assert_eq!(
+            chunks(b'\n', b"a\nbb"),
+            vec![(b"a\n".to_vec(), 0), (b"bb".to_vec(), 2)]
+        );
+    }
+
+    #[test]
+    fn input_without_any_terminator_is_one_chunk() {
+        assert_eq!(chunks(b'\n', b"abc"), vec![(b"abc".to_vec(), 0)]);
+    }
+
+    #[test]
+    fn honors_a_custom_terminator() {
+        assert_eq!(
+            chunks(b'\0', b"a\0bb\0c"),
+            vec![(b"a\0bb\0".to_vec(), 0), (b"c".to_vec(), 5)]
+        );
+    }
+
+    #[test]
+    fn reassembles_input_larger_than_the_buffer() {
+        // Force many reads and at least one chunk boundary mid-file.
+        let mut content = Vec::new();
+        for i in 0..50_000u32 {
+            content.extend_from_slice(format!("line number {i}\n").as_bytes());
+        }
+        assert!(content.len() > 128 * 1024);
+
+        let got = chunks(b'\n', &content);
+        assert!(got.len() > 1, "expected multiple chunks, got {}", got.len());
+
+        // Chunks must tile the input exactly, contiguously, each ending on a
+        // line boundary (the input ends with a terminator).
+        let mut expected_start = 0u64;
+        let mut joined = Vec::new();
+        for (bytes, start) in &got {
+            assert_eq!(*start, expected_start);
+            assert_eq!(*bytes.last().unwrap(), b'\n');
+            expected_start += bytes.len() as u64;
+            joined.extend_from_slice(bytes);
+        }
+        assert_eq!(joined, content);
+    }
+
+    #[test]
+    fn grows_to_hold_a_single_overlong_line() {
+        // One line far bigger than the initial 128 KiB buffer, then a short one.
+        let mut content = vec![b'x'; 300 * 1024];
+        content.push(b'\n');
+        content.extend_from_slice(b"tail\n");
+
+        let got = chunks(b'\n', &content);
+        let joined: Vec<u8> = got.iter().flat_map(|(b, _)| b.clone()).collect();
+        assert_eq!(joined, content);
+        assert_eq!(got[0].1, 0);
+    }
 }
diff --git a/src/matcher.rs b/src/matcher.rs
@@ -4,6 +4,7 @@
 // file that was distributed with this source code.
 
 use crate::{Config, RegexMode};
+use memchr::memmem;
 use onig::{
     EncodedBytes, Regex, RegexOptions, Region, SearchOptions, Syntax, SyntaxBehavior,
     SyntaxOperator,
@@ -14,6 +15,12 @@ use uucore::error::{UResult, USimpleError};
 pub struct Matcher<'a> {
     config: &'a Config<'a>,
     patterns: Vec<CompiledPattern>,
+    /// One substring searcher per pattern, present only when *every* pattern is
+    /// a plain literal that a raw byte search resolves exactly (see
+    /// [`plain_literal`]). When set, a caller can decide a line matches by
+    /// looking for any of these needles, bypassing the regex engine entirely.
+    /// `None` as soon as a single pattern needs real regex evaluation.
+    literal_searchers: Option<Vec<memmem::Finder<'static>>>,
 }
 
 impl<'a> Matcher<'a> {
@@ -22,7 +29,32 @@ impl<'a> Matcher<'a> {
         for raw in config.patterns {
             patterns.push(CompiledPattern::compile(raw, config)?);
         }
-        Ok(Self { config, patterns })
+
+        // If we can reduce the whole pattern set to literal needles, keep a
+        // searcher for each so the driver can take a bulk substring-scan path.
+        let needles: Option<Vec<Vec<u8>>> = config
+            .patterns
+            .iter()
+            .map(|p| plain_literal(p, config.ignore_case, config.regex_mode))
+            .collect();
+        let literal_searchers = needles.filter(|n| !n.is_empty()).map(|n| {
+            n.iter()
+                .map(|w| memmem::Finder::new(w).into_owned())
+                .collect()
+        });
+
+        Ok(Self {
+            config,
+            patterns,
+            literal_searchers,
+        })
+    }
+
+    /// Per-pattern substring searchers, present only when the pattern set is a
+    /// pure set of literals (no regex needed). Used by the searcher to scan a
+    /// whole buffer at once instead of testing line by line.
+    pub fn literal_searchers(&self) -> Option<&[memmem::Finder<'static>]> {
+        self.literal_searchers.as_deref()
     }
 
     /// Decide whether `line` matches and return the positions to highlight.
@@ -194,6 +226,25 @@ impl Cursor<'_> {
     }
 }
 
+/// Return the literal bytes of `pattern` when a raw byte-for-byte substring
+/// search is *exactly* equivalent to matching it, otherwise `None`.
+///
+/// We accept only ASCII, case-sensitive needles. That keeps the byte search in
+/// agreement with the regex engine on every possible input, including bytes that
+/// are not valid UTF-8: an ASCII byte can never be part of a multi-byte sequence,
+/// so its presence is unambiguous. In the regex modes we also require that no
+/// byte could ever act as a metacharacter; under `-F` the text is literal as-is.
+fn plain_literal(pattern: &str, ignore_case: bool, mode: RegexMode) -> Option<Vec<u8>> {
+    if ignore_case || pattern.is_empty() || !pattern.is_ascii() {
+        return None;
+    }
+    // Every byte that carries special meaning in any of our regex syntaxes.
+    // A needle without these reads the same as a literal in Basic/Extended/Perl.
+    const SPECIAL: &[u8] = b".*[]^$\\+?{}()|";
+    let plain = mode == RegexMode::Fixed || !pattern.bytes().any(|b| SPECIAL.contains(&b));
+    plain.then(|| pattern.as_bytes().to_vec())
+}
+
 struct CompiledPattern {
     /// Default semantics. It's decently fast and used for searching.
     leftmost: Regex,
@@ -289,3 +340,49 @@ impl CompiledPattern {
             .is_some()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::plain_literal;
+    use crate::RegexMode;
+
+    fn lit(p: &str, ic: bool, mode: RegexMode) -> Option<Vec<u8>> {
+        plain_literal(p, ic, mode)
+    }
+
+    #[test]
+    fn fixed_mode_takes_any_ascii_verbatim() {
+        // Under -F every byte is literal, even regex metacharacters.
+        assert_eq!(lit("abc", false, RegexMode::Fixed), Some(b"abc".to_vec()));
+        assert_eq!(lit("a.*b", false, RegexMode::Fixed), Some(b"a.*b".to_vec()));
+        assert_eq!(lit("a+b", false, RegexMode::Fixed), Some(b"a+b".to_vec()));
+    }
+
+    #[test]
+    fn regex_modes_accept_metacharacter_free_literals() {
+        for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] {
+            assert_eq!(lit("ing", false, mode), Some(b"ing".to_vec()));
+            assert_eq!(lit("Hello123", false, mode), Some(b"Hello123".to_vec()));
+        }
+    }
+
+    #[test]
+    fn regex_modes_reject_anything_with_a_metacharacter() {
+        for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] {
+            for p in [
+                "a.b", "a*", "[ab]", "^a", "a$", "a\\b", "a+", "a?", "(a)", "a|b", "a{2}",
+            ] {
+                assert_eq!(lit(p, false, mode), None, "pattern {p:?} in {mode:?}");
+            }
+        }
+    }
+
+    #[test]
+    fn rejects_empty_case_insensitive_and_non_ascii() {
+        assert_eq!(lit("", false, RegexMode::Fixed), None);
+        assert_eq!(lit("abc", true, RegexMode::Fixed), None); // -i
+        assert_eq!(lit("abc", true, RegexMode::Basic), None);
+        assert_eq!(lit("café", false, RegexMode::Fixed), None); // non-ASCII
+        assert_eq!(lit("naïve", false, RegexMode::Basic), None);
+    }
+}