Skip to content

Commit e10c9d7

Browse files
committed
automata: add new 'WhichCaptures' config
This is the first step in fixing a regression in memory usage. The underlying problem is that regex-automata now natively supports multi-pattern regexes *with* capturing support. Unfortunately though, this overall doesn't work too well with the current design of the PikeVM, because the amount of memory used is `len(captures) * len(states)`. So basically, as the regex and number of captures increases, the amount of memory used gets quite high. This is new functionality that we hope to improve upon over time, so it's not too big of a deal on its own. But it turns out this impacts previous uses of RegexSet that have capture groups. The old implementation just ignored these capture groups because they weren't supported in a RegexSet, and thus there were no memory problems. But in the new implementation, nothing tells it that it's okay to ignore the capture groups. So it winds up allocating space for them even though the RegexSet APIs don't provide any of that functionality. So my plan to fix this is to introduce a new configuration knob for controlling more granularly which capture states are compiled into the NFA. Previously we only supported "all of them" or "none of them." This commit adds a new (backwards compatible) knob that also permits "just implicit groups." That is, one capture group per pattern. This hopefully leads to less memory usage overall. (Well, it will certaintly be less, but hopefully it's a big reduction.) We don't actually change anything here. We just add a new `Config::which_captures` knob, implement the existing `Config::captures` in terms of `Config::which_captures` and deprecate `Config::captures`. If this winds up not being sufficient, then we may need to adapt the PikeVM to work without any capture groups at all and instead just report which patterns match. Which is... probably fine?
1 parent a191024 commit e10c9d7

File tree

7 files changed

+182
-39
lines changed

7 files changed

+182
-39
lines changed

regex-automata/src/dfa/dense.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,10 @@ impl Builder {
11701170
.clone()
11711171
// We can always forcefully disable captures because DFAs do not
11721172
// support them.
1173-
.configure(thompson::Config::new().captures(false))
1173+
.configure(
1174+
thompson::Config::new()
1175+
.which_captures(thompson::WhichCaptures::None),
1176+
)
11741177
.build_many(patterns)
11751178
.map_err(BuildError::nfa)?;
11761179
self.build_from_nfa(&nfa)

regex-automata/src/hybrid/dfa.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3973,7 +3973,10 @@ impl Builder {
39733973
.clone()
39743974
// We can always forcefully disable captures because DFAs do not
39753975
// support them.
3976-
.configure(thompson::Config::new().captures(false))
3976+
.configure(
3977+
thompson::Config::new()
3978+
.which_captures(thompson::WhichCaptures::None),
3979+
)
39773980
.build_many(patterns)
39783981
.map_err(BuildError::nfa)?;
39793982
self.build_from_nfa(nfa)

regex-automata/src/meta/strategy.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::{
1313
regex::{Cache, RegexInfo},
1414
reverse_inner, wrappers,
1515
},
16-
nfa::thompson::{self, NFA},
16+
nfa::thompson::{self, WhichCaptures, NFA},
1717
util::{
1818
captures::{Captures, GroupInfo},
1919
look::LookMatcher,
@@ -452,7 +452,7 @@ impl Core {
452452
.utf8(info.config().get_utf8_empty())
453453
.nfa_size_limit(info.config().get_nfa_size_limit())
454454
.shrink(false)
455-
.captures(true)
455+
.which_captures(WhichCaptures::All)
456456
.look_matcher(lookm);
457457
let nfa = thompson::Compiler::new()
458458
.configure(thompson_config.clone())
@@ -499,7 +499,10 @@ impl Core {
499499
// useful with capturing groups in reverse. And of course,
500500
// the lazy DFA ignores capturing groups in all cases.
501501
.configure(
502-
thompson_config.clone().captures(false).reverse(true),
502+
thompson_config
503+
.clone()
504+
.which_captures(WhichCaptures::None)
505+
.reverse(true),
503506
)
504507
.build_many_from_hir(hirs)
505508
.map_err(BuildError::nfa)?;
@@ -1480,7 +1483,7 @@ impl ReverseInner {
14801483
.utf8(core.info.config().get_utf8_empty())
14811484
.nfa_size_limit(core.info.config().get_nfa_size_limit())
14821485
.shrink(false)
1483-
.captures(false)
1486+
.which_captures(WhichCaptures::None)
14841487
.look_matcher(lookm);
14851488
let result = thompson::Compiler::new()
14861489
.configure(thompson_config)

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 154 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pub struct Config {
3030
reverse: Option<bool>,
3131
nfa_size_limit: Option<Option<usize>>,
3232
shrink: Option<bool>,
33-
captures: Option<bool>,
33+
which_captures: Option<WhichCaptures>,
3434
look_matcher: Option<LookMatcher>,
3535
#[cfg(test)]
3636
unanchored_prefix: Option<bool>,
@@ -178,12 +178,15 @@ impl Config {
178178
/// ```
179179
/// use regex_automata::{
180180
/// dfa::{self, Automaton},
181-
/// nfa::thompson::NFA,
181+
/// nfa::thompson::{NFA, WhichCaptures},
182182
/// HalfMatch, Input,
183183
/// };
184184
///
185185
/// let dfa = dfa::dense::Builder::new()
186-
/// .thompson(NFA::config().captures(false).reverse(true))
186+
/// .thompson(NFA::config()
187+
/// .which_captures(WhichCaptures::None)
188+
/// .reverse(true)
189+
/// )
187190
/// .build("baz[0-9]+")?;
188191
/// let expected = Some(HalfMatch::must(0, 3));
189192
/// assert_eq!(
@@ -277,10 +280,12 @@ impl Config {
277280
///
278281
/// ```
279282
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
280-
/// use regex_automata::nfa::thompson::NFA;
283+
/// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
281284
///
282285
/// // Currently we have to disable captures when enabling reverse NFA.
283-
/// let config = NFA::config().captures(false).reverse(true);
286+
/// let config = NFA::config()
287+
/// .which_captures(WhichCaptures::None)
288+
/// .reverse(true);
284289
/// let not_shrunk = NFA::compiler()
285290
/// .configure(config.clone().shrink(false))
286291
/// .build(r"\w")?;
@@ -314,18 +319,70 @@ impl Config {
314319
/// require capturing groups to be present in the NFA. Building a Pike VM
315320
/// with an NFA without capturing groups will result in an error.
316321
///
322+
/// (Note that since this method is deprecated, the example below uses
323+
/// [`Config::which_captures`] to disable capture states.)
324+
///
317325
/// ```
318-
/// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA};
326+
/// use regex_automata::nfa::thompson::{
327+
/// pikevm::PikeVM,
328+
/// NFA,
329+
/// WhichCaptures,
330+
/// };
319331
///
320332
/// let nfa = NFA::compiler()
321-
/// .configure(NFA::config().captures(false))
333+
/// .configure(NFA::config().which_captures(WhichCaptures::None))
322334
/// .build(r"[a-z]+")?;
323335
/// assert!(PikeVM::new_from_nfa(nfa).is_err());
324336
///
325337
/// # Ok::<(), Box<dyn std::error::Error>>(())
326338
/// ```
327-
pub fn captures(mut self, yes: bool) -> Config {
328-
self.captures = Some(yes);
339+
#[deprecated(since = "0.3.5", note = "use which_captures instead")]
340+
pub fn captures(self, yes: bool) -> Config {
341+
self.which_captures(if yes {
342+
WhichCaptures::All
343+
} else {
344+
WhichCaptures::None
345+
})
346+
}
347+
348+
/// Configures what kinds of capture groups are compiled into
349+
/// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
350+
/// Thompson NFA.
351+
///
352+
/// Currently, using any option except for [`WhichCaptures::None`] requires
353+
/// disabling the [`reverse`](Config::reverse) setting. If both are
354+
/// enabled, then the compiler will return an error. It is expected that
355+
/// this limitation will be lifted in the future.
356+
///
357+
/// This is set to [`WhichCaptures::All`] by default. Callers may wish to
358+
/// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
359+
/// overhead of capture states for explicit groups. Usually this occurs
360+
/// when one wants to use the `PikeVM` only for determining the overall
361+
/// match. Otherwise, the `PikeVM` could use much more memory than is
362+
/// necessary.
363+
///
364+
/// # Example
365+
///
366+
/// This example demonstrates that some regex engines, like the Pike VM,
367+
/// require capturing groups to be present in the NFA. Building a Pike VM
368+
/// with an NFA without capturing groups will result in an error.
369+
///
370+
/// ```
371+
/// use regex_automata::nfa::thompson::{
372+
/// pikevm::PikeVM,
373+
/// NFA,
374+
/// WhichCaptures,
375+
/// };
376+
///
377+
/// let nfa = NFA::compiler()
378+
/// .configure(NFA::config().which_captures(WhichCaptures::None))
379+
/// .build(r"[a-z]+")?;
380+
/// assert!(PikeVM::new_from_nfa(nfa).is_err());
381+
///
382+
/// # Ok::<(), Box<dyn std::error::Error>>(())
383+
/// ```
384+
pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
385+
self.which_captures = Some(which_captures);
329386
self
330387
}
331388

@@ -405,8 +462,14 @@ impl Config {
405462
}
406463

407464
/// Return whether NFA compilation is configured to produce capture states.
465+
#[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
408466
pub fn get_captures(&self) -> bool {
409-
self.captures.unwrap_or(true)
467+
self.get_which_captures().is_any()
468+
}
469+
470+
/// Return what kinds of capture states will be compiled into an NFA.
471+
pub fn get_which_captures(&self) -> WhichCaptures {
472+
self.which_captures.unwrap_or(WhichCaptures::All)
410473
}
411474

412475
/// Return the look-around matcher for this NFA.
@@ -439,14 +502,65 @@ impl Config {
439502
reverse: o.reverse.or(self.reverse),
440503
nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
441504
shrink: o.shrink.or(self.shrink),
442-
captures: o.captures.or(self.captures),
505+
which_captures: o.which_captures.or(self.which_captures),
443506
look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
444507
#[cfg(test)]
445508
unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
446509
}
447510
}
448511
}
449512

513+
/// A configuration indicating which kinds of
514+
/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
515+
///
516+
/// This configuration can be used with [`Config::which_captures`] to control
517+
/// which capture states are compiled into a Thompson NFA.
518+
///
519+
/// The default configuration is [`WhichCaptures::All`].
520+
#[derive(Clone, Copy, Debug)]
521+
pub enum WhichCaptures {
522+
/// All capture states, including those corresponding to both implicit and
523+
/// explicit capture groups, are included in the Thompson NFA.
524+
All,
525+
/// Only capture states corresponding to implicit capture groups are
526+
/// included. Implicit capture groups appear in every pattern implicitly
527+
/// and correspond to the overall match of a pattern.
528+
///
529+
/// This is useful when one only cares about the overall match of a
530+
/// pattern. By excluding capture states from explicit capture groups,
531+
/// one might be able to reduce the memory usage of a multi-pattern regex
532+
/// substantially if it was otherwise written to have many explicit capture
533+
/// groups.
534+
Implicit,
535+
/// No capture states are compiled into the Thompson NFA.
536+
///
537+
/// This is useful when capture states are either not needed (for example,
538+
/// if one is only trying to build a DFA) or if they aren't supported (for
539+
/// example, a reverse NFA).
540+
None,
541+
}
542+
543+
impl Default for WhichCaptures {
544+
fn default() -> WhichCaptures {
545+
WhichCaptures::All
546+
}
547+
}
548+
549+
impl WhichCaptures {
550+
/// Returns true if this configuration indicates that no capture states
551+
/// should be produced in an NFA.
552+
pub fn is_none(&self) -> bool {
553+
matches!(*self, WhichCaptures::None)
554+
}
555+
556+
/// Returns true if this configuration indicates that some capture states
557+
/// should be added to an NFA. Note that this might only include capture
558+
/// states for implicit capture groups.
559+
pub fn is_any(&self) -> bool {
560+
!self.is_none()
561+
}
562+
}
563+
450564
/*
451565
This compiler below uses Thompson's construction algorithm. The compiler takes
452566
a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
@@ -800,7 +914,9 @@ impl Compiler {
800914
if exprs.len() > PatternID::LIMIT {
801915
return Err(BuildError::too_many_patterns(exprs.len()));
802916
}
803-
if self.config.get_reverse() && self.config.get_captures() {
917+
if self.config.get_reverse()
918+
&& self.config.get_which_captures().is_any()
919+
{
804920
return Err(BuildError::unsupported_captures());
805921
}
806922

@@ -978,7 +1094,7 @@ impl Compiler {
9781094
name: Option<&str>,
9791095
expr: &Hir,
9801096
) -> Result<ThompsonRef, BuildError> {
981-
if !self.config.get_captures() {
1097+
if self.config.get_which_captures().is_none() {
9821098
return self.c(expr);
9831099
}
9841100

@@ -1728,9 +1844,15 @@ mod tests {
17281844
util::primitives::{PatternID, StateID},
17291845
};
17301846

1847+
use super::*;
1848+
17311849
fn build(pattern: &str) -> NFA {
17321850
NFA::compiler()
1733-
.configure(NFA::config().captures(false).unanchored_prefix(false))
1851+
.configure(
1852+
NFA::config()
1853+
.which_captures(WhichCaptures::None)
1854+
.unanchored_prefix(false),
1855+
)
17341856
.build(pattern)
17351857
.unwrap()
17361858
}
@@ -1794,7 +1916,7 @@ mod tests {
17941916
#[test]
17951917
fn compile_unanchored_prefix() {
17961918
let nfa = NFA::compiler()
1797-
.configure(NFA::config().captures(false))
1919+
.configure(NFA::config().which_captures(WhichCaptures::None))
17981920
.build(r"a")
17991921
.unwrap();
18001922
assert_eq!(
@@ -1827,7 +1949,11 @@ mod tests {
18271949

18281950
// Check that non-UTF-8 literals work.
18291951
let nfa = NFA::compiler()
1830-
.configure(NFA::config().captures(false).unanchored_prefix(false))
1952+
.configure(
1953+
NFA::config()
1954+
.which_captures(WhichCaptures::None)
1955+
.unanchored_prefix(false),
1956+
)
18311957
.syntax(crate::util::syntax::Config::new().utf8(false))
18321958
.build(r"(?-u)\xFF")
18331959
.unwrap();
@@ -1937,7 +2063,7 @@ mod tests {
19372063
let nfa = NFA::compiler()
19382064
.configure(
19392065
NFA::config()
1940-
.captures(false)
2066+
.which_captures(WhichCaptures::None)
19412067
.reverse(true)
19422068
.shrink(false)
19432069
.unanchored_prefix(false),
@@ -1965,7 +2091,11 @@ mod tests {
19652091
#[test]
19662092
fn compile_many_start_pattern() {
19672093
let nfa = NFA::compiler()
1968-
.configure(NFA::config().captures(false).unanchored_prefix(false))
2094+
.configure(
2095+
NFA::config()
2096+
.which_captures(WhichCaptures::None)
2097+
.unanchored_prefix(false),
2098+
)
19692099
.build_many(&["a", "b"])
19702100
.unwrap();
19712101
assert_eq!(
@@ -1993,7 +2123,9 @@ mod tests {
19932123
use regex_syntax::hir::{Class, ClassBytes, Hir};
19942124

19952125
let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
1996-
let config = NFA::config().captures(false).unanchored_prefix(false);
2126+
let config = NFA::config()
2127+
.which_captures(WhichCaptures::None)
2128+
.unanchored_prefix(false);
19972129
let nfa =
19982130
NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
19992131
assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
@@ -2005,7 +2137,9 @@ mod tests {
20052137
use regex_syntax::hir::{Class, ClassUnicode, Hir};
20062138

20072139
let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2008-
let config = NFA::config().captures(false).unanchored_prefix(false);
2140+
let config = NFA::config()
2141+
.which_captures(WhichCaptures::None)
2142+
.unanchored_prefix(false);
20092143
let nfa =
20102144
NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
20112145
assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);

regex-automata/src/nfa/thompson/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,4 @@ pub use self::{
7878
},
7979
};
8080
#[cfg(feature = "syntax")]
81-
pub use compiler::{Compiler, Config};
81+
pub use compiler::{Compiler, Config, WhichCaptures};

0 commit comments

Comments
 (0)