Permalink
Browse files

add a match arena to regexp to avoid generating garbage.

simple regexps run 20x faster.
the regex-dna benchmark goes 3x faster.

R=rsc
CC=golang-dev
http://codereview.appspot.com/156108
  • Loading branch information...
robpike committed Nov 20, 2009
1 parent 0683f77 commit 5e0e8e35d5df2533fc7beff9feecbe56333c164e
Showing with 133 additions and 46 deletions.
  1. +27 −0 src/pkg/regexp/all_test.go
  2. +106 −46 src/pkg/regexp/regexp.go
View
@@ -60,6 +60,7 @@ type tester struct {
}
var matches = []tester{
tester{`^abcdefg`, "abcdefg", vec{0, 7}},
tester{`a+`, "baaab", vec{1, 4}},
tester{"abcd..", "abcdef", vec{0, 6}},
tester{``, "", vec{0, 0}},
@@ -450,3 +451,29 @@ func TestAllMatches(t *testing.T) {
}
}
}
func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50);
b.StopTimer();
re, _ := Compile(x);
b.StartTimer();
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!");
break;
}
}
}
func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 49);
b.StopTimer();
re, _ := Compile("^" + x);
b.StartTimer();
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!");
break;
}
}
}
View
@@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error {
return p.error;
}
// return regular text at the beginning of str
// Extract regular text from the beginning of the pattern.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
var b []byte;
var utf = make([]byte, utf8.UTFMax);
@@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp {
return regexp;
}
// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
head *matchVec;
len int; // length of match vector
}
type matchVec struct {
m []int; // pairs of bracketing submatches. 0th is start,end
ref int;
next *matchVec;
}
func (a *matchArena) new() *matchVec {
if a.head == nil {
const N = 10;
block := make([]matchVec, N);
for i := 0; i < N; i++ {
b := &block[i];
b.next = a.head;
a.head = b;
}
}
m := a.head;
a.head = m.next;
m.ref = 0;
if m.m == nil {
m.m = make([]int, a.len)
}
return m;
}
func (a *matchArena) free(m *matchVec) {
m.ref--;
if m.ref == 0 {
m.next = a.head;
a.head = m;
}
}
func (a *matchArena) copy(m *matchVec) *matchVec {
m1 := a.new();
copy(m1.m, m.m);
return m1;
}
func (a *matchArena) noMatch() *matchVec {
m := a.new();
for i := range m.m {
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
m.ref = 1;
return m;
}
type state struct {
inst instr; // next instruction to execute
match []int; // pairs of bracketing submatches. 0th is start,end
match *matchVec;
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
// adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state.
func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state {
switch inst.kind() {
case _BOT:
if pos == 0 {
s = re.addState(s, inst.next(), match, pos, end)
s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _EOT:
if pos == end {
s = re.addState(s, inst.next(), match, pos, end)
s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _BRA:
n := inst.(*_Bra).n;
match[2*n] = pos;
s = re.addState(s, inst.next(), match, pos, end);
match.m[2*n] = pos;
s = a.addState(s, inst.next(), match, pos, end);
return s;
case _EBRA:
n := inst.(*_Ebra).n;
match[2*n+1] = pos;
s = re.addState(s, inst.next(), match, pos, end);
match.m[2*n+1] = pos;
s = a.addState(s, inst.next(), match, pos, end);
return s;
}
index := inst.index();
l := len(s);
begin := match[0];
// TODO: Once the state is a vector and we can do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary,
begin := match.m[0];
// TODO: If the state were a vector and we could do insert, have inputs always
// go in order correctly and this "earlier" test is not necessary,
for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction
s[i].match[0] <= begin { // earlier match already going; lefmost wins
s[i].match.m[0] <= begin { // earlier match already going; lefmost wins
return s
}
}
@@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
s = s[0 : l+1];
s[l].inst = inst;
s[l].match = match;
match.ref++;
if inst.kind() == _ALT {
s1 := make([]int, 2*(re.nbra+1));
copy(s1, match);
s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end);
// give other branch a copy of this match vector
s1 = make([]int, 2*(re.nbra+1));
copy(s1, match);
s = re.addState(s, inst.next(), s1, pos, end);
s = a.addState(s, inst.next(), a.copy(match), pos, end);
}
return s;
}
func noMatch(nbra int) []int {
match := make([]int, 2*(nbra+1));
for i := range match {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
return match;
}
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
var s [2][]state;
s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0];
in, out := 0, 1;
@@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
pos += advance + len(re.prefix);
}
arena := &matchArena{nil, 2 * (re.nbra + 1)};
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
match := noMatch(re.nbra);
match[0] = pos;
s[out] = re.addState(s[out], re.start.next(), match, pos, end);
match := arena.noMatch();
match.m[0] = pos;
s[out] = arena.addState(s[out], re.start.next(), match, pos, end);
arena.free(match); // if addState saved it, ref was incremented
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
// clear out old state
old := s[out];
for _, state := range old {
arena.free(state.match)
}
s[out] = old[0:0]; // truncate state vector
if found && len(s[in]) == 0 {
// machine has completed
break
@@ -791,37 +845,40 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
}
pos += charwidth;
for i := 0; i < len(s[in]); i++ {
st := s[in][i];
switch s[in][i].inst.kind() {
for _, st := range s[in] {
switch st.inst.kind() {
case _BOT:
case _EOT:
case _CHAR:
if c == st.inst.(*_Char).char {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _ANY:
if c != endOfFile {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _NOTNL:
if c != endOfFile && c != '\n' {
s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _BRA:
case _EBRA:
case _ALT:
case _END:
// choose leftmost longest
if !found || // first
st.match[0] < final.match[0] || // leftmost
(st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
st.match.m[0] < final.match.m[0] || // leftmost
(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final = st;
final.match[1] = pos - charwidth;
final.match.ref++;
final.match.m[1] = pos - charwidth;
}
found = true;
default:
@@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
}
}
if final.match == nil {
return nil
}
// if match found, back up start of match by width of prefix.
if re.prefix != "" && len(final.match) > 0 {
final.match[0] -= len(re.prefix)
if re.prefix != "" && len(final.match.m) > 0 {
final.match.m[0] -= len(re.prefix)
}
return final.match;
return final.match.m;
}

0 comments on commit 5e0e8e3

Please sign in to comment.