kaizen: clean up state finite automata (#308)

* kaizen: clean up state finite automata addresses #197 Signed-off-by: Tim Bray <tbray@textuality.com> * patch codecov workflow Signed-off-by: Tim Bray <tbray@textuality.com> --------- Signed-off-by: Tim Bray <tbray@textuality.com>
timbray · May 31, 2024 · 49e31ba · 49e31ba
1 parent 04396c4
commit 49e31ba
Show file tree

Hide file tree

Showing 22 changed files with 761 additions and 742 deletions.
diff --git a/.github/workflows/go-unit-tests.yaml b/.github/workflows/go-unit-tests.yaml
@@ -68,6 +68,8 @@ jobs:
       - if: steps.codecov-enabled.outputs.files_exists == 'true'
         name: Upload Codecov Report
         uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
       - name: Verify git clean
         shell: bash

diff --git a/anything_but.go b/anything_but.go
@@ -58,61 +58,78 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
 	return
 }
 
-// makeMultiAnythingButAutomaton exists to handle constructs such as
+// makeMultiAnythingButDFA exists to handle constructs such as
 //
 // {"x": [ {"anything-but": [ "a", "b" ] } ] }
 //
-// A DFA that matches anything but one byte sequence is like this:
+// A finite automaton that matches anything but one byte sequence is like this:
 // For each byte in val with value Z, we produce a table that leads to a nextField match on all non-Z values,
 // and to another such table for Z. After all the bytes have matched, a match on valueTerminator leads to
 // an empty table with no field Transitions, all others to a nexField match
 //
 // Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not
 // to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So
 // in principle we could intersect automata.
-func makeMultiAnythingButAutomaton(vals [][]byte, useThisTransition *fieldMatcher) (*smallTable[*dfaStep], *fieldMatcher) {
-	var nextField *fieldMatcher
-	if useThisTransition != nil {
-		nextField = useThisTransition
-	} else {
-		nextField = newFieldMatcher()
-	}
-	ret, _ := oneMultiAnythingButStep(vals, 0, nextField), nextField
+func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
+	nextField := newFieldMatcher()
+	successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
+	//DEBUG successStep.table.label = "(success)"
+	success := &faNext{steps: []*faState{successStep}}
+
+	ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField
 	return ret, nextField
 }
 
-// oneMultiAnythingButStep - spookeh
-func oneMultiAnythingButStep(vals [][]byte, index int, nextField *fieldMatcher) *smallTable[*dfaStep] {
-	success := &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{nextField}}
-	var u unpackedTable[*dfaStep]
+// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
+// the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is
+// success but transfers to the next step on whatever the current byte in each of the vals that have not
+// yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
+// to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
+// strings.
+func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
+	// this will be the default transition in all the anything-but tables.
+	var u unpackedTable
 	for i := range u {
 		u[i] = success
 	}
-	// for the char at position 'index' in each val
-	nextSteps := make(map[byte][][]byte)
-	lastSteps := make(map[byte]bool)
+
+	// for the char at position 'index' in each val. valsWithBytesRemaining is keyed by that char (assuming that 'index' isn't
+	// off the edge of that val. valsEndingHere[index] being true for some val means that val ends here.
+	valsWithBytesRemaining := make(map[byte][][]byte)
+	valsEndingHere := make(map[byte]bool)
 	for _, val := range vals {
 		lastIndex := len(val) - 1
 		switch {
 		case index < lastIndex:
+			// gather vals that still have characters past 'index'
 			utf8Byte := val[index]
-			step := nextSteps[utf8Byte]
-			nextSteps[utf8Byte] = append(step, val)
+			step := valsWithBytesRemaining[utf8Byte]
+			valsWithBytesRemaining[utf8Byte] = append(step, val)
 		case index == lastIndex:
-			lastSteps[val[index]] = true
+			// remember if this particular val ends here
+			valsEndingHere[val[index]] = true
 		case index > lastIndex:
 			// no-op
 		}
 	}
 
-	for utf8Byte, valList := range nextSteps {
-		u[utf8Byte] = &dfaStep{table: oneMultiAnythingButStep(valList, index+1, nextField)}
+	// for each val that still has bytes to process, recurse to process the next one
+	for utf8Byte, val := range valsWithBytesRemaining {
+		nextTable := oneMultiAnythingButStep(val, index+1, success)
+		nextStep := &faState{table: nextTable}
+		u[utf8Byte] = &faNext{steps: []*faState{nextStep}}
 	}
-	for utf8Byte := range lastSteps {
-		lastStep := &dfaStep{table: newSmallTable[*dfaStep]()} // note no transition
-		u[utf8Byte] = &dfaStep{table: makeSmallDfaTable(success, []byte{valueTerminator}, []*dfaStep{lastStep})}
+
+	// for each val that ends at 'index', put a failure-transition for this anything-but
+	// if you hit the valueTerminator, success for everything else
+	for utf8Byte := range valsEndingHere {
+		failState := &faState{table: newSmallTable()} // note no transitions
+		lastStep := &faNext{steps: []*faState{failState}}
+		lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep})
+		u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}}
 	}
-	table := newSmallTable[*dfaStep]()
+
+	table := newSmallTable()
 	table.pack(&u)
 	return table
 }
diff --git a/anything_but_test.go b/anything_but_test.go
@@ -11,7 +11,7 @@ func TestAnythingButMerging(t *testing.T) {
 	q, _ := New()
 	var err error
 
-	// can merge with DFA?
+	// can merge with FA?
 	err = q.AddPattern("pFoo", pFoo)
 	if err != nil {
 		t.Error("add pFoo")
@@ -63,11 +63,95 @@ func TestAnythingButMerging(t *testing.T) {
 	}
 }
 
+func TestFootCornerCase(t *testing.T) {
+	q, _ := New()
+	pFoot := `{"z": ["foot"]}`
+	err := q.AddPattern("foot", pFoot)
+	if err != nil {
+		t.Error("addP: " + err.Error())
+	}
+	m, err := q.MatchesForEvent([]byte(`{"z": "foot"}`))
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(m) != 1 || m[0] != "foot" {
+		t.Error("foot not 1")
+	}
+	q, _ = New()
+	pNotFoo := `{"z": [ { "anything-but": ["foo"]} ] }`
+	err = q.AddPattern("notFoo", pNotFoo)
+	if err != nil {
+		t.Error("addP: " + err.Error())
+	}
+	m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`))
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(m) != 1 || m[0] != "notFoo" {
+		t.Error("foot not 1")
+	}
+	q, _ = New()
+	pFooStar := `{"z": [ { "shellstyle": "foo*" } ] }`
+	err = q.AddPattern("foostar", pFooStar)
+	if err != nil {
+		t.Error("addP: " + err.Error())
+	}
+	m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`))
+	if err != nil {
+		t.Error(err.Error())
+	}
+	if len(m) != 1 || m[0] != "foostar" {
+		t.Error("foot not 1")
+	}
+}
+
+func TestAnythingButAlgo(t *testing.T) {
+	notJoeTim := `{"x": [ { "anything-but": ["joe", "tim"] } ] }`
+	q, _ := New()
+	err := q.AddPattern("notJoeTim", notJoeTim)
+	if err != nil {
+		t.Error("NJT: " + err.Error())
+	}
+	event := `{"x": "toe"}`
+	matches, err := q.MatchesForEvent([]byte(event))
+	if err != nil {
+		t.Error("NJT: " + err.Error())
+	}
+	if len(matches) != 1 {
+		t.Error("NJT: Didn't match")
+	}
+	event = `{"x": "joe"}`
+	matches, err = q.MatchesForEvent([]byte(event))
+	if err != nil {
+		t.Error("NJT: " + err.Error())
+	}
+	if len(matches) != 0 {
+		t.Error("NJT: matched joe")
+	}
+
+	notTTT := `{"x": [ { "anything-but": ["tim", "time", "timed"] } ] }`
+	q, _ = New()
+	err = q.AddPattern("notTTT", notTTT)
+	if err != nil {
+		t.Error("NTTT: " + err.Error())
+	}
+	events := []string{`{"x": "tim"}`, `{"x": "time"}`, `{"x": "timed"}`}
+	for _, ev := range events {
+		matches, err := q.MatchesForEvent([]byte(ev))
+		if err != nil {
+			t.Error("NTTT: (" + ev + ") " + err.Error())
+		}
+		if len(matches) != 0 {
+			t.Error("NTTT: (" + ev + ") matched")
+		}
+	}
+}
+
 func TestAnythingButMatching(t *testing.T) {
 	q, _ := New()
 	// the idea is we're testing against all the 5-letter Wordle patterns, so we want a 4-letter prefix and
 	// suffix of an existing wordle, a 5-letter non-wordle, and a 6-letter where the wordle might match at the start
-	// and end. I tried to think of scenarios that would defeat the pretty-simple anything-but DFA but couldn't.
+	// and end. I tried to think of scenarios that would defeat the pretty-simple anything-but FA but couldn't.
 	problemWords := []string{
 		`"bloo"`,
 		`"aper"`,

diff --git a/benchmarks_test.go b/benchmarks_test.go
@@ -198,7 +198,7 @@ func TestBigShellStyle(t *testing.T) {
 }
 
 // TestPatternAddition adds a whole lot of string-only rules as fast as possible  The profiler says that the
-// performance is totally doinated by the garbage-collector thrashing, in particular it has to allocate
+// performance is totally dominated by the garbage-collector thrashing, in particular it has to allocate
 // ~220K smallTables.  Tried https://blog.twitch.tv/en/2019/04/10/go-memory-ballast-how-i-learnt-to-stop-worrying-and-love-the-heap/
 // but it doesn't seem to help.
 // TODO: Add shellstyle patterns
@@ -231,7 +231,7 @@ func TestPatternAddition(t *testing.T) {
 	runtime.ReadMemStats(&msAfter)
 	delta := 1.0 / 1000000.0 * float64(msAfter.Alloc-msBefore.Alloc)
 	fmt.Printf("before %d, after %d, delta %f\n", msBefore.Alloc, msAfter.Alloc, delta)
-	fmt.Println("stats:" + matcherStats(m))
+	fmt.Println("statsAccum:" + matcherStats(m))
 	elapsed := float64(time.Since(before).Milliseconds())
 	perSecond := float64(fieldCount) / (elapsed / 1000.0)
 	fmt.Printf("%.2f fields/second\n\n", perSecond)

diff --git a/core_matcher.go b/core_matcher.go
@@ -30,7 +30,7 @@ type coreMatcher struct {
 // state is the start of the automaton.
 // segmentsTree is a structure that encodes which fields appear in the Patterns that are added to the coreMatcher.
 // It is built during calls to addPattern. It implements SegmentsTreeTracker, which is used by the event flattener
-// to optimize the flattening process by skipping the processing of fields which are not used in any patern.
+// to optimize the flattening process by skipping the processing of fields which are not used in any pattern.
 type coreFields struct {
 	state        *fieldMatcher
 	segmentsTree *segmentsTree
@@ -64,7 +64,7 @@ func (m *coreMatcher) addPattern(x X, patternJSON string) error {
 	m.lock.Lock()
 	defer m.lock.Unlock()
 
-	// we build up the new coreMatcher state in freshStart so we can atomically switch it in once complete
+	// we build up the new coreMatcher state in freshStart so that we can atomically switch it in once complete
 	freshStart := &coreFields{}
 	currentFields := m.fields()
 	freshStart.segmentsTree = currentFields.segmentsTree.copy()
@@ -163,7 +163,7 @@ func (a fieldsList) Swap(i, j int) {
 
 // matchesForFields takes a list of Field structures, sorts them by pathname, and launches the field-matching
 // process. The fields in a pattern to match are similarly sorted; thus running an automaton over them works.
-// No error can be returned but the matcher interface requires one and it is used by the pruner implementation
+// No error can be returned but the matcher interface requires one, and it is used by the pruner implementation
 func (m *coreMatcher) matchesForFields(fields []Field) ([]X, error) {
 	if len(fields) == 0 {
 		fields = emptyFields()
@@ -227,7 +227,7 @@ func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSe
 
 func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches *matchSet) {
 	for existsFalsePath, existsFalseTrans := range stateFields.existsFalse {
-		// it seems like there ought to be a more state-machine-idiomatic way to do this but
+		// it seems like there ought to be a more state-machine-idiomatic way to do this, but
 		// I thought of a few and none of them worked.  Quite likely someone will figure it out eventually.
 		// Could get slow for big events with hundreds or more fields (not that I've ever seen that) - might
 		// be worthwhile switching to binary search at some field count or building a map[]boolean in addPattern

diff --git a/core_matcher_test.go b/core_matcher_test.go
@@ -139,6 +139,58 @@ func TestFieldNameOrdering(t *testing.T) {
 	}
 }
 
+func TestSuffixBug(t *testing.T) {
+	var err error
+	j := `{"Url":    "xy9"}`
+	patterns := []string{
+		`{ "Url": [ { "shellstyle": "*9" } ] }`,
+		`{ "Url": [ { "shellstyle": "x*9" } ] }`,
+	}
+
+	// make sure each works individually
+	m := newCoreMatcher()
+	_ = m.addPattern("p0", patterns[0])
+	matches, _ := m.matchesForJSONEvent([]byte(j))
+	if len(matches) != 1 || matches[0] != "p0" {
+		t.Error("p0 didn't match")
+	}
+
+	m = newCoreMatcher()
+	_ = m.addPattern("p1", patterns[1])
+	matches, _ = m.matchesForJSONEvent([]byte(j))
+	if len(matches) != 1 || matches[0] != "p1" {
+		t.Error("p1 didn't match")
+	}
+
+	// now let's see if they work merged
+	m = newCoreMatcher()
+	wanted := make(map[X]int)
+	for _, should := range patterns {
+		wanted[should] = 0
+		err = m.addPattern(should, should)
+		if err != nil {
+			t.Error("add one of many: " + err.Error())
+		}
+	}
+	matches, err = m.matchesForJSONEvent([]byte(j))
+	if err != nil {
+		t.Error("m4J on all: " + err.Error())
+	}
+	if len(matches) != len(patterns) {
+		for _, match := range matches {
+			wanted[match]++
+		}
+		for want := range wanted {
+			if wanted[want] == 0 {
+				t.Errorf("Missed: %s", want.(string))
+			} else {
+				fmt.Printf("Matched %s\n", want)
+			}
+		}
+		fmt.Println()
+	}
+}
+
 func TestExerciseMatching(t *testing.T) {
 	j := `{
         "Image": {
@@ -234,12 +286,11 @@ func TestExerciseMatching(t *testing.T) {
 		}
 		for want := range wanted {
 			if wanted[want] == 0 {
-				t.Errorf("Missed: %v" + want.(string))
+				t.Errorf("Missed: %s", want.(string))
 			}
 		}
 		fmt.Println()
 	}
-	// fmt.Println("Should not: " + matcherStats(m))
 }
 
 func TestTacos(t *testing.T) {

diff --git a/field_matcher.go b/field_matcher.go
@@ -94,7 +94,7 @@ func (m *fieldMatcher) addExists(exists bool, field *patternField) []*fieldMatch
 }
 
 func (m *fieldMatcher) addTransition(field *patternField) []*fieldMatcher {
-	// we build the new updateable state in freshStart so we can blsat it in atomically once computed
+	// we build the new updateable state in freshStart so that we can blast it in atomically once computed
 	current := m.fields()
 	freshStart := &fmFields{
 		matches:     current.matches,