Skip to content

Commit

Permalink
kaizen: clean up state finite automata (#308)
Browse files Browse the repository at this point in the history
* kaizen: clean up state finite automata

addresses #197

Signed-off-by: Tim Bray <tbray@textuality.com>

* patch codecov workflow

Signed-off-by: Tim Bray <tbray@textuality.com>

---------

Signed-off-by: Tim Bray <tbray@textuality.com>
  • Loading branch information
timbray committed May 31, 2024
1 parent 04396c4 commit 49e31ba
Show file tree
Hide file tree
Showing 22 changed files with 761 additions and 742 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/go-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ jobs:
- if: steps.codecov-enabled.outputs.files_exists == 'true'
name: Upload Codecov Report
uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

- name: Verify git clean
shell: bash
Expand Down
69 changes: 43 additions & 26 deletions anything_but.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,61 +58,78 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
return
}

// makeMultiAnythingButAutomaton exists to handle constructs such as
// makeMultiAnythingButDFA exists to handle constructs such as
//
// {"x": [ {"anything-but": [ "a", "b" ] } ] }
//
// A DFA that matches anything but one byte sequence is like this:
// A finite automaton that matches anything but one byte sequence is like this:
// For each byte in val with value Z, we produce a table that leads to a nextField match on all non-Z values,
// and to another such table for Z. After all the bytes have matched, a match on valueTerminator leads to
// an empty table with no field Transitions, all others to a nexField match
//
// Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not
// to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So
// in principle we could intersect automata.
func makeMultiAnythingButAutomaton(vals [][]byte, useThisTransition *fieldMatcher) (*smallTable[*dfaStep], *fieldMatcher) {
var nextField *fieldMatcher
if useThisTransition != nil {
nextField = useThisTransition
} else {
nextField = newFieldMatcher()
}
ret, _ := oneMultiAnythingButStep(vals, 0, nextField), nextField
func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
nextField := newFieldMatcher()
successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
//DEBUG successStep.table.label = "(success)"
success := &faNext{steps: []*faState{successStep}}

ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField
return ret, nextField
}

// oneMultiAnythingButStep - spookeh
func oneMultiAnythingButStep(vals [][]byte, index int, nextField *fieldMatcher) *smallTable[*dfaStep] {
success := &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{nextField}}
var u unpackedTable[*dfaStep]
// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
// the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is
// success but transfers to the next step on whatever the current byte in each of the vals that have not
// yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
// to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
// strings.
func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
// this will be the default transition in all the anything-but tables.
var u unpackedTable
for i := range u {
u[i] = success
}
// for the char at position 'index' in each val
nextSteps := make(map[byte][][]byte)
lastSteps := make(map[byte]bool)

// for the char at position 'index' in each val. valsWithBytesRemaining is keyed by that char (assuming that 'index' isn't
// off the edge of that val. valsEndingHere[index] being true for some val means that val ends here.
valsWithBytesRemaining := make(map[byte][][]byte)
valsEndingHere := make(map[byte]bool)
for _, val := range vals {
lastIndex := len(val) - 1
switch {
case index < lastIndex:
// gather vals that still have characters past 'index'
utf8Byte := val[index]
step := nextSteps[utf8Byte]
nextSteps[utf8Byte] = append(step, val)
step := valsWithBytesRemaining[utf8Byte]
valsWithBytesRemaining[utf8Byte] = append(step, val)
case index == lastIndex:
lastSteps[val[index]] = true
// remember if this particular val ends here
valsEndingHere[val[index]] = true
case index > lastIndex:
// no-op
}
}

for utf8Byte, valList := range nextSteps {
u[utf8Byte] = &dfaStep{table: oneMultiAnythingButStep(valList, index+1, nextField)}
// for each val that still has bytes to process, recurse to process the next one
for utf8Byte, val := range valsWithBytesRemaining {
nextTable := oneMultiAnythingButStep(val, index+1, success)
nextStep := &faState{table: nextTable}
u[utf8Byte] = &faNext{steps: []*faState{nextStep}}
}
for utf8Byte := range lastSteps {
lastStep := &dfaStep{table: newSmallTable[*dfaStep]()} // note no transition
u[utf8Byte] = &dfaStep{table: makeSmallDfaTable(success, []byte{valueTerminator}, []*dfaStep{lastStep})}

// for each val that ends at 'index', put a failure-transition for this anything-but
// if you hit the valueTerminator, success for everything else
for utf8Byte := range valsEndingHere {
failState := &faState{table: newSmallTable()} // note no transitions
lastStep := &faNext{steps: []*faState{failState}}
lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep})
u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}}
}
table := newSmallTable[*dfaStep]()

table := newSmallTable()
table.pack(&u)
return table
}
88 changes: 86 additions & 2 deletions anything_but_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ func TestAnythingButMerging(t *testing.T) {
q, _ := New()
var err error

// can merge with DFA?
// can merge with FA?
err = q.AddPattern("pFoo", pFoo)
if err != nil {
t.Error("add pFoo")
Expand Down Expand Up @@ -63,11 +63,95 @@ func TestAnythingButMerging(t *testing.T) {
}
}

func TestFootCornerCase(t *testing.T) {
q, _ := New()
pFoot := `{"z": ["foot"]}`
err := q.AddPattern("foot", pFoot)
if err != nil {
t.Error("addP: " + err.Error())
}
m, err := q.MatchesForEvent([]byte(`{"z": "foot"}`))
if err != nil {
t.Error(err.Error())
}
if len(m) != 1 || m[0] != "foot" {
t.Error("foot not 1")
}
q, _ = New()
pNotFoo := `{"z": [ { "anything-but": ["foo"]} ] }`
err = q.AddPattern("notFoo", pNotFoo)
if err != nil {
t.Error("addP: " + err.Error())
}
m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`))
if err != nil {
t.Error(err.Error())
}
if len(m) != 1 || m[0] != "notFoo" {
t.Error("foot not 1")
}
q, _ = New()
pFooStar := `{"z": [ { "shellstyle": "foo*" } ] }`
err = q.AddPattern("foostar", pFooStar)
if err != nil {
t.Error("addP: " + err.Error())
}
m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`))
if err != nil {
t.Error(err.Error())
}
if len(m) != 1 || m[0] != "foostar" {
t.Error("foot not 1")
}
}

func TestAnythingButAlgo(t *testing.T) {
notJoeTim := `{"x": [ { "anything-but": ["joe", "tim"] } ] }`
q, _ := New()
err := q.AddPattern("notJoeTim", notJoeTim)
if err != nil {
t.Error("NJT: " + err.Error())
}
event := `{"x": "toe"}`
matches, err := q.MatchesForEvent([]byte(event))
if err != nil {
t.Error("NJT: " + err.Error())
}
if len(matches) != 1 {
t.Error("NJT: Didn't match")
}
event = `{"x": "joe"}`
matches, err = q.MatchesForEvent([]byte(event))
if err != nil {
t.Error("NJT: " + err.Error())
}
if len(matches) != 0 {
t.Error("NJT: matched joe")
}

notTTT := `{"x": [ { "anything-but": ["tim", "time", "timed"] } ] }`
q, _ = New()
err = q.AddPattern("notTTT", notTTT)
if err != nil {
t.Error("NTTT: " + err.Error())
}
events := []string{`{"x": "tim"}`, `{"x": "time"}`, `{"x": "timed"}`}
for _, ev := range events {
matches, err := q.MatchesForEvent([]byte(ev))
if err != nil {
t.Error("NTTT: (" + ev + ") " + err.Error())
}
if len(matches) != 0 {
t.Error("NTTT: (" + ev + ") matched")
}
}
}

func TestAnythingButMatching(t *testing.T) {
q, _ := New()
// the idea is we're testing against all the 5-letter Wordle patterns, so we want a 4-letter prefix and
// suffix of an existing wordle, a 5-letter non-wordle, and a 6-letter where the wordle might match at the start
// and end. I tried to think of scenarios that would defeat the pretty-simple anything-but DFA but couldn't.
// and end. I tried to think of scenarios that would defeat the pretty-simple anything-but FA but couldn't.
problemWords := []string{
`"bloo"`,
`"aper"`,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func TestBigShellStyle(t *testing.T) {
}

// TestPatternAddition adds a whole lot of string-only rules as fast as possible The profiler says that the
// performance is totally doinated by the garbage-collector thrashing, in particular it has to allocate
// performance is totally dominated by the garbage-collector thrashing, in particular it has to allocate
// ~220K smallTables. Tried https://blog.twitch.tv/en/2019/04/10/go-memory-ballast-how-i-learnt-to-stop-worrying-and-love-the-heap/
// but it doesn't seem to help.
// TODO: Add shellstyle patterns
Expand Down Expand Up @@ -231,7 +231,7 @@ func TestPatternAddition(t *testing.T) {
runtime.ReadMemStats(&msAfter)
delta := 1.0 / 1000000.0 * float64(msAfter.Alloc-msBefore.Alloc)
fmt.Printf("before %d, after %d, delta %f\n", msBefore.Alloc, msAfter.Alloc, delta)
fmt.Println("stats:" + matcherStats(m))
fmt.Println("statsAccum:" + matcherStats(m))
elapsed := float64(time.Since(before).Milliseconds())
perSecond := float64(fieldCount) / (elapsed / 1000.0)
fmt.Printf("%.2f fields/second\n\n", perSecond)
Expand Down
8 changes: 4 additions & 4 deletions core_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ type coreMatcher struct {
// state is the start of the automaton.
// segmentsTree is a structure that encodes which fields appear in the Patterns that are added to the coreMatcher.
// It is built during calls to addPattern. It implements SegmentsTreeTracker, which is used by the event flattener
// to optimize the flattening process by skipping the processing of fields which are not used in any patern.
// to optimize the flattening process by skipping the processing of fields which are not used in any pattern.
type coreFields struct {
state *fieldMatcher
segmentsTree *segmentsTree
Expand Down Expand Up @@ -64,7 +64,7 @@ func (m *coreMatcher) addPattern(x X, patternJSON string) error {
m.lock.Lock()
defer m.lock.Unlock()

// we build up the new coreMatcher state in freshStart so we can atomically switch it in once complete
// we build up the new coreMatcher state in freshStart so that we can atomically switch it in once complete
freshStart := &coreFields{}
currentFields := m.fields()
freshStart.segmentsTree = currentFields.segmentsTree.copy()
Expand Down Expand Up @@ -163,7 +163,7 @@ func (a fieldsList) Swap(i, j int) {

// matchesForFields takes a list of Field structures, sorts them by pathname, and launches the field-matching
// process. The fields in a pattern to match are similarly sorted; thus running an automaton over them works.
// No error can be returned but the matcher interface requires one and it is used by the pruner implementation
// No error can be returned but the matcher interface requires one, and it is used by the pruner implementation
func (m *coreMatcher) matchesForFields(fields []Field) ([]X, error) {
if len(fields) == 0 {
fields = emptyFields()
Expand Down Expand Up @@ -227,7 +227,7 @@ func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSe

func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches *matchSet) {
for existsFalsePath, existsFalseTrans := range stateFields.existsFalse {
// it seems like there ought to be a more state-machine-idiomatic way to do this but
// it seems like there ought to be a more state-machine-idiomatic way to do this, but
// I thought of a few and none of them worked. Quite likely someone will figure it out eventually.
// Could get slow for big events with hundreds or more fields (not that I've ever seen that) - might
// be worthwhile switching to binary search at some field count or building a map[]boolean in addPattern
Expand Down
55 changes: 53 additions & 2 deletions core_matcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,58 @@ func TestFieldNameOrdering(t *testing.T) {
}
}

func TestSuffixBug(t *testing.T) {
var err error
j := `{"Url": "xy9"}`
patterns := []string{
`{ "Url": [ { "shellstyle": "*9" } ] }`,
`{ "Url": [ { "shellstyle": "x*9" } ] }`,
}

// make sure each works individually
m := newCoreMatcher()
_ = m.addPattern("p0", patterns[0])
matches, _ := m.matchesForJSONEvent([]byte(j))
if len(matches) != 1 || matches[0] != "p0" {
t.Error("p0 didn't match")
}

m = newCoreMatcher()
_ = m.addPattern("p1", patterns[1])
matches, _ = m.matchesForJSONEvent([]byte(j))
if len(matches) != 1 || matches[0] != "p1" {
t.Error("p1 didn't match")
}

// now let's see if they work merged
m = newCoreMatcher()
wanted := make(map[X]int)
for _, should := range patterns {
wanted[should] = 0
err = m.addPattern(should, should)
if err != nil {
t.Error("add one of many: " + err.Error())
}
}
matches, err = m.matchesForJSONEvent([]byte(j))
if err != nil {
t.Error("m4J on all: " + err.Error())
}
if len(matches) != len(patterns) {
for _, match := range matches {
wanted[match]++
}
for want := range wanted {
if wanted[want] == 0 {
t.Errorf("Missed: %s", want.(string))
} else {
fmt.Printf("Matched %s\n", want)
}
}
fmt.Println()
}
}

func TestExerciseMatching(t *testing.T) {
j := `{
"Image": {
Expand Down Expand Up @@ -234,12 +286,11 @@ func TestExerciseMatching(t *testing.T) {
}
for want := range wanted {
if wanted[want] == 0 {
t.Errorf("Missed: %v" + want.(string))
t.Errorf("Missed: %s", want.(string))
}
}
fmt.Println()
}
// fmt.Println("Should not: " + matcherStats(m))
}

func TestTacos(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion field_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func (m *fieldMatcher) addExists(exists bool, field *patternField) []*fieldMatch
}

func (m *fieldMatcher) addTransition(field *patternField) []*fieldMatcher {
// we build the new updateable state in freshStart so we can blsat it in atomically once computed
// we build the new updateable state in freshStart so that we can blast it in atomically once computed
current := m.fields()
freshStart := &fmFields{
matches: current.matches,
Expand Down
Loading

0 comments on commit 49e31ba

Please sign in to comment.