diff --git a/.github/workflows/go-unit-tests.yaml b/.github/workflows/go-unit-tests.yaml index 1d555cd..4cd08b2 100644 --- a/.github/workflows/go-unit-tests.yaml +++ b/.github/workflows/go-unit-tests.yaml @@ -68,6 +68,8 @@ jobs: - if: steps.codecov-enabled.outputs.files_exists == 'true' name: Upload Codecov Report uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - name: Verify git clean shell: bash diff --git a/anything_but.go b/anything_but.go index 1fc45eb..3e70440 100644 --- a/anything_but.go +++ b/anything_but.go @@ -58,11 +58,11 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ return } -// makeMultiAnythingButAutomaton exists to handle constructs such as +// makeMultiAnythingButDFA exists to handle constructs such as // // {"x": [ {"anything-but": [ "a", "b" ] } ] } // -// A DFA that matches anything but one byte sequence is like this: +// A finite automaton that matches anything but one byte sequence is like this: // For each byte in val with value Z, we produce a table that leads to a nextField match on all non-Z values, // and to another such table for Z. After all the bytes have matched, a match on valueTerminator leads to // an empty table with no field Transitions, all others to a nexField match @@ -70,49 +70,66 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ // Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not // to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So // in principle we could intersect automata. -func makeMultiAnythingButAutomaton(vals [][]byte, useThisTransition *fieldMatcher) (*smallTable[*dfaStep], *fieldMatcher) { - var nextField *fieldMatcher - if useThisTransition != nil { - nextField = useThisTransition - } else { - nextField = newFieldMatcher() - } - ret, _ := oneMultiAnythingButStep(vals, 0, nextField), nextField +func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) { + nextField := newFieldMatcher() + successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} + //DEBUG successStep.table.label = "(success)" + success := &faNext{steps: []*faState{successStep}} + + ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField return ret, nextField } -// oneMultiAnythingButStep - spookeh -func oneMultiAnythingButStep(vals [][]byte, index int, nextField *fieldMatcher) *smallTable[*dfaStep] { - success := &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{nextField}} - var u unpackedTable[*dfaStep] +// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is +// the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is +// success but transfers to the next step on whatever the current byte in each of the vals that have not +// yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition +// to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but +// strings. +func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable { + // this will be the default transition in all the anything-but tables. + var u unpackedTable for i := range u { u[i] = success } - // for the char at position 'index' in each val - nextSteps := make(map[byte][][]byte) - lastSteps := make(map[byte]bool) + + // for the char at position 'index' in each val. valsWithBytesRemaining is keyed by that char (assuming that 'index' isn't + // off the edge of that val. valsEndingHere[index] being true for some val means that val ends here. + valsWithBytesRemaining := make(map[byte][][]byte) + valsEndingHere := make(map[byte]bool) for _, val := range vals { lastIndex := len(val) - 1 switch { case index < lastIndex: + // gather vals that still have characters past 'index' utf8Byte := val[index] - step := nextSteps[utf8Byte] - nextSteps[utf8Byte] = append(step, val) + step := valsWithBytesRemaining[utf8Byte] + valsWithBytesRemaining[utf8Byte] = append(step, val) case index == lastIndex: - lastSteps[val[index]] = true + // remember if this particular val ends here + valsEndingHere[val[index]] = true case index > lastIndex: // no-op } } - for utf8Byte, valList := range nextSteps { - u[utf8Byte] = &dfaStep{table: oneMultiAnythingButStep(valList, index+1, nextField)} + // for each val that still has bytes to process, recurse to process the next one + for utf8Byte, val := range valsWithBytesRemaining { + nextTable := oneMultiAnythingButStep(val, index+1, success) + nextStep := &faState{table: nextTable} + u[utf8Byte] = &faNext{steps: []*faState{nextStep}} } - for utf8Byte := range lastSteps { - lastStep := &dfaStep{table: newSmallTable[*dfaStep]()} // note no transition - u[utf8Byte] = &dfaStep{table: makeSmallDfaTable(success, []byte{valueTerminator}, []*dfaStep{lastStep})} + + // for each val that ends at 'index', put a failure-transition for this anything-but + // if you hit the valueTerminator, success for everything else + for utf8Byte := range valsEndingHere { + failState := &faState{table: newSmallTable()} // note no transitions + lastStep := &faNext{steps: []*faState{failState}} + lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep}) + u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}} } - table := newSmallTable[*dfaStep]() + + table := newSmallTable() table.pack(&u) return table } diff --git a/anything_but_test.go b/anything_but_test.go index 94c811d..180228a 100644 --- a/anything_but_test.go +++ b/anything_but_test.go @@ -11,7 +11,7 @@ func TestAnythingButMerging(t *testing.T) { q, _ := New() var err error - // can merge with DFA? + // can merge with FA? err = q.AddPattern("pFoo", pFoo) if err != nil { t.Error("add pFoo") @@ -63,11 +63,95 @@ func TestAnythingButMerging(t *testing.T) { } } +func TestFootCornerCase(t *testing.T) { + q, _ := New() + pFoot := `{"z": ["foot"]}` + err := q.AddPattern("foot", pFoot) + if err != nil { + t.Error("addP: " + err.Error()) + } + m, err := q.MatchesForEvent([]byte(`{"z": "foot"}`)) + if err != nil { + t.Error(err.Error()) + } + if len(m) != 1 || m[0] != "foot" { + t.Error("foot not 1") + } + q, _ = New() + pNotFoo := `{"z": [ { "anything-but": ["foo"]} ] }` + err = q.AddPattern("notFoo", pNotFoo) + if err != nil { + t.Error("addP: " + err.Error()) + } + m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) + if err != nil { + t.Error(err.Error()) + } + if len(m) != 1 || m[0] != "notFoo" { + t.Error("foot not 1") + } + q, _ = New() + pFooStar := `{"z": [ { "shellstyle": "foo*" } ] }` + err = q.AddPattern("foostar", pFooStar) + if err != nil { + t.Error("addP: " + err.Error()) + } + m, err = q.MatchesForEvent([]byte(`{"z": "foot"}`)) + if err != nil { + t.Error(err.Error()) + } + if len(m) != 1 || m[0] != "foostar" { + t.Error("foot not 1") + } +} + +func TestAnythingButAlgo(t *testing.T) { + notJoeTim := `{"x": [ { "anything-but": ["joe", "tim"] } ] }` + q, _ := New() + err := q.AddPattern("notJoeTim", notJoeTim) + if err != nil { + t.Error("NJT: " + err.Error()) + } + event := `{"x": "toe"}` + matches, err := q.MatchesForEvent([]byte(event)) + if err != nil { + t.Error("NJT: " + err.Error()) + } + if len(matches) != 1 { + t.Error("NJT: Didn't match") + } + event = `{"x": "joe"}` + matches, err = q.MatchesForEvent([]byte(event)) + if err != nil { + t.Error("NJT: " + err.Error()) + } + if len(matches) != 0 { + t.Error("NJT: matched joe") + } + + notTTT := `{"x": [ { "anything-but": ["tim", "time", "timed"] } ] }` + q, _ = New() + err = q.AddPattern("notTTT", notTTT) + if err != nil { + t.Error("NTTT: " + err.Error()) + } + events := []string{`{"x": "tim"}`, `{"x": "time"}`, `{"x": "timed"}`} + for _, ev := range events { + matches, err := q.MatchesForEvent([]byte(ev)) + if err != nil { + t.Error("NTTT: (" + ev + ") " + err.Error()) + } + if len(matches) != 0 { + t.Error("NTTT: (" + ev + ") matched") + } + } +} + func TestAnythingButMatching(t *testing.T) { q, _ := New() // the idea is we're testing against all the 5-letter Wordle patterns, so we want a 4-letter prefix and // suffix of an existing wordle, a 5-letter non-wordle, and a 6-letter where the wordle might match at the start - // and end. I tried to think of scenarios that would defeat the pretty-simple anything-but DFA but couldn't. + // and end. I tried to think of scenarios that would defeat the pretty-simple anything-but FA but couldn't. problemWords := []string{ `"bloo"`, `"aper"`, diff --git a/benchmarks_test.go b/benchmarks_test.go index fcaac9d..e5c2c2e 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -198,7 +198,7 @@ func TestBigShellStyle(t *testing.T) { } // TestPatternAddition adds a whole lot of string-only rules as fast as possible The profiler says that the -// performance is totally doinated by the garbage-collector thrashing, in particular it has to allocate +// performance is totally dominated by the garbage-collector thrashing, in particular it has to allocate // ~220K smallTables. Tried https://blog.twitch.tv/en/2019/04/10/go-memory-ballast-how-i-learnt-to-stop-worrying-and-love-the-heap/ // but it doesn't seem to help. // TODO: Add shellstyle patterns @@ -231,7 +231,7 @@ func TestPatternAddition(t *testing.T) { runtime.ReadMemStats(&msAfter) delta := 1.0 / 1000000.0 * float64(msAfter.Alloc-msBefore.Alloc) fmt.Printf("before %d, after %d, delta %f\n", msBefore.Alloc, msAfter.Alloc, delta) - fmt.Println("stats:" + matcherStats(m)) + fmt.Println("statsAccum:" + matcherStats(m)) elapsed := float64(time.Since(before).Milliseconds()) perSecond := float64(fieldCount) / (elapsed / 1000.0) fmt.Printf("%.2f fields/second\n\n", perSecond) diff --git a/core_matcher.go b/core_matcher.go index 284fc61..e51eb79 100644 --- a/core_matcher.go +++ b/core_matcher.go @@ -30,7 +30,7 @@ type coreMatcher struct { // state is the start of the automaton. // segmentsTree is a structure that encodes which fields appear in the Patterns that are added to the coreMatcher. // It is built during calls to addPattern. It implements SegmentsTreeTracker, which is used by the event flattener -// to optimize the flattening process by skipping the processing of fields which are not used in any patern. +// to optimize the flattening process by skipping the processing of fields which are not used in any pattern. type coreFields struct { state *fieldMatcher segmentsTree *segmentsTree @@ -64,7 +64,7 @@ func (m *coreMatcher) addPattern(x X, patternJSON string) error { m.lock.Lock() defer m.lock.Unlock() - // we build up the new coreMatcher state in freshStart so we can atomically switch it in once complete + // we build up the new coreMatcher state in freshStart so that we can atomically switch it in once complete freshStart := &coreFields{} currentFields := m.fields() freshStart.segmentsTree = currentFields.segmentsTree.copy() @@ -163,7 +163,7 @@ func (a fieldsList) Swap(i, j int) { // matchesForFields takes a list of Field structures, sorts them by pathname, and launches the field-matching // process. The fields in a pattern to match are similarly sorted; thus running an automaton over them works. -// No error can be returned but the matcher interface requires one and it is used by the pruner implementation +// No error can be returned but the matcher interface requires one, and it is used by the pruner implementation func (m *coreMatcher) matchesForFields(fields []Field) ([]X, error) { if len(fields) == 0 { fields = emptyFields() @@ -227,7 +227,7 @@ func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSe func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches *matchSet) { for existsFalsePath, existsFalseTrans := range stateFields.existsFalse { - // it seems like there ought to be a more state-machine-idiomatic way to do this but + // it seems like there ought to be a more state-machine-idiomatic way to do this, but // I thought of a few and none of them worked. Quite likely someone will figure it out eventually. // Could get slow for big events with hundreds or more fields (not that I've ever seen that) - might // be worthwhile switching to binary search at some field count or building a map[]boolean in addPattern diff --git a/core_matcher_test.go b/core_matcher_test.go index 4d098bd..6b2906d 100644 --- a/core_matcher_test.go +++ b/core_matcher_test.go @@ -139,6 +139,58 @@ func TestFieldNameOrdering(t *testing.T) { } } +func TestSuffixBug(t *testing.T) { + var err error + j := `{"Url": "xy9"}` + patterns := []string{ + `{ "Url": [ { "shellstyle": "*9" } ] }`, + `{ "Url": [ { "shellstyle": "x*9" } ] }`, + } + + // make sure each works individually + m := newCoreMatcher() + _ = m.addPattern("p0", patterns[0]) + matches, _ := m.matchesForJSONEvent([]byte(j)) + if len(matches) != 1 || matches[0] != "p0" { + t.Error("p0 didn't match") + } + + m = newCoreMatcher() + _ = m.addPattern("p1", patterns[1]) + matches, _ = m.matchesForJSONEvent([]byte(j)) + if len(matches) != 1 || matches[0] != "p1" { + t.Error("p1 didn't match") + } + + // now let's see if they work merged + m = newCoreMatcher() + wanted := make(map[X]int) + for _, should := range patterns { + wanted[should] = 0 + err = m.addPattern(should, should) + if err != nil { + t.Error("add one of many: " + err.Error()) + } + } + matches, err = m.matchesForJSONEvent([]byte(j)) + if err != nil { + t.Error("m4J on all: " + err.Error()) + } + if len(matches) != len(patterns) { + for _, match := range matches { + wanted[match]++ + } + for want := range wanted { + if wanted[want] == 0 { + t.Errorf("Missed: %s", want.(string)) + } else { + fmt.Printf("Matched %s\n", want) + } + } + fmt.Println() + } +} + func TestExerciseMatching(t *testing.T) { j := `{ "Image": { @@ -234,12 +286,11 @@ func TestExerciseMatching(t *testing.T) { } for want := range wanted { if wanted[want] == 0 { - t.Errorf("Missed: %v" + want.(string)) + t.Errorf("Missed: %s", want.(string)) } } fmt.Println() } - // fmt.Println("Should not: " + matcherStats(m)) } func TestTacos(t *testing.T) { diff --git a/field_matcher.go b/field_matcher.go index d2bcc5a..6b13cf8 100644 --- a/field_matcher.go +++ b/field_matcher.go @@ -94,7 +94,7 @@ func (m *fieldMatcher) addExists(exists bool, field *patternField) []*fieldMatch } func (m *fieldMatcher) addTransition(field *patternField) []*fieldMatcher { - // we build the new updateable state in freshStart so we can blsat it in atomically once computed + // we build the new updateable state in freshStart so that we can blast it in atomically once computed current := m.fields() freshStart := &fmFields{ matches: current.matches, diff --git a/flatten_json_test.go b/flatten_json_test.go index 4b38648..580400f 100644 --- a/flatten_json_test.go +++ b/flatten_json_test.go @@ -2,7 +2,6 @@ package quamina import ( "bytes" - "fmt" "os" "testing" ) @@ -126,7 +125,7 @@ func TestFJSkippingErrors(t *testing.T) { for _, event := range events { fields, err := f.Flatten([]byte(event), matcher.getSegmentsTreeTracker()) if err == nil { - t.Errorf("Expected to fail [%s], but got %d fields", string(event), len(fields)) + t.Errorf("Expected to fail [%s], but got %d fields", event, len(fields)) } } } @@ -248,7 +247,7 @@ func testTrackerSelection(t *testing.T, fj Flattener, tracker SegmentsTreeTracke } func TestFJErrorCases(t *testing.T) { - // adding "a\nx" so we will goes into "a" object, otherwise we will skip it + // adding "a\nx" so we will go into "a" object, otherwise we will skip it matcher := fakeMatcher("a", "b", "c", "d", "e", "f", "a\nx") fj := newJSONFlattener().(*flattenJSON) @@ -316,76 +315,6 @@ func TestFJErrorCases(t *testing.T) { } } -func TestSkipUnusedPaths(t *testing.T) { - // Each of theses cases has a nested object with additional values that should - // be skipped after the specified paths have been checked - // - // e.g., take the following object and paths: - // - // object: { "a": { "b": 1, "c": 2} } - // paths: ["a\nb", "d"] - // - // After the flattener evaluates a.b, it should skip a.c before looking - // for d in the outer object. - // - // These tests make sure that the flattener correctly skips remaining - // values including nested objects and arrays. - // - // The tests below contain an additional path to look for after the - // paths in the nested object to make sure the flattener correctly - // exits the nested object and begings parsing the rest of the event - // at the correct location. - cases := []struct { - event string - matcherPaths []string - }{ - { - event: `{"nested":{"thing":"whatever","extra":{}}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":{"empty": false}}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":[{}]}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":[{"empty": false}]}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":[]}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":[1,"two",true,null]}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - { - event: `{"nested":{"thing":"whatever","extra":[],"andAnother":{}}}`, - matcherPaths: []string{"nested\nthing", "another"}, - }, - } - - for i, c := range cases { - t.Run(fmt.Sprintf("case_%d", i), func(t *testing.T) { - matcher := fakeMatcher(c.matcherPaths...) - - fj := newJSONFlattener().(*flattenJSON) - - // ignore the fields - this test isn't concerned with the flattening result - _, err := fj.Flatten([]byte(c.event), matcher.getSegmentsTreeTracker()) - - // make sure the flattener didn't return an error - if err != nil { - t.Fatalf("failed to flatten json: %v", err) - } - }) - } -} - func fakeMatcher(paths ...string) *coreMatcher { m := newCoreMatcher() for _, path := range paths { diff --git a/list_maker.go b/list_maker.go deleted file mode 100644 index a9211f7..0000000 --- a/list_maker.go +++ /dev/null @@ -1,117 +0,0 @@ -package quamina - -// this needs to exist so that all the lists containing a single step to X are the same list, and similarly all -// those containing the triple step to X,Y,Z are the same list, so that pack/unpack work properly. In a large majority -// of cases, there's only one step in the list, so those are handled straightforwardly with a map. Otherwise, we -// laboriously look through all the lists for a match. In Java I'd implement a hashCode() method and everything -// would be a hash, but I haven't learned yet what the Go equivalent is. -// TODO: This can be greatly reduced when we shake down the whole DFA/NFA mess. -type dfaMemory struct { - singletons map[*nfaStep]*dfaStep - plurals []perList -} -type perList struct { - list []*nfaStep - dfa *dfaStep -} - -func newDfaMemory() *dfaMemory { - return &dfaMemory{singletons: make(map[*nfaStep]*dfaStep)} -} - -func (m *dfaMemory) rememberDfaForList(dfa *dfaStep, steps ...*nfaStep) { - if len(steps) == 1 { - m.singletons[steps[0]] = dfa - } else { - m.plurals = append(m.plurals, perList{list: steps, dfa: dfa}) - } -} - -func (m *dfaMemory) dfaForNfas(steps ...*nfaStep) (*dfaStep, bool) { - if len(steps) == 1 { - d, ok := m.singletons[steps[0]] - return d, ok - } - for _, p := range m.plurals { - if nfaListsEqual(p.list, steps) { - return p.dfa, true - } - } - return nil, false -} - -func nfaListsEqual(l1, l2 []*nfaStep) bool { - if len(l1) != len(l2) { - return false - } - for _, e1 := range l1 { - if !nfaListContains(l2, e1) { - return false - } - } - return true -} - -func nfaListContains(list []*nfaStep, step *nfaStep) bool { - for _, e := range list { - if e == step { - return true - } - } - return false -} - -type listMaker struct { - singletons map[*nfaStep]*nfaStepList - plurals []*nfaStepList -} - -func newListMaker() *listMaker { - return &listMaker{singletons: make(map[*nfaStep]*nfaStepList)} -} - -func (l *listMaker) getSingleton(step *nfaStep) *nfaStepList { - already, ok := l.singletons[step] - if ok { - return already - } - list := &nfaStepList{steps: []*nfaStep{step}} - l.singletons[step] = list - return list -} - -func (l *listMaker) getList(steps ...*nfaStep) *nfaStepList { - if len(steps) == 1 { - return l.getSingleton(steps[0]) - } - - for _, already := range l.plurals { - if listsAreEqual(already.steps, steps) { - return already - } - } - list := &nfaStepList{steps: steps} - l.plurals = append(l.plurals, list) - return list -} - -func listsAreEqual(l1, l2 []*nfaStep) bool { - if len(l1) != len(l2) { - return false - } - for _, step := range l1 { - if !listMakerContains(l2, step) { - return false - } - } - return true -} - -func listMakerContains(list []*nfaStep, step *nfaStep) bool { - for _, fromList := range list { - if step == fromList { - return true - } - } - return false -} diff --git a/list_maker_test.go b/list_maker_test.go deleted file mode 100644 index 16e7347..0000000 --- a/list_maker_test.go +++ /dev/null @@ -1,122 +0,0 @@ -package quamina - -import ( - "testing" -) - -func TestDfaMemory(t *testing.T) { - d1 := &dfaStep{} - d3 := &dfaStep{} - d12 := &dfaStep{} - d13 := &dfaStep{} - d123 := &dfaStep{} - ns1 := &nfaStep{} - ns2 := &nfaStep{} - ns3 := &nfaStep{} - l1 := []*nfaStep{ns1} - l3 := []*nfaStep{ns3} - l12 := []*nfaStep{ns1, ns2} - l13 := []*nfaStep{ns1, ns3} - l123 := []*nfaStep{ns1, ns2, ns3} - - mem := newDfaMemory() - mem.rememberDfaForList(d1, l1...) - mem.rememberDfaForList(d3, l3...) - mem.rememberDfaForList(d12, l12...) - mem.rememberDfaForList(d13, l13...) - mem.rememberDfaForList(d123, l123...) - - var ok bool - var d *dfaStep - d, ok = mem.dfaForNfas(l1...) - if ok == false || d != d1 { - t.Error("failed d1") - } - d, ok = mem.dfaForNfas(l3...) - if ok == false || d != d3 { - t.Error("failed d1") - } - var shouldMatches [][]*nfaStep - shouldMatches = [][]*nfaStep{{ns1, ns2}, {ns2, ns1}} - for i, should := range shouldMatches { - d, ok := mem.dfaForNfas(should...) - if ok == false || d != d12 { - t.Errorf("no match on %d", i) - } - } - shouldMatches = [][]*nfaStep{{ns1, ns3}, {ns3, ns1}} - for i, should := range shouldMatches { - d, ok := mem.dfaForNfas(should...) - if ok == false || d != d13 { - t.Errorf("no match on %d", i) - } - } - shouldMatches = [][]*nfaStep{{ns1, ns2, ns3}, {ns1, ns3, ns2}, {ns3, ns1, ns2}, {ns3, ns2, ns1}} - for i, should := range shouldMatches { - d, ok := mem.dfaForNfas(should...) - if ok == false || d != d123 { - t.Errorf("no match on %d", i) - } - } - - noDfaFor := [][]*nfaStep{ - {&nfaStep{}}, - {ns2}, - {ns3, ns2}, - {ns1, ns2, &nfaStep{}}, - {ns1, ns2, ns3, &nfaStep{}}, - } - - for i, no := range noDfaFor { - _, ok = mem.dfaForNfas(no...) - if ok { - t.Errorf("bogus match %d", i) - } - } -} - -func TestListMaker(t *testing.T) { - steps := []*nfaStep{ - {}, - {}, - {}, - } - multi := [][]*nfaStep{ - {steps[0]}, - {steps[0], steps[1]}, - {steps[0], steps[1], steps[2]}, - {steps[0], steps[2]}, - {steps[1]}, - {steps[1], steps[2]}, - {steps[2]}, - } - lm := newListMaker() - lists := make(map[*nfaStepList]bool) - for _, step := range steps { - lists[lm.getSingleton(step)] = true - } - if len(lists) != 3 { - t.Error("length should be 3") - } - for _, step := range steps { - lists[lm.getSingleton(step)] = true - } - if len(lists) != 3 { - t.Error("length STILL should be 3") - } - lm = newListMaker() - lists = make(map[*nfaStepList]bool) - for _, plural := range multi { - lists[lm.getList(plural...)] = true - } - wanted := len(multi) - if len(lists) != wanted { - t.Errorf("Got %d wanted %d", len(lists), wanted) - } - for _, plural := range multi { - lists[lm.getList(plural...)] = true - } - if len(lists) != wanted { - t.Errorf("Got %d STILL wanted %d", len(lists), wanted) - } -} diff --git a/nfa.go b/nfa.go new file mode 100644 index 0000000..e8fb2fa --- /dev/null +++ b/nfa.go @@ -0,0 +1,166 @@ +package quamina + +// This groups the functions that traverse, merge, and debug Quamina's nondeterministic finite automata + +func traverseFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*fieldMatcher { + return traverseOneFAStep(table, 0, val, transitions) +} + +func traverseOneFAStep(table *smallTable, index int, val []byte, transitions []*fieldMatcher) []*fieldMatcher { + var utf8Byte byte + switch { + case index < len(val): + utf8Byte = val[index] + case index == len(val): + utf8Byte = valueTerminator + default: + return transitions + } + nextSteps := table.step(utf8Byte) + if nextSteps == nil { + return transitions + } + index++ + for _, nextStep := range nextSteps.steps { + transitions = append(transitions, nextStep.fieldTransitions...) + transitions = traverseOneFAStep(nextStep.table, index, val, transitions) + } + return transitions +} + +// mergeFAs compute the union of two valueMatch automata. If you look up the textbook theory about this, +// they say to compute the set product for automata A and B and build A0B0, A0B1 … A1BN, A1B0 … but if you look +// at that you realize that many of the product states aren't reachable. So you compute A0B0 and then keep +// recursing on the transitions coming out, I'm pretty sure you get a correct result. I don't know if it's +// minimal or even avoids being wasteful. +// INVARIANT: neither argument is nil +// INVARIANT: To be thread-safe, no existing table can be updated except when we're building it + +type faStepKey struct { + step1 *faState + step2 *faState +} + +func mergeFAs(table1, table2 *smallTable) *smallTable { + state1 := &faState{table: table1} + state2 := &faState{table: table2} + return mergeFAStates(state1, state2, make(map[faStepKey]*faState)).table +} + +// TODO: maybe memoize these based on the string of characters you matched to get here? +// TODO: recursion seems way too deep +func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState) *faState { + var combined *faState + mKey := faStepKey{state1, state2} + combined, ok := keyMemo[mKey] + if ok { + return combined + } + + newTable := newSmallTable() + + fieldTransitions := append(state1.fieldTransitions, state2.fieldTransitions...) + combined = &faState{table: newTable, fieldTransitions: fieldTransitions} + //DEBUG combined.table.label = fmt.Sprintf("(%s ∎ %s)", state1.table.label, state2.table.label) + keyMemo[mKey] = combined + u1 := unpackTable(state1.table) + u2 := unpackTable(state2.table) + var uComb unpackedTable + + for i, next1 := range u1 { + next2 := u2[i] + switch { + case next1 == nil && next2 == nil: + uComb[i] = nil + case next1 != nil && next2 == nil: + uComb[i] = u1[i] + case next1 == nil && next2 != nil: + uComb[i] = u2[i] + case next1 != nil && next2 != nil: + //fmt.Printf("MERGE %s & %s i=%d d=%d: ", next1, next2, i, depth) + if next1 == next2 { + // fmt.Println("n1 == n2") + uComb[i] = next1 + } else if i > 0 && next1 == u1[i-1] && next2 == u2[i-1] { + uComb[i] = uComb[i-1] + // fmt.Printf("SEQ %s\n", uComb[i].steps[0].table.shortDump()) + } else { + // fmt.Println("RECURSE!") + var comboNext []*faState + for _, nextStep1 := range next1.steps { + for _, nextStep2 := range next2.steps { + comboNext = append(comboNext, mergeFAStates(nextStep1, nextStep2, keyMemo)) + } + } + uComb[i] = &faNext{steps: comboNext} + //DEBUG uComb[i].serial = *serial + } + } + } + combined.table.pack(&uComb) + + return combined +} + +/**************************************/ +/* debugging apparatus from here down */ +/**************************************/ +/* +func (t *smallTable) dump() string { + return dump1(&faState{table: t}, 0, make(map[*smallTable]bool)) +} +func dump1(fas *faState, indent int, already map[*smallTable]bool) string { + t := fas.table + s := " " + st2(t) + "\n" + for _, step := range t.steps { + if step != nil { + for _, state := range step.steps { + _, ok := already[state.table] + if !ok { + already[state.table] = true + s += dump1(state, indent+1, already) + } + } + } + } + return s +} +func (t *smallTable) shortDump() string { + return fmt.Sprintf("%d-%s", t.serial, t.label) +} + +func (n *faNext) String() string { + var snames []string + for _, step := range n.steps { + snames = append(snames, fmt.Sprintf("%d %s", step.table.serial, step.table.label)) + } + return "[" + strings.Join(snames, " · ") + "]" +} + +func stString(t *smallTable) string { + var rows []string + + for i := range t.ceilings { + c := t.ceilings[i] + if i == 0 { + c = 0 + } else { + if c != valueTerminator && c != byte(byteCeiling) { + c = t.ceilings[i-1] + } + } + var trailer string + if i == len(t.ceilings)-1 && c != valueTerminator && c != byte(byteCeiling) { + trailer = "…" + } else { + trailer = "" + } + if t.steps[i] != nil { + rows = append(rows, fmt.Sprintf("%s%s:%s ", branchChar(c), trailer, t.steps[i].String())) + } else { + rows = append(rows, fmt.Sprintf("%s%s:nil ", branchChar(c), trailer)) + } + } + return fmt.Sprintf("s%d [%s] ", t.serial, t.label) + strings.Join(rows, "/ ") +} +*/ diff --git a/nfa_test.go b/nfa_test.go new file mode 100644 index 0000000..058d81e --- /dev/null +++ b/nfa_test.go @@ -0,0 +1,107 @@ +package quamina + +import ( + "fmt" + "testing" + "unsafe" +) + +// TestArrayBehavior is here prove that (a) you can index a map with an array and +// the indexing actually relies on the values in the array. This has nothing to do with +// Quamina but I'm leaving it here because I had to write this stupid test after failing +// to find a straightforward question of whether this works as expected anywhere in the +// Golang docs. +func TestArrayBehavior(t *testing.T) { + type gpig [4]int + pigs := []gpig{ + {1, 2, 3, 4}, + {4, 3, 2, 1}, + } + nonPigs := []gpig{ + {3, 4, 3, 4}, + {99, 88, 77, 66}, + } + m := make(map[gpig]bool) + for _, pig := range pigs { + m[pig] = true + } + for _, pig := range pigs { + _, ok := m[pig] + if !ok { + t.Error("missed pig") + } + } + pigs[0][0] = 111 + pigs[1][3] = 777 + pigs = append(pigs, nonPigs...) + for _, pig := range pigs { + _, ok := m[pig] + if ok { + t.Error("mutant pig") + } + } + newPig := gpig{1, 2, 3, 4} + _, ok := m[newPig] + if !ok { + t.Error("Newpig") + } +} + +func TestFocusedMerge(t *testing.T) { + shellStyles := []string{ + "a*b", + "ab*", + "*ab", + } + var automata []*smallTable + var matchers []*fieldMatcher + + for _, shellStyle := range shellStyles { + str := `"` + shellStyle + `"` + automaton, matcher := makeShellStyleAutomaton([]byte(str)) + automata = append(automata, automaton) + matchers = append(matchers, matcher) + } + + var cab uintptr + for _, mm := range matchers { + uu := uintptr(unsafe.Pointer(mm)) + cab = cab ^ uu + } + + merged := newSmallTable() + for _, automaton := range automata { + merged = mergeFAs(merged, automaton) + + s := statsAccum{ + fmVisited: make(map[*fieldMatcher]bool), + vmVisited: make(map[*valueMatcher]bool), + stVisited: make(map[any]bool), + } + faStats(merged, &s) + fmt.Println(s.stStats()) + } +} + +func TestNFABasics(t *testing.T) { + aFoo, fFoo := makeStringFA([]byte("foo"), nil) + var matches []*fieldMatcher + + matches = traverseOneFAStep(aFoo, 0, []byte("foo"), nil) + if len(matches) != 1 || matches[0] != fFoo { + t.Error("ouch no foo") + } + matches = traverseOneFAStep(aFoo, 0, []byte("foot"), nil) + if len(matches) != 0 { + t.Error("ouch yes foot") + } + + aNotFoot, fNotFoot := makeMultiAnythingButFA([][]byte{[]byte("foot")}) + notFeet := []string{"foo", "footy", "afoot", "xyz"} + for _, notFoot := range notFeet { + matches = traverseOneFAStep(aNotFoot, 0, []byte(notFoot), nil) + if len(matches) != 1 || matches[0] != fNotFoot { + t.Error("!foot miss: " + notFoot) + } + } +} diff --git a/pattern.go b/pattern.go index e9aa127..cf3d718 100644 --- a/pattern.go +++ b/pattern.go @@ -47,7 +47,7 @@ type patternBuild struct { // patternFromJSON compiles a JSON text provided in jsonBytes into a list of patternField structures. // I love naked returns and I cannot lie func patternFromJSON(jsonBytes []byte) (fields []*patternField, err error) { - // we can't use json.Unmarshal because it round-trips numbers through float64 and %f so they won't end up matching + // we can't use json.Unmarshal because it round-trips numbers through float64 and %f, so they won't end up matching // what the caller actually wrote in the patternField. json.Decoder is kind of slow due to excessive // memory allocation, but I haven't got around to prematurely optimizing the patternFromJSON code path var pb patternBuild diff --git a/pattern_test.go b/pattern_test.go index 9149798..4e218c6 100644 --- a/pattern_test.go +++ b/pattern_test.go @@ -41,61 +41,6 @@ func TestPatternErrorHandling(t *testing.T) { } } -// test that adding an empty pattern doesn't screw anything up -func TestEmptyValueArray(t *testing.T) { - var err error - empties := []string{ - `{"data": {"field": []}}`, - `{"data": {"field": [], "field2": [23]}}`, - `{"data": {"field2": [], "field": [23]}}`, - } - wanted := map[string][]X{ - `{"data": {"field": 23}}`: {empties[2]}, - `{"data": {"field2": 23}}`: {empties[1]}, - `{"data": {"field": []}}`: {}, - } - cm := newCoreMatcher() - defer func() { - if r := recover(); r != nil { - t.Errorf("addPattern panicked") - } - }() - for _, empty := range empties { - err = cm.addPattern(empty, empty) - if err != nil { - t.Error("addPattern: " + err.Error()) - } - } - - for want := range wanted { - matches, err := cm.matchesForJSONEvent([]byte(want)) - if err != nil { - t.Errorf("m4je err on %s", want) - } - checkXEqual(t, wanted[want], matches) - } -} -func checkXEqual(t *testing.T, x1s []X, x2s []X) { - t.Helper() - x2size := len(x2s) - for _, x1 := range x1s { - count := 0 - for _, x2 := range x2s { - if x1 == x2 { - count++ - } - } - if count != 1 { - t.Errorf("for %s in X1, %d", x1, count) - } else { - x2size-- - } - } - if x2size != 0 { - t.Error("Extra elements in X2") - } -} - func TestPatternFromJSON(t *testing.T) { bads := []string{ `x`, diff --git a/pruner.go b/pruner.go index b35f16b..fe2fcd3 100644 --- a/pruner.go +++ b/pruner.go @@ -61,7 +61,7 @@ type prunerMatcher struct { // Maybe prunerMatcher should maybe not be embedded or public. - // live is live set of patterns. + // live is the live set of patterns. live LivePatternsState stats prunerStats @@ -123,7 +123,7 @@ func (t *tooMuchFiltering) rebuild(added bool, s *prunerStats) bool { // We won't rebuildWhileLocked if nothing's been emitted yet. // // In isolation, this heuristic is arguable, but for this - // policy we need it. Otherwise we'll divide by zero, and + // policy we need it. Otherwise, we'll divide by zero, and // nobody wants that. if s.Emitted == 0 { return false @@ -148,9 +148,9 @@ func (m *prunerMatcher) disableRebuild() { // rebuildTrigger provides a way to control when rebuilds are // automatically triggered during standard operations. // -// Currently an addPattern, deletePatterns, or matchesForFields can +// Currently, an addPattern, deletePatterns, or matchesForFields can // trigger a rebuild. When a rebuild is triggered, it's executed -// synchronously: the the Add/Delete/Match method doesn't return until +// synchronously: the Add/Delete/Match method doesn't return until // the rebuild is complete. type rebuildTrigger interface { // rebuild should return true to trigger a rebuild. diff --git a/shell_style.go b/shell_style.go index 48792b2..d447222 100644 --- a/shell_style.go +++ b/shell_style.go @@ -51,20 +51,14 @@ func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []type } // makeShellStyleAutomaton - recognize a "-delimited string containing one '*' glob. -// TODO: Make this recursive like makeStringAutomaton -func makeShellStyleAutomaton(val []byte, useThisTransition *fieldMatcher) (start *smallTable[*nfaStepList], nextField *fieldMatcher) { - table := newSmallTable[*nfaStepList]() +func makeShellStyleAutomaton(val []byte) (start *smallTable, nextField *fieldMatcher) { + table := newSmallTable() start = table - if useThisTransition != nil { - nextField = useThisTransition - } else { - nextField = newFieldMatcher() - } - lister := newListMaker() + nextField = newFieldMatcher() // for each byte in the pattern - var globStep *nfaStep = nil - var globExitStep *nfaStep = nil + var globStep *faState = nil + var globExitStep *faState = nil var globExitByte byte i := 0 for i < len(val) { @@ -72,54 +66,58 @@ func makeShellStyleAutomaton(val []byte, useThisTransition *fieldMatcher) (start if ch == '*' { // special-case handling for string ending in '*"' - transition to field match on any character. // we know the trailing '"' will be there because of JSON syntax. - // TODO: This doesn't even need to be an NFA if i == len(val)-2 { - step := &nfaStep{table: newSmallTable[*nfaStepList](), fieldTransitions: []*fieldMatcher{nextField}} - list := lister.getList(step) - table.addRangeSteps(0, byteCeiling, list) + step := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} + table.setDefault(&faNext{steps: []*faState{step}}) + //DEBUG step.table.label = fmt.Sprintf("prefix escape at %d", i) return } // loop back on everything - globStep = &nfaStep{table: table} - table.addRangeSteps(0, byteCeiling, lister.getList(globStep)) + globStep = &faState{table: table} + //DEBUG table.label = fmt.Sprintf("gS at %d", i) + table.setDefault(&faNext{steps: []*faState{globStep}}) // escape the glob on the next char from the pattern - remember the byte and the state escaped to i++ globExitByte = val[i] - globExitStep = &nfaStep{table: newSmallTable[*nfaStepList]()} + globExitStep = &faState{table: newSmallTable()} + //DEBUG globExitStep.table.label = fmt.Sprintf("gX on %c at %d", val[i], i) // escape the glob - table.addByteStep(globExitByte, lister.getList(globExitStep)) + table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}}) table = globExitStep.table } else { - nextStep := &nfaStep{table: newSmallTable[*nfaStepList]()} + nextStep := &faState{table: newSmallTable()} + //DEBUG nextStep.table.label = fmt.Sprintf("on %c at %d", val[i], i) // we're going to move forward on 'ch'. On anything else, we leave it at nil or - if we've passed // a glob, loop back to the glob stae. if 'ch' is also the glob exit byte, also put in a transfer // back to the glob exist state if globExitStep != nil { - table.addRangeSteps(0, byteCeiling, lister.getList(globStep)) + table.setDefault(&faNext{steps: []*faState{globStep}}) if ch == globExitByte { - table.addByteStep(ch, lister.getList(globExitStep, nextStep)) + table.addByteStep(ch, &faNext{steps: []*faState{globExitStep, nextStep}}) } else { - table.addByteStep(globExitByte, lister.getList(globExitStep)) - table.addByteStep(ch, lister.getList(nextStep)) + table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}}) + table.addByteStep(ch, &faNext{steps: []*faState{nextStep}}) } } else { - table.addByteStep(ch, lister.getList(nextStep)) + table.addByteStep(ch, &faNext{steps: []*faState{nextStep}}) } table = nextStep.table } i++ } - lastStep := &nfaStep{table: newSmallTable[*nfaStepList](), fieldTransitions: []*fieldMatcher{nextField}} + lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} + //DEBUG lastStep.table.label = fmt.Sprintf("last step at %d", i) if globExitStep != nil { - table.addRangeSteps(0, byteCeiling, lister.getList(globStep)) - table.addByteStep(globExitByte, lister.getList(globExitStep)) - table.addByteStep(valueTerminator, lister.getList(lastStep)) + table.setDefault(&faNext{steps: []*faState{globStep}}) + table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}}) + table.addByteStep(valueTerminator, &faNext{steps: []*faState{lastStep}}) } else { - table.addByteStep(valueTerminator, lister.getList(lastStep)) + table.addByteStep(valueTerminator, &faNext{steps: []*faState{lastStep}}) } + // fmt.Printf("new for [%s]: %s\n", string(val), start.dump()) return } diff --git a/shell_style_test.go b/shell_style_test.go index 09a6a62..494a6a6 100644 --- a/shell_style_test.go +++ b/shell_style_test.go @@ -57,29 +57,23 @@ func TestMakeShellStyleAutomaton(t *testing.T) { {`"ayybyyzxx"`}, } - // NOTE also testing nfa2Dfa for i, pattern := range patterns { - myNext := newFieldMatcher() - a, wanted := makeShellStyleAutomaton([]byte(pattern), myNext) - if wanted != myNext { - t.Error("bad next on: " + pattern) - } - d := nfa2Dfa(a) + a, wanted := makeShellStyleAutomaton([]byte(pattern)) vm := newValueMatcher() - vmf := vmFields{startDfa: d} + vmf := vmFields{startTable: a} vm.update(&vmf) for _, should := range shouldsForPatterns[i] { var transitions []*fieldMatcher - gotTrans := transitionDfa(d, []byte(should), transitions) + gotTrans := traverseFA(a, []byte(should), transitions) if len(gotTrans) != 1 || gotTrans[0] != wanted { t.Errorf("Failure for %s on %s", pattern, should) } } for _, shouldNot := range shouldNotForPatterns[i] { var transitions []*fieldMatcher - gotTrans := transitionDfa(d, []byte(shouldNot), transitions) + gotTrans := traverseFA(a, []byte(shouldNot), transitions) if gotTrans != nil { - t.Errorf("bogus DFA match for %s on %s", pattern, shouldNot) + t.Errorf("bogus match for %s on %s", pattern, shouldNot) } } } @@ -89,18 +83,17 @@ func TestShellStyleBuildTime(t *testing.T) { words := readWWords(t) starWords := make([]string, 0, len(words)) patterns := make([]string, 0, len(words)) + source := rand.NewSource(293591) for _, word := range words { //nolint:gosec - starAt := rand.Int31n(6) + starAt := source.Int63() % 6 starWord := string(word[:starAt]) + "*" + string(word[starAt:]) starWords = append(starWords, starWord) pattern := fmt.Sprintf(`{"x": [ {"shellstyle": "%s" } ] }`, starWord) patterns = append(patterns, pattern) } q, _ := New() - for i := 0; i < 32; i++ { - // fmt.Printf("i=%d w=%s: %s\n", i, starWords[i], matcherStats(q.matcher.(*coreMatcher))) - // fmt.Println(patterns[i]) + for i := 0; i < 21; i++ { err := q.AddPattern(starWords[i], patterns[i]) if err != nil { t.Error("AddP: " + err.Error()) @@ -140,6 +133,8 @@ func TestMixedPatterns(t *testing.T) { t.Error("addPattern: " + name + ", prob=" + err.Error()) } } + fmt.Println("M: " + matcherStats(m)) + got := make(map[X]int) lines := getCityLotsLines(t) for _, line := range lines { diff --git a/small_table.go b/small_table.go index 9484cb5..e64bd8b 100644 --- a/small_table.go +++ b/small_table.go @@ -1,26 +1,19 @@ package quamina -// dfaStep and nfaStep are used by the valueMatcher automaton - every step through the +// faState is used by the valueMatcher automaton - every step through the // automaton requires a smallTable and for some of them, taking the step means you've matched a value and can // transition to a new fieldMatcher, in which case the fieldTransitions slice will be non-nil -type dfaStep struct { - table *smallTable[*dfaStep] - fieldTransitions []*fieldMatcher -} - -type nfaStep struct { - table *smallTable[*nfaStepList] +type faState struct { + table *smallTable fieldTransitions []*fieldMatcher } // struct wrapper to make this comparable to help with pack/unpack -type nfaStepList struct { - steps []*nfaStep +type faNext struct { + // serial int // very useful in debugging table construction + steps []*faState } -// TODO: declare dfaTable { smallTable[*dfaStep } and nfaTable { smallTable[*nfaStepList] } -// and make a bunch of code more concise and readable. - // byteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go's byte, which is uint8. The values // 0xF5-0xFF can't appear in UTF-8 strings. We use 0xF5 as a value terminator, so characters F6 and higher // can't appear. @@ -52,23 +45,25 @@ const valueTerminator byte = 0xf5 // or array construct. One could imagine making step() smarter and do a binary search in the case where there are // more than some number of entries. But I'm dubious, the ceilings field is []byte and running through a single-digit // number of those has a good chance of minimizing memory fetches -type smallTable[S comparable] struct { +type smallTable struct { + //DEBUG label string + //DEBUG serial uint64 ceilings []byte - steps []S + steps []*faNext } // newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at // the end, which smallTable.step totally depends on. -func newSmallTable[S comparable]() *smallTable[S] { - var sNil S // declared but not assigned, thus serves as nil - return &smallTable[S]{ +func newSmallTable() *smallTable { + return &smallTable{ + //DEBUG serial: rand.Uint64() % 1000, ceilings: []byte{byte(byteCeiling)}, - steps: []S{sNil}, + steps: []*faNext{nil}, } } // step finds the member of steps in the smallTable that corresponds to the utf8Byte argument. It may return nil. -func (t *smallTable[S]) step(utf8Byte byte) S { +func (t *smallTable) step(utf8Byte byte) *faNext { for index, ceiling := range t.ceilings { if utf8Byte < ceiling { return t.steps[index] @@ -77,148 +72,16 @@ func (t *smallTable[S]) step(utf8Byte byte) S { panic("Malformed smallTable") } -// mergeDfas and mergeNfas compute the union of two valueMatch automata. If you look up the textbook theory about this, -// they say to compute the set product for automata A and B and build A0B0, A0B1 … A1BN, A1B0 … but if you look -// at that you realize that many of the product states aren't reachable. So you compute A0B0 and then keep -// recursing on the transitions coming out, I'm pretty sure you get a correct result. I don't know if it's -// minimal or even avoids being wasteful. -// INVARIANT: neither argument is nil -// INVARIANT: To be thread-safe, no existing table can be updated except when we're building it -func mergeDfas(existing, newStep *smallTable[*dfaStep]) *smallTable[*dfaStep] { - step1 := &dfaStep{table: existing} - step2 := &dfaStep{table: newStep} - return mergeOneDfaStep(step1, step2, make(map[dfaStepKey]*dfaStep)).table -} - -// dfaStepKey exists to serve as the key for the memoize map that's needed to control recursion in mergeAutomata -type dfaStepKey struct { - step1 *dfaStep - step2 *dfaStep -} - -func mergeOneDfaStep(step1, step2 *dfaStep, memoize map[dfaStepKey]*dfaStep) *dfaStep { - var combined *dfaStep - - // to support automata that loop back to themselves (typically on *) we have to stop recursing (and also - // trampolined recursion) - mKey := dfaStepKey{step1: step1, step2: step2} - combined, ok := memoize[mKey] - if ok { - return combined - } - - // TODO: this works, all the tests pass, but should to be able to have with just one *fieldMatcher - newTable := newSmallTable[*dfaStep]() - switch { - case step1.fieldTransitions == nil && step2.fieldTransitions == nil: - combined = &dfaStep{table: newTable} - case step1.fieldTransitions != nil && step2.fieldTransitions != nil: - transitions := append(step1.fieldTransitions, step2.fieldTransitions...) - combined = &dfaStep{table: newTable, fieldTransitions: transitions} - case step1.fieldTransitions != nil && step2.fieldTransitions == nil: - combined = &dfaStep{table: newTable, fieldTransitions: step1.fieldTransitions} - case step1.fieldTransitions == nil && step2.fieldTransitions != nil: - combined = &dfaStep{table: newTable, fieldTransitions: step2.fieldTransitions} - } - memoize[mKey] = combined - - uExisting := unpackTable(step1.table) - uNew := unpackTable(step2.table) - var uComb unpackedTable[*dfaStep] - for i, stepExisting := range uExisting { - stepNew := uNew[i] - switch { - case stepExisting == nil && stepNew == nil: - uComb[i] = nil - case stepExisting != nil && stepNew == nil: - uComb[i] = stepExisting - case stepExisting == nil && stepNew != nil: - uComb[i] = stepNew - case stepExisting != nil && stepNew != nil: - // there are considerable runs of the same value - if i > 0 && stepExisting == uExisting[i-1] && stepNew == uNew[i-1] { - uComb[i] = uComb[i-1] - } else { - uComb[i] = mergeOneDfaStep(stepExisting, stepNew, memoize) - } - } - } - combined.table.pack(&uComb) - return combined -} - -// nfa2Dfa does what the name says. As of now it does not consider epsilon -// transitions in the NFA because, as of the time of writing, none of the -// pattern-matching required those transitions. It is based on the algorithm -// taught in the TU München course “Automata and Formal Languages”, lecturer -// Prof. Dr. Ernst W. Mayr in 2014-15, in particular the examples appearing in -// http://wwwmayr.informatik.tu-muenchen.de/lehre/2014WS/afs/2014-10-14.pdf -// especially the slide in Example 11. -func nfa2Dfa(table *smallTable[*nfaStepList]) *smallTable[*dfaStep] { - firstStep := &nfaStepList{steps: []*nfaStep{{table: table}}} - return nfaStep2DfaStep(firstStep, newDfaMemory()).table -} - -func nfaStep2DfaStep(stepList *nfaStepList, memoize *dfaMemory) *dfaStep { - var dStep *dfaStep - dStep, ok := memoize.dfaForNfas(stepList.steps...) - if ok { - return dStep - } - dStep = &dfaStep{ - table: &smallTable[*dfaStep]{}, - } - memoize.rememberDfaForList(dStep, stepList.steps...) - if len(stepList.steps) == 1 { - // there's only stepList.steps[0] - nStep := stepList.steps[0] - dStep.fieldTransitions = nStep.fieldTransitions - dStep.table.ceilings = make([]byte, len(nStep.table.ceilings)) - dStep.table.steps = make([]*dfaStep, len(nStep.table.ceilings)) // defaults will be nil, which is OK - for i, nfaList := range nStep.table.steps { - dStep.table.ceilings[i] = nStep.table.ceilings[i] - if nfaList != nil { - dStep.table.steps[i] = nfaStep2DfaStep(nfaList, memoize) - } - } - } else { - // coalesce - first, unpack each of the steps - unpackedNfaSteps := make([]*unpackedTable[*nfaStepList], len(stepList.steps)) - var unpackedDfa unpackedTable[*dfaStep] - for i, list := range stepList.steps { - unpackedNfaSteps[i] = unpackTable(list.table) - dStep.fieldTransitions = append(dStep.fieldTransitions, list.fieldTransitions...) - } - for utf8Byte := 0; utf8Byte < byteCeiling; utf8Byte++ { - steps := make(map[*nfaStep]bool) - for _, table := range unpackedNfaSteps { - if table[utf8Byte] != nil { - for _, step := range table[utf8Byte].steps { - steps[step] = true - } - } - } - var synthStep nfaStepList - for step := range steps { - synthStep.steps = append(synthStep.steps, step) - } - unpackedDfa[utf8Byte] = nfaStep2DfaStep(&synthStep, memoize) - } - dStep.table.pack(&unpackedDfa) - } - - return dStep -} - -// makeSmallDfaTable creates a pre-loaded small table, with all bytes not otherwise specified having the defaultStep +// makeSmallTable creates a pre-loaded small table, with all bytes not otherwise specified having the defaultStep // value, and then a few other values with their indexes and values specified in the other two arguments. The // goal is to reduce memory churn // constraint: positions must be provided in order -func makeSmallDfaTable(defaultStep *dfaStep, indices []byte, steps []*dfaStep) *smallTable[*dfaStep] { - t := smallTable[*dfaStep]{ +func makeSmallTable(defaultStep *faNext, indices []byte, steps []*faNext) *smallTable { + t := smallTable{ ceilings: make([]byte, 0, len(indices)+2), - steps: make([]*dfaStep, 0, len(indices)+2), + steps: make([]*faNext, 0, len(indices)+2), } + var lastIndex byte = 0 for i, index := range indices { if index > lastIndex { @@ -237,13 +100,13 @@ func makeSmallDfaTable(defaultStep *dfaStep, indices []byte, steps []*dfaStep) * } // unpackedTable replicates the data in the smallTable ceilings and steps arrays. It's quite hard to -// update the list structure in a smallDfaTable, but trivial in an unpackedTable. The idea is that to update -// a smallDfaTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point… -// TODO: Figure out how to update a smallDfaTable in place -type unpackedTable[S comparable] [byteCeiling]S +// update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update +// a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point… +// TODO: Figure out how to update a smallTable in place +type unpackedTable [byteCeiling]*faNext -func unpackTable[S comparable](t *smallTable[S]) *unpackedTable[S] { - var u unpackedTable[S] +func unpackTable(t *smallTable) *unpackedTable { + var u unpackedTable unpackedIndex := 0 for packedIndex, c := range t.ceilings { ceiling := int(c) @@ -255,9 +118,9 @@ func unpackTable[S comparable](t *smallTable[S]) *unpackedTable[S] { return &u } -func (t *smallTable[S]) pack(u *unpackedTable[S]) { +func (t *smallTable) pack(u *unpackedTable) { ceilings := make([]byte, 0, 16) - steps := make([]S, 0, 16) + steps := make([]*faNext, 0, 16) lastStep := u[0] for unpackedIndex, ss := range u { if ss != lastStep { @@ -272,16 +135,84 @@ func (t *smallTable[S]) pack(u *unpackedTable[S]) { t.steps = steps } -func (t *smallTable[S]) addByteStep(utf8Byte byte, step S) { +func (t *smallTable) addByteStep(utf8Byte byte, step *faNext) { unpacked := unpackTable(t) unpacked[utf8Byte] = step t.pack(unpacked) } -func (t *smallTable[S]) addRangeSteps(floor int, ceiling int, s S) { +// setDefault sets all the values of the table to the provided faNext pointer +// TODO: Do we need this at all? Maybe just a variant of newSmallTable? +func (t *smallTable) setDefault(s *faNext) { + t.steps = []*faNext{s} + t.ceilings = []byte{byte(byteCeiling)} +} + +// Debugging from here down +/* +// addRangeSteps not currently used but think it will be useful in future regex-y work +func (t *smallTable) addRangeSteps(floor int, ceiling int, s *faNext) { unpacked := unpackTable(t) for i := floor; i < ceiling; i++ { unpacked[i] = s } t.pack(unpacked) } + +func st2(t *smallTable) string { + // going to build a string rep of a smallTable based on the unpacked form + // each line is going to be a range like + // 'c' .. 'e' => %X + // lines where the *faNext is nil are omitted + var rows []string + unpacked := unpackTable(t) + + var rangeStart int + var b int + + defTrans := unpacked[0] + + for { + for b < len(unpacked) && unpacked[b] == nil { + b++ + } + if b == len(unpacked) { + break + } + rangeStart = b + lastN := unpacked[b] + for b < len(unpacked) && unpacked[b] == lastN { + b++ + } + if lastN != defTrans { + row := "" + if b == rangeStart+1 { + row += fmt.Sprintf("'%s'", branchChar((byte(rangeStart)))) + } else { + row += fmt.Sprintf("'%s'…'%s'", branchChar(byte(rangeStart)), branchChar(byte(b-1))) + } + row += " → " + lastN.String() + rows = append(rows, row) + } + } + if defTrans != nil { + dtString := "★ → " + defTrans.String() + return fmt.Sprintf("%d [%s] ", t.serial, t.label) + strings.Join(rows, " / ") + " / " + dtString + } else { + return fmt.Sprintf("%d [%s] ", t.serial%1000, t.label) + strings.Join(rows, " / ") + } +} + +func branchChar(b byte) string { + switch b { + case 0: + return "∅" + case valueTerminator: + return "ℵ" + case byte(byteCeiling): + return "♾️" + default: + return fmt.Sprintf("%c", b) + } +} +*/ diff --git a/small_table_test.go b/small_table_test.go index c89bd8a..b8b3134 100644 --- a/small_table_test.go +++ b/small_table_test.go @@ -2,11 +2,11 @@ package quamina import ( "fmt" - "math/rand" "testing" "time" ) +/* TODO: Restore func TestMakeSmallTable(t *testing.T) { tMST(t, []byte{1, 2, 33}) tMST(t, []byte{0, 1, 2, 33, byte(byteCeiling - 1)}) @@ -36,11 +36,9 @@ func tMST(t *testing.T, b []byte) { } } -func newDfaTransition(f *fieldMatcher) *dfaStep { - return &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{f}} -} +*/ -func TestDFAMergePerf(t *testing.T) { +func TestFAMergePerf(t *testing.T) { words := readWWords(t) patterns := make([]string, 0, len(words)) for _, word := range words { @@ -71,6 +69,7 @@ func TestDFAMergePerf(t *testing.T) { fmt.Printf("%.2f addPatterns/second with letter patterns\n\n", perSecond) } +/* TODO: Restore func TestCombiner(t *testing.T) { // "jab" A0 := &dfaStep{table: newSmallTable[*dfaStep]()} @@ -100,7 +99,7 @@ func TestCombiner(t *testing.T) { combo := mergeOneDfaStep(A0, B0, make(map[dfaStepKey]*dfaStep)) - state := &vmFields{startDfa: combo.table} + state := &vmFields{startTable: combo.table} vm := newValueMatcher() vm.update(state) matches := vm.transitionOn([]byte("jab")) @@ -126,8 +125,8 @@ func TestCombiner(t *testing.T) { st = newDfaTransition(CFM) C2.table.addByteStep(valueTerminator, st) - combo = mergeOneDfaStep(&dfaStep{table: vm.getFields().startDfa}, C0, make(map[dfaStepKey]*dfaStep)) - vm.update(&vmFields{startDfa: combo.table}) + combo = mergeOneDfaStep(&dfaStep{table: vm.getFields().startTable}, C0, make(map[dfaStepKey]*dfaStep)) + vm.update(&vmFields{startTable: combo.table}) matches = vm.transitionOn([]byte("jab")) if len(matches) != 1 || matches[0].fields().transitions["AFM"] == nil { t.Error("wanted AFM") @@ -144,7 +143,9 @@ func TestCombiner(t *testing.T) { t.Error("should have BFM and CFM") } } +*/ +/* TODO: Restore func TestUnpack(t *testing.T) { st1 := &dfaStep{table: newSmallTable[*dfaStep]()} @@ -166,6 +167,7 @@ func TestUnpack(t *testing.T) { } } + func TestFuzzPack(t *testing.T) { seeds := []int64{9, 81, 1729, 8, 64, 512, 7, 49, 343, 6, 36, 216, 5, 25, 125} for _, seed := range seeds { @@ -176,7 +178,7 @@ func TestFuzzPack(t *testing.T) { func fuzzPack(t *testing.T, seed int64) { t.Helper() - rand.New(rand.NewSource(seed)) + rand.Seed(seed) var used [byteCeiling]bool var unpacked unpackedTable[*dfaStep] @@ -234,3 +236,53 @@ func fuzzPack(t *testing.T, seed int64) { } } } + + +*/ + +/* Debug testing +func TestSt2(t *testing.T) { + fas1 := faNext{ + serial: 1, + steps: []*faState{}, + } + fas2 := faNext{ + serial: 2, + steps: []*faState{}, + } + fas3 := faNext{ + serial: 3, + steps: []*faState{}, + } + fas4 := faNext{ + serial: 4, + steps: []*faState{}, + } + fas0 := faNext{ + serial: 0, + steps: []*faState{}, + } + + fasp1 := &fas1 + fasp2 := &fas2 + fasp3 := &fas3 + fasp4 := &fas4 + fasp0 := &fas0 + + table := newSmallTable() + table.addByteStep('b', fasp1) + table.addByteStep('c', fasp2) + table.addByteStep('z', fasp2) + table.addByteStep('$', fasp3) + table.addRangeSteps('p', 't', fasp4) + + DEBUG fmt.Println("ta-da! " + st2(table)) + + table = newSmallTable() + table.addByteStep('c', fasp2) + + table = makeSmallTable(fasp0, []byte{'b', 'c', 'z', '$'}, []*faNext{fasp1, fasp2, fasp2, fasp3}) + fmt.Println("to-do! " + st2(table)) +} + +*/ diff --git a/stats.go b/stats.go index 80d520a..cb1618d 100644 --- a/stats.go +++ b/stats.go @@ -3,7 +3,7 @@ package quamina import "fmt" // TODO: add stats for average and max smallTable fanout -type stats struct { +type statsAccum struct { fmCount int fmTblCount int fmEntries int @@ -19,10 +19,18 @@ type stats struct { siCount int } +func (s *statsAccum) stStats() string { + avgStSize := "n/a" + if s.stTblCount > 0 { + avgStSize = fmt.Sprintf("%.3f", float64(s.stEntries)/float64(s.stTblCount)) + } + return fmt.Sprintf("SmallTables %d (avg size %s, max %d), singletons %d", s.stCount, avgStSize, s.stMax, s.siCount) +} + // matcherStats gathers statistics about the size of a coreMatcher, including the average and max fanout sizes of // the transition tables, returning this information in string form func matcherStats(m *coreMatcher) string { - s := stats{ + s := statsAccum{ fmVisited: make(map[*fieldMatcher]bool), vmVisited: make(map[*valueMatcher]bool), stVisited: make(map[any]bool), @@ -39,7 +47,7 @@ func matcherStats(m *coreMatcher) string { return fmPart + vmPart + stPart } -func fmStats(m *fieldMatcher, s *stats) { +func fmStats(m *fieldMatcher, s *statsAccum) { if s.fmVisited[m] { return } @@ -59,7 +67,7 @@ func fmStats(m *fieldMatcher, s *stats) { } } -func vmStats(m *valueMatcher, s *stats) { +func vmStats(m *valueMatcher, s *statsAccum) { if s.vmVisited[m] { return } @@ -70,12 +78,12 @@ func vmStats(m *valueMatcher, s *stats) { s.siCount++ fmStats(state.singletonTransition, s) } - if state.startDfa != nil { - dfaStats(state.startDfa, s) + if state.startTable != nil { + faStats(state.startTable, s) } } -func dfaStats(t *smallTable[*dfaStep], s *stats) { +func faStats(t *smallTable, s *statsAccum) { if s.stVisited[t] { return } @@ -89,14 +97,16 @@ func dfaStats(t *smallTable[*dfaStep], s *stats) { s.stTblCount++ s.stEntries += len(t.ceilings) } - for _, step := range t.steps { - if step != nil { - if step.fieldTransitions != nil { - for _, m := range step.fieldTransitions { - fmStats(m, s) + for _, next := range t.steps { + if next != nil { + for _, step := range next.steps { + if step.fieldTransitions != nil { + for _, m := range step.fieldTransitions { + fmStats(m, s) + } } + faStats(step.table, s) } - dfaStats(step.table, s) } } } diff --git a/value_matcher.go b/value_matcher.go index 8219349..4a42a9e 100644 --- a/value_matcher.go +++ b/value_matcher.go @@ -5,16 +5,13 @@ import ( "sync/atomic" ) -// valueMatcher represents a byte-driven automaton. The table needs to be the -// equivalent of a map[byte]nextState and is represented by smallTable. Some -// patterns can be represented by a deterministic finite automaton (DFA) but -// others, particularly with a regex flavor, need to be represented by a -// nondeterministic finite automaton (NFA). NFAs are converted to DFAs for -// simplicity and efficiency. The basic algorithm is to compute the automaton -// for a pattern, convert it to a DFA if necessary, and merge with any -// existing DFA. +// valueMatcher represents a byte-driven finite automaton (FA). The table needs to be the +// equivalent of a map[byte]nextState and is represented by smallTable. +// In this implementation all the FAs are nondeterministic, which means each +// byte can cause transfers to multiple other states. The basic algorithm is to compute the FA +// for a pattern and merge with any existing FA. // In some (common) cases there is only one byte sequence forward from a state, -// i.e. a string-valued field with only one string match. In this case, the DFA +// i.e. a string-valued field with only one string match. In this case, the FA // will be null and the value being matched has to exactly equal the singletonMatch // field; if so, the singletonTransition is the return value. This is to avoid // having a long chain of smallTables each with only one entry. @@ -25,7 +22,7 @@ type valueMatcher struct { updateable atomic.Value // always contains *vmFields } type vmFields struct { - startDfa *smallTable[*dfaStep] + startTable *smallTable singletonMatch []byte singletonTransition *fieldMatcher } @@ -66,63 +63,37 @@ func (m *valueMatcher) transitionOn(val []byte) []*fieldMatcher { } return transitions - case fields.startDfa != nil: - return transitionDfa(fields.startDfa, val, transitions) + case fields.startTable != nil: + return traverseFA(fields.startTable, val, transitions) default: - // no dfa, no singleton, nothing to do, this probably can't happen because a flattener + // no FA, no singleton, nothing to do, this probably can't happen because a flattener // shouldn't preserve a field that hasn't appeared in a pattern return transitions } } -func transitionDfa(table *smallTable[*dfaStep], val []byte, transitions []*fieldMatcher) []*fieldMatcher { - // step through the smallTables, byte by byte - for _, utf8Byte := range val { - step := table.step(utf8Byte) - if step == nil { - return transitions - } - - transitions = append(transitions, step.fieldTransitions...) - table = step.table - } - - // look for terminator after exhausting bytes of val - lastStep := table.step(valueTerminator) - - // we only do a field-level transition if there's one in the table that the - // last character in val arrives at - if lastStep != nil { - transitions = append(transitions, lastStep.fieldTransitions...) - } - - return transitions -} - func (m *valueMatcher) addTransition(val typedVal) *fieldMatcher { valBytes := []byte(val.val) fields := m.getFieldsForUpdate() // there's already a table, thus an out-degree > 1 - if fields.startDfa != nil { - var newDfa *smallTable[*dfaStep] + if fields.startTable != nil { + var newFA *smallTable var nextField *fieldMatcher switch val.vType { case stringType, numberType, literalType: - newDfa, nextField = makeStringAutomaton(valBytes, nil) + newFA, nextField = makeStringFA(valBytes, nil) case anythingButType: - newDfa, nextField = makeMultiAnythingButAutomaton(val.list, nil) + newFA, nextField = makeMultiAnythingButFA(val.list) case shellStyleType: - var newNfa *smallTable[*nfaStepList] - newNfa, nextField = makeShellStyleAutomaton(valBytes, nil) - newDfa = nfa2Dfa(newNfa) + newFA, nextField = makeShellStyleAutomaton(valBytes) case prefixType: - newDfa, nextField = makePrefixAutomaton(valBytes, nil) + newFA, nextField = makePrefixAutomaton(valBytes) default: panic("unknown value type") } - fields.startDfa = mergeDfas(fields.startDfa, newDfa) + fields.startTable = mergeFAs(fields.startTable, newFA) m.update(fields) return nextField } @@ -139,18 +110,18 @@ func (m *valueMatcher) addTransition(val typedVal) *fieldMatcher { m.update(fields) return fields.singletonTransition case anythingButType: - newAutomaton, nextField := makeMultiAnythingButAutomaton(val.list, nil) - fields.startDfa = newAutomaton + newFA, nextField := makeMultiAnythingButFA(val.list) + fields.startTable = newFA m.update(fields) return nextField case shellStyleType: - newAutomaton, nextField := makeShellStyleAutomaton(valBytes, nil) - fields.startDfa = nfa2Dfa(newAutomaton) + newAutomaton, nextField := makeShellStyleAutomaton(valBytes) + fields.startTable = newAutomaton m.update(fields) return nextField case prefixType: - newAutomaton, nextField := makePrefixAutomaton(valBytes, nil) - fields.startDfa = newAutomaton + newFA, nextField := makePrefixAutomaton(valBytes) + fields.startTable = newFA m.update(fields) return nextField default: @@ -167,77 +138,85 @@ func (m *valueMatcher) addTransition(val typedVal) *fieldMatcher { // singleton is here, we don't match, so our outdegree becomes 2, so we have // to build an automaton with two values in it - singletonAutomaton, _ := makeStringAutomaton(fields.singletonMatch, fields.singletonTransition) + singletonAutomaton, _ := makeStringFA(fields.singletonMatch, fields.singletonTransition) var nextField *fieldMatcher - var newDfa *smallTable[*dfaStep] + var newFA *smallTable switch val.vType { case stringType, numberType, literalType: - newDfa, nextField = makeStringAutomaton(valBytes, nil) + newFA, nextField = makeStringFA(valBytes, nil) case anythingButType: - newDfa, nextField = makeMultiAnythingButAutomaton(val.list, nil) + newFA, nextField = makeMultiAnythingButFA(val.list) case shellStyleType: - var newNfa *smallTable[*nfaStepList] - newNfa, nextField = makeShellStyleAutomaton(valBytes, nil) - newDfa = nfa2Dfa(newNfa) + newFA, nextField = makeShellStyleAutomaton(valBytes) case prefixType: - newDfa, nextField = makePrefixAutomaton(valBytes, nil) + newFA, nextField = makePrefixAutomaton(valBytes) default: panic("unknown value type") } // now table is ready for use, nuke singleton to signal threads to use it - fields.startDfa = mergeDfas(singletonAutomaton, newDfa) + fields.startTable = mergeFAs(singletonAutomaton, newFA) + // fmt.Println("Merged: " + fields.startTable.dump()) fields.singletonMatch = nil fields.singletonTransition = nil m.update(fields) return nextField } -func makePrefixAutomaton(val []byte, useThisTransition *fieldMatcher) (*smallTable[*dfaStep], *fieldMatcher) { - var nextField *fieldMatcher - - if useThisTransition != nil { - nextField = useThisTransition - } else { - nextField = newFieldMatcher() - } +func makePrefixAutomaton(val []byte) (*smallTable, *fieldMatcher) { + nextField := newFieldMatcher() return onePrefixStep(val, 0, nextField), nextField } -func onePrefixStep(val []byte, index int, nextField *fieldMatcher) *smallTable[*dfaStep] { - var nextStep *dfaStep +func onePrefixStep(val []byte, index int, nextField *fieldMatcher) *smallTable { + var nextStep *faNext // have to stop one short to skip the closing " + var nextState *faState + if index == len(val)-2 { - nextStep = &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{nextField}} + nextState = &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}} } else { - nextStep = &dfaStep{table: onePrefixStep(val, index+1, nextField)} + nextState = &faState{table: onePrefixStep(val, index+1, nextField)} } - return makeSmallDfaTable(nil, []byte{val[index]}, []*dfaStep{nextStep}) + nextStep = &faNext{steps: []*faState{nextState}} + return makeSmallTable(nil, []byte{val[index]}, []*faNext{nextStep}) } -// makeStringAutomaton creates a utf8-based automaton from a literal string +// makeStringFA creates a utf8-based automaton from a literal string // using smallTables. Note the addition of a valueTerminator. The implementation -// is recursive because this allows the use of the makeSmallDfaTable call, which +// is recursive because this allows the use of the makeSmallTable call, which // reduces memory churn. Converting from a straightforward implementation to // this approximately doubled the fields/second rate in addPattern -func makeStringAutomaton(val []byte, useThisTransition *fieldMatcher) (*smallTable[*dfaStep], *fieldMatcher) { +func makeStringFA(val []byte, useThisTransition *fieldMatcher) (*smallTable, *fieldMatcher) { var nextField *fieldMatcher if useThisTransition != nil { nextField = useThisTransition } else { nextField = newFieldMatcher() } - return oneDfaStep(val, 0, nextField), nextField + + return makeOneFAStep(val, 0, nextField), nextField } -func oneDfaStep(val []byte, index int, nextField *fieldMatcher) *smallTable[*dfaStep] { - var nextStep *dfaStep +func makeOneFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable { + var nextStepList *faNext if index == len(val)-1 { - lastStep := &dfaStep{table: newSmallTable[*dfaStep](), fieldTransitions: []*fieldMatcher{nextField}} - nextStep = &dfaStep{table: makeSmallDfaTable(nil, []byte{valueTerminator}, []*dfaStep{lastStep})} + lastStep := &faState{ + table: newSmallTable(), + fieldTransitions: []*fieldMatcher{nextField}, + } + lastStepList := &faNext{steps: []*faState{lastStep}} + nextStep := &faState{ + table: makeSmallTable(nil, []byte{valueTerminator}, []*faNext{lastStepList}), + } + nextStepList = &faNext{steps: []*faState{nextStep}} } else { - nextStep = &dfaStep{table: oneDfaStep(val, index+1, nextField)} + nextStep := &faState{table: makeOneFAStep(val, index+1, nextField)} + nextStepList = &faNext{steps: []*faState{nextStep}} } - return makeSmallDfaTable(nil, []byte{val[index]}, []*dfaStep{nextStep}) + var u unpackedTable + u[val[index]] = nextStepList + // return stepper.buildTable(&u) + return makeSmallTable(nil, []byte{val[index]}, []*faNext{nextStepList}) } diff --git a/value_matcher_test.go b/value_matcher_test.go index 337c38e..e7938b4 100644 --- a/value_matcher_test.go +++ b/value_matcher_test.go @@ -196,11 +196,12 @@ func TestOverlappingValues(t *testing.T) { } func TestFuzzValueMatcher(t *testing.T) { - rand.New(rand.NewSource(98543)) + source := rand.NewSource(98543) + m := newCoreMatcher() var pNames []X bytes := "abcdefghijklmnopqrstuvwxyz" - lb := len(bytes) + lb := int64(len(bytes)) strLen := 12 used := make(map[X]bool) @@ -209,7 +210,7 @@ func TestFuzzValueMatcher(t *testing.T) { str := "" for j := 0; j < strLen; j++ { //nolint:gosec - ch := bytes[rand.Int()%lb] + ch := bytes[source.Int63()%lb] str += string([]byte{ch}) } pNames = append(pNames, str) @@ -248,7 +249,7 @@ func TestFuzzValueMatcher(t *testing.T) { str := "" for j := 0; j < strLen; j++ { //nolint:gosec - ch := bytes[rand.Int()%lb] + ch := bytes[source.Int63()%lb] str += string([]byte{ch}) } _, ok := used[str] @@ -268,7 +269,7 @@ func TestFuzzValueMatcher(t *testing.T) { } func TestFuzzWithNumbers(t *testing.T) { - rand.New(rand.NewSource(98543)) + source := rand.NewSource(98543) m := newCoreMatcher() var pNames []X used := make(map[X]bool) @@ -276,7 +277,7 @@ func TestFuzzWithNumbers(t *testing.T) { // make ten thousand random numbers between 1 and 100K. There are probably dupes? for i := 0; i < 10000; i++ { //nolint:gosec - n := rand.Int63n(1000000) + n := source.Int63() ns := fmt.Sprintf("%d", n) pNames = append(pNames, ns) used[ns] = true @@ -331,12 +332,3 @@ func TestFuzzWithNumbers(t *testing.T) { } } } - -func contains(list []*fieldMatcher, s *fieldMatcher) bool { - for _, l := range list { - if l == s { - return true - } - } - return false -}