-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add example of fuzzy matching * Tweak fuzzy table to fail earlier * Break out min3 to util * Formalize distance command * Fix tests * Package levenshtein as its own fuzzy package * First pass at sift4 * Clean up code a bit, add ratio, add tests * Swap dist search to use sift4 algorithm that's easily 5-10x faster * Rename dist command to fuzzy * Clean up fuzzy table that has keys with a lot of misses to speed up searches * Remove legacy return for similarity matches. Tweak algorithm * Switch to a more straight-forward algorithm for size * Readme * Mark fuzzy command as experimental
- Loading branch information
Showing
11 changed files
with
425 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
// +build experimental | ||
|
||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
"rare/cmd/helpers" | ||
"rare/cmd/readProgress" | ||
"rare/pkg/aggregation" | ||
"rare/pkg/color" | ||
"rare/pkg/multiterm" | ||
|
||
"github.com/urfave/cli" | ||
) | ||
|
||
func fuzzyFunction(c *cli.Context) error { | ||
var ( | ||
topItems = c.Int("n") | ||
reverseSort = c.Bool("reverse") | ||
atLeast = c.Int64("atleast") | ||
sortByKey = c.Bool("sk") | ||
extra = c.Bool("extra") | ||
similarity = float32(c.Float64("similarity")) | ||
simOffset = c.Int("similarity-offset") | ||
simSize = c.Int("similarity-size") | ||
) | ||
|
||
counter := aggregation.NewFuzzyAggregator(similarity, simOffset, simSize) | ||
writer := multiterm.NewHistogram(multiterm.New(), topItems) | ||
writer.ShowBar = c.Bool("bars") || extra | ||
writer.ShowPercentage = c.Bool("percentage") || extra | ||
|
||
ext := helpers.BuildExtractorFromArguments(c) | ||
|
||
helpers.RunAggregationLoop(ext, counter, func() { | ||
writeHistoOutput(writer, counter.Histo, topItems, reverseSort, sortByKey, atLeast) | ||
writer.InnerWriter().WriteForLine(topItems, helpers.FWriteExtractorSummary(ext, | ||
counter.ParseErrors(), | ||
fmt.Sprintf("(Groups: %s) (Fuzzy: %s)", color.Wrapi(color.BrightBlue, counter.Histo.GroupCount()), color.Wrapi(color.BrightMagenta, counter.FuzzyTableSize())))) | ||
writer.InnerWriter().WriteForLine(topItems+1, readProgress.GetReadFileString()) | ||
}) | ||
|
||
writer.InnerWriter().Close() | ||
|
||
return nil | ||
} | ||
|
||
func fuzzyCommand() *cli.Command { | ||
return helpers.AdaptCommandForExtractor(cli.Command{ | ||
Name: "fuzzy", | ||
ShortName: "z", | ||
Aliases: []string{"fuz"}, | ||
Usage: "(EXPERIMENTAL) Look for similar matches by using a fuzzy search algorithm", | ||
Description: `Generates a live-updating histogram of the input data, looking | ||
for a relative distance between various results. This is useful to find | ||
similar log messages that may have slight differences to them (eg ids) | ||
and aggregating and search for these messages`, | ||
Action: fuzzyFunction, | ||
Flags: []cli.Flag{ | ||
cli.BoolFlag{ | ||
Name: "bars,b", | ||
Usage: "Display bars as part of histogram", | ||
}, | ||
cli.BoolFlag{ | ||
Name: "percentage", | ||
Usage: "Display percentage of total next to the value", | ||
}, | ||
cli.BoolFlag{ | ||
Name: "extra,x", | ||
Usage: "Alias for -b --percentage", | ||
}, | ||
cli.IntFlag{ | ||
Name: "num,n", | ||
Usage: "Number of elements to display", | ||
Value: 5, | ||
}, | ||
cli.Int64Flag{ | ||
Name: "atleast", | ||
Usage: "Only show results if there are at least this many samples", | ||
Value: 0, | ||
}, | ||
cli.BoolFlag{ | ||
Name: "reverse", | ||
Usage: "Reverses the display sort-order", | ||
}, | ||
cli.BoolFlag{ | ||
Name: "sortkey,sk", | ||
Usage: "Sort by key, rather than value", | ||
}, | ||
cli.Float64Flag{ | ||
Name: "similarity,s", | ||
Usage: "The expression string has to be at least this percent similar to qualify as a fuzzy match", | ||
Value: 0.75, | ||
}, | ||
cli.Int64Flag{ | ||
Name: "similarity-offset,so", | ||
Usage: "The max offset to examine in the string to look for a similarity", | ||
Value: 10, | ||
}, | ||
cli.Int64Flag{ | ||
Name: "similarity-size,ss", | ||
Usage: "The maximum size a similarity table can grow to. Keeps the top most-likely keys at all times", | ||
Value: 100, | ||
}, | ||
}, | ||
}) | ||
} | ||
|
||
func init() { | ||
commands = append(commands, *fuzzyCommand()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
// +build experimental | ||
|
||
package aggregation | ||
|
||
import ( | ||
"rare/pkg/fuzzy" | ||
) | ||
|
||
type FuzzyAggregator struct { | ||
lookup *fuzzy.FuzzyTable | ||
Histo *MatchCounter | ||
} | ||
|
||
func NewFuzzyAggregator(matchDist float32, maxOffset, maxSize int) *FuzzyAggregator { | ||
return &FuzzyAggregator{ | ||
lookup: fuzzy.NewFuzzyTable(matchDist, maxOffset, maxSize), | ||
Histo: NewCounter(), | ||
} | ||
} | ||
|
||
func (s *FuzzyAggregator) Sample(ele string) { | ||
similarStr, _ := s.lookup.GetMatchId(ele) | ||
s.Histo.SampleValue(similarStr, 1) | ||
} | ||
|
||
func (s *FuzzyAggregator) ParseErrors() uint64 { | ||
return s.Histo.ParseErrors() | ||
} | ||
|
||
func (s *FuzzyAggregator) FuzzyTableSize() int { | ||
return s.lookup.Count() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package fuzzy | ||
|
||
import ( | ||
"rare/pkg/fuzzy/sift4" | ||
"sort" | ||
) | ||
|
||
type fuzzyItem struct { | ||
original string | ||
score int64 | ||
} | ||
|
||
type FuzzyTable struct { | ||
keys []fuzzyItem | ||
matchDist float32 | ||
maxOffset int | ||
maxSize int | ||
searches int | ||
} | ||
|
||
func NewFuzzyTable(matchDist float32, maxOffset, maxSize int) *FuzzyTable { | ||
if maxSize < 0 { | ||
panic("Invalid max size") | ||
} | ||
if maxOffset < 0 { | ||
panic("Invalid max offset") | ||
} | ||
return &FuzzyTable{ | ||
keys: make([]fuzzyItem, 0), | ||
matchDist: matchDist, | ||
maxOffset: maxOffset, | ||
maxSize: maxSize, | ||
} | ||
} | ||
|
||
func (s *FuzzyTable) GetMatchId(val string) (match string, isNew bool) { | ||
for i := range s.keys { | ||
ele := &s.keys[i] | ||
d := sift4.DistanceStringRatio(ele.original, val, s.maxOffset) | ||
if d > s.matchDist { | ||
if d < 0.99 { // Imperfect matches score more | ||
ele.score += int64(len(s.keys)) | ||
} else { | ||
ele.score++ | ||
} | ||
return ele.original, false | ||
} | ||
ele.score-- | ||
} | ||
|
||
s.searches++ | ||
if s.searches >= 10 { | ||
s.Cleanup() | ||
s.searches = 0 | ||
} | ||
|
||
if len(s.keys) < s.maxSize || s.keys[len(s.keys)-1].score < 1 { | ||
newItem := fuzzyItem{ | ||
original: val, | ||
score: 1, | ||
} | ||
s.keys = append(s.keys, newItem) | ||
} | ||
|
||
return val, true | ||
} | ||
|
||
func (s *FuzzyTable) Cleanup() { | ||
// Sorting puts the most likely match candidate at the top of the search | ||
sort.Slice(s.keys, func(i, j int) bool { | ||
return s.keys[i].score > s.keys[j].score | ||
}) | ||
|
||
if len(s.keys) > s.maxSize { | ||
s.keys = s.keys[:s.maxSize] | ||
} | ||
} | ||
|
||
func (s *FuzzyTable) Count() int { | ||
return len(s.keys) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package fuzzy | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestSimpleTable(t *testing.T) { | ||
tbl := NewFuzzyTable(0.5, 5, -100) | ||
_, new := tbl.GetMatchId("test") | ||
assert.True(t, new) | ||
|
||
_, new = tbl.GetMatchId("test") | ||
assert.False(t, new) | ||
|
||
_, new = tbl.GetMatchId("blah") | ||
assert.True(t, new) | ||
|
||
_, new = tbl.GetMatchId("tast") | ||
assert.False(t, new) | ||
} | ||
|
||
func BenchmarkSimpleTable(b *testing.B) { | ||
tbl := NewFuzzyTable(0.7, 5, -100) | ||
for n := 0; n < b.N; n++ { | ||
tbl.GetMatchId(fmt.Sprintf("abcd-%d", n%100)) | ||
} | ||
} |
13 changes: 0 additions & 13 deletions
13
pkg/levenshtein/levenshtein.go → pkg/fuzzy/levenshtein/levenshtein.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package levenshtein | ||
|
||
func min3(a, b, c int) int { | ||
if a < b { | ||
if a < c { | ||
return a | ||
} | ||
} else { | ||
if b < c { | ||
return b | ||
} | ||
} | ||
return c | ||
} |
Oops, something went wrong.