Skip to content

Commit

Permalink
Fuzzy search (#23)
Browse files Browse the repository at this point in the history
* Add example of fuzzy matching

* Tweak fuzzy table to fail earlier

* Break out min3 to util

* Formalize distance command

* Fix tests

* Package levenshtein as its own fuzzy package

* First pass at sift4

* Clean up code a bit, add ratio, add tests

* Swap dist search to use sift4 algorithm that's easily 5-10x faster

* Rename dist command to fuzzy

* Clean up fuzzy table that has keys with a lot of misses to speed up searches

* Remove legacy return for similarity matches. Tweak algorithm

* Switch to a more straight-forward algorithm for size

* Readme

* Mark fuzzy command as experimental
  • Loading branch information
zix99 committed Oct 18, 2020
1 parent 45cb991 commit 0cf16dd
Show file tree
Hide file tree
Showing 11 changed files with 425 additions and 22 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ go run github.com/gobuffalo/packr/v2/packr2

# Build binary
go build .

# OR, with experimental features
go build -tags experimental .
```

# Docs
Expand Down
16 changes: 9 additions & 7 deletions cmd/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ package cmd

import "github.com/urfave/cli"

var commands []cli.Command = []cli.Command{
*filterCommand(),
*histogramCommand(),
*analyzeCommand(),
*tabulateCommand(),
*docsCommand(),
}

func GetSupportedCommands() []cli.Command {
return []cli.Command{
*filterCommand(),
*histogramCommand(),
*analyzeCommand(),
*tabulateCommand(),
*docsCommand(),
}
return commands
}
111 changes: 111 additions & 0 deletions cmd/fuzzy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// +build experimental

package cmd

import (
"fmt"
"rare/cmd/helpers"
"rare/cmd/readProgress"
"rare/pkg/aggregation"
"rare/pkg/color"
"rare/pkg/multiterm"

"github.com/urfave/cli"
)

func fuzzyFunction(c *cli.Context) error {
var (
topItems = c.Int("n")
reverseSort = c.Bool("reverse")
atLeast = c.Int64("atleast")
sortByKey = c.Bool("sk")
extra = c.Bool("extra")
similarity = float32(c.Float64("similarity"))
simOffset = c.Int("similarity-offset")
simSize = c.Int("similarity-size")
)

counter := aggregation.NewFuzzyAggregator(similarity, simOffset, simSize)
writer := multiterm.NewHistogram(multiterm.New(), topItems)
writer.ShowBar = c.Bool("bars") || extra
writer.ShowPercentage = c.Bool("percentage") || extra

ext := helpers.BuildExtractorFromArguments(c)

helpers.RunAggregationLoop(ext, counter, func() {
writeHistoOutput(writer, counter.Histo, topItems, reverseSort, sortByKey, atLeast)
writer.InnerWriter().WriteForLine(topItems, helpers.FWriteExtractorSummary(ext,
counter.ParseErrors(),
fmt.Sprintf("(Groups: %s) (Fuzzy: %s)", color.Wrapi(color.BrightBlue, counter.Histo.GroupCount()), color.Wrapi(color.BrightMagenta, counter.FuzzyTableSize()))))
writer.InnerWriter().WriteForLine(topItems+1, readProgress.GetReadFileString())
})

writer.InnerWriter().Close()

return nil
}

func fuzzyCommand() *cli.Command {
return helpers.AdaptCommandForExtractor(cli.Command{
Name: "fuzzy",
ShortName: "z",
Aliases: []string{"fuz"},
Usage: "(EXPERIMENTAL) Look for similar matches by using a fuzzy search algorithm",
Description: `Generates a live-updating histogram of the input data, looking
for a relative distance between various results. This is useful to find
similar log messages that may have slight differences to them (eg ids)
and aggregating and search for these messages`,
Action: fuzzyFunction,
Flags: []cli.Flag{
cli.BoolFlag{
Name: "bars,b",
Usage: "Display bars as part of histogram",
},
cli.BoolFlag{
Name: "percentage",
Usage: "Display percentage of total next to the value",
},
cli.BoolFlag{
Name: "extra,x",
Usage: "Alias for -b --percentage",
},
cli.IntFlag{
Name: "num,n",
Usage: "Number of elements to display",
Value: 5,
},
cli.Int64Flag{
Name: "atleast",
Usage: "Only show results if there are at least this many samples",
Value: 0,
},
cli.BoolFlag{
Name: "reverse",
Usage: "Reverses the display sort-order",
},
cli.BoolFlag{
Name: "sortkey,sk",
Usage: "Sort by key, rather than value",
},
cli.Float64Flag{
Name: "similarity,s",
Usage: "The expression string has to be at least this percent similar to qualify as a fuzzy match",
Value: 0.75,
},
cli.Int64Flag{
Name: "similarity-offset,so",
Usage: "The max offset to examine in the string to look for a similarity",
Value: 10,
},
cli.Int64Flag{
Name: "similarity-size,ss",
Usage: "The maximum size a similarity table can grow to. Keeps the top most-likely keys at all times",
Value: 100,
},
},
})
}

func init() {
commands = append(commands, *fuzzyCommand())
}
32 changes: 32 additions & 0 deletions pkg/aggregation/fuzzy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// +build experimental

package aggregation

import (
"rare/pkg/fuzzy"
)

type FuzzyAggregator struct {
lookup *fuzzy.FuzzyTable
Histo *MatchCounter
}

func NewFuzzyAggregator(matchDist float32, maxOffset, maxSize int) *FuzzyAggregator {
return &FuzzyAggregator{
lookup: fuzzy.NewFuzzyTable(matchDist, maxOffset, maxSize),
Histo: NewCounter(),
}
}

func (s *FuzzyAggregator) Sample(ele string) {
similarStr, _ := s.lookup.GetMatchId(ele)
s.Histo.SampleValue(similarStr, 1)
}

func (s *FuzzyAggregator) ParseErrors() uint64 {
return s.Histo.ParseErrors()
}

func (s *FuzzyAggregator) FuzzyTableSize() int {
return s.lookup.Count()
}
81 changes: 81 additions & 0 deletions pkg/fuzzy/fuzzyTable.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package fuzzy

import (
"rare/pkg/fuzzy/sift4"
"sort"
)

type fuzzyItem struct {
original string
score int64
}

type FuzzyTable struct {
keys []fuzzyItem
matchDist float32
maxOffset int
maxSize int
searches int
}

func NewFuzzyTable(matchDist float32, maxOffset, maxSize int) *FuzzyTable {
if maxSize < 0 {
panic("Invalid max size")
}
if maxOffset < 0 {
panic("Invalid max offset")
}
return &FuzzyTable{
keys: make([]fuzzyItem, 0),
matchDist: matchDist,
maxOffset: maxOffset,
maxSize: maxSize,
}
}

func (s *FuzzyTable) GetMatchId(val string) (match string, isNew bool) {
for i := range s.keys {
ele := &s.keys[i]
d := sift4.DistanceStringRatio(ele.original, val, s.maxOffset)
if d > s.matchDist {
if d < 0.99 { // Imperfect matches score more
ele.score += int64(len(s.keys))
} else {
ele.score++
}
return ele.original, false
}
ele.score--
}

s.searches++
if s.searches >= 10 {
s.Cleanup()
s.searches = 0
}

if len(s.keys) < s.maxSize || s.keys[len(s.keys)-1].score < 1 {
newItem := fuzzyItem{
original: val,
score: 1,
}
s.keys = append(s.keys, newItem)
}

return val, true
}

func (s *FuzzyTable) Cleanup() {
// Sorting puts the most likely match candidate at the top of the search
sort.Slice(s.keys, func(i, j int) bool {
return s.keys[i].score > s.keys[j].score
})

if len(s.keys) > s.maxSize {
s.keys = s.keys[:s.maxSize]
}
}

func (s *FuzzyTable) Count() int {
return len(s.keys)
}
30 changes: 30 additions & 0 deletions pkg/fuzzy/fuzzyTable_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package fuzzy

import (
"fmt"
"testing"

"github.com/stretchr/testify/assert"
)

func TestSimpleTable(t *testing.T) {
tbl := NewFuzzyTable(0.5, 5, -100)
_, new := tbl.GetMatchId("test")
assert.True(t, new)

_, new = tbl.GetMatchId("test")
assert.False(t, new)

_, new = tbl.GetMatchId("blah")
assert.True(t, new)

_, new = tbl.GetMatchId("tast")
assert.False(t, new)
}

func BenchmarkSimpleTable(b *testing.B) {
tbl := NewFuzzyTable(0.7, 5, -100)
for n := 0; n < b.N; n++ {
tbl.GetMatchId(fmt.Sprintf("abcd-%d", n%100))
}
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,5 @@
package levenshtein

func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
} else {
if b < c {
return b
}
}
return c
}

func Distance(a, b []rune) int {
alen := len(a)
blen := len(b)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,14 @@ func TestFullSimilar(t *testing.T) {
assert.Equal(t, float32(1.0), DistanceStringRatio("abc", "abc"))
}

func BenchmarkSimilarity(b *testing.B) {
func BenchmarkSimilarityHigh(b *testing.B) {
for i := 0; i < b.N; i++ {
DistanceStringRatio("abcdef", "qqqdef")
DistanceString("this is a very long string to test with", "this is a very short string to test with")
}
}

func BenchmarkSimilarityLow(b *testing.B) {
for i := 0; i < b.N; i++ {
DistanceString("this is a very long string to test with", "a completely different string with a few similar words")
}
}
14 changes: 14 additions & 0 deletions pkg/fuzzy/levenshtein/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package levenshtein

func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
} else {
if b < c {
return b
}
}
return c
}

0 comments on commit 0cf16dd

Please sign in to comment.