Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add minimal perfect hash domain matcher #743

Merged
merged 9 commits into from
Mar 15, 2021
6 changes: 3 additions & 3 deletions app/router/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ type DomainMatcher struct {
matchers strmatcher.IndexMatcher
}

func NewACAutomatonDomainMatcher(domains []*Domain) (*DomainMatcher, error) {
g := strmatcher.NewACAutomatonMatcherGroup()
func NewMphMatcherGroup(domains []*Domain) (*DomainMatcher, error) {
g := strmatcher.NewMphMatcherGroup()
for _, d := range domains {
matcherType, f := matcherTypeMap[d.Type]
if !f {
Expand Down Expand Up @@ -102,7 +102,7 @@ func NewDomainMatcher(domains []*Domain) (*DomainMatcher, error) {
}

func (m *DomainMatcher) ApplyDomain(domain string) bool {
return len(m.matchers.Match(domain)) > 0
return len(m.matchers.Match(strings.ToLower(domain))) > 0
}

// Apply implements Condition.
Expand Down
6 changes: 3 additions & 3 deletions app/router/condition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ func TestChinaSites(t *testing.T) {

matcher, err := NewDomainMatcher(domains)
common.Must(err)
acMatcher, err := NewACAutomatonDomainMatcher(domains)
acMatcher, err := NewMphMatcherGroup(domains)
common.Must(err)

type TestCase struct {
Expand Down Expand Up @@ -399,11 +399,11 @@ func TestChinaSites(t *testing.T) {
}
}

func BenchmarkHybridDomainMatcher(b *testing.B) {
func BenchmarkMphDomainMatcher(b *testing.B) {
domains, err := loadGeoSite("CN")
common.Must(err)

matcher, err := NewACAutomatonDomainMatcher(domains)
matcher, err := NewMphMatcherGroup(domains)
common.Must(err)

type TestCase struct {
Expand Down
8 changes: 4 additions & 4 deletions app/router/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ func (rr *RoutingRule) BuildCondition() (Condition, error) {

if len(rr.Domain) > 0 {
switch rr.DomainMatcher {
case "hybrid":
matcher, err := NewACAutomatonDomainMatcher(rr.Domain)
case "mph":
matcher, err := NewMphMatcherGroup(rr.Domain)
if err != nil {
return nil, newError("failed to build domain condition with ACAutomatonDomainMatcher").Base(err)
return nil, newError("failed to build domain condition with MphDomainMatcher").Base(err)
}
newError("ACAutomatonDomainMatcher is enabled for ", len(rr.Domain), "domain rules(s)").AtDebug().WriteToLog()
newError("MphDomainMatcher is enabled for ", len(rr.Domain), "domain rules(s)").AtDebug().WriteToLog()
conds.Add(matcher)
default:
matcher, err := NewDomainMatcher(rr.Domain)
Expand Down
297 changes: 297 additions & 0 deletions common/strmatcher/mph_matcher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
package strmatcher

import (
"math/bits"
"regexp"
"sort"
"strings"
"unsafe"
)

// PrimeRK is the prime base used in Rabin-Karp algorithm.
const PrimeRK = 16777619

// calculate the rolling murmurHash of given string
func RollingHash(s string) uint32 {
h := uint32(0)
for i := len(s) - 1; i >= 0; i-- {
h = h*PrimeRK + uint32(s[i])
}
return h
}

// A MphMatcherGroup is divided into three parts:
// 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table;
// 2. `substr` patterns are matched by ac automaton;
// 3. `regex` patterns are matched with the regex library.
type MphMatcherGroup struct {
ac *ACAutomaton
otherMatchers []matcherEntry
rules []string
level0 []uint32
level0Mask int
level1 []uint32
level1Mask int
count uint32
ruleMap *map[string]uint32
}

func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
h := RollingHash(pattern)
switch t {
case Domain:
(*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
fallthrough
case Full:
(*g.ruleMap)[pattern] = h
default:
}
}

func NewMphMatcherGroup() *MphMatcherGroup {
return &MphMatcherGroup{
ac: nil,
otherMatchers: nil,
rules: nil,
level0: nil,
level0Mask: 0,
level1: nil,
level1Mask: 0,
count: 1,
ruleMap: &map[string]uint32{},
}
}

// AddPattern adds a pattern to MphMatcherGroup
func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
switch t {
case Substr:
if g.ac == nil {
g.ac = NewACAutomaton()
}
g.ac.Add(pattern, t)
case Full, Domain:
pattern = strings.ToLower(pattern)
g.AddFullOrDomainPattern(pattern, t)
case Regex:
r, err := regexp.Compile(pattern)
if err != nil {
return 0, err
}
g.otherMatchers = append(g.otherMatchers, matcherEntry{
m: &regexMatcher{pattern: r},
id: g.count,
})
default:
panic("Unknown type")
}
return g.count, nil
}

// Build builds a minimal perfect hash table and ac automaton from insert rules
func (g *MphMatcherGroup) Build() {
if g.ac != nil {
g.ac.Build()
}
keyLen := len(*g.ruleMap)
g.level0 = make([]uint32, nextPow2(keyLen/4))
g.level0Mask = len(g.level0) - 1
g.level1 = make([]uint32, nextPow2(keyLen))
g.level1Mask = len(g.level1) - 1
var sparseBuckets = make([][]int, len(g.level0))
var ruleIdx int
for rule, hash := range *g.ruleMap {
n := int(hash) & g.level0Mask
g.rules = append(g.rules, rule)
sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
ruleIdx++
}
g.ruleMap = nil
var buckets []indexBucket
for n, vals := range sparseBuckets {
if len(vals) > 0 {
buckets = append(buckets, indexBucket{n, vals})
}
}
sort.Sort(bySize(buckets))

occ := make([]bool, len(g.level1))
var tmpOcc []int
for _, bucket := range buckets {
var seed = uint32(0)
for {
findSeed := true
tmpOcc = tmpOcc[:0]
for _, i := range bucket.vals {
n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
if occ[n] {
for _, n := range tmpOcc {
occ[n] = false
}
seed++
findSeed = false
break
}
occ[n] = true
tmpOcc = append(tmpOcc, n)
g.level1[n] = uint32(i)
}
if findSeed {
g.level0[bucket.n] = seed
break
}
}
}
}

func nextPow2(v int) int {
if v <= 1 {
return 1
}
const MaxUInt = ^uint(0)
n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
return int(n)
}

// Lookup searches for s in t and returns its index and whether it was found.
func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
i0 := int(h) & g.level0Mask
seed := g.level0[i0]
i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
n := g.level1[i1]
return s == g.rules[int(n)]
}

// Match implements IndexMatcher.Match.
func (g *MphMatcherGroup) Match(pattern string) []uint32 {
result := []uint32{}
hash := uint32(0)
for i := len(pattern) - 1; i >= 0; i-- {
hash = hash*PrimeRK + uint32(pattern[i])
if pattern[i] == '.' {
if g.Lookup(hash, pattern[i:]) {
result = append(result, 1)
return result
}
}
}
if g.Lookup(hash, pattern) {
result = append(result, 1)
return result
}
if g.ac != nil && g.ac.Match(pattern) {
result = append(result, 1)
return result
}
for _, e := range g.otherMatchers {
if e.m.Match(pattern) {
result = append(result, e.id)
return result
}
}
return nil
}

type indexBucket struct {
n int
vals []int
}

type bySize []indexBucket

func (s bySize) Len() int { return len(s) }
func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

type stringStruct struct {
str unsafe.Pointer
len int
}

func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
x := (*stringStruct)(a)
return memhashFallback(x.str, h, uintptr(x.len))
}

const (
// Constants for multiplication: four random odd 64-bit numbers.
m1 = 16877499708836156737
m2 = 2820277070424839065
m3 = 9497967016996688599
m4 = 15839092249703872147
)

var hashkey = [4]uintptr{1, 1, 1, 1}

func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
h := uint64(seed + s*hashkey[0])
tail:
switch {
case s == 0:
case s < 4:
h ^= uint64(*(*byte)(p))
h ^= uint64(*(*byte)(add(p, s>>1))) << 8
h ^= uint64(*(*byte)(add(p, s-1))) << 16
h = rotl31(h*m1) * m2
case s <= 8:
h ^= uint64(readUnaligned32(p))
h ^= uint64(readUnaligned32(add(p, s-4))) << 32
h = rotl31(h*m1) * m2
case s <= 16:
h ^= readUnaligned64(p)
h = rotl31(h*m1) * m2
h ^= readUnaligned64(add(p, s-8))
h = rotl31(h*m1) * m2
case s <= 32:
h ^= readUnaligned64(p)
h = rotl31(h*m1) * m2
h ^= readUnaligned64(add(p, 8))
h = rotl31(h*m1) * m2
h ^= readUnaligned64(add(p, s-16))
h = rotl31(h*m1) * m2
h ^= readUnaligned64(add(p, s-8))
h = rotl31(h*m1) * m2
default:
v1 := h
v2 := uint64(seed * hashkey[1])
v3 := uint64(seed * hashkey[2])
v4 := uint64(seed * hashkey[3])
for s >= 32 {
v1 ^= readUnaligned64(p)
v1 = rotl31(v1*m1) * m2
p = add(p, 8)
v2 ^= readUnaligned64(p)
v2 = rotl31(v2*m2) * m3
p = add(p, 8)
v3 ^= readUnaligned64(p)
v3 = rotl31(v3*m3) * m4
p = add(p, 8)
v4 ^= readUnaligned64(p)
v4 = rotl31(v4*m4) * m1
p = add(p, 8)
s -= 32
}
h = v1 ^ v2 ^ v3 ^ v4
goto tail
}

h ^= h >> 29
h *= m3
h ^= h >> 32
return uintptr(h)
}
func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
return unsafe.Pointer(uintptr(p) + x)
}
func readUnaligned32(p unsafe.Pointer) uint32 {
q := (*[4]byte)(p)
return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
}

func rotl31(x uint64) uint64 {
return (x << 31) | (x >> (64 - 31))
}
func readUnaligned64(p unsafe.Pointer) uint64 {
q := (*[8]byte)(p)
return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
}
Loading