forked from umputun/remark42
/
restricted_words.go
176 lines (143 loc) · 4.25 KB
/
restricted_words.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package service
import (
"strings"
"unicode"
"unicode/utf8"
log "github.com/go-pkgz/lgr"
)
// RestrictedWordsLister provides restricted words in comments per site
type RestrictedWordsLister interface {
List(siteID string) (restricted []string, err error)
}
// StaticRestrictedWordsLister provides same restricted words in comments for every site
type StaticRestrictedWordsLister struct {
Words []string
}
// List provides restricted words in comments (ignores siteID)
func (l StaticRestrictedWordsLister) List(siteID string) (restricted []string, err error) {
return l.Words, nil
}
// RestrictedWordsMatcher matches comment text against restricted words
type RestrictedWordsMatcher struct {
lister RestrictedWordsLister
}
// NewRestrictedWordsMatcher creates new RestrictedWordsMatcher using provided RestrictedWordsLister
func NewRestrictedWordsMatcher(lister RestrictedWordsLister) *RestrictedWordsMatcher {
return &RestrictedWordsMatcher{lister: lister}
}
// Match matches comment text against restricted words for specified site
func (m *RestrictedWordsMatcher) Match(siteID string, text string) bool {
restrictedWords, err := m.lister.List(siteID)
if err != nil {
log.Printf("[WARN] failed to get restricted patterns for site %s: %v", siteID, err)
return false
}
if len(restrictedWords) == 0 {
return false
}
tokens := m.tokenize(text)
trie := newWildcardTrie(restrictedWords...)
for _, token := range tokens {
if trie.check(token) {
return true
}
}
return false
}
func (m *RestrictedWordsMatcher) tokenize(text string) []string {
tokens := make([]string, 0, 10) // accumulator for tokens
word := false // flag shows if current range is word
start := 0 // beginning of the current range
for pos, r := range text {
if unicode.IsLetter(r) || unicode.IsNumber(r) {
if !word {
// everything from start to pos - 1 is not a word, so reset start and start word tracking
start = pos
word = true
}
continue
}
if word && start < pos {
// everything from start to pos - 1 is a word, so add it as a token and reset start
tokens = append(tokens, strings.ToLower(text[start:pos]))
start = pos
}
// exited the word
word = false
}
// since we append tokens when we already left the word (on next iteration),
// we need to do it manually for the last iteration
if word {
tokens = append(tokens, strings.ToLower(text[start:]))
}
return tokens
}
type wildcardTrie struct {
terminal bool
children map[rune]*wildcardTrie
}
func newWildcardTrie(patterns ...string) *wildcardTrie {
trie := &wildcardTrie{terminal: false, children: make(map[rune]*wildcardTrie)}
for _, p := range patterns {
trie.addPattern(p)
}
return trie
}
func (trie *wildcardTrie) addPattern(pattern string) {
// since pattern matching algorithm is recursive we do not allow long patterns
if utf8.RuneCountInString(pattern) < 1 || utf8.RuneCountInString(pattern) > 64 {
log.Printf("[WARN] invalid pattern length '%s': actual - %d, min allowed - 1, max allowed - 64", pattern, utf8.RuneCountInString(pattern))
return
}
node := trie
for _, r := range strings.ToLower(strings.TrimSpace(pattern)) {
if childNode, exists := node.children[r]; exists {
node = childNode
continue
}
childNode := newWildcardTrie()
node.children[r] = childNode
node = childNode
}
node.terminal = true
}
// check tests if any pattern stored in trie matches the token. Recursive. Max depth is longest pattern in trie.
func (trie *wildcardTrie) check(token string) bool {
if len(token) == 0 {
if trie.terminal {
return true
}
if childNode, exists := trie.children['*']; exists && childNode.terminal {
return true
}
return false
}
r, width := utf8.DecodeRuneInString(token)
if childNode, exists := trie.children[r]; exists {
if childNode.check(token[width:]) {
return true
}
}
if childNode, exists := trie.children['*']; exists {
if childNode.terminal {
return true
}
if childNode.checkAllSuffixes(token) {
return true
}
}
return false
}
func (trie *wildcardTrie) checkAllSuffixes(token string) bool {
suffix := token
for {
if len(suffix) == 0 {
return false
}
if trie.check(suffix) {
return true
}
_, width := utf8.DecodeRuneInString(suffix)
suffix = suffix[width:]
}
}