/
collector.go
82 lines (71 loc) · 1.67 KB
/
collector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package wordcloud
import (
"github.com/go-ego/gse"
)
var (
seg gse.Segmenter
)
type WordCollector struct {
wordSet []WordJson
}
func NewWordCollectorWithStr(str string) *WordCollector {
words := seg.Pos(str, true)
words = seg.TrimPos(words)
words = seg.TrimWithPos(words, "x", "m", "eng")
wordSet := newWordSet(words)
return NewWordCollectorWithSet(wordSet)
}
func NewWordCollectorWithSet(wordSet []WordJson) *WordCollector {
return &WordCollector{
wordSet: wordSet,
}
}
func accumulateWords(words []*WordJson) []*WordJson {
length := len(words)
ans := make([]*WordJson, 0)
classMap := make(map[string]string)
wordCounter := make(map[string]uint)
for i := 0; i < length; i++ {
if _, ok := classMap[words[i].Content]; !ok {
wordCounter[words[i].Content] = words[i].Count
classMap[words[i].Content] = words[i].WordClass
} else {
wordCounter[words[i].Content] += words[i].Count
}
}
for word, count := range wordCounter {
ans = append(ans, &WordJson{
Content: word,
WordClass: classMap[word],
Count: count,
})
}
return ans
}
func newWordSet(words []gse.SegPos) (wordSet []WordJson) {
counter := map[gse.SegPos]uint{}
for _, word := range words {
counter[word]++
}
for word, count := range counter {
wordSet = append(wordSet, WordJson{
Content: word.Text,
WordClass: word.Pos,
Count: count,
})
}
return
}
func (wc *WordCollector) Filter(filter Filter) *WordCollector {
newWordSet := make([]WordJson, 0)
for _, word := range wc.wordSet {
if filter.IsLegal(word) {
newWordSet = append(newWordSet, word)
}
}
wc.wordSet = newWordSet
return wc
}
func (wc *WordCollector) ToSlice() []WordJson {
return wc.wordSet
}