Skip to content

Commit

Permalink
use Tokenize instead of Cut and SetCutForSearchThreshold(3) in gojieb…
Browse files Browse the repository at this point in the history
…a/bleve
  • Loading branch information
yanyiwu committed Apr 30, 2016
1 parent e94bce2 commit 045ae5c
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
+ upgrade cppjieba -> v4.7.0
+ add new api: Tokenize for location information of words
+ add new api: SetCutForSearchThreshold
+ use Tokenize instead of Cut and SetCutForSearchThreshold(3) in gojieba/bleve

## v0.13.0

Expand Down
10 changes: 8 additions & 2 deletions bleve/bleve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ func Example() {
Id: "4",
Body: "交代",
},
{
Id: "5",
Body: "长江大桥",
},
}

indexMapping := bleve.NewIndexMapping()
Expand Down Expand Up @@ -73,6 +77,7 @@ func Example() {
querys := []string{
"你好世界",
"亲口交代",
"长江",
}

for _, q := range querys {
Expand All @@ -86,8 +91,9 @@ func Example() {
}

// Output:
// [{"id":"2","score":0.4232867878957415},{"id":"1","score":0.4232867878957415}]
// [{"id":"4","score":0.4232867878957415},{"id":"3","score":0.4232867878957415}]
// [{"id":"2","score":0.47907267476955906},{"id":"1","score":0.47907267476955906}]
// [{"id":"4","score":0.47907267476955906},{"id":"3","score":0.47907267476955906}]
// [{"id":"5","score":0.9581453659370776}]
}

func prettify(res *bleve.SearchResult) string {
Expand Down
22 changes: 11 additions & 11 deletions bleve/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@ import (
"github.com/yanyiwu/gojieba"
)

const (
CUT_FOR_SEARCH_THRESHOLD = 3
)

type JiebaTokenizer struct {
handle *gojieba.Jieba
}

func NewJiebaTokenizer(dictpath, hmmpath, userdictpath string) *JiebaTokenizer {
return &JiebaTokenizer{
gojieba.NewJieba(dictpath, hmmpath, userdictpath),
}
x := gojieba.NewJieba(dictpath, hmmpath, userdictpath)
x.SetCutForSearchThreshold(CUT_FOR_SEARCH_THRESHOLD)
return &JiebaTokenizer{x}
}

func (x *JiebaTokenizer) Free() {
Expand All @@ -24,22 +28,18 @@ func (x *JiebaTokenizer) Free() {

func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
start := 0
end := 0
pos := 1
words := x.handle.Cut(string(sentence), false)
words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, false)
for _, word := range words {
end = start + len(word)
token := analysis.Token{
Term: []byte(word),
Start: start,
End: end,
Term: []byte(word.Str),
Start: word.Start,
End: word.End,
Position: pos,
Type: analysis.Ideographic,
}
result = append(result, &token)
pos++
start = end
}
return result
}
Expand Down

0 comments on commit 045ae5c

Please sign in to comment.