zincsearch · prabhatsharma · Mar 8, 2022 · Mar 5, 2022 · Mar 6, 2022 · Mar 6, 2022
diff --git a/go.mod b/go.mod
@@ -11,6 +11,7 @@ require (
 	github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 // indirect
 	github.com/gin-contrib/cors v1.3.1
 	github.com/gin-gonic/gin v1.7.4
+	github.com/go-ego/gse v0.70.0
 	github.com/google/uuid v1.3.0
 	github.com/jeremywohl/flatten v1.0.1
 	github.com/joho/godotenv v1.4.0

diff --git a/go.sum b/go.sum
@@ -142,6 +142,8 @@ github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm
 github.com/gin-gonic/gin v1.5.0/go.mod h1:Nd6IXA8m5kNZdNEHMBd93KT+mdY3+bewLgRvmCsR2Do=
 github.com/gin-gonic/gin v1.7.4 h1:QmUZXrvJ9qZ3GfWvQ+2wnW/1ePrTEJqPKMYEU3lD/DM=
 github.com/gin-gonic/gin v1.7.4/go.mod h1:jD2toBW3GZUr5UMcdrwQA10I7RuaFOl/SGeDjXkfUtY=
+github.com/go-ego/gse v0.70.0 h1:K9M+clnPc1sjnSGvLgSHAo39oWr0v3deXB/31K0n18w=
+github.com/go-ego/gse v0.70.0/go.mod h1:M9Xv8cEW7Of27BbE4p0iI3arqQHCYcm5N16/2b3pPPk=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
@@ -374,6 +376,10 @@ github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVM
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs=
 github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
+github.com/vcaesar/cedar v0.20.0 h1:VtBy/twzVjXiTo1Ij3fQRyDQRzvzDa9sKacpbwSJyps=
+github.com/vcaesar/cedar v0.20.0/go.mod h1:iMDweyuW76RvSrCkQeZeQk4iCbshiPzcCvcGCtpM7iI=
+github.com/vcaesar/tt v0.20.0 h1:9t2Ycb9RNHcP0WgQgIaRKJBB+FrRdejuaL6uWIHuoBA=
+github.com/vcaesar/tt v0.20.0/go.mod h1:GHPxQYhn+7OgKakRusH7KJ0M5MhywoeLb8Fcffs/Gtg=
 github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
 github.com/xtgo/uuid v0.0.0-20140804021211-a0b114877d4c h1:3lbZUMbMiGUW/LMkfsEABsc5zNT9+b1CvsJx47JzJ8g=
 github.com/xtgo/uuid v0.0.0-20140804021211-a0b114877d4c/go.mod h1:UrdRz5enIKZ63MEE3IF9l2/ebyx59GyGgPi+tICQdmM=

diff --git a/pkg/bluge/analysis/lang/chs/README.md b/pkg/bluge/analysis/lang/chs/README.md
@@ -0,0 +1,143 @@
+# zinc-analysis-gse
+
+it's a plugin of zinc to support Chinese analyzer.
+
+Analyzer: `gse_standard` , `gse_search`
+
+Tokenizer: `gse_standard` , `gse_search`
+
+TokenFilter: `gse_stop`
+
+> build has embed dictionary of `zh/s_1.txt`, `zh/stop_tokens.txt`.
+
+you can find it: https://github.com/go-ego/gse/tree/master/data/dict
+
+> also you can custom dictionary follow [custom user dictionary](#custom-user-dictionary)
+
+after custom, you need restart zinc.
+
+## gse
+
+https://github.com/go-ego/gse
+
+Go efficient multilingual NLP and text segmentation; support english, chinese, japanese and other.
+
+## Environment
+
+you need pass environment to enable gse support:
+
+`ZINC_PLUGIN_GSE_ENABLE` true of false, default is `false`
+
+`ZINC_PLUGIN_GSE_DICT_EMBED` small or big, default is `small`, which size dictionary will load when `gse` enabled.
+
+`ZINC_PLUGIN_GSE_DICT_PATH` custom dictionary path, default is `./plugins/gse/dict`
+
+
+## API example
+
+POST http://localhost:4080/es/_analyze
+
+```
+{
+  "analyzer": "gse_standard",
+  "text": "《复仇者联盟3：无限战争》是全片使用IMAX摄影机拍摄制作的的科幻片."
+}
+```
+
+POST http://localhost:4080/es/_analyze
+
+```
+{
+  "analyzer": "gse_search",
+  "text": "《复仇者联盟3：无限战争》是全片使用IMAX摄影机拍摄制作的的科幻片."
+}
+```
+
+PUT http://localhost:4080/api/index
+
+```
+{
+	"name": "my-index-chs",
+		"mappings": {
+			"properties": {
+				"title": {
+					"type": "text",
+					"index": true,
+					"highlightable": true,
+					"analyzer": "gse_search",
+					"search_analyzer": "gse_standard"
+				},
+				"author": {
+					"type": "keyword",
+					"index": true,
+					"store": false
+				},
+				"create_time": {
+					"type":"time"
+				}
+			}
+		}
+}
+```
+
+POST http://localhost:4080/api/my-index-chs/document
+
+```
+{
+	"title": "《复仇者联盟3：无限战争》是全片使用IMAX摄影机拍摄制作的科幻片",
+	"author": "灭霸",
+	"create_time": "2022-03-05T18:18:18+08:00"
+}
+```
+
+POST http://localhost:4080/es/my-index-chs/_search
+
+```
+{
+	"query": {
+		"match": {
+			"title": "复仇者联盟"
+		}
+	}
+}
+```
+
+## custom user dictionary
+
+add your words append to the file `${ZINC_PLUGIN_GSE_DICT_PATH}/user.txt`
+
+format:
+
+```
+分词文本  频率        词性
+word    frequency   property
+```
+
+like:
+
+```
+复仇者联盟 100 n
+```
+
+## custom stop tokens
+
+add your words append to the file `${ZINC_PLUGIN_GSE_DICT_PATH}/stop.txt`
+
+format:
+
+```
+停止词
+word
+```
+
+like:
+
+```
+哈哈
+```
+
+## Credit
+
+* https://github.com/prabhatsharma/zinc
+* https://github.com/blugelabs/bluge
+* https://github.com/go-ego/gse
diff --git a/pkg/bluge/analysis/lang/chs/analyzer/search.go b/pkg/bluge/analysis/lang/chs/analyzer/search.go
@@ -0,0 +1,16 @@
+package analyzer
+
+import (
+	"github.com/blugelabs/bluge/analysis"
+	"github.com/go-ego/gse"
+
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/token"
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/tokenizer"
+)
+
+func NewSearchAnalyzer(seg *gse.Segmenter) *analysis.Analyzer {
+	return &analysis.Analyzer{
+		Tokenizer:    tokenizer.NewSearchTokenizer(seg),
+		TokenFilters: []analysis.TokenFilter{token.NewStopTokenFilter(seg, nil)},
+	}
+}
diff --git a/pkg/bluge/analysis/lang/chs/analyzer/standard.go b/pkg/bluge/analysis/lang/chs/analyzer/standard.go
@@ -0,0 +1,16 @@
+package analyzer
+
+import (
+	"github.com/blugelabs/bluge/analysis"
+	"github.com/go-ego/gse"
+
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/token"
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/tokenizer"
+)
+
+func NewStandardAnalyzer(seg *gse.Segmenter) *analysis.Analyzer {
+	return &analysis.Analyzer{
+		Tokenizer:    tokenizer.NewStandardTokenizer(seg),
+		TokenFilters: []analysis.TokenFilter{token.NewStopTokenFilter(seg, nil)},
+	}
+}
diff --git a/pkg/bluge/analysis/lang/chs/gse.go b/pkg/bluge/analysis/lang/chs/gse.go
@@ -0,0 +1,65 @@
+package chs
+
+import (
+	"strings"
+
+	"github.com/blugelabs/bluge/analysis"
+	"github.com/go-ego/gse"
+
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/analyzer"
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/token"
+	"github.com/prabhatsharma/zinc/pkg/bluge/analysis/lang/chs/tokenizer"
+	"github.com/prabhatsharma/zinc/pkg/zutils"
+)
+
+func NewGseStandardAnalyzer() *analysis.Analyzer {
+	return analyzer.NewStandardAnalyzer(seg)
+}
+
+func NewGseSearchAnalyzer() *analysis.Analyzer {
+	return analyzer.NewSearchAnalyzer(seg)
+}
+
+func NewGseStandardTokenizer() analysis.Tokenizer {
+	return tokenizer.NewStandardTokenizer(seg)
+}
+func NewGseSearchTokenizer() analysis.Tokenizer {
+	return tokenizer.NewSearchTokenizer(seg)
+}
+
+func NewGseStopTokenFilter() analysis.TokenFilter {
+	return token.NewStopTokenFilter(seg, nil)
+}
+
+var seg *gse.Segmenter
+
+func init() {
+	seg = new(gse.Segmenter)
+	enable := strings.ToUpper(zutils.GetEnv("ZINC_PLUGIN_GSE_ENABLE", "FALSE"))    // false / true
+	embed := strings.ToUpper(zutils.GetEnv("ZINC_PLUGIN_GSE_DICT_EMBED", "SMALL")) // small / big
+	if enable == "TRUE" {
+		if embed == "BIG" {
+			seg.LoadDictEmbed("zh_s")
+			seg.LoadStopEmbed()
+		} else {
+			seg.LoadDictStr(_dictCHS)
+			seg.LoadStopStr(_dictStop)
+		}
+	} else {
+		seg.LoadDictStr(`zinc`)
+		seg.LoadStopStr(_dictStop)
+	}
+	seg.Load = true
+	seg.SkipLog = true
+
+	// load user dict
+	dataPath := zutils.GetEnv("ZINC_PLUGIN_GSE_DICT_PATH", "./plugins/gse/dict")
+	userDict := dataPath + "/user.txt"
+	if ok, _ := zutils.IsExist(userDict); ok {
+		seg.LoadDict(userDict)
+	}
+	stopDict := dataPath + "/stop.txt"
+	if ok, _ := zutils.IsExist(stopDict); ok {
+		seg.LoadStop(stopDict)
+	}
+}