Skip to content
Permalink
Browse files

add tokenizer for switching tokenizer in each language

  • Loading branch information...
suzuken committed Feb 13, 2016
1 parent e26e0c1 commit b5a875e692e5aac4875fd0a9a55911f95e897ba4
Showing with 72 additions and 1 deletion.
  1. +0 −1 stopwords_test.go
  2. +47 −0 tokenizer.go
  3. +25 −0 tokenizer_test.go
@@ -7,7 +7,6 @@ import (
func TestNewStopwords(t *testing.T) {
sw := NewStopwords()
st := sw.stopWordsCount("ja", "日本語(にほんご、にっぽんご)は、主に日本国内や日本人同士の間で使われている言語である。")
t.Logf("%v", st)
if st.wordCount == 0 {
t.Fatalf("%v", st)
}
@@ -0,0 +1,47 @@
package goose

import (
"golang.org/x/text/language"
"strings"

kagomeTokenizer "github.com/ikawaha/kagome/tokenizer"
)

// Tokenizer represents to tokenize given string into tokens.
type Tokenizer interface {
Tokenize(string) []string
}

// MultilangTokenizer switching tokenizer by given language settings.
// Tokenizer is used by each document based on its language.
type MultilangTokenizer struct {
lang language.Tag
}

// NewMultilangTokenizer makes MultilangTokenizer.
func NewMultilangTokenizer(lang language.Tag) *MultilangTokenizer {
return &MultilangTokenizer{
lang: lang,
}
}

// Tokenize runs tokenize string and return its tokens.
func (m *MultilangTokenizer) Tokenize(s string) []string {
switch m.lang {
case language.Japanese:
t := kagomeTokenizer.New()
tokens := t.Tokenize(s)
// tokens contains BOS and EOS as token.
// so length is caliculated by dispite of them.
ret := make([]string, 0, len(tokens)-2)
for _, token := range tokens {
if token.Class != kagomeTokenizer.DUMMY {
ret = append(ret, token.Surface)
}
}
return ret
default:
// space separated language
return strings.Split(s, " ")
}
}
@@ -0,0 +1,25 @@
package goose

import (
"golang.org/x/text/language"
"reflect"
"testing"
)

func TestTokenizer(t *testing.T) {
japanese := language.Make("en")
tokenizer := NewMultilangTokenizer(japanese)
tokens := tokenizer.Tokenize("Language and Locale Matching in Go")
if !reflect.DeepEqual(tokens, []string{"Language", "and", "Locale", "Matching", "in", "Go"}) {
t.Fatalf("cannot tokenize english string. tokens %v", tokens)
}
}

func TestJapaneseTokenizer(t *testing.T) {
japanese := language.Make("ja")
tokenizer := NewMultilangTokenizer(japanese)
tokens := tokenizer.Tokenize("すもももももももものうち")
if !reflect.DeepEqual(tokens, []string{"すもも", "", "もも", "", "もも", "", "うち"}) {
t.Fatalf("cannot tokenize japanese string. tokens %v", tokens)
}
}

0 comments on commit b5a875e

Please sign in to comment.
You can’t perform that action at this time.