Skip to content

Commit

Permalink
add new api: Tokenize for location information of words
Browse files Browse the repository at this point in the history
  • Loading branch information
yanyiwu committed Apr 21, 2016
1 parent f4a4bdd commit 0e88ee8
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 0 deletions.
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## next version

+ upgrade cppjieba -> v4.7.0
+ add new api: Tokenize for location information of words

## v0.13.0

Expand Down
23 changes: 23 additions & 0 deletions jieba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ static char** ConvertWords(const std::vector<std::string>& words) {
return res;
}

static Word* ConvertWords(const std::vector<cppjieba::Word>& words) {
Word* res = (Word*)malloc(sizeof(Word) * (words.size() + 1));
for (size_t i = 0; i < words.size(); i++) {
res[i].offset = words[i].offset;
res[i].len = words[i].word.size();
}
res[words.size()].offset = 0;
res[words.size()].len = 0;
return res;
}

Jieba NewJieba(const char* dict_path, const char* hmm_path, const char* user_dict) {
return (Jieba)(new cppjieba::Jieba(dict_path, hmm_path, user_dict));
}
Expand Down Expand Up @@ -53,3 +64,15 @@ char** Tag(Jieba x, const char* sentence) {
}
return ConvertWords(words);
}

Word* Tokenize(Jieba x, const char* sentence, TokenizeMode mode, int is_hmm_used) {
std::vector<cppjieba::Word> words;
switch (mode) {
case SearchMode:
((cppjieba::Jieba*)x)->CutForSearch(sentence, words, is_hmm_used);
return ConvertWords(words);
default:
((cppjieba::Jieba*)x)->Cut(sentence, words, is_hmm_used);
return ConvertWords(words);
}
}
25 changes: 25 additions & 0 deletions jieba.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,19 @@ package gojieba
import "C"
import "unsafe"

type TokenizeMode int

const (
DefaultMode TokenizeMode = iota
SearchMode
)

type Word struct {
Str string
Start int
End int
}

type Jieba struct {
jieba C.Jieba
}
Expand Down Expand Up @@ -72,3 +85,15 @@ func (x *Jieba) Tag(s string) []string {
res := cstrings(words)
return res
}

func (x *Jieba) Tokenize(s string, mode TokenizeMode, hmm bool) []Word {
c_int_hmm := 0
if hmm {
c_int_hmm = 1
}
cstr := C.CString(s)
defer C.free(unsafe.Pointer(cstr))
var words *C.Word = C.Tokenize(x.jieba, cstr, C.TokenizeMode(mode), C.int(c_int_hmm))
defer C.free(unsafe.Pointer(words))
return convertWords(s, words)
}
13 changes: 13 additions & 0 deletions jieba.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
#ifndef CJIEBA_JIEBA_H
#define CJIEBA_JIEBA_H

#include <stdlib.h>
#include "util.h"

typedef void* Jieba;

typedef struct {
size_t offset;
size_t len;
} Word;

typedef enum {
DefaultMode = 0,
SearchMode = 1,
} TokenizeMode;

Jieba NewJieba(const char* dict_path, const char* hmm_path, const char* user_dict);
void FreeJieba(Jieba);

Expand All @@ -13,4 +24,6 @@ char** CutAll(Jieba handle, const char* sentence);
char** CutForSearch(Jieba handle, const char* sentence, int is_hmm_used);
char** Tag(Jieba handle, const char* sentence);

Word* Tokenize(Jieba x, const char* sentence, TokenizeMode mode, int is_hmm_used);

#endif // CJIEBA_JIEBA_H
24 changes: 24 additions & 0 deletions jieba_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package gojieba

import (
"fmt"
"reflect"
"strings"
"testing"
)
Expand Down Expand Up @@ -38,6 +39,16 @@ func ExampleJieba() {
fmt.Println(s)
fmt.Println("词性标注:", strings.Join(words, ","))

s = "长春市长春药店"
wordinfos := x.Tokenize(s, SearchMode, false)
fmt.Println(s)
fmt.Println("Tokenize:", wordinfos)

//s = "长春市长春药店"
//wordinfos := x.Tokenize(s, SearchMode, !use_hmm)
//fmt.Println(s)
//fmt.Println(wordinfos)

// Output:
// 我来到北京清华大学
// 全模式: 我/来到/北京/清华/清华大学/华大/大学
Expand All @@ -49,6 +60,8 @@ func ExampleJieba() {
// 搜索引擎模式: 小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造
// 长春市长春药店
// 词性标注: 长春市/ns,长春/ns,药店/n
// 长春市长春药店
// Tokenize: [{长春市 0 9} {长春 9 15} {药店 15 21}]
}

func TestJieba(t *testing.T) {
Expand Down Expand Up @@ -101,6 +114,17 @@ func TestJieba(t *testing.T) {
if expected != actual {
t.Error(actual)
}

s = "长春市长春药店"
wordinfos := x.Tokenize(s, SearchMode, false)
expectedwords := []Word{
Word{Str: "长春市", Start: 0, End: 9},
Word{Str: "长春", Start: 9, End: 15},
Word{Str: "药店", Start: 15, End: 21},
}
if !reflect.DeepEqual(wordinfos, expectedwords) {
t.Error()
}
}

func BenchmarkJieba(b *testing.B) {
Expand Down
18 changes: 18 additions & 0 deletions util.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package gojieba

/*
#include "jieba.h"
*/
import "C"
import "unsafe"

Expand All @@ -12,3 +15,18 @@ func cstrings(x **C.char) []string {
}
return s
}

func convertWords(s string, words *C.Word) []Word {
result := make([]Word, 0)
x := words
eltSize := unsafe.Sizeof(*x)
start := 0
end := 0
for (*x).len != 0 {
start = int((*x).offset)
end = start + int((*x).len)
result = append(result, Word{s[start:end], start, end})
x = (*C.Word)(unsafe.Pointer(uintptr(unsafe.Pointer(x)) + eltSize))
}
return result
}

0 comments on commit 0e88ee8

Please sign in to comment.