Skip to content

Commit b638c52

Browse files
committed
moved from ksana-document
1 parent 3c8d15a commit b638c52

File tree

4 files changed

+269
-0
lines changed

4 files changed

+269
-0
lines changed

configs.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
var tokenizers=require('./tokenizers');
2+
var normalizeTbl=null;
3+
var setNormalizeTable=function(tbl,obj) {
4+
if (!obj) {
5+
obj={};
6+
for (var i=0;i<tbl.length;i++) {
7+
var arr=tbl[i].split("=");
8+
obj[arr[0]]=arr[1];
9+
}
10+
}
11+
normalizeTbl=obj;
12+
return obj;
13+
}
14+
var normalize1=function(token) {
15+
if (!token) return "";
16+
token=token.replace(/[ \n\.,]/g,'').trim();
17+
if (!normalizeTbl) return token;
18+
if (token.length==1) {
19+
return normalizeTbl[token] || token;
20+
} else {
21+
for (var i=0;i<token.length;i++) {
22+
token[i]=normalizeTbl[token[i]] || token[i];
23+
}
24+
return token;
25+
}
26+
}
27+
var isSkip1=function(token) {
28+
var t=token.trim();
29+
return (t=="" || t==" " || t=="※" || t=="\n");
30+
}
31+
var normalize_tibetan=function(token) {
32+
return token.replace(/[ ]/g,'').trim();
33+
}
34+
35+
var isSkip_tibetan=function(token) {
36+
var t=token.trim();
37+
return (t=="" || t==" " || t=="\n");
38+
}
39+
var simple1={
40+
func:{
41+
tokenize:tokenizers.simple
42+
,setNormalizeTable:setNormalizeTable
43+
,normalize: normalize1
44+
,isSkip: isSkip1
45+
}
46+
47+
}
48+
var tibetan1={
49+
func:{
50+
tokenize:tokenizers.tibetan
51+
,setNormalizeTable:setNormalizeTable
52+
,normalize:normalize_tibetan
53+
,isSkip:isSkip_tibetan
54+
}
55+
}
56+
module.exports={"simple1":simple1,"tibetan1":tibetan1}

index.js

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
custom func for building and searching ydb
3+
4+
keep all version
5+
6+
getAPI(version); //return hash of functions , if ver is omit , return lastest
7+
8+
postings2Tree // if version is not supply, get lastest
9+
tokenize(text,api) // convert a string into tokens(depends on other api)
10+
normalizeToken // stemming and etc
11+
isSpaceChar // not a searchable token
12+
isSkipChar // 0 vpos
13+
14+
for client and server side
15+
16+
*/
17+
var configs=require("./configs");
18+
var config_simple="simple1";
19+
var optimize=function(json,config) {
20+
config=config||config_simple;
21+
return json;
22+
}
23+
24+
var getAPI=function(config) {
25+
config=config||config_simple;
26+
var func=configs[config].func;
27+
func.optimize=optimize;
28+
if (config=="simple1") {
29+
//add common custom function here
30+
} else if (config=="tibetan1") {
31+
32+
} else throw "config "+config +"not supported";
33+
34+
return func;
35+
}
36+
37+
module.exports={getAPI:getAPI};

package.json

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"name": "ksana-analyzer",
3+
"version": "1.0.0",
4+
"description": "convert text stream to tokens",
5+
"main": "index.js",
6+
"scripts": {
7+
"test": "mocha"
8+
},
9+
"repository": {
10+
"type": "git",
11+
"url": "https://yapcheahshen@github.com/ksanaforge/ksana-analyzer"
12+
},
13+
"keywords": [
14+
"tokenizing",
15+
"segmentation"
16+
],
17+
"author": "yapcheahshen@gmail.com",
18+
"license": "MIT",
19+
"bugs": {
20+
"url": "https://github.com/ksanaforge/ksana-analyzer/issues"
21+
},
22+
"homepage": "https://github.com/ksanaforge/ksana-analyzer"
23+
}

tokenizers.js

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
var tibetan =function(s) {
2+
//continuous tsheg grouped into same token
3+
//shad and space grouped into same token
4+
var offset=0;
5+
var tokens=[],offsets=[];
6+
s=s.replace(/\r\n/g,'\n').replace(/\r/g,'\n');
7+
var arr=s.split('\n');
8+
9+
for (var i=0;i<arr.length;i++) {
10+
var last=0;
11+
var str=arr[i];
12+
str.replace(/[ ]+/g,function(m,m1){
13+
tokens.push(str.substring(last,m1)+m);
14+
offsets.push(offset+last);
15+
last=m1+m.length;
16+
});
17+
if (last<str.length) {
18+
tokens.push(str.substring(last));
19+
offsets.push(last);
20+
}
21+
if (i===arr.length-1) break;
22+
tokens.push('\n');
23+
offsets.push(offset+last);
24+
offset+=str.length+1;
25+
}
26+
27+
return {tokens:tokens,offsets:offsets};
28+
};
29+
var isSpace=function(c) {
30+
return (c==" ") ;//|| (c==",") || (c==".");
31+
}
32+
var isCJK =function(c) {return ((c>=0x3000 && c<=0x9FFF)
33+
|| (c>=0xD800 && c<0xDC00) || (c>=0xFF00) ) ;}
34+
var simple1=function(s) {
35+
var offset=0;
36+
var tokens=[],offsets=[];
37+
s=s.replace(/\r\n/g,'\n').replace(/\r/g,'\n');
38+
arr=s.split('\n');
39+
40+
var pushtoken=function(t,off) {
41+
var i=0;
42+
if (t.charCodeAt(0)>255) {
43+
while (i<t.length) {
44+
var c=t.charCodeAt(i);
45+
offsets.push(off+i);
46+
tokens.push(t[i]);
47+
if (c>=0xD800 && c<=0xDFFF) {
48+
tokens[tokens.length-1]+=t[i]; //extension B,C,D
49+
}
50+
i++;
51+
}
52+
} else {
53+
tokens.push(t);
54+
offsets.push(off);
55+
}
56+
}
57+
for (var i=0;i<arr.length;i++) {
58+
var last=0,sp="";
59+
str=arr[i];
60+
str.replace(/[_0-9A-Za-z]+/g,function(m,m1){
61+
while (isSpace(sp=str[last]) && last<str.length) {
62+
tokens[tokens.length-1]+=sp;
63+
last++;
64+
}
65+
pushtoken(str.substring(last,m1)+m , offset+last);
66+
offsets.push(offset+last);
67+
last=m1+m.length;
68+
});
69+
70+
if (last<str.length) {
71+
while (isSpace(sp=str[last]) && last<str.length) {
72+
tokens[tokens.length-1]+=sp;
73+
last++;
74+
}
75+
pushtoken(str.substring(last), offset+last);
76+
77+
}
78+
offsets.push(offset+last);
79+
offset+=str.length+1;
80+
if (i===arr.length-1) break;
81+
tokens.push('\n');
82+
}
83+
84+
return {tokens:tokens,offsets:offsets};
85+
86+
};
87+
88+
var simple=function(s) {
89+
var token='';
90+
var tokens=[], offsets=[] ;
91+
var i=0;
92+
var lastspace=false;
93+
var addtoken=function() {
94+
if (!token) return;
95+
tokens.push(token);
96+
offsets.push(i);
97+
token='';
98+
}
99+
while (i<s.length) {
100+
var c=s.charAt(i);
101+
var code=s.charCodeAt(i);
102+
if (isCJK(code)) {
103+
addtoken();
104+
token=c;
105+
if (code>=0xD800 && code<0xDC00) { //high sorragate
106+
token+=s.charAt(i+1);i++;
107+
}
108+
addtoken();
109+
} else {
110+
if (c=='&' || c=='<' || c=='?' || c=="," || c=="."
111+
|| c=='|' || c=='~' || c=='`' || c==';'
112+
|| c=='>' || c==':'
113+
|| c=='=' || c=='@' || c=="-"
114+
|| c==']' || c=='}' || c==")"
115+
//|| c=='{' || c=='}'|| c=='[' || c==']' || c=='(' || c==')'
116+
|| code==0xf0b || code==0xf0d // tibetan space
117+
|| (code>=0x2000 && code<=0x206f)) {
118+
addtoken();
119+
if (c=='&' || c=='<'){ // || c=='{'|| c=='('|| c=='[') {
120+
var endchar='>';
121+
if (c=='&') endchar=';'
122+
//else if (c=='{') endchar='}';
123+
//else if (c=='[') endchar=']';
124+
//else if (c=='(') endchar=')';
125+
126+
while (i<s.length && s.charAt(i)!=endchar) {
127+
token+=s.charAt(i);
128+
i++;
129+
}
130+
token+=endchar;
131+
addtoken();
132+
} else {
133+
token=c;
134+
addtoken();
135+
}
136+
token='';
137+
} else {
138+
if (c==" ") {
139+
token+=c;
140+
lastspace=true;
141+
} else {
142+
if (lastspace) addtoken();
143+
lastspace=false;
144+
token+=c;
145+
}
146+
}
147+
}
148+
i++;
149+
}
150+
addtoken();
151+
return {tokens:tokens,offsets:offsets};
152+
}
153+
module.exports={simple:simple,tibetan:tibetan};

0 commit comments

Comments
 (0)