-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* cedict also dumps data into a sqlite db
- Loading branch information
Showing
7 changed files
with
126 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
install: | ||
cp -f ../tools/CRF++-0.53/python/.libs/_crfpp.so . | ||
cp -f ../tools/CRF++-0.53/python/crfpp.py . | ||
cp -f ../segment/tagging/baseseg.py . | ||
cp -f ../segment/tagging/cleanup.py . | ||
cp -f ../segment/tagging/hanzi_util.py . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/usr/bin/python | ||
# -*- encoding: utf-8 -*- | ||
|
||
from curses import ascii | ||
|
||
def is_zh(ch): | ||
"""return True if ch is Chinese character. | ||
full-width puncts/latins are not counted in. | ||
""" | ||
x = ord(ch) | ||
# CJK Radicals Supplement and Kangxi radicals | ||
if 0x2e80 <= x <= 0x2fef: | ||
return True | ||
# CJK Unified Ideographs Extension A | ||
elif 0x3400 <= x <= 0x4dbf: | ||
return True | ||
# CJK Unified Ideographs | ||
elif 0x4e00 <= x <= 0x9fbb: | ||
return True | ||
# CJK Compatibility Ideographs | ||
elif 0xf900 <= x <= 0xfad9: | ||
return True | ||
# CJK Unified Ideographs Extension B | ||
elif 0x20000 <= x <= 0x2a6df: | ||
return True | ||
else: | ||
return False | ||
|
||
def is_punct(ch): | ||
x = ord(ch) | ||
# in no-formal literals, space is used as punctuation sometimes. | ||
if x < 127 and ascii.ispunct(x): | ||
return True | ||
# General Punctuation | ||
elif 0x2000 <= x <= 0x206f: | ||
return True | ||
# CJK Symbols and Punctuation | ||
elif 0x3000 <= x <= 0x303f: | ||
return True | ||
# Halfwidth and Fullwidth Forms | ||
elif 0xff00 <= x <= 0xffef: | ||
return True | ||
# CJK Compatibility Forms | ||
elif 0xfe30 <= x <= 0xfe4f: | ||
return True | ||
else: | ||
return False | ||
|
||
def is_terminator(ch): | ||
return ch in (u'!', u'?', u',', u';', u'.', u'!', u'?', u',', u'。', u'…') | ||
|
||
if __name__ == "__main__": | ||
s = u"hehe, 我爱北京天安门。" | ||
predicates = {u'zh':is_zh, | ||
u'punct':is_punct, | ||
u'term':is_terminator} | ||
fmt_str = ' %2s %-7s %-7s %-7s' | ||
print fmt_str % tuple(['char']+predicates.keys()) | ||
for c in s: | ||
print fmt_str % tuple([c] + [str(pred(c)) for pred in predicates.values()]) |