Skip to content

Commit

Permalink
* move hanzi_util out of cleanup
Browse files Browse the repository at this point in the history
* cedict also dumps data into a sqlite db
  • Loading branch information
tchaikov committed Oct 25, 2009
1 parent 7decb5a commit 09f317e
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 79 deletions.
5 changes: 0 additions & 5 deletions Makefile

This file was deleted.

6 changes: 6 additions & 0 deletions build/Makefile
@@ -0,0 +1,6 @@
install:
cp -f ../tools/CRF++-0.53/python/.libs/_crfpp.so .
cp -f ../tools/CRF++-0.53/python/crfpp.py .
cp -f ../segment/tagging/baseseg.py .
cp -f ../segment/tagging/cleanup.py .
cp -f ../segment/tagging/hanzi_util.py .
37 changes: 31 additions & 6 deletions lexicon/cedict2sunpinyin.py → lexicon/cedict.py
Expand Up @@ -7,6 +7,7 @@
import sys
import codecs
import re
from optparse import OptionParser

# the format of cedit looks like
# <traditional chinese word> <simplfied chinese word> [<pinyins with tones>] /translations|in|english/
Expand All @@ -27,25 +28,49 @@ def normalize(py):
return "'".join(normalize(py) for py in pinyins.split())

cedict_pattern = re.compile('\S+ (\S+) \[([^\]]+)\] .*')
def transform(line):

def transform(line, dump_func):
try:
sc_word, pinyins = cedict_pattern.match(line).groups()
print sc_word, normalize_pinyins(pinyins)
dump_func(sc_word, normalize_pinyins(pinyins))
except:
pass # just ran into an unknown line or a hybrid word

def main(cedict_fname):
def dump(cedict_fname, dump_func):
try:
cedict_file = codecs.open(cedict_fname, "r", "utf-8")
except:
print >> sys.stderr, "failed to open %s" % cedict_fname
sys.exit(1)
for line in cedict_file:
if line.startswith(u'#'): continue
transform(line)
transform(line, dump_func)
cedict_file.close()

def dump_sunpinyin(sc_word, pinyins):
print sc_word, pinyins

class DumpToDb(object):
import wordb
def __init__(self, filename):
self.db = wordb.open(filename)

def __call__(self, word):
self.db[word] = 1

if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-d", "--dict",
help="uncompressed cedict file",
metavar="DICT",
default="../data/cedict_1_0_ts_utf-8_mdbg.txt")
parser.add_option("-m", "--db",
help="dump DICT to the sqlite3 DB",
metavar="DB")
parser.add_option("-o", "--output",
help="dump simplified Chinese words and its pinyin from DICT to FILE",
metavar="FILE")

#cedict_fname = sys.argv[1]
cedict_fname = "/media/stuff/dev/dev/nlp/cedict_1_0_ts_utf-8_mdbg.txt"
main(cedict_fname)
cedict_fname =
# dump(cedict_fname, dump_sunpinyin)
42 changes: 24 additions & 18 deletions lexicon/search_filter.py
Expand Up @@ -29,18 +29,22 @@ def __str__(self):
class SearchEngine(object):
def choose_ip(self):
# TODO: should be round-robin, and mark those banned IPs with lower priority
ip = random.choice(filter(lambda ip: self.ips[ip] == 0, self.ips))
return random.choice(filter(lambda ip: self.ips[ip] == 0, self.ips))

def remove_ip(self, ip):
self.ips[ip] = 1

def is_miss(self, result):
return self.re_miss.search(lines) is not None

def get_freq(self, result):
match = self.re_hit.findall (result)
if not match:
match = self.re_hit.findall(result)
if match:
return int (match[0].replace(',', ''))
else:
# could be search engine's suggestion or
# it just banned me
return 0
freq = int (match[0].replace(',', ''))

class Baidu(SearchEngine):
url = "http://%s/s?%s"
Expand All @@ -60,7 +64,7 @@ def build_url(self, query):
query = urllib2.quote(query.encode('utf-8'))
param = urllib.urlencode({'wd':'"%s"' % query, 'ie':'utf-8', 'oe':'utf-8'})
ip = self.choose_ip()
return self.url % (ip, query)
return self.url % (ip, param)

class Google(SearchEngine):
url = "http://%s/search?%s"
Expand Down Expand Up @@ -93,19 +97,19 @@ class Google(SearchEngine):
"72.14.235.147":0,
"74.125.19.147":0,
"74.125.19.103":0}
re_hit = re.compile (u"<b>([0-9\,]+)</b> 项符合 *<b>")
re_hit = re.compile (u"获得约 <b>([0-9\,]+)</b> 条结果")
re_miss = re.compile(u"未找到符合.*的结果")

def build_url(self, query):
query = '"%s"' % urllib2.quote(query.encode('utf-8'))
param = urllib.urlencode({'as_epq': query,
#query = urllib2.quote(query.encode('utf-8'))
param = urllib.urlencode({'as_epq': query.encode('utf-8'),
'ie':'utf-8',
'oe':'utf-8',
'hl':'zh_CN',
'c2coff':'1',
'lr':''})
ip = self.choose_ip()
return self.url % (ip, query)
return self.url % (ip, param), ip

class SearchEngineFilter(object):
def __init__(self, search_engine, threshold = 100000):
Expand All @@ -116,14 +120,17 @@ def __init__(self, search_engine, threshold = 100000):
'Gecko/20080314'
'Firefox/3.0.3')}
def get_freq(self, word):
#params0 = urllib.urlencode ({'as_epq': '"%s"' % word})
url = self.se.build_url(word)
req = urllib2.Request (url, headers=self.http_headers)
#f = codecs.open( "test1.txt", "r", "utf-8" )
f = urllib2.urlopen (req)
lines = unicode("".join(f.readlines()), "utf-8")
return self.se.get_freq(lines)

while True:
try:
url, ip = self.se.build_url(word)
req = urllib2.Request (url, headers=self.http_headers)
f = urllib2.urlopen (req)
lines = unicode("".join(f.readlines()), "utf-8")
return self.se.get_freq(lines)
except urllib2.URLError,e:
# this ip is not accessible
self.se.remove_ip(ip)

def get_freq__(self, word):
ip = random.choice(filter(lambda ip: self.se.ips[ip] == 0, self.se.ips))
freq = self.__get_word_freq_from_ip(word, ip)
Expand Down Expand Up @@ -156,4 +163,3 @@ def get_freq_(self, word):
google_filter = SearchEngineFilter(Google())
for word in [u'人间', u'大炮']:
print word, ':', google_filter.get_freq(word)

7 changes: 4 additions & 3 deletions segment/tagging/baseseg.py
Expand Up @@ -17,7 +17,7 @@ def __init__(self, **args):
arg_str = ' '.join([' '.join(['-'+k,str(v)]) for k,v in args.items()])
self.tagger = crfpp.Tagger(arg_str)

def segment(self, tokens):
def segment(self, tokens, delimeter=u'/'):
self.tagger.clear()
for token in tokens:
self.tagger.add(token.encode('utf-8'))
Expand All @@ -36,8 +36,9 @@ def segment(self, tokens):
word.append(tokens[i])
if word:
words.append(''.join(word))
print u' '.join(words)
print ''.join(tokens)
if self.verbose:
print ''.join(tokens)
print delimeter.join(words)

def __call__(self, tokens):
self.segment(tokens)
Expand Down
48 changes: 1 addition & 47 deletions segment/tagging/cleanup.py
Expand Up @@ -21,53 +21,7 @@
from __future__ import with_statement
import sys
import codecs
from curses import ascii

def is_zh(ch):
"""return True if ch is Chinese character.
full-width puncts/latins are not counted in.
"""
x = ord(ch)
# CJK Radicals Supplement and Kangxi radicals
if 0x2e80 <= x <= 0x2fef:
return True
# CJK Unified Ideographs Extension A
elif 0x3400 <= x <= 0x4dbf:
return True
# CJK Unified Ideographs
elif 0x4e00 <= x <= 0x9fbb:
return True
# CJK Compatibility Ideographs
elif 0xf900 <= x <= 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif 0x20000 <= x <= 0x2a6df:
return True
else:
return False

def is_punct(ch):
x = ord(ch)
# in no-formal literals, space is used as punctuation sometimes.
if x < 127 and ascii.ispunct(x):
return True
# General Punctuation
elif 0x2000 <= x <= 0x206f:
return True
# CJK Symbols and Punctuation
elif 0x3000 <= x <= 0x303f:
return True
# Halfwidth and Fullwidth Forms
elif 0xff00 <= x <= 0xffef:
return True
# CJK Compatibility Forms
elif 0xfe30 <= x <= 0xfe4f:
return True
else:
return False

def is_terminator(ch):
return ch in (u'!', u'?', u',', u';', u'.', u'!', u'?', u',', u'。', u'…')
from hanzi_util import is_terminator, is_punct, is_zh

def split_into_sentences(line):
tokens = []
Expand Down
60 changes: 60 additions & 0 deletions segment/tagging/hanzi_util.py
@@ -0,0 +1,60 @@
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from curses import ascii

def is_zh(ch):
"""return True if ch is Chinese character.
full-width puncts/latins are not counted in.
"""
x = ord(ch)
# CJK Radicals Supplement and Kangxi radicals
if 0x2e80 <= x <= 0x2fef:
return True
# CJK Unified Ideographs Extension A
elif 0x3400 <= x <= 0x4dbf:
return True
# CJK Unified Ideographs
elif 0x4e00 <= x <= 0x9fbb:
return True
# CJK Compatibility Ideographs
elif 0xf900 <= x <= 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif 0x20000 <= x <= 0x2a6df:
return True
else:
return False

def is_punct(ch):
x = ord(ch)
# in no-formal literals, space is used as punctuation sometimes.
if x < 127 and ascii.ispunct(x):
return True
# General Punctuation
elif 0x2000 <= x <= 0x206f:
return True
# CJK Symbols and Punctuation
elif 0x3000 <= x <= 0x303f:
return True
# Halfwidth and Fullwidth Forms
elif 0xff00 <= x <= 0xffef:
return True
# CJK Compatibility Forms
elif 0xfe30 <= x <= 0xfe4f:
return True
else:
return False

def is_terminator(ch):
return ch in (u'!', u'?', u',', u';', u'.', u'!', u'?', u',', u'。', u'…')

if __name__ == "__main__":
s = u"hehe, 我爱北京天安门。"
predicates = {u'zh':is_zh,
u'punct':is_punct,
u'term':is_terminator}
fmt_str = ' %2s %-7s %-7s %-7s'
print fmt_str % tuple(['char']+predicates.keys())
for c in s:
print fmt_str % tuple([c] + [str(pred(c)) for pred in predicates.values()])

0 comments on commit 09f317e

Please sign in to comment.