* move hanzi_util out of cleanup

* cedict also dumps data into a sqlite db
sunpinyin · Oct 25, 2009 · 09f317e · 09f317e
1 parent 7decb5a
commit 09f317e
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 79 deletions.
diff --git a/Makefile b/Makefile
diff --git a/build/Makefile b/build/Makefile
@@ -0,0 +1,6 @@
+install:
+	cp -f ../tools/CRF++-0.53/python/.libs/_crfpp.so .
+	cp -f ../tools/CRF++-0.53/python/crfpp.py .
+	cp -f ../segment/tagging/baseseg.py .
+	cp -f ../segment/tagging/cleanup.py .
+	cp -f ../segment/tagging/hanzi_util.py .
diff --git a/lexicon/cedict2sunpinyin.py → lexicon/cedict.py b/lexicon/cedict2sunpinyin.py → lexicon/cedict.py
@@ -7,6 +7,7 @@
 import sys
 import codecs
 import re
+from optparse import OptionParser
 
 # the format of cedit looks like
 #   <traditional chinese word> <simplfied chinese word> [<pinyins with tones>] /translations|in|english/
@@ -27,25 +28,49 @@ def normalize(py):
     return "'".join(normalize(py) for py in pinyins.split())
 
 cedict_pattern = re.compile('\S+ (\S+) \[([^\]]+)\] .*')
-def transform(line):
+
+def transform(line, dump_func):
     try:
         sc_word, pinyins = cedict_pattern.match(line).groups()
-        print sc_word, normalize_pinyins(pinyins)
+        dump_func(sc_word, normalize_pinyins(pinyins))
     except:
         pass                            # just ran into an unknown line or a hybrid word
 
-def main(cedict_fname):
+def dump(cedict_fname, dump_func):
     try:
         cedict_file = codecs.open(cedict_fname, "r", "utf-8")
     except:
         print >> sys.stderr, "failed to open %s" % cedict_fname
         sys.exit(1)
     for line in cedict_file:
         if line.startswith(u'#'): continue
-        transform(line)
+        transform(line, dump_func)
     cedict_file.close()
+
+def dump_sunpinyin(sc_word, pinyins):
+    print sc_word, pinyins
+
+class DumpToDb(object):
+    import wordb
+    def __init__(self, filename):
+        self.db = wordb.open(filename)
+
+    def __call__(self, word):
+        self.db[word] = 1
 
 if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-d", "--dict",
+                      help="uncompressed cedict file",
+                      metavar="DICT",
+                      default="../data/cedict_1_0_ts_utf-8_mdbg.txt")
+    parser.add_option("-m", "--db",
+                      help="dump DICT to the sqlite3 DB",
+                      metavar="DB")
+    parser.add_option("-o", "--output",
+                      help="dump simplified Chinese words and its pinyin from DICT to FILE",
+                      metavar="FILE")
+
     #cedict_fname = sys.argv[1]
-    cedict_fname = "/media/stuff/dev/dev/nlp/cedict_1_0_ts_utf-8_mdbg.txt"
-    main(cedict_fname)
+    cedict_fname = 
+    # dump(cedict_fname, dump_sunpinyin)
diff --git a/lexicon/search_filter.py b/lexicon/search_filter.py
@@ -29,18 +29,22 @@ def __str__(self):
 class SearchEngine(object):
     def choose_ip(self):
         # TODO: should be round-robin, and mark those banned IPs with lower priority
-        ip = random.choice(filter(lambda ip: self.ips[ip] == 0, self.ips))
+        return random.choice(filter(lambda ip: self.ips[ip] == 0, self.ips))
 
+    def remove_ip(self, ip):
+        self.ips[ip] = 1
+
     def is_miss(self, result):
         return self.re_miss.search(lines) is not None
 
     def get_freq(self, result):
-        match = self.re_hit.findall (result)
-        if not match:
+        match = self.re_hit.findall(result)
+        if match:
+            return int (match[0].replace(',', ''))
+        else:
             # could be search engine's suggestion or 
             # it just banned me
             return 0
-        freq = int (match[0].replace(',', ''))
 
 class Baidu(SearchEngine):
     url = "http://%s/s?%s"
@@ -60,7 +64,7 @@ def build_url(self, query):
         query = urllib2.quote(query.encode('utf-8'))
         param = urllib.urlencode({'wd':'"%s"' % query, 'ie':'utf-8', 'oe':'utf-8'})
         ip = self.choose_ip()
-        return self.url % (ip, query)
+        return self.url % (ip, param)
 
 class Google(SearchEngine):
     url = "http://%s/search?%s"
@@ -93,19 +97,19 @@ class Google(SearchEngine):
         "72.14.235.147":0,
         "74.125.19.147":0,
         "74.125.19.103":0}
-    re_hit = re.compile (u"<b>([0-9\,]+)</b> 项符合 *<b>")
+    re_hit = re.compile (u"获得约 <b>([0-9\,]+)</b> 条结果")
     re_miss = re.compile(u"未找到符合.*的结果")
 
     def build_url(self, query):
-        query = '"%s"' % urllib2.quote(query.encode('utf-8'))
-        param = urllib.urlencode({'as_epq': query,
+        #query = urllib2.quote(query.encode('utf-8'))
+        param = urllib.urlencode({'as_epq': query.encode('utf-8'),
                                   'ie':'utf-8',
                                   'oe':'utf-8',
                                   'hl':'zh_CN',
                                   'c2coff':'1',
                                   'lr':''})
         ip = self.choose_ip()
-        return self.url % (ip, query)
+        return self.url % (ip, param), ip
 
 class SearchEngineFilter(object):
     def __init__(self, search_engine, threshold = 100000):
@@ -116,14 +120,17 @@ def __init__(self, search_engine, threshold = 100000):
                                                      'Gecko/20080314'
                                                      'Firefox/3.0.3')}
     def get_freq(self, word):
-        #params0 = urllib.urlencode ({'as_epq': '"%s"' % word})
-        url = self.se.build_url(word)
-        req = urllib2.Request (url, headers=self.http_headers)
-        #f = codecs.open( "test1.txt", "r", "utf-8" )
-        f = urllib2.urlopen (req)
-        lines = unicode("".join(f.readlines()), "utf-8")
-        return self.se.get_freq(lines)
-
+        while True:
+            try:
+                url, ip = self.se.build_url(word)
+                req = urllib2.Request (url, headers=self.http_headers)
+                f = urllib2.urlopen (req)
+                lines = unicode("".join(f.readlines()), "utf-8")
+                return self.se.get_freq(lines)
+            except urllib2.URLError,e:
+                # this ip is not accessible
+                self.se.remove_ip(ip)
+
     def get_freq__(self, word):
         ip = random.choice(filter(lambda ip: self.se.ips[ip] == 0, self.se.ips))
         freq = self.__get_word_freq_from_ip(word, ip)
@@ -156,4 +163,3 @@ def get_freq_(self, word):
     google_filter = SearchEngineFilter(Google())
     for word in [u'人间', u'大炮']:
         print word, ':', google_filter.get_freq(word)
-
diff --git a/segment/tagging/baseseg.py b/segment/tagging/baseseg.py
@@ -17,7 +17,7 @@ def __init__(self, **args):
         arg_str = ' '.join([' '.join(['-'+k,str(v)]) for k,v in args.items()])
         self.tagger = crfpp.Tagger(arg_str)
 
-    def segment(self, tokens):
+    def segment(self, tokens, delimeter=u'/'):
         self.tagger.clear()
         for token in tokens:
             self.tagger.add(token.encode('utf-8'))
@@ -36,8 +36,9 @@ def segment(self, tokens):
             word.append(tokens[i])
         if word:
             words.append(''.join(word))
-        print u' '.join(words)
-        print ''.join(tokens)
+        if self.verbose:
+            print ''.join(tokens)
+        print delimeter.join(words)
 
     def __call__(self, tokens):
         self.segment(tokens)

diff --git a/segment/tagging/cleanup.py b/segment/tagging/cleanup.py
@@ -21,53 +21,7 @@
 from __future__ import with_statement
 import sys
 import codecs
-from curses import ascii
-
-def is_zh(ch):
-    """return True if ch is Chinese character.
-    full-width puncts/latins are not counted in.
-    """
-    x = ord(ch)
-    # CJK Radicals Supplement and Kangxi radicals
-    if 0x2e80 <= x <= 0x2fef:
-        return True
-    # CJK Unified Ideographs Extension A
-    elif 0x3400 <= x <= 0x4dbf:
-        return True
-    # CJK Unified Ideographs
-    elif 0x4e00 <= x <= 0x9fbb:
-        return True
-    # CJK Compatibility Ideographs
-    elif 0xf900 <= x <= 0xfad9:
-        return True
-    # CJK Unified Ideographs Extension B
-    elif 0x20000 <= x <= 0x2a6df:
-        return True
-    else:
-        return False
-
-def is_punct(ch):
-    x = ord(ch)
-    # in no-formal literals, space is used as punctuation sometimes.
-    if x < 127 and ascii.ispunct(x):
-        return True
-    # General Punctuation
-    elif 0x2000 <= x <= 0x206f:
-        return True
-    # CJK Symbols and Punctuation
-    elif 0x3000 <= x <= 0x303f:
-        return True
-    # Halfwidth and Fullwidth Forms
-    elif 0xff00 <= x <= 0xffef:
-        return True
-    # CJK Compatibility Forms
-    elif 0xfe30 <= x <= 0xfe4f:
-        return True
-    else:
-        return False
-
-def is_terminator(ch):
-    return ch in (u'!', u'?', u',', u';', u'.', u'！', u'？', u'，', u'。', u'…')
+from hanzi_util import is_terminator, is_punct, is_zh
 
 def split_into_sentences(line):
     tokens = []

diff --git a/segment/tagging/hanzi_util.py b/segment/tagging/hanzi_util.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+
+from curses import ascii
+
+def is_zh(ch):
+    """return True if ch is Chinese character.
+    full-width puncts/latins are not counted in.
+    """
+    x = ord(ch)
+    # CJK Radicals Supplement and Kangxi radicals
+    if 0x2e80 <= x <= 0x2fef:
+        return True
+    # CJK Unified Ideographs Extension A
+    elif 0x3400 <= x <= 0x4dbf:
+        return True
+    # CJK Unified Ideographs
+    elif 0x4e00 <= x <= 0x9fbb:
+        return True
+    # CJK Compatibility Ideographs
+    elif 0xf900 <= x <= 0xfad9:
+        return True
+    # CJK Unified Ideographs Extension B
+    elif 0x20000 <= x <= 0x2a6df:
+        return True
+    else:
+        return False
+
+def is_punct(ch):
+    x = ord(ch)
+    # in no-formal literals, space is used as punctuation sometimes.
+    if x < 127 and ascii.ispunct(x):
+        return True
+    # General Punctuation
+    elif 0x2000 <= x <= 0x206f:
+        return True
+    # CJK Symbols and Punctuation
+    elif 0x3000 <= x <= 0x303f:
+        return True
+    # Halfwidth and Fullwidth Forms
+    elif 0xff00 <= x <= 0xffef:
+        return True
+    # CJK Compatibility Forms
+    elif 0xfe30 <= x <= 0xfe4f:
+        return True
+    else:
+        return False
+
+def is_terminator(ch):
+    return ch in (u'!', u'?', u',', u';', u'.', u'！', u'？', u'，', u'。', u'…')
+
+if __name__ == "__main__":
+    s = u"hehe, 我爱北京天安门。"
+    predicates = {u'zh':is_zh,
+                  u'punct':is_punct,
+                  u'term':is_terminator}
+    fmt_str = ' %2s  %-7s  %-7s  %-7s'
+    print fmt_str % tuple(['char']+predicates.keys())
+    for c in s:
+        print fmt_str % tuple([c] + [str(pred(c)) for pred in predicates.values()])