Skip to content
  • 3 commits
  • 2 files changed
  • 0 commit comments
  • 1 contributor
Commits on Mar 28, 2012
@hxw hxw [host-tools] fix another python unicode bug
need to add unicode() as python tends to treat UTF-8 strings as ASCII
and throw conversion errors

Signed-off-by: Christopher Hall <hsw@openmoko.com>
a35e784
@hxw hxw [host-tools] some extra prints in verbose mode
to aid debugging failing build

Signed-off-by: Christopher Hall <hsw@openmoko.com>
59796fd
@hxw hxw [host-tools] drastically truncate Japanese phonetic translations
limited to 100(initial phrase) * 100(next phrase)
so the number of translations is clipped to a maximum of 10,000

example of phrase that causes failure: (with translation counts)
平親清女・平親清女妹・平親清四女・平親清五女
  120  *   360   *   120   *   120
= 622,080,000 possible translations

Signed-off-by: Christopher Hall <hsw@openmoko.com>
a929a77
Showing with 50 additions and 6 deletions.
  1. +25 −2 host-tools/offline-renderer/ArticleIndex.py
  2. +25 −4 host-tools/offline-renderer/LanguageTranslation.py
View
27 host-tools/offline-renderer/ArticleIndex.py
@@ -156,7 +156,15 @@ def main():
language = language_convert)
for f in args:
+ if verbose:
+ PrintLog.message('process: {0:s}'.format(f))
+ else:
+ pass
limit = processor.process(f, limit)
+ if verbose:
+ PrintLog.message('process: {0:s} returned: {0:s}'.format(limit))
+ else:
+ pass
if limit != 'all' and limit <= 0:
break
@@ -439,7 +447,8 @@ def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
if verbose:
PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'
- .format(category, key, title, rcategory, rkey, rtitle))
+ .format(unicode(category, 'utf-8'), key, title,
+ unicode(rcategory, 'utf-8'), rkey, rtitle))
return
if '' == rtitle:
@@ -482,7 +491,7 @@ def body(self, category, key, title, text, seek):
if restricted:
self.restricted_count += 1
- if not verbose and self.article_count % 10000 == 0:
+ if self.article_count % 10000 == 0:
start_time = time.time()
PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
self.time = start_time
@@ -496,6 +505,7 @@ def body(self, category, key, title, text, seek):
PrintLog.message(u' --> {0:s}'.format(bad_words))
else:
PrintLog.message(u'Title: {0:s}'.format(title))
+ pass
character_count = len(text)
self.total_character_count += character_count
@@ -508,15 +518,28 @@ def body(self, category, key, title, text, seek):
def resolve_redirects(self):
"""add redirect to article_index"""
+ global verbose
count = 0
+ if verbose:
+ PrintLog.message(u'Resolving redirects')
+ else:
+ pass
for item in self.redirects:
try:
self.set_index(item, self.find(item)[:3] + (True,))
count += 1
+ if verbose and count % 1000 == 0:
+ PrintLog.message(u'Redirects resolved: {0:d}'.format(count))
+ else:
+ pass
except KeyError:
PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
except CycleError:
PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
+ if verbose:
+ PrintLog.message(u'Total redirects resolved: {0:d}'.format(count))
+ else:
+ pass
return count
View
29 host-tools/offline-renderer/LanguageTranslation.py
@@ -13,7 +13,8 @@
import PinyinTable
try:
import MeCab
-except:
+except Exception as e:
+ print('exception: ', e)
print('error: Missing python module: python-mecab')
print(' sudo apt-get install python-mecab mecab-ipadic-utf8')
exit(1)
@@ -278,20 +279,32 @@ def get_phonetics(self, text):
for c in key:
romaji = romaji + c
result.append(romaji)
+
return result
+ import string
+ punctuation = string.punctuation + u'、・\r\n \t'
+
+
def translate(self, text):
"""take Japanese string and convert to Roman letters"""
result = []
for text in super(type(self), self).translate(text):
- for tt in text.split():
+ split_text = ''.join([ c if not c in self.punctuation else ' ' for c in list(text)]).split()
+ for tt in split_text:
if type(tt) == unicode:
tt = tt.encode('utf-8')
phonetics = self.get_phonetics(tt)
- result = super(type(self), self).append_translations(result, phonetics, ' ')
+ #result = super(type(self), self).append_translations(result, phonetics, ' ')
+ # *** nasty hack to make sure the number of translations does not exceed 10000
+ # *** as some Japanese phrases can have hundreds of millions of possible pronunciations
+ # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女
+ # *** 120 * 360 * 120 * 120 -> 622,080,000
+ # *** just cut the arrays to the first 100 elements
+ result = super(type(self), self).append_translations(result[:100], phonetics[:100], ' ')
if result is None or [] == result or '' == result:
return ['']
@@ -327,6 +340,14 @@ def main():
('ja2', u'2004年新潟県中越地震 孫正義 孫悟空 孫子 バラク・オバマ スタぴか'),
('ja3', u'Ъ'),
('ja4', u'国際的な協力の下に規制薬物に係る不正行為を助長する行為等の防止を図るための麻薬及び向精神薬取締法等の特例等に関する法律'),
+ ('ja5', u'東京都クラブバスケットボール連盟'),
+ ('ja6a', u'平親清女'),
+ ('ja6b', u'平親清女妹'),
+ ('ja6c', u'平親清四女'),
+ ('ja6d', u'平親清五女'),
+ ('ja6e', u'平親清女・平親清女妹'),
+ ('ja6f', u'平親清女・平親清女妹・平親清四女'),
+ ('ja6z', u'平親清女・平親清女妹・平親清四女・平親清五女'),
('qq', u'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ'),
('q1', u'ĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈ'),
('q2', u'ƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ'),
@@ -346,7 +367,7 @@ def main():
print(u'\nNormal translation\n==================')
test_items(texts, LanguageNormal().translate)
- print(u'\nJapnese translation\n====================')
+ print(u'\nJapanese translation\n====================')
test_items(texts, LanguageJapanese().translate)

No commit comments for this range

Something went wrong with that request. Please try again.