Permalink
Browse files

[host-tools] drastically truncate Japanese phonetic translations

limited to 100(initial phrase) * 100(next phrase)
so the number of translations is clipped to a maximum of 10,000

example of phrase that causes failure: (with translation counts)
平親清女・平親清女妹・平親清四女・平親清五女
  120  *   360   *   120   *   120
= 622,080,000 possible translations

Signed-off-by: Christopher Hall <hsw@openmoko.com>
  • Loading branch information...
1 parent 59796fd commit a929a77e3a947de9529ee1142b72b2a59e396039 @hxw hxw committed Mar 28, 2012
Showing with 25 additions and 4 deletions.
  1. +25 −4 host-tools/offline-renderer/LanguageTranslation.py
@@ -13,7 +13,8 @@
import PinyinTable
try:
import MeCab
-except:
+except Exception as e:
+ print('exception: ', e)
print('error: Missing python module: python-mecab')
print(' sudo apt-get install python-mecab mecab-ipadic-utf8')
exit(1)
@@ -278,20 +279,32 @@ def get_phonetics(self, text):
for c in key:
romaji = romaji + c
result.append(romaji)
+
return result
+ import string
+ punctuation = string.punctuation + u'、・\r\n \t'
+
+
def translate(self, text):
"""take Japanese string and convert to Roman letters"""
result = []
for text in super(type(self), self).translate(text):
- for tt in text.split():
+ split_text = ''.join([ c if not c in self.punctuation else ' ' for c in list(text)]).split()
+ for tt in split_text:
if type(tt) == unicode:
tt = tt.encode('utf-8')
phonetics = self.get_phonetics(tt)
- result = super(type(self), self).append_translations(result, phonetics, ' ')
+ #result = super(type(self), self).append_translations(result, phonetics, ' ')
+ # *** nasty hack to make sure the number of translations does not exceed 10000
+ # *** as some Japanese phrases can have hundreds of millions of possible pronunciations
+ # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女
+ # *** 120 * 360 * 120 * 120 -> 622,080,000
+ # *** just cut the arrays to the first 100 elements
+ result = super(type(self), self).append_translations(result[:100], phonetics[:100], ' ')
if result is None or [] == result or '' == result:
return ['']
@@ -327,6 +340,14 @@ def main():
('ja2', u'2004年新潟県中越地震 孫正義 孫悟空 孫子 バラク・オバマ スタぴか'),
('ja3', u'Ъ'),
('ja4', u'国際的な協力の下に規制薬物に係る不正行為を助長する行為等の防止を図るための麻薬及び向精神薬取締法等の特例等に関する法律'),
+ ('ja5', u'東京都クラブバスケットボール連盟'),
+ ('ja6a', u'平親清女'),
+ ('ja6b', u'平親清女妹'),
+ ('ja6c', u'平親清四女'),
+ ('ja6d', u'平親清五女'),
+ ('ja6e', u'平親清女・平親清女妹'),
+ ('ja6f', u'平親清女・平親清女妹・平親清四女'),
+ ('ja6z', u'平親清女・平親清女妹・平親清四女・平親清五女'),
('qq', u'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ'),
('q1', u'ĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈ'),
('q2', u'ƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ'),
@@ -346,7 +367,7 @@ def main():
print(u'\nNormal translation\n==================')
test_items(texts, LanguageNormal().translate)
- print(u'\nJapnese translation\n====================')
+ print(u'\nJapanese translation\n====================')
test_items(texts, LanguageJapanese().translate)

0 comments on commit a929a77

Please sign in to comment.