Permalink
Browse files

[offline-renderer] Changes for language links

Language links that are simple are encoded as:
  en:Just Testing\0

Language links with accents:
  fr#Hiroshi Hara\1Hìrôšhî Hârå\0
  ja#harakouji\1原広司\0

Note: \0 and \1 are binary bytes

Signed-off-by: Christopher Hall <hsw@openmoko.com>
  • Loading branch information...
1 parent b3cded8 commit 3e1658eb146312bdb1b00c962fd70a2a25af96cd @hxw hxw committed Apr 7, 2010
@@ -20,22 +20,13 @@
import FileScanner
import TidyUp
import PrintLog
-
-
-# this _must_ be in ascending ASCII sequence
-KEYPAD_KEYS = """ !#$%&'()*+,-.0123456789=?@abcdefghijklmnopqrstuvwxyz"""
-
-# to check if in order: uncomment and look at result
-#for c in KEYPAD_KEYS:
-# print('{0:d}'.format(ord(c)))
-#sys.exit(0)
+import LanguageTranslation
+import SearchKey
# maximum string lengths for FND file
MAXIMUM_TITLE_LENGTH = 63 # c-code is 64 including '\0'
-# underscore and space
-whitespaces = re.compile(r'([\s_]+)', re.IGNORECASE)
# to catch loop in redirections
class CycleError(Exception):
@@ -63,6 +54,7 @@ def usage(message):
print(' --limit=number Limit the number of articles processed')
print(' --prefix=name Device file name portion for .fnd/.pfx [pedia]')
print(' --templates=file Database for templates [templates.db]')
+ print(' --truncate-title Set to when not using language links to save space')
exit(1)
@@ -72,7 +64,7 @@ def main():
try:
- opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:l:p:L:',
+ opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:l:p:L:T',
['help', 'verbose',
'article-index=',
'article-offsets=',
@@ -81,6 +73,7 @@ def main():
'limit=',
'prefix=',
'language=',
+ 'truncate-title',
])
except getopt.GetoptError, err:
usage(err)
@@ -94,6 +87,7 @@ def main():
template_name = 'templates.db'
limit = 'all'
language = 'en' # some languages may require special processing
+ truncate_title = False # set tru when not using language links
for opt, arg in opts:
if opt in ('-v', '--verbose'):
@@ -108,6 +102,8 @@ def main():
cnt_name = arg
elif opt in ('-t', '--templates'):
template_name = arg
+ elif opt in ('-T', '--truncate-title'):
+ truncate_title = True
elif opt in ('-l', '--limit'):
if arg[-1] == 'k':
arg = arg[:-1] + '000'
@@ -129,9 +125,9 @@ def main():
if [] == args:
usage('Missing argument(s)')
- language_convert = LanguageNull()
+ language_convert = LanguageTranslation.LanguageNull()
if 'ja' == language:
- language_convert = Furigana()
+ language_convert = LanguageTranslation.Furigana()
processor = FileProcessing(articles = art_name, offsets = off_name,
templates = template_name,
@@ -170,7 +166,7 @@ def main():
cf.close()
- output_fnd(fnd_name, processor, language_convert)
+ output_fnd(fnd_name, processor, language_convert, truncate_title)
output_pfx(pfx_name)
del processor
@@ -189,14 +185,14 @@ def generate_bigram(text):
if len(text) > 2:
try:
- if text[0].lower() in KEYPAD_KEYS and text[1].lower() in KEYPAD_KEYS:
+ if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]):
bigram[text[0:2]] += 1
except KeyError:
bigram[text[0:2]] = 1
if len(text) > 4:
try:
- if text[2].lower() in KEYPAD_KEYS and text[3].lower() in KEYPAD_KEYS:
+ if SearchKey.is_valid_character(text[2]) and SearchKey.is_valid_character(text[3]):
bigram[text[2:4]] += 1
except KeyError:
bigram[text[2:4]] = 1
@@ -381,7 +377,6 @@ def title(self, category, key, title, seek):
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
- global whitespaces
global verbose
title = self.translate(title).strip(u'\u200e\u200f')
@@ -396,7 +391,7 @@ def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
except UnicodeDecodeError:
pass
- rtitle = whitespaces.sub(' ', rtitle).strip().lstrip(':')
+ rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()
if self.KEY_TEMPLATE == key:
if title != rtitle:
@@ -532,22 +527,15 @@ def find(self, title, level = 0):
return result
-import unicodedata
-def strip_accents(s):
- if type(s) == str:
- s = unicode(s, 'utf-8')
- return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
-
-
def bigram_encode(title):
+ """encode a title in bigram form"""
global bigram
- global whitespaces
result = ''
- title = strip_accents(title)
+ title = SearchKey.strip_accents(title)
while len(title) >= 2:
- if title[0].lower() in KEYPAD_KEYS:
+ if SearchKey.is_valid_character(title[0]):
b = title[0:2]
if b in bigram:
@@ -560,18 +548,15 @@ def bigram_encode(title):
#result += '?'
title = title[1:]
if len(title) == 1:
- if title[0].lower() in KEYPAD_KEYS:
+ if SearchKey.is_valid_character(title[0]):
result += chr(ord(title[0]))
#else:
# result += '?'
- # compact all spaces
- result = whitespaces.sub(' ', result).strip()
-
- return result
+ return SearchKey.compact_spaces(result)
-def output_fnd(filename, article_index, language_processor):
+def output_fnd(filename, article_index, language_processor, truncate_title):
"""create bigram table"""
global bigram
global index_matrix
@@ -606,19 +591,11 @@ def output_fnd(filename, article_index, language_processor):
#article_list = [strip_accents(k) for k in article_index.keys()]
#article_list.sort(key = lambda x: strip_accents(x).lower())
- def sort_key(key):
- global KEYPAD_KEYS
- global whitepaces
-
- result = ''.join(c for c in strip_accents(language_processor.translate(key).lower()) if c in KEYPAD_KEYS)
- # compact all spaces
- result = whitespaces.sub(' ', result).strip()
- return result
-
PrintLog.message(u'Sorting titles')
start_time = time.time()
- article_list = [ (sort_key(title), title) for title in article_index.all_indices() ]
+ article_list = [ (SearchKey.make_key(language_processor.translate(title)), title)
+ for title in article_index.all_indices() ]
article_list.sort()
PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
@@ -641,7 +618,9 @@ def sort_key(key):
if '' == bigram_title and is_redirect:
continue
- utf8_title = title.encode('utf-8')[:MAXIMUM_TITLE_LENGTH]
+ utf8_title = title.encode('utf-8')
+ if truncate_title:
+ utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
offset = out_f.tell()
article_index.set_index(title, (article_number, offset, restricted, is_redirect))
@@ -697,7 +676,7 @@ def output_pfx(filename):
PrintLog.message(u'Writing: {0:s}'.format(filename))
start_time = time.time()
out_f = open(filename, 'w')
- list = '\0' + KEYPAD_KEYS
+ list = '\0' + SearchKey.all_characters()
for k1 in list:
for k2 in list:
for k3 in list:
@@ -712,146 +691,6 @@ def output_pfx(filename):
PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
-class LanguageProcessor(object):
-
- def translate(self, text):
- PrintLog.message('Virual function called')
- sys.exit(1)
-
-
-
-class LanguageNull(LanguageProcessor):
- """no-op class"""
- def translate(self, text):
- """null translation => only strip spaces"""
- return text.strip()
-
-
-class Furigana(LanguageProcessor):
- """Convert Japanese to Romaji"""
-
- KANA_TO_ROMAN = {
-
- u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
- u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
- u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
-
- u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
- u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
- u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
-
- u'': 'da', u'': 'di', u'': 'du', u'': 'de', u'': 'do',
- u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
- u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
-
- u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
- u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
- u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
-
- u'': 'ya', u'': 'yu', u'': 'yo',
- u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
- u'': 'wa', u'': 'wo',
-
- u'': 'nn',
-
- u'': '-',
-
- u'ウァ': 'wha', u'ウィ': 'whi', u'ウェ': 'whe', u'ウォ': 'who',
- u'ヴァ': 'va', u'ヴィ': 'vi', u'': 'vu', u'ヴェ': 've', u'ヴォ': 'vo',
- u'チャ': 'cya', u'チィ': 'cyi', u'チュ': 'cyu', u'チェ': 'cye', u'チョ': 'cyo',
-
- u'ニャ': 'nya', u'ニィ': 'nyi', u'ニュ': 'nyu', u'ニェ': 'nye', u'ニョ': 'nyo',
- u'シャ': 'sya', u'シィ': 'syi', u'シュ': 'syu', u'シェ': 'sye', u'ショ': 'syo',
- u'キァ': 'kya', u'キィ': 'kyi', u'キュ': 'kyu', u'キェ': 'kye', u'キョ': 'kyo',
-
- u'テャ': 'tha', u'ティ': 'thi', u'テュ': 'thu', u'テェ': 'the', u'テョ': 'tho',
- u'ヒャ': 'hya', u'ヒィ': 'hyi', u'ヒュ': 'hyu', u'ヒェ': 'hye', u'ヒョ': 'hyo',
- u'ミャ': 'mya', u'ミィ': 'myi', u'ミュ': 'myu', u'ミェ': 'mye', u'ミョ': 'myo',
-
- u'リャ': 'rya', u'リィ': 'ryi', u'リュ': 'ryu', u'リェ': 'rye', u'リョ': 'ryo',
- u'ジャ': 'ja', u'ジィ': 'jyi', u'ジュ': 'ju', u'ジェ': 'je' , u'ジョ': 'jo',
- u'ギャ': 'gya', u'ギィ': 'gyi', u'ギュ': 'gyu', u'ギェ': 'gye', u'ギョ': 'gyo',
-
- u'ビャ': 'bya', u'ビィ': 'byi', u'ビュ': 'byu', u'ビェ': 'bye', u'ビョ': 'byo',
- u'ピャ': 'pya', u'ピィ': 'pyi', u'ピュ': 'pyu', u'ピェ': 'pye', u'ピョ': 'pyo',
- u'クァ': 'kha', u'クィ': 'khi', u'クゥ': 'khu', u'クェ': 'khe', u'クォ': 'kho',
-
- u'グァ': 'gha', u'グィ': 'ghi', u'グゥ': 'ghu', u'グェ': 'ghe', u'グォ': 'gho',
- u'ファ': 'fa', u'フィ': 'fi', u'フェ': 'fe', u'フォ': 'fo',
- u'フャ': 'fya', u'フュ': 'fyu', u'フョ': 'fyo',
-
- u'デァ': 'dha', u'ディ': 'dhi', u'デュ': 'dhu', u'デェ': 'dhe', u'デョ': 'dho',
- u'ツァ': 'tsa', u'ツィ': 'tsi', u'ツェ': 'tse', u'ツォ': 'tso',
- }
-
-
- def __init__(self, *args, **kw):
- super(Furigana, self).__init__(*args, **kw)
-
- import MeCab # load Japanese dictionary interface
-
- self.mecab = MeCab.Tagger('-Ochasen')
-
-
- def romanise(self, text):
- """private method for converting Japanese phonetics to Romaji"""
-
- if type(text) != unicode:
- text = unicode(text, "utf-8")
-
- result = ''
- i = 0
- duplicate = False
- last = len(text) - 1
- while i <= last:
- key = text[i:i + 2] # extract a pair of phonetics
- if not (i < last and key in self.KANA_TO_ROMAN):
- key = text[i]
-
- if key in self.KANA_TO_ROMAN:
- s = self.KANA_TO_ROMAN[key]
- i += len(key) - 1
- if duplicate:
- s = s[0] + s
- duplicate = False
- result += s
- elif u'' == key:
- duplicate = True
- else:
- result += key
- duplicate = False
- i += 1
-
- return result
-
-
- def translate(self, text):
- """take Japanese string and convert to Roman letters"""
-
- if type(text) == unicode:
- text = text.encode('utf-8')
- n = self.mecab.parseToNode(text)
-
- result = ''
- while n:
-
- if n.surface == '':
- n = n.next
- continue
-
- feature = unicode(n.feature,'utf-8').split(',')
-
- if len(feature) < 8 or feature[7] == '*':
- r = self.romanise(n.surface)
- else:
- r = self.romanise(feature[7])
-
- result += r + " "
- n = n.next
-
- return result.strip()
-
-
# run the program
if __name__ == "__main__":
main()
@@ -21,6 +21,9 @@
import bucket
import PrintLog
import gd
+import littleparser
+import LanguageTranslation
+import SearchKey
verbose = False
warnings = False
@@ -1145,9 +1148,21 @@ def write_article(language_links):
# create language links
links_stream = io.BytesIO('')
+ japanese_convert = LanguageTranslation.Furigana().translate
+ translate = littleparser.LittleParser().translate
for l in language_links:
- links_stream.write(l.encode('utf-8') + '\0')
+ language, link = l.split(':', 1)
+
+ if 'ja' == language:
+ stripped = SearchKey.strip_accents(japanese_convert(link))
+ else:
+ stripped = SearchKey.strip_accents(link)
+
+ if link == stripped:
+ links_stream.write(l.encode('utf-8') + '\0')
+ else:
+ links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')
links_stream.flush()
langs = links_stream.getvalue()
Oops, something went wrong.

0 comments on commit 3e1658e

Please sign in to comment.