Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extracted all functions to the module level; no more classes
- Loading branch information
Sumin Byeon
committed
Sep 22, 2014
1 parent
1ea7fb9
commit 18edce3
Showing
7 changed files
with
188 additions
and
127 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# -*- coding: utf8 -*- | ||
|
||
from __init__ import deprecated | ||
|
||
|
||
def separate(ch): | ||
"""한글 자모 분리. 주어진 한글 한 글자의 초성, 중성 초성을 반환함.""" | ||
uindex = ord(ch) - 0xac00 | ||
jongseong = uindex % 28 | ||
joongseong = ((uindex - jongseong) / 28) % 21 | ||
choseong = ((uindex - jongseong) / 28) / 21 | ||
|
||
return (choseong, joongseong, jongseong) | ||
|
||
|
||
@deprecated | ||
def synthesize(choseong, joongseong, jongseong): | ||
return build(choseong, joongseong, jongseong) | ||
|
||
|
||
def build(choseong, joongseong, jongseong): | ||
"""초성, 중성, 종성을 조합하여 완성형 한 글자를 만듦. 'choseong', | ||
'joongseong', 'jongseong' are offsets. For example, 'ㄱ' is 0, 'ㄲ' is 1, | ||
'ㄴ' is 2, and so on and so fourth.""" | ||
return unichr(((((choseong) * 21) + joongseong) * 28) + jongseong + 0xac00) | ||
|
||
|
||
def dooeum(previous, current): | ||
"""두음법칙을 적용하기 위한 함수.""" | ||
p, c = Hangul.separate(previous), Hangul.separate(current) | ||
offset = 0 | ||
|
||
# 한자음 '녀, 뇨, 뉴, 니', '랴, 려, 례, 료, 류, 리'가 단어 첫머리에 올 때 | ||
# '여, 요, 유, 이', '야, 여, 예, 요, 유, 이'로 발음한다. | ||
if current in (u'녀', u'뇨', u'뉴', u'니'): | ||
offset = 9 | ||
elif current in (u'랴', u'려', u'례', u'료', u'류', u'리'): | ||
offset = 6 | ||
# 한자음 '라, 래, 로, 뢰, 루, 르'가 단어 첫머리에 올 때 '나, 내, 노, 뇌, | ||
# 누, 느'로 발음한다. | ||
elif current in (u'라', u'래', u'로', u'뢰', u'루', u'르'): | ||
offset = -3 | ||
# 모음이나 ㄴ 받침 뒤에 이어지는 '렬, 률'은 '열, 율'로 발음한다. | ||
elif current in (u'렬', u'률') and p[2] in (0, 2): | ||
offset = 6 | ||
|
||
return build(c[0]+offset, c[1], c[2]) | ||
|
||
|
||
def is_hangul(ch): | ||
if ch is None: | ||
return False | ||
else: | ||
return ord(ch) >= 0xac00 and ord(ch) <= 0xd7a3 | ||
|
||
|
||
def contains_hangul(text): | ||
# NOTE: Probably not an ideal solution in terms of performance | ||
return reduce(lambda x, y: x or y, map(lambda c: is_hangul(c), text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# -*- coding: utf8 -*- | ||
"""두음법칙에 관련된 내용은 | ||
http://ko.wikipedia.org/wiki/%EB%91%90%EC%9D%8C_%EB%B2%95%EC%B9%99 를 참고.""" | ||
|
||
from pairs import table as hanja_table | ||
from hangul import dooeum | ||
|
||
|
||
def translate_syllable(previous, current): | ||
if current in hanja_table: | ||
if previous in hanja_table: | ||
return hanja_table[current] | ||
else: | ||
return dooeum(previous, hanja_table[current]) | ||
|
||
return current | ||
|
||
|
||
def split_hanja(text): | ||
"""주어진 문장을 한자로 된 구역과 그 이외의 문자로 된 구역으로 분리""" | ||
|
||
# TODO: Can we make this a bit prettier? | ||
if len(text) == 0: | ||
yield text | ||
else: | ||
ch = text[0] | ||
bucket = [ch] | ||
prev_state = Hanja.is_hanja(ch) | ||
|
||
for ch in text[1:]: | ||
state = Hanja.is_hanja(ch) | ||
|
||
if prev_state != state: | ||
yield ''.join(bucket) | ||
bucket = [ch] | ||
else: | ||
bucket.append(ch) | ||
|
||
prev_state = state | ||
|
||
yield ''.join(bucket) | ||
|
||
|
||
def translate(text, mode): | ||
return ''.join(map(lambda w: translate_word(w, mode), | ||
split_hanja(text))) | ||
|
||
|
||
def translate_word(word, mode, | ||
format='<span class="hanja">%s</span><span class="hangul">(%s)</span>'): | ||
""" | ||
:param mode: combination | substitution | ||
""" | ||
tw = ''.join(map(translate_syllable, u' '+word[:-1], word)) | ||
|
||
if mode == 'combination' and is_hanja(word[0]) == 1: | ||
return format % (word, tw) | ||
else: | ||
return tw | ||
|
||
|
||
def is_hanja(ch): | ||
"""Determines if a given character ``ch`` is a Chinese character.""" | ||
return ch in hanja_table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pytest | ||
pytest-cov | ||
coveralls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# -*- coding: utf8 -*- | ||
import pytest | ||
from hanja import hangul, hanja | ||
|
||
|
||
def test_separation(): | ||
assert hangul.separate(u'가') == (0, 0, 0) | ||
assert hangul.separate(u'까') == (1, 0, 0) | ||
assert hangul.separate(u'갸') == (0, 2, 0) | ||
assert hangul.separate(u'각') == (0, 0, 1) | ||
|
||
|
||
def test_build(): | ||
assert hangul.build(0, 0, 0) == u'가' | ||
|
||
|
||
def test_is_hangul(): | ||
assert hangul.is_hangul(u'한') == True | ||
assert hangul.is_hangul('A') == False | ||
assert hangul.is_hangul('1') == False | ||
assert hangul.is_hangul(None) == False | ||
|
||
|
||
def test_contains_hangul(): | ||
assert hangul.contains_hangul(u'한국어') == True | ||
assert hangul.contains_hangul(u'한ABC국어') == True | ||
assert hangul.contains_hangul(u"Yo, what's up bro?") == False | ||
assert hangul.contains_hangul(u'1234567890') == False |