From 18edce33654ecf90305bfeb3f802478c67c606ef Mon Sep 17 00:00:00 2001 From: Sumin Byeon Date: Mon, 22 Sep 2014 09:18:14 +0000 Subject: [PATCH] Extracted all functions to the module level; no more classes --- README.rst | 28 +++++------ hanja/__init__.py | 120 +++++++------------------------------------- hanja/hangul.py | 59 ++++++++++++++++++++++ hanja/hanja.py | 64 +++++++++++++++++++++++ requirements.txt | 3 ++ setup.py | 13 ++--- tests/test_basic.py | 28 +++++++++++ 7 files changed, 188 insertions(+), 127 deletions(-) create mode 100644 hanja/hangul.py create mode 100644 hanja/hanja.py create mode 100644 requirements.txt create mode 100644 tests/test_basic.py diff --git a/README.rst b/README.rst index e781714..8253aad 100644 --- a/README.rst +++ b/README.rst @@ -11,7 +11,7 @@ Installation .. code-block:: console - sudo pip install hanja + pip install hanja Usage @@ -20,14 +20,14 @@ Usage 한글 초성, 중성, 종성 분리 `````````````````````````` ->>> Hangul.separate(u'가') +>>> hangul.separate(u'가') (0, 0, 0) ->>> Hangul.separate(u'까') +>>> hangul.separate(u'까') (1, 0, 0) 튜플(tuple)의 마지막 원소가 0이면 종성이 없는 글자라고 판단할 수 있다. ->>> Hangul.separate(u'한') +>>> hangul.separate(u'한') (18, 0, 4) 'ㅎ'은 19번째 자음, 'ㅏ'는 첫번째 모음, 'ㄴ'은 다섯번째 자음이라는 것을 알 수 있다. @@ -36,18 +36,18 @@ Usage 초성, 중성, 종성을 조합하여 한 글자를 만듦 `````````````````````````````````````````` ->>> Hangul.synthesize(0, 0, 0) +>>> hangul.build(0, 0, 0) u'\uac00' ->>> print Hangul.synthesize(0, 0, 0) +>>> print Hangul.build(0, 0, 0) 가 주어진 글자가 한글인지의 여부를 판별 ```````````````````````````````````` ->>> Hangul.is_hangul(u'가') +>>> hangul.is_hangul(u'가') True ->>> Hangul.is_hangul(u'a') +>>> hangul.is_hangul(u'a') False @@ -56,19 +56,19 @@ False 리스트가 아닌 제네레이터(generator)를 반환한다. ->>> '|'.join(Hanja.split_hanja(u'大韓民國은 民主共和國이다.')) +>>> '|'.join(hanja.split_hanja(u'大韓民國은 民主共和國이다.')) 大韓民國|은 |民主共和國|이다. ->>> [x for x in Hanja.split_hanja(u'大韓民國은 民主共和國이다.')] +>>> [x for x in hanja.split_hanja(u'大韓民國은 民主共和國이다.')] [u'\u5927\u97d3\u6c11\u570b', u'\uc740 ', u'\u6c11\u4e3b\u5171\u548c\u570b', u'\uc774\ub2e4.'] 주어진 글자가 한자인지의 여부를 판별 ```````````````````````````````````` ->>> Hanja.is_hanja(u'韓') +>>> hanja.is_hanja(u'韓') True ->>> Hanja.is_hanja(u'한') +>>> hanja.is_hanja(u'한') False 문장 변환 @@ -76,10 +76,10 @@ False 치환 모드 변환: ->>> Hanja.translate(u'大韓民國은 民主共和國이다.', 'substitution') +>>> hanja.translate(u'大韓民國은 民主共和國이다.', 'substitution') 대한민국은 민주공화국이다. 혼용 모드 변환: ->>> Hanja.translate(u'大韓民國은 民主共和國이다.', 'combination') +>>> hanja.translate(u'大韓民國은 民主共和國이다.', 'combination') 大韓民國(대한민국)民主共和國(민주공화국)이다. diff --git a/hanja/__init__.py b/hanja/__init__.py index e680a2b..87307f9 100644 --- a/hanja/__init__.py +++ b/hanja/__init__.py @@ -2,106 +2,20 @@ __author__ = 'Sumin Byeon' __email__ = 'suminb@gmail.com' -__version__ = '0.9.0' - -from pairs import table as hanja_table - -class Hangul: - @staticmethod - def separate(ch): - """한글 자모 분리. 주어진 한글 한 글자의 초성, 중성 초성을 반환함.""" - uindex = ord(ch) - 0xac00 - jongseong = uindex % 28 - joongseong = ((uindex - jongseong) / 28) % 21 - choseong = ((uindex - jongseong) / 28) / 21 - - return (choseong, joongseong, jongseong) - - @staticmethod - def synthesize(choseong, joongseong, jongseong): - """초성, 중성, 종성을 조합하여 완성형 한 글자를 만듦. 'choseong', 'joongseong', 'jongseong' are offsets. For example, 'ㄱ' is 0, 'ㄲ' is 1, 'ㄴ' is 2, and so on and so fourth.""" - return unichr(((((choseong) * 21) + joongseong) * 28) + jongseong + 0xac00) - - @staticmethod - def dooeum(previous, current): - """두음법칙을 적용하기 위한 함수.""" - p, c = Hangul.separate(previous), Hangul.separate(current) - offset = 0 - - # 한자음 '녀, 뇨, 뉴, 니', '랴, 려, 례, 료, 류, 리'가 단어 첫머리에 올 때 '여, 요, 유, 이', '야, 여, 예, 요, 유, 이'로 발음한다. - if current in (u'녀', u'뇨', u'뉴', u'니'): - offset = 9 - elif current in (u'랴', u'려', u'례', u'료', u'류', u'리'): - offset = 6 - # 한자음 '라, 래, 로, 뢰, 루, 르'가 단어 첫머리에 올 때 '나, 내, 노, 뇌, 누, 느'로 발음한다. - elif current in (u'라', u'래', u'로', u'뢰', u'루', u'르'): - offset = -3 - # 모음이나 ㄴ 받침 뒤에 이어지는 '렬, 률'은 '열, 율'로 발음한다. - elif current in (u'렬', u'률') and p[2] in (0, 2): - offset = 6 - - return Hangul.synthesize(c[0]+offset, c[1], c[2]) - - @staticmethod - def is_hangul(ch): - return ord(ch) >= 0xac00 and ord(ch) <= 0xd7a3 - -class Hanja: - """두음법칙에 관련된 내용은 http://ko.wikipedia.org/wiki/%EB%91%90%EC%9D%8C_%EB%B2%95%EC%B9%99 를 참고.""" - - @staticmethod - def translate_syllable(previous, current): - if current in hanja_table: - if previous in hanja_table: - return hanja_table[current] - else: - return Hangul.dooeum(previous, hanja_table[current]) - - return current - - @staticmethod - def split_hanja(text): - """주어진 문장을 한자로 된 구역과 그 이외의 문자로 된 구역으로 분리""" - - # TODO: Can we make this a bit prettier? - if len(text) == 0: - yield text - else: - ch = text[0] - bucket = [ch] - prev_state = Hanja.is_hanja(ch) - - for ch in text[1:]: - state = Hanja.is_hanja(ch) - - if prev_state != state: - yield ''.join(bucket) - bucket = [ch] - else: - bucket.append(ch) - - prev_state = state - - yield ''.join(bucket) - - - @staticmethod - def translate(text, mode): - return ''.join(map(lambda w: Hanja.translate_word(w, mode), Hanja.split_hanja(text))) - - @staticmethod - def translate_word(word, mode, format='%s(%s)'): - """ - ``mode``: combination | substitution - """ - tw = ''.join(map(Hanja.translate_syllable, u' '+word[:-1], word)) - - if mode == 'combination' and Hanja.is_hanja(word[0]) == 1: - return format % (word, tw) - else: - return tw - - @staticmethod - def is_hanja(ch): - """Determines if a given character ``ch`` is a Chinese character.""" - return ch in hanja_table +__version__ = '0.10.0' + +import warnings + +# Copied from https://wiki.python.org/moin/PythonDecoratorLibrary +def deprecated(func): + '''This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emitted + when the function is used.''' + def new_func(*args, **kwargs): + warnings.warn("Call to deprecated function {}.".format(func.__name__), + category=DeprecationWarning) + return func(*args, **kwargs) + new_func.__name__ = func.__name__ + new_func.__doc__ = func.__doc__ + new_func.__dict__.update(func.__dict__) + return new_func diff --git a/hanja/hangul.py b/hanja/hangul.py new file mode 100644 index 0000000..d49e503 --- /dev/null +++ b/hanja/hangul.py @@ -0,0 +1,59 @@ +# -*- coding: utf8 -*- + +from __init__ import deprecated + + +def separate(ch): + """한글 자모 분리. 주어진 한글 한 글자의 초성, 중성 초성을 반환함.""" + uindex = ord(ch) - 0xac00 + jongseong = uindex % 28 + joongseong = ((uindex - jongseong) / 28) % 21 + choseong = ((uindex - jongseong) / 28) / 21 + + return (choseong, joongseong, jongseong) + + +@deprecated +def synthesize(choseong, joongseong, jongseong): + return build(choseong, joongseong, jongseong) + + +def build(choseong, joongseong, jongseong): + """초성, 중성, 종성을 조합하여 완성형 한 글자를 만듦. 'choseong', + 'joongseong', 'jongseong' are offsets. For example, 'ㄱ' is 0, 'ㄲ' is 1, + 'ㄴ' is 2, and so on and so fourth.""" + return unichr(((((choseong) * 21) + joongseong) * 28) + jongseong + 0xac00) + + +def dooeum(previous, current): + """두음법칙을 적용하기 위한 함수.""" + p, c = Hangul.separate(previous), Hangul.separate(current) + offset = 0 + + # 한자음 '녀, 뇨, 뉴, 니', '랴, 려, 례, 료, 류, 리'가 단어 첫머리에 올 때 + # '여, 요, 유, 이', '야, 여, 예, 요, 유, 이'로 발음한다. + if current in (u'녀', u'뇨', u'뉴', u'니'): + offset = 9 + elif current in (u'랴', u'려', u'례', u'료', u'류', u'리'): + offset = 6 + # 한자음 '라, 래, 로, 뢰, 루, 르'가 단어 첫머리에 올 때 '나, 내, 노, 뇌, + # 누, 느'로 발음한다. + elif current in (u'라', u'래', u'로', u'뢰', u'루', u'르'): + offset = -3 + # 모음이나 ㄴ 받침 뒤에 이어지는 '렬, 률'은 '열, 율'로 발음한다. + elif current in (u'렬', u'률') and p[2] in (0, 2): + offset = 6 + + return build(c[0]+offset, c[1], c[2]) + + +def is_hangul(ch): + if ch is None: + return False + else: + return ord(ch) >= 0xac00 and ord(ch) <= 0xd7a3 + + +def contains_hangul(text): + # NOTE: Probably not an ideal solution in terms of performance + return reduce(lambda x, y: x or y, map(lambda c: is_hangul(c), text)) diff --git a/hanja/hanja.py b/hanja/hanja.py new file mode 100644 index 0000000..b34191a --- /dev/null +++ b/hanja/hanja.py @@ -0,0 +1,64 @@ +# -*- coding: utf8 -*- +"""두음법칙에 관련된 내용은 +http://ko.wikipedia.org/wiki/%EB%91%90%EC%9D%8C_%EB%B2%95%EC%B9%99 를 참고.""" + +from pairs import table as hanja_table +from hangul import dooeum + + +def translate_syllable(previous, current): + if current in hanja_table: + if previous in hanja_table: + return hanja_table[current] + else: + return dooeum(previous, hanja_table[current]) + + return current + + +def split_hanja(text): + """주어진 문장을 한자로 된 구역과 그 이외의 문자로 된 구역으로 분리""" + + # TODO: Can we make this a bit prettier? + if len(text) == 0: + yield text + else: + ch = text[0] + bucket = [ch] + prev_state = Hanja.is_hanja(ch) + + for ch in text[1:]: + state = Hanja.is_hanja(ch) + + if prev_state != state: + yield ''.join(bucket) + bucket = [ch] + else: + bucket.append(ch) + + prev_state = state + + yield ''.join(bucket) + + +def translate(text, mode): + return ''.join(map(lambda w: translate_word(w, mode), + split_hanja(text))) + + +def translate_word(word, mode, + format='%s(%s)'): + """ + :param mode: combination | substitution + """ + tw = ''.join(map(translate_syllable, u' '+word[:-1], word)) + + if mode == 'combination' and is_hanja(word[0]) == 1: + return format % (word, tw) + else: + return tw + + +def is_hanja(ch): + """Determines if a given character ``ch`` is a Chinese character.""" + return ch in hanja_table diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c7c53a1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pytest +pytest-cov +coveralls diff --git a/setup.py b/setup.py index 6471084..41e8ee2 100644 --- a/setup.py +++ b/setup.py @@ -5,15 +5,8 @@ def readme(): - try: - f = open('README.rst') - content = f.read() - f.close() - return content - except IOError: - pass - except OSError: - pass + with open('README.rst') as f: + return f.read() setup(name='hanja', @@ -25,4 +18,4 @@ def readme(): author_email=hanja.__email__, url='http://github.com/suminb/hanja', packages=[], - ) +) diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..6909f37 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,28 @@ +# -*- coding: utf8 -*- +import pytest +from hanja import hangul, hanja + + +def test_separation(): + assert hangul.separate(u'가') == (0, 0, 0) + assert hangul.separate(u'까') == (1, 0, 0) + assert hangul.separate(u'갸') == (0, 2, 0) + assert hangul.separate(u'각') == (0, 0, 1) + + +def test_build(): + assert hangul.build(0, 0, 0) == u'가' + + +def test_is_hangul(): + assert hangul.is_hangul(u'한') == True + assert hangul.is_hangul('A') == False + assert hangul.is_hangul('1') == False + assert hangul.is_hangul(None) == False + + +def test_contains_hangul(): + assert hangul.contains_hangul(u'한국어') == True + assert hangul.contains_hangul(u'한ABC국어') == True + assert hangul.contains_hangul(u"Yo, what's up bro?") == False + assert hangul.contains_hangul(u'1234567890') == False