Skip to content

Commit

Permalink
Extracted all functions to the module level; no more classes
Browse files Browse the repository at this point in the history
  • Loading branch information
Sumin Byeon committed Sep 22, 2014
1 parent 1ea7fb9 commit 18edce3
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 127 deletions.
28 changes: 14 additions & 14 deletions README.rst
Expand Up @@ -11,7 +11,7 @@ Installation

.. code-block:: console
sudo pip install hanja
pip install hanja
Usage
Expand All @@ -20,14 +20,14 @@ Usage
한글 초성, 중성, 종성 분리
``````````````````````````

>>> Hangul.separate(u'')
>>> hangul.separate(u'')
(0, 0, 0)
>>> Hangul.separate(u'')
>>> hangul.separate(u'')
(1, 0, 0)

튜플(tuple)의 마지막 원소가 0이면 종성이 없는 글자라고 판단할 수 있다.

>>> Hangul.separate(u'')
>>> hangul.separate(u'')
(18, 0, 4)

'ㅎ'은 19번째 자음, 'ㅏ'는 첫번째 모음, 'ㄴ'은 다섯번째 자음이라는 것을 알 수 있다.
Expand All @@ -36,18 +36,18 @@ Usage
초성, 중성, 종성을 조합하여 한 글자를 만듦
``````````````````````````````````````````

>>> Hangul.synthesize(0, 0, 0)
>>> hangul.build(0, 0, 0)
u'\uac00'
>>> print Hangul.synthesize(0, 0, 0)
>>> print Hangul.build(0, 0, 0)


주어진 글자가 한글인지의 여부를 판별
````````````````````````````````````

>>> Hangul.is_hangul(u'')
>>> hangul.is_hangul(u'')
True
>>> Hangul.is_hangul(u'a')
>>> hangul.is_hangul(u'a')
False


Expand All @@ -56,30 +56,30 @@ False

리스트가 아닌 제네레이터(generator)를 반환한다.

>>> '|'.join(Hanja.split_hanja(u'大韓民國은 民主共和國이다.'))
>>> '|'.join(hanja.split_hanja(u'大韓民國은 民主共和國이다.'))
大韓民國|은 |民主共和國|이다.

>>> [x for x in Hanja.split_hanja(u'大韓民國은 民主共和國이다.')]
>>> [x for x in hanja.split_hanja(u'大韓民國은 民主共和國이다.')]
[u'\u5927\u97d3\u6c11\u570b', u'\uc740 ', u'\u6c11\u4e3b\u5171\u548c\u570b', u'\uc774\ub2e4.']

주어진 글자가 한자인지의 여부를 판별
````````````````````````````````````

>>> Hanja.is_hanja(u'')
>>> hanja.is_hanja(u'')
True

>>> Hanja.is_hanja(u'')
>>> hanja.is_hanja(u'')
False

문장 변환
`````````

치환 모드 변환:

>>> Hanja.translate(u'大韓民國은 民主共和國이다.', 'substitution')
>>> hanja.translate(u'大韓民國은 民主共和國이다.', 'substitution')
대한민국은 민주공화국이다.

혼용 모드 변환:

>>> Hanja.translate(u'大韓民國은 民主共和國이다.', 'combination')
>>> hanja.translate(u'大韓民國은 民主共和國이다.', 'combination')
<span class="hanja">大韓民國</span><span class="hangul">(대한민국)</span>은 <span class="hanja">民主共和國</span><span class="hangul">(민주공화국)</span>이다.
120 changes: 17 additions & 103 deletions hanja/__init__.py
Expand Up @@ -2,106 +2,20 @@

__author__ = 'Sumin Byeon'
__email__ = 'suminb@gmail.com'
__version__ = '0.9.0'

from pairs import table as hanja_table

class Hangul:
@staticmethod
def separate(ch):
"""한글 자모 분리. 주어진 한글 한 글자의 초성, 중성 초성을 반환함."""
uindex = ord(ch) - 0xac00
jongseong = uindex % 28
joongseong = ((uindex - jongseong) / 28) % 21
choseong = ((uindex - jongseong) / 28) / 21

return (choseong, joongseong, jongseong)

@staticmethod
def synthesize(choseong, joongseong, jongseong):
"""초성, 중성, 종성을 조합하여 완성형 한 글자를 만듦. 'choseong', 'joongseong', 'jongseong' are offsets. For example, 'ㄱ' is 0, 'ㄲ' is 1, 'ㄴ' is 2, and so on and so fourth."""
return unichr(((((choseong) * 21) + joongseong) * 28) + jongseong + 0xac00)

@staticmethod
def dooeum(previous, current):
"""두음법칙을 적용하기 위한 함수."""
p, c = Hangul.separate(previous), Hangul.separate(current)
offset = 0

# 한자음 '녀, 뇨, 뉴, 니', '랴, 려, 례, 료, 류, 리'가 단어 첫머리에 올 때 '여, 요, 유, 이', '야, 여, 예, 요, 유, 이'로 발음한다.
if current in (u'녀', u'뇨', u'뉴', u'니'):
offset = 9
elif current in (u'랴', u'려', u'례', u'료', u'류', u'리'):
offset = 6
# 한자음 '라, 래, 로, 뢰, 루, 르'가 단어 첫머리에 올 때 '나, 내, 노, 뇌, 누, 느'로 발음한다.
elif current in (u'라', u'래', u'로', u'뢰', u'루', u'르'):
offset = -3
# 모음이나 ㄴ 받침 뒤에 이어지는 '렬, 률'은 '열, 율'로 발음한다.
elif current in (u'렬', u'률') and p[2] in (0, 2):
offset = 6

return Hangul.synthesize(c[0]+offset, c[1], c[2])

@staticmethod
def is_hangul(ch):
return ord(ch) >= 0xac00 and ord(ch) <= 0xd7a3

class Hanja:
"""두음법칙에 관련된 내용은 http://ko.wikipedia.org/wiki/%EB%91%90%EC%9D%8C_%EB%B2%95%EC%B9%99 를 참고."""

@staticmethod
def translate_syllable(previous, current):
if current in hanja_table:
if previous in hanja_table:
return hanja_table[current]
else:
return Hangul.dooeum(previous, hanja_table[current])

return current

@staticmethod
def split_hanja(text):
"""주어진 문장을 한자로 된 구역과 그 이외의 문자로 된 구역으로 분리"""

# TODO: Can we make this a bit prettier?
if len(text) == 0:
yield text
else:
ch = text[0]
bucket = [ch]
prev_state = Hanja.is_hanja(ch)

for ch in text[1:]:
state = Hanja.is_hanja(ch)

if prev_state != state:
yield ''.join(bucket)
bucket = [ch]
else:
bucket.append(ch)

prev_state = state

yield ''.join(bucket)


@staticmethod
def translate(text, mode):
return ''.join(map(lambda w: Hanja.translate_word(w, mode), Hanja.split_hanja(text)))

@staticmethod
def translate_word(word, mode, format='<span class="hanja">%s</span><span class="hangul">(%s)</span>'):
"""
``mode``: combination | substitution
"""
tw = ''.join(map(Hanja.translate_syllable, u' '+word[:-1], word))

if mode == 'combination' and Hanja.is_hanja(word[0]) == 1:
return format % (word, tw)
else:
return tw

@staticmethod
def is_hanja(ch):
"""Determines if a given character ``ch`` is a Chinese character."""
return ch in hanja_table
__version__ = '0.10.0'

import warnings

# Copied from https://wiki.python.org/moin/PythonDecoratorLibrary
def deprecated(func):
'''This is a decorator which can be used to mark functions
as deprecated. It will result in a warning being emitted
when the function is used.'''
def new_func(*args, **kwargs):
warnings.warn("Call to deprecated function {}.".format(func.__name__),
category=DeprecationWarning)
return func(*args, **kwargs)
new_func.__name__ = func.__name__
new_func.__doc__ = func.__doc__
new_func.__dict__.update(func.__dict__)
return new_func
59 changes: 59 additions & 0 deletions hanja/hangul.py
@@ -0,0 +1,59 @@
# -*- coding: utf8 -*-

from __init__ import deprecated


def separate(ch):
"""한글 자모 분리. 주어진 한글 한 글자의 초성, 중성 초성을 반환함."""
uindex = ord(ch) - 0xac00
jongseong = uindex % 28
joongseong = ((uindex - jongseong) / 28) % 21
choseong = ((uindex - jongseong) / 28) / 21

return (choseong, joongseong, jongseong)


@deprecated
def synthesize(choseong, joongseong, jongseong):
return build(choseong, joongseong, jongseong)


def build(choseong, joongseong, jongseong):
"""초성, 중성, 종성을 조합하여 완성형 한 글자를 만듦. 'choseong',
'joongseong', 'jongseong' are offsets. For example, 'ㄱ' is 0, 'ㄲ' is 1,
'ㄴ' is 2, and so on and so fourth."""
return unichr(((((choseong) * 21) + joongseong) * 28) + jongseong + 0xac00)


def dooeum(previous, current):
"""두음법칙을 적용하기 위한 함수."""
p, c = Hangul.separate(previous), Hangul.separate(current)
offset = 0

# 한자음 '녀, 뇨, 뉴, 니', '랴, 려, 례, 료, 류, 리'가 단어 첫머리에 올 때
# '여, 요, 유, 이', '야, 여, 예, 요, 유, 이'로 발음한다.
if current in (u'녀', u'뇨', u'뉴', u'니'):
offset = 9
elif current in (u'랴', u'려', u'례', u'료', u'류', u'리'):
offset = 6
# 한자음 '라, 래, 로, 뢰, 루, 르'가 단어 첫머리에 올 때 '나, 내, 노, 뇌,
# 누, 느'로 발음한다.
elif current in (u'라', u'래', u'로', u'뢰', u'루', u'르'):
offset = -3
# 모음이나 ㄴ 받침 뒤에 이어지는 '렬, 률'은 '열, 율'로 발음한다.
elif current in (u'렬', u'률') and p[2] in (0, 2):
offset = 6

return build(c[0]+offset, c[1], c[2])


def is_hangul(ch):
if ch is None:
return False
else:
return ord(ch) >= 0xac00 and ord(ch) <= 0xd7a3


def contains_hangul(text):
# NOTE: Probably not an ideal solution in terms of performance
return reduce(lambda x, y: x or y, map(lambda c: is_hangul(c), text))
64 changes: 64 additions & 0 deletions hanja/hanja.py
@@ -0,0 +1,64 @@
# -*- coding: utf8 -*-
"""두음법칙에 관련된 내용은
http://ko.wikipedia.org/wiki/%EB%91%90%EC%9D%8C_%EB%B2%95%EC%B9%99 를 참고."""

from pairs import table as hanja_table
from hangul import dooeum


def translate_syllable(previous, current):
if current in hanja_table:
if previous in hanja_table:
return hanja_table[current]
else:
return dooeum(previous, hanja_table[current])

return current


def split_hanja(text):
"""주어진 문장을 한자로 된 구역과 그 이외의 문자로 된 구역으로 분리"""

# TODO: Can we make this a bit prettier?
if len(text) == 0:
yield text
else:
ch = text[0]
bucket = [ch]
prev_state = Hanja.is_hanja(ch)

for ch in text[1:]:
state = Hanja.is_hanja(ch)

if prev_state != state:
yield ''.join(bucket)
bucket = [ch]
else:
bucket.append(ch)

prev_state = state

yield ''.join(bucket)


def translate(text, mode):
return ''.join(map(lambda w: translate_word(w, mode),
split_hanja(text)))


def translate_word(word, mode,
format='<span class="hanja">%s</span><span class="hangul">(%s)</span>'):
"""
:param mode: combination | substitution
"""
tw = ''.join(map(translate_syllable, u' '+word[:-1], word))

if mode == 'combination' and is_hanja(word[0]) == 1:
return format % (word, tw)
else:
return tw


def is_hanja(ch):
"""Determines if a given character ``ch`` is a Chinese character."""
return ch in hanja_table
3 changes: 3 additions & 0 deletions requirements.txt
@@ -0,0 +1,3 @@
pytest
pytest-cov
coveralls
13 changes: 3 additions & 10 deletions setup.py
Expand Up @@ -5,15 +5,8 @@


def readme():
try:
f = open('README.rst')
content = f.read()
f.close()
return content
except IOError:
pass
except OSError:
pass
with open('README.rst') as f:
return f.read()


setup(name='hanja',
Expand All @@ -25,4 +18,4 @@ def readme():
author_email=hanja.__email__,
url='http://github.com/suminb/hanja',
packages=[],
)
)
28 changes: 28 additions & 0 deletions tests/test_basic.py
@@ -0,0 +1,28 @@
# -*- coding: utf8 -*-
import pytest
from hanja import hangul, hanja


def test_separation():
assert hangul.separate(u'가') == (0, 0, 0)
assert hangul.separate(u'까') == (1, 0, 0)
assert hangul.separate(u'갸') == (0, 2, 0)
assert hangul.separate(u'각') == (0, 0, 1)


def test_build():
assert hangul.build(0, 0, 0) == u'가'


def test_is_hangul():
assert hangul.is_hangul(u'한') == True
assert hangul.is_hangul('A') == False
assert hangul.is_hangul('1') == False
assert hangul.is_hangul(None) == False


def test_contains_hangul():
assert hangul.contains_hangul(u'한국어') == True
assert hangul.contains_hangul(u'한ABC국어') == True
assert hangul.contains_hangul(u"Yo, what's up bro?") == False
assert hangul.contains_hangul(u'1234567890') == False

0 comments on commit 18edce3

Please sign in to comment.