# 언어데이터과학 25강 (2023-12-03) 옛한글 처리

## 할 일

### 오늘의 목표

현대 한국어 표기에 쓰이지 않는 옛한글을 포함한 한국어 음절 유형을 분류할 수 있다.
  > ᄣᆞᆯ -> CCCVC, ᄎᆔ -> CGVV


### 심화 과제

임의의 한글 자모의 연쇄를 한글 음절로 합성할 수 있다.
  > ㅂㅏㅂㅅㅜㄷㄱㅏㄹㅏㄱ -> 밥숟가락

In [1]:
from unicodedata import name, lookup, normalize
import re

## 옛한글 처리



### 과거

+ PUA(Private Use Areas): 유니코드에서 코드가 할당되지 않은 문자 영역

In [2]:
spy = '\ue13d'
print(spy)




### 현재

+ "첫가끝"(첫소리-가운뎃소리-끝소리): 유니코드에 할당된 자음 문자와 모음 문자의 조합

![](https://upload.wikimedia.org/wikipedia/commons/b/bd/Hangul_jamo_characters_in_Unicode.svg)

## `unicodedata` 모듈을 사용한 Python 옛한글 처리

### `lookup()` 함수로 옛한글 입력하기

+ `unicodedata.lookup()`: 유니코드에서 정의된 문자의 이름을 인자로 받아서 해당하는 문자를 돌려주는 함수

#### 문자 이름 알아보기

+ https://www.unicode.org/charts/PDF/U1100.pdf HANGUL JAMO
+ https://www.unicode.org/charts/PDF/UA960.pdf HANGUL JAMO EXTENDED-A
+ https://www.unicode.org/charts/PDF/UD7B0.pdf HANGUL JAMO EXTENDED-B

In [3]:
# CHOSEONG, JUNGSEONG, JONGSEONG 순서가 맞아야 함
onset1 = lookup('HANGUL CHOSEONG PIEUP-SIOS-TIKEUT')
nucleus1 = lookup('HANGUL JUNGSEONG ARAEA')
coda1 = lookup('HANGUL JONGSEONG RIEUL')
print(onset1+nucleus1+coda1)

ᄣᆞᆯ


In [4]:
onset2 = lookup('HANGUL CHOSEONG CIEUC')
nucleus2 = lookup('HANGUL JUNGSEONG O')
coda2 = lookup('HANGUL JONGSEONG YESIEUNG')
print(onset2+nucleus2+coda2)

조ᇰ


In [5]:
onset3 = lookup('HANGUL CHOSEONG CHIEUCH')
nucleus3 = lookup('HANGUL JUNGSEONG YU-I')
print(onset3+nucleus3)

ᄎᆔ


In [6]:
onset4 = lookup('HANGUL CHOSEONG KIYEOK')
nucleus4 = lookup('HANGUL JUNGSEONG U')
coda4 = lookup('HANGUL JONGSEONG MIEUM-KIYEOK')
print(onset4+nucleus4+coda4)

구ᇚ


### `normalize()` 함수로 한글 음절 분해하기

+ NFD: Normalization Form (Canonical) D(ecomposition)
+ NFC: Normalization Form (Canonical) C(omposition)
+ NFKD: Normalization Form Compatibility D(ecomposition)
+ NFKC: Normalization Form Compatibility C(omposition)

In [63]:
def decompose(str):
  return normalize('NFKD', str)

### `name()` 함수로 문자 이름 찾기

In [66]:
def print_jamo_name(syl:str):
  print(syl, name(syl))
  print('----------')
  for jamo in decompose(syl):
    print(jamo, name(jamo))

In [67]:
print_jamo_name('뷁') # CGVCC

뷁 HANGUL SYLLABLE BWELG
----------
ᄇ HANGUL CHOSEONG PIEUP
ᅰ HANGUL JUNGSEONG WE
ᆰ HANGUL JONGSEONG RIEUL-KIYEOK


In [10]:
print_jamo_name('아') # V

아 HANGUL SYLLABLE A
ᄋ HANGUL CHOSEONG IEUNG
ᅡ HANGUL JUNGSEONG A


In [11]:
def romanize(hangul):
  hname = name(hangul)
  assert hname.startswith('HANGUL')
  return hname.split()[-1]

In [12]:
print(romanize('ㄱ'))

KIYEOK


In [13]:
print(romanize('ㅢ'))

YI


In [14]:
print(romanize('ㅚ'))

OE


In [15]:
print(romanize('ㅟ'))

WI


In [17]:
GLIDES = re.compile(r'^[WY]')

In [18]:
def is_letter(hangul):
    return name(hangul).startswith('HANGUL LETTER')

def is_choseong(jamo):
    return name(jamo).startswith('HANGUL CHOSEONG')

def is_jungseong(jamo):
    return name(jamo).startswith('HANGUL JUNGSEONG')

def is_jongseong(jamo):
    return name(jamo).startswith('HANGUL JONGSEONG')

In [19]:
def get_onset_type(jamo):
  assert is_choseong(jamo) or is_letter(jamo)

  roman = romanize(jamo)
  if romanize(jamo) == 'IEUNG':
    return ''
  
  # 자음의 개수만큼 C를 내보내기
  return '' # EDIT THIS LINE

In [20]:
def get_nucleus_type(jamo):
  assert is_jungseong(jamo) or is_letter(jamo)

  roman = romanize(jamo)
  output = ''
  
  # 모음의 개수만큼 V 혹은 GV를 내보내기
  # DO SOMETHING HERE
  # DO SOMETHING HERE
  
  return output

In [21]:
def get_coda_type(jamo):
  assert is_jongseong(jamo) or is_letter(jamo)

  roman = romanize(jamo)
  return (roman.count('-')+1) * 'C'

In [22]:
def is_syllable(char):
  try:
    return name(char).startswith('HANGUL SYLLABLE')
  except ValueError:
    return False
  except TypeError:
    return False

In [23]:
print(is_syllable('앍'))
print(is_syllable('ㅇㅇ'))

True
False


In [24]:
print(is_syllable(spy))

False


In [25]:
def to_choseong(hangul):
    try:
        return lookup(re.sub('LETTER|JONGSEONG', 'CHOSEONG', name(hangul)))
    except:
        return hangul

def to_jungseong(hangul):
    try:
        jname = name(hangul)
        # [hw14Adv] DO SOMETHING HERE
        return lookup(jname.replace('LETTER', 'JUNGSEONG'))
    except:
        return hangul

def to_jongseong(hangul):
    try:
        return lookup(re.sub('LETTER|CHOSEONG', 'JONGSEONG', name(hangul)))
    except:
        return hangul

In [26]:
def to_syllable(jamos):
    output = ''
    if 2 <= len(jamos) <= 3:
        output += to_choseong(jamos[0])
        output += to_jungseong(jamos[1])
    if len(jamos) == 3:
        output += to_jongseong(jamos[2])
    
    return normalize('NFKC', output)

In [27]:
def is_jamo_syllable(string):
    try:
        onset, nucleus = string[0:2]
        result = is_choseong(to_choseong(onset)) and is_jungseong(to_jungseong(nucleus))
        coda = string[2:]
        if coda:
            result = result and is_jongseong(to_jongseong(coda))
    except:
        return False
    
    return result


In [28]:
print(is_jamo_syllable('ㄱㅏㅁ'))
print(is_jamo_syllable('ㄱㅏㅁ4'))
print(is_jamo_syllable('ㄱㅏㅁㅁ'))
print(is_jamo_syllable('ᄣᆞᆯ'))

True
False
False
True


In [29]:
def is_any_syllable(string):
    return is_jamo_syllable(string) or is_syllable(string)

In [30]:
print(is_any_syllable('ㅇ'))
print(is_any_syllable('앆'))
print(is_any_syllable(decompose('앆')))
print(is_any_syllable('ㅇㅏ'))
print(is_any_syllable('ㅇㅏㄲ'))
print(is_any_syllable('ㅇㅏㅇㅇ'))

False
True
True
True
True
False


In [31]:
def to_letter(jamo):
    try:
        jname = name(jamo)
        # [hw14Adv] DO SOMETHING HERE
        return lookup(re.sub('(?:CHO|JUNG|JONG)SEONG', 'LETTER', jname))
    except:
        return jamo

In [32]:
def to_letters(jamos):
    return ''.join(to_letter(jamo) for jamo in jamos)

+ https://unicode.org/charts/PDF/U3130.pdf HANGUL COMPATIBILITY JAMO

In [33]:
# Advanced homework
def split_syllables(jamos):
    jamos = to_letters(jamos)
    
    # [hw14adv] DO SOMETHING HERE

In [34]:
jamos = 'ㅂㅏㅂㅏㅁㅂㅏ'
print(split_syllables(jamos))

['ㅂㅏ', 'ㅂㅏㅁ', 'ㅂㅏ']


In [35]:
complex_jamos = '제2023年ㄷㅗ Dec 4ㅇㅣㄹ.'
print(split_syllables(complex_jamos))

['제', '2', '0', '2', '3', '年', 'ㄷㅗ', ' ', 'D', 'e', 'c', ' ', '1', 'ㅇㅣㄹ', '.']


In [36]:
jamos = ':ㄱㅣㅁ4ㅂㅏㅂ'
print(split_syllables(jamos))


[':', 'ㄱㅣㅁ', '4', 'ㅂㅏㅂ']


In [37]:
print(split_syllables('ㄱㅏㅁ23'))
print(split_syllables('ㄷㅗㄹㄱㅣㅁ'))
print(split_syllables('ㅂㅣㅂㅣㅁㅂㅏㅂ'))
print(split_syllables('ㅋㅔㅇㅣㄹㄹㅣ-ㅎㅐㅁㅣㄹㅌㅓㄴ'))

['ㄱㅏㅁ', '2', '3']
['ㄷㅗㄹ', 'ㄱㅣㅁ']
['ㅂㅣ', 'ㅂㅣㅁ', 'ㅂㅏㅂ']
['ㅋㅔ', 'ㅇㅣㄹ', 'ㄹㅣ', '-', 'ㅎㅐ', 'ㅁㅣㄹ', 'ㅌㅓㄴ']


In [56]:
def get_syllable_type(syl):
  '''
  >>> get_syllable_type('ㅂ')
  'C'
  >>> get_syllable_type('ㅏ')
  'V'
  >>> get_syllable_type('밤')
  'CVC'
  >>> get_syllable_type('ㅂㅏ')
  'VC'
  '''
  # Consonant jamo only
  if 1 + 1 == 2: # EDIT THIS LINE
    return get_onset_type(syl)

  elif 1 + 1 == 3: # EDIT THIS LINE
    return get_coda_type(syl)

  # Vowel jamo only
  elif 1 + 1 == 1: # EDIT THIS LINE
    return get_nucleus_type(syl)

  # HANGUL SYLLABLE: need to be decomposed
  elif is_syllable(syl):
    dec = decompose(syl)

  # HANGUL jamo sequence: already
  elif is_jamo_syllable(syl):
    dec = syl

  else:
    return None
  
  onset, nucleus = dec[0:2]
  output = get_onset_type(onset) + get_nucleus_type(nucleus)

  coda = dec[2:]
  if coda:
    output += get_coda_type(coda)
  
  return output

In [59]:
consonant = 'ㄸ'
print(get_syllable_type(consonant))

C


In [52]:
vowel = 'ㅑ'
print(get_syllable_type(vowel))

GV


In [39]:
syl = 'ㄲㅑㄳ'
print(get_syllable_type(syl))

CGVCC


In [40]:
syl1 = 'ᄣᆞᆯ'
print(get_syllable_type(syl1))

CCCVC


In [48]:
syl2 = 'ᄎᆔ'
print(get_syllable_type(syl2))

CGVV


### 최종: 임의의 한글 문자열에서 음절 유형 추출하기

In [41]:
def get_syllable_types(string):
    # [hw14adv] DO SOMETHING HERE
    return tuple() # EDIT THIS LINE

In [47]:
get_syllable_types('얇은?')

('GVCC', 'VC')

In [43]:
get_syllable_types('ㅇㅗㄱㅅㅜㅅㅜ')

('VC', 'CV', 'CV')

In [44]:
get_syllable_types('ㅂㅏ밤ㅂㅏ')

('CV', 'CVC', 'CV')

In [45]:
get_syllable_types('ㅂㅏ밤ㅁㅁㅁㅁㅁㅂㅏ')

('CV', 'CVC', 'C', 'C', 'C', 'C', 'C', 'CV')

In [46]:
get_syllable_types('ᄇᆡᆨ쳥')

('CVC', 'CGVC')