In [19]:
from pathlib import Path
import nltk
import unicodedata 


src_dir = 'korTextData'
filename = '7 영웅문 3부-의천도룡기(1961)1편.txt'
filepath = Path(src_dir) / filename

lines = open(filepath, encoding='utf8').readlines()

lines = [line.strip() for line in lines]

In [12]:
line = lines[102]

In [13]:
line

'배꽃의 계절이어라.'

In [17]:
for c in line:
    if ord(c) > 127:
        print('{}  utf8: {} ord: U+{:04x} name: {}'.format(c, c.encode('utf8'), ord(c), unicodedata.name(c)))

배  utf8: b'\xeb\xb0\xb0' ord: U+bc30 name: HANGUL SYLLABLE BAE
꽃  utf8: b'\xea\xbd\x83' ord: U+af43 name: HANGUL SYLLABLE GGOC
의  utf8: b'\xec\x9d\x98' ord: U+c758 name: HANGUL SYLLABLE YI
계  utf8: b'\xea\xb3\x84' ord: U+acc4 name: HANGUL SYLLABLE GYE
절  utf8: b'\xec\xa0\x88' ord: U+c808 name: HANGUL SYLLABLE JEOL
이  utf8: b'\xec\x9d\xb4' ord: U+c774 name: HANGUL SYLLABLE I
어  utf8: b'\xec\x96\xb4' ord: U+c5b4 name: HANGUL SYLLABLE EO
라  utf8: b'\xeb\x9d\xbc' ord: U+b77c name: HANGUL SYLLABLE RA


In [18]:
print(line.encode('unicode_escape'))

b'\\ubc30\\uaf43\\uc758 \\uacc4\\uc808\\uc774\\uc5b4\\ub77c.'


In [20]:
nltk.word_tokenize(line)

['배꽃의', '계절이어라', '.']

In [30]:
mytokens = []
for s in lines:
    mytokens += nltk.word_tokenize(s)
len(mytokens), mytokens[1000:1010]

(71712, ['왕세충', '(', '王世忠', ')', '을', '토벌할', '때', '소림', '승려들이', '종군하여'])

## Regular Expressions

In [21]:
import re

wordlist = [w for w in nltk.corpus.words.words('en') if w.islower() ]

In [23]:
# find words with 'ed' ending

[w for w in wordlist if re.search('ed$', w)][:4]

['abaissed', 'abandoned', 'abased', 'abashed']

In [33]:
[w for w in mytokens if re.search('가$', w)][:10]

['가', '찾기가', '무기가', '아닌가', '가', '그가', '그가', '아내가', '소녀가', '않은가']

In [39]:
# find words with a string starts with 'lo' using '^' in the pattern
[w for w in wordlist if re.search('^lo', w)][:5]

['lo', 'loa', 'loach', 'load', 'loadage']

In [40]:
# woldcard (.) matches any single character among words of length 8.
[w for w in wordlist if re.search('^..j..t..$', w)][:5]

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector']

In [55]:
[w for w in wordlist if re.search('..j..t..', w)][:5]

['abjectedness', 'abjection', 'abjective', 'abjectly', 'abjectness']

In [59]:
[w for w in wordlist if re.search('^mai?', w)][:5] # i may or may not, but starting with ma

['ma', 'maam', 'maamselle', 'mabi', 'mabolo']

In [60]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

In [61]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][efd]$', w)]

['gold', 'golf', 'hold', 'hole']

In [67]:
ghi = [w for w in wordlist if re.search('^[ghijklmno]+$', w)]
ghi[:5]

['g', 'ghoom', 'gig', 'giggling', 'gigolo']

In [75]:
ghi2 = [w for w in wordlist if re.search('^[g-o]+$', w)]
len(ghi2), ghi2[:5]

(257, ['g', 'ghoom', 'gig', 'giggling', 'gigolo'])

In [72]:
abc = [w for w in wordlist if re.search('^[a-fj-o]+$', w)]
len(abc), abc[:5]

(1963, ['a', 'aa', 'aal', 'aam', 'aba'])

In [73]:
abc[::200]

['a',
 'anemonal',
 'belee',
 'cadalene',
 'coll',
 'dod',
 'feel',
 'kenno',
 'mack',
 'namable']

In [77]:
# what happens if $ is not used.
ghi3 = [w for w in wordlist if re.search('^[ghijklmno]+', w)]
len(ghi3), ghi3[:5]

(54165, ['g', 'ga', 'gab', 'gabardine', 'gabbard'])

In [78]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
print('chat_words: ', chat_words[:4])

result = [w for w in chat_words if re.search('^m+i+n+e+$', w)]
len(result), result[:5]

chat_words:  ['', '!', '!!', '!!!']


(4,
 ['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
  'miiiiiinnnnnnnnnneeeeeeee',
  'mine',
  'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee'])

In [79]:
wsj = sorted(set(nltk.corpus.treebank.words()))

dec = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
dec[:10]

['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']

In [82]:
print(re.search('^[0-9]+\.[0-9]+$', "0.1 0.2 0.3 999.123"))

None


In [85]:
print(re.search('^[0-9]+\.[0-9]+$', "23490.11111"))

<re.Match object; span=(0, 11), match='23490.11111'>


In [86]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]

['C$', 'US$']

In [88]:
[w for w in wsj if re.search('^[0-9]{4}$', w)][:5]

['1614', '1637', '1787', '1901', '1903']

In [89]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point',
 '20-point',
 '20-stock',
 '21-month',
 '237-seat',
 '240-page',
 '27-year',
 '30-day',
 '30-point',
 '30-share',
 '30-year',
 '300-day',
 '36-day',
 '36-store',
 '42-year',
 '50-state',
 '500-stock',
 '52-week',
 '69-point',
 '84-month',
 '87-store',
 '90-day']

In [90]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [95]:
eding = [w for w in wsj if re.search('(ed|ing)$', w)]
len(eding), eding[:10]

(1841,
 ['62%-owned',
  'Absorbed',
  'According',
  'Adopting',
  'Advanced',
  'Advancing',
  'Alfred',
  'Allied',
  'Annualized',
  'Anything'])

In [114]:
eding2 = [w for w in wsj if re.search('ed|ing$', w)]
len(eding2), eding[:10]

(1969,
 ['62%-owned',
  'Absorbed',
  'According',
  'Adopting',
  'Advanced',
  'Advancing',
  'Alfred',
  'Allied',
  'Annualized',
  'Anything'])

In [105]:
d = [w for w in eding if w not in eding2]
d

[]

In [101]:
edingdiff = [w for w in eding2 if w not in eding]
print(len(edingdiff))
edingdiff2 = [w for w in eding2 if not (w in eding)]
print(len(edingdiff2))
edingdiff3 = [w for w in eding2 if not w in eding]
print(len(edingdiff3))
print(edingdiff == edingdiff2, edingdiff2 == edingdiff3)

128
128
128
True True


In [103]:
edingdiff[:50]

['Biedermann',
 'Breeden',
 'Cathedral',
 'Cedric',
 'Confederation',
 'Credit',
 'Federal',
 'Federalist',
 'Federation',
 'Freddie',
 'Frederick',
 'Friedrichs',
 'Impediments',
 'Intermediate',
 'Kennedy',
 'Media',
 'Medical',
 'Medicine',
 'Mercedes',
 'Montedison',
 'Nederlanden',
 'Needham',
 'Proceeds',
 'Reddington',
 'Redevelopment',
 'Roederer',
 'Speedway',
 'Sweden',
 'Teddy',
 'Toledo',
 'Wednesday',
 'Wedtech',
 'acknowledge',
 'acknowledges',
 'agreed-upon',
 'allegedly',
 'beds',
 'buttoned-down',
 'closed-end',
 'comedies',
 'concede',
 'concedes',
 'credentials',
 'credibility',
 'credit',
 'creditor',
 'creditors',
 'credits',
 'creditworthiness',
 'deeds']

In [122]:
ed_inside = [w for w in wsj if re.search('ed', w)]
print('ed_inside: ', len(ed_inside))

ing_end = [w for w in wsj if re.search('ing$', w)]
print('end with ing: ', len(ing_end))

ed_inside:  1178
end with ing:  811


In [123]:
ed_ing_sum = list(set(ed_inside).union(set(ing_end)))
len(ed_ing_sum)

1969

In [124]:
ed_ing_sum == eding2

False

In [125]:
[w for w in ed_ing_sum if w not in eding2]

[]

In [126]:
[w for w in eding2 if w not in ed_ing_sum]

[]

In [127]:
ed_ing_sum.sort()
eding2.sort()
ed_ing_sum == eding2 

True

In [129]:
c = 'ㄱ'
ord(c), c, hex(ord(c))

(12593, 'ㄱ', '0x3131')

In [135]:
u = ord(c)

In [136]:
u % 16

1

In [139]:
x = '\u1100'
x

'ᄀ'

In [145]:
hex(ord('한'))

'0xd55c'

In [23]:
cv = ord('각')

basevalue = ord('가')
print(basevalue, hex(basevalue), cv)
jong = (cv - basevalue) % 28
jung = ((cv - basevalue)/28) % 21 if jong >= 28 else 0
cho = (((cv - basevalue)/28)/21)%19 if jung >= 21 else 0

cho, jung, jong 

44032 0xac00 44033


(0, 0, 1)

In [27]:
cv = ord('깎')

basevalue = ord('가')
print(basevalue, hex(basevalue), cv)
jong = (cv - basevalue) % 28
jung = int((cv - basevalue)/28) % 21 # if jong >= 28 else 0
cho = int(((cv - basevalue)/28)/21)%19 # if jung >= 21 else 0

cho, jung, jong 

44032 0xac00 44622


(1, 0, 2)

In [28]:
a, b, c = chr(cho+0x1100), chr(jung+0x1161), chr(jong+0x11A8 - 1)
a, b, c

('ᄁ', 'ᅡ', 'ᆩ')

In [29]:
a,b,c = ord(a) - 0x1100, ord(b) - 0x1161, ord(c) - 0x11A8 + 1
syllable = 0xAC00 + a*588 + b*28 + c 
print(syllable, f'0x{syllable:X}', chr(syllable))

44622 0xAE4E 깎


In [32]:
def syllable2jamo(kchar):
    """ conversion from 완성형 unicode to 조합형 unicode """
    cv = ord(kchar)

    basevalue = ord('가')
    print(basevalue, hex(basevalue), cv)
    jong = int(cv - basevalue) % 28
    jung = int((cv - basevalue)/28) % 21
    cho = int(((cv - basevalue)/28)/21)%19

    return chr(cho + 0x1100), chr(jung + 0x1161), chr(jong + 0x11A8 - 1)

In [39]:
def jamo2syllable(jamo):
    """ conversion from 조합형 unicode to 완성형 unicode """
    a,b,c = jamo
    a,b,c = ord(a) - 0x1100, ord(b) - 0x1161, ord(c) - 0x11A8 + 1
    syllable = 0xAC00 + a*588 + b*28 + c 
    print(syllable, f'0x{syllable:X}', chr(syllable))
    return chr(syllable)

In [40]:
syllable2jamo("깎")

44032 0xac00 44622


('ᄁ', 'ᅡ', 'ᆩ')

In [41]:
jamo2syllable(syllable2jamo("깎"))

44032 0xac00 44622
44622 0xAE4E 깎


'깎'

In [42]:
syllable2jamo('듕')

44032 0xac00 46293


('ᄃ', 'ᅲ', 'ᆼ')

In [43]:
jamo2syllable(syllable2jamo('듕'))

44032 0xac00 46293
46293 0xB4D5 듕


'듕'