# 3.2 지도 학습 기반 형태소 분석

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'ML/KRembedding/'

LIB_PATH = 'lib'

%cd drive/My\ Drive/$FOLDERNAME/$LIB_PATH

Mounted at /content/drive
/content/drive/My Drive/ML/KRembedding/lib


In [2]:
# Mecab 설치

!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Installing automake (A dependency for mecab-ko)
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:11 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Package

## 3.2.1 KoNLPy 사용법

In [3]:
from konlpy.tag import Mecab
tokenizer = Mecab()
tokenizer.morphs('아버지가방에들어가신다')

['아버지', '가', '방', '에', '들어가', '신다']

In [4]:
tokenizer.pos('아버지가방에들어가신다')

[('아버지', 'NNG'),
 ('가', 'JKS'),
 ('방', 'NNG'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('신다', 'EP+EC')]

In [54]:
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma

def get_tokenizer(t_name):
  if t_name == 'komoran':
    t = Komoran()
  elif t_name == "okt":
    t = Okt()
  elif t_name == "mecab":
    t = Mecab()
  elif t_name == "hannanum":
    t = Hannanum()
  elif t_name == "kkma":
    t = Kkma()
  else:
    t = Mecab()
  return t

tokenizer = get_tokenizer('komoran')
print(tokenizer.morphs('아버지가방에들어가신다'))
print(tokenizer.pos('아버지가방에들어가신다'))

['아버지', '가방', '에', '들어가', '시', 'ㄴ다']
[('아버지', 'NNG'), ('가방', 'NNP'), ('에', 'JKB'), ('들어가', 'VV'), ('시', 'EP'), ('ㄴ다', 'EC')]


## 3.2.3 Khaiii 사용법

In [31]:
# Khaiii 설치

%cd /content
!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python
%cd drive/My\ Drive/$FOLDERNAME/$LIB_PATH

/content
Cloning into 'khaiii'...
remote: Enumerating objects: 1009, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 1009 (delta 41), reused 69 (delta 24), pack-reused 877[K
Receiving objects: 100% (1009/1009), 33.06 MiB | 25.96 MiB/s, done.
Resolving deltas: 100% (399/399), done.
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test fma_compiles
-- Performi

In [32]:
from khaiii import KhaiiiApi
tokenizer = KhaiiiApi()

In [41]:
data = tokenizer.analyze('아버지가방에들어가신다')
print([str(m) for word in data for m in word.morphs])
print([str(m).split('/')[0] for word in data for m in word.morphs])

['아버지/NNG', '가/JKS', '방/NNG', '에/JKB', '들어가/VV', '시/EP', 'ㄴ다/EC']
['아버지', '가', '방', '에', '들어가', '시', 'ㄴ다']


## 3.2.4 은전한닢에 사용자 사전 추가하기

In [43]:
# '가우스전자' 가 두 토큰으로 분리됨

tokenizer = Mecab()
tokenizer.morphs('가우스전자 텔레비전 정말 좋네요')

['가우스', '전자', '텔레비전', '정말', '좋', '네요']

In [45]:
# user dictionary 작성

%cd /tmp/mecab-ko-dic-2.1.1-20180720/user-dic

/tmp/mecab-ko-dic-2.1.1-20180720/user-dic


In [46]:
!cat > nnp.csv

가우스전자,,,,NNP,*,F,가우스전자,*,*,*,*,*
서울대입구역,,,,NNP,*,T,서울대입구역,*,*,*,*,*
^C


In [47]:
!bash ../tools/add-userdic.sh
!cd ../ && make install
%cd /content/drive/My\ Drive/$FOLDERNAME/$LIB_PATH

generating userdic...
nnp.csv
/tmp/mecab-ko-dic-2.1.1-20180720/tools/../model.def is not a binary model. reopen it as text mode...
reading /tmp/mecab-ko-dic-2.1.1-20180720/tools/../user-dic/nnp.csv ... 
done!
person.csv
/tmp/mecab-ko-dic-2.1.1-20180720/tools/../model.def is not a binary model. reopen it as text mode...
reading /tmp/mecab-ko-dic-2.1.1-20180720/tools/../user-dic/person.csv ... 
done!
place.csv
/tmp/mecab-ko-dic-2.1.1-20180720/tools/../model.def is not a binary model. reopen it as text mode...
reading /tmp/mecab-ko-dic-2.1.1-20180720/tools/../user-dic/place.csv ... 
done!
test -z "model.bin matrix.bin char.bin sys.dic unk.dic" || rm -f model.bin matrix.bin char.bin sys.dic unk.dic
/usr/local/libexec/mecab/mecab-dict-index -d . -o . -f UTF-8 -t UTF-8
reading ./unk.def ... 13
emitting double-array: 100% |###########################################| 
reading ./CoinedWord.csv ... 148
reading ./user-nnp.csv ... 2
reading ./NNBC.csv ... 677
reading ./Person.csv ... 196459
readi

In [51]:
# 사용자 사전 추가 후, 의도대로 분석됨

tokenizer = Mecab()
tokenizer.morphs('가우스전자 텔레비전 정말 좋네요')

['가우스전자', '텔레비전', '정말', '좋', '네요']