# 한국어 형태소 분석기 - SOYNLP
    - 튜토리얼 참고 : https://github.com/lovit/soynlp
    - https://datascienceschool.net/03%20machine%20learning/03.01.04%20soynlp.html
    - https://velog.io/@seunghoking/NLP-%ED%95%9C%EA%B5%AD%EC%96%B4-%ED%98%95%ED%83%9C%EC%86%8C-%EB%B6%84%EC%84%9D%EA%B8%B0-%EC%98%A4%ED%94%88%EC%86%8C%EC%8A%A4-%EC%A0%95%EB%A6%AC%ED%95%B4%EB%93%9C%EB%A6%B4%EA%B2%8C%EC%9A%94

In [1]:
import soynlp
soynlp.__version__

'0.0.493'

In [2]:
# 말뭉치 다운로드
# 내가 분석할 말뭉치가 많지 않으면 soynlp 작동↓
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt", filename="data/2016-10-20.txt")

('data/2016-10-20.txt', <http.client.HTTPMessage at 0x2245504e760>)

- SOYNLP에서 사용될 단어 만들기
    - joblib 활용해서 불러오기

In [4]:
# 훈련 데이터를 다수의 문서로 분리
from soynlp import DoublespaceLineCorpus
corpus = DoublespaceLineCorpus("data/2016-10-20.txt")
len(corpus)

30091

In [5]:
# 전체 말뭉치에서 단어 점수표를 계산 (학습)
from soynlp.word import WordExtractor

word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 0.752 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


In [6]:
from soynlp.tokenizer import LTokenizer

# 단어 사전을 만들어주는 작업을 soynlp는 학습을 통해 만들어준다.
# 영어는 wordnet이 있는 반면
scores = {word:score.cohesion_forward for word, score in word_score_table.items()}
l_tokenizer = LTokenizer(scores=scores)
l_tokenizer.tokenize("국제사회와 우리의 노력들로 범죄를 척결하자", flatten=False)

[('국제사회', '와'), ('우리', '의'), ('노력', '들로'), ('범죄', '를'), ('척결', '하자')]

In [7]:
import joblib

joblib.dump(scores, 'data/scores.pkl')

['data/scores.pkl']

- 명사 추출

In [8]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2(verbose=True)
nouns = noun_extractor.train_extract(corpus)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 403896 from 30091 sents. mem=0.822 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=4434442, mem=1.454 Gb
[Noun Extractor] batch prediction was completed for 119705 words
[Noun Extractor] checked compounds. discovered 70639 compounds
[Noun Extractor] postprocessing detaching_features : 109312 -> 92205
[Noun Extractor] postprocessing ignore_features : 92205 -> 91999
[Noun Extractor] postprocessing ignore_NJ : 91999 -> 90643
[Noun Extractor] 90643 nouns (70639 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.573 Gb                    
[Noun Extractor] 76.63 % eojeols are covered


In [9]:
noun_scores = {noun:score[0] for noun, score in nouns.items() if len(noun) > 1}
joblib.dump(noun_scores, 'data/noun_scores.pkl')

['data/noun_scores.pkl']