# Reference
- https://www.lucypark.kr/courses/2015-ba/text-mining.html#topic-modeling

# Load Data

In [1]:
from konlpy.corpus import kobill
file_ids = kobill.fileids()
file_ids

['1809890.txt',
 '1809891.txt',
 '1809892.txt',
 '1809893.txt',
 '1809894.txt',
 '1809895.txt',
 '1809896.txt',
 '1809897.txt',
 '1809898.txt',
 '1809899.txt']

In [2]:
doc = " ".join([kobill.open(i).read() for i in file_ids])

In [3]:
doc

'지방공무원법 일부개정법률안\n\n(정의화의원 대표발의 )\n\n 의 안\n 번 호\n\n9890\n\n발의연월일 : 2010.  11.  12.  \n\n발  의  자 : 정의화․이명수․김을동 \n\n이사철․여상규․안규백\n\n황영철․박영아․김정훈\n\n김학송 의원(10인)\n\n제안이유 및 주요내용\n\n  초등학교 저학년의 경우에도 부모의 따뜻한 사랑과 보살핌이 필요\n\n한 나이이나, 현재 공무원이 자녀를 양육하기 위하여 육아휴직을 할 \n\n수 있는 자녀의 나이는 만 6세 이하로 되어 있어 초등학교 저학년인 \n\n자녀를 돌보기 위해서는 해당 부모님은 일자리를 그만 두어야 하고 \n\n이는 곧 출산의욕을 저하시키는 문제로 이어질 수 있을 것임.\n\n  따라서 육아휴직이 가능한 자녀의 연령을 만 8세 이하로 개정하려\n\n는 것임(안 제63조제2항제4호).\n\n- 1 -\n\n\x0c법률  제        호\n\n지방공무원법 일부개정법률안\n\n지방공무원법 일부를 다음과 같이 개정한다.\n\n제63조제2항제4호 중 “만 6세 이하의 초등학교 취학 전 자녀를”을 “만 \n\n8세 이하(취학 중인 경우에는 초등학교 2학년 이하를 말한다)의 자녀를”\n\n로 한다.\n\n부      칙\n\n이 법은 공포한 날부터 시행한다.\n\n- 3 -\n\n\x0c신 ·구조문대비표\n\n현      행\n\n개   정   안\n\n제63조(휴직) ① (생  략)\n\n제63조(휴직) ① (현행과 같음)\n\n  ② 공무원이 다음 각 호의 어\n\n  ② -------------------------\n\n느 하나에 해당하는 사유로 휴\n\n----------------------------\n\n직을 원하면 임용권자는 휴직\n\n----------------------------\n\n을 명할 수 있다. 다만, 제4호\n\n-------------.---------------\n\n의 경우에는 대통령령으로 정\n\n---------------------------

In [4]:
type(doc)

str

In [5]:
len(doc)

46324

# Tokenization

In [6]:
from konlpy.tag import Twitter
twitter = Twitter()

In [7]:
def pos_tag(doc):
    word_pos_list = []
    for word, pos in twitter.pos(doc):
        word_pos = word+'/'+pos
        word_pos_list.append(word_pos)
    return word_pos_list

In [8]:
pos_tag('정부')

['정부/Noun']

In [9]:
tokens = pos_tag(doc)

In [10]:
tokens

['지방공무원법/Noun',
 '일부/Noun',
 '개정/Noun',
 '법률/Noun',
 '안/Noun',
 '(/Punctuation',
 '정의화/Noun',
 '의원/Noun',
 '대표/Noun',
 '발의/Noun',
 ')/Punctuation',
 '의/Noun',
 '안/Noun',
 '번/Noun',
 '호/Noun',
 '9890/Number',
 '발의/Noun',
 '연월일/Noun',
 ':/Punctuation',
 '2010/Number',
 './Punctuation',
 '11/Number',
 './Punctuation',
 '12/Number',
 './Punctuation',
 '발/Noun',
 '의/Noun',
 '자/Noun',
 ':/Punctuation',
 '정의화/Noun',
 '․/Foreign',
 '이명수/Noun',
 '․/Foreign',
 '김을동/Noun',
 '이사철/Noun',
 '․/Foreign',
 '여상규/Noun',
 '․/Foreign',
 '안규백/Noun',
 '황영철/Noun',
 '․/Foreign',
 '박영아/Noun',
 '․/Foreign',
 '김정훈/Noun',
 '김학송/Noun',
 '의원/Noun',
 '(/Punctuation',
 '10/Number',
 '인/Noun',
 ')/Punctuation',
 '제안/Noun',
 '이유/Noun',
 '및/Noun',
 '주요/Noun',
 '내용/Noun',
 '초등학교/Noun',
 '저학년/Noun',
 '의/Josa',
 '경우/Noun',
 '에도/Josa',
 '부모/Noun',
 '의/Josa',
 '따뜻한/Adjective',
 '사랑/Noun',
 '과/Josa',
 '보살핌/Verb',
 '이/Eomi',
 '필요/Noun',
 '한/Verb',
 '나이/Noun',
 '이나/Josa',
 ',/Punctuation',
 '현재/Noun',
 '공무원/Noun',
 '이/Josa',
 '자

In [11]:
len(tokens)

18270

In [12]:
len(set(tokens))

1720

# Word Vectorization

In [13]:
from gensim.models import word2vec

## Train Vectorizer

In [14]:
wv_model = word2vec.Word2Vec([tokens])

In [15]:
wv_model.vector_size

100

In [16]:
wv_model.wv.vocab

{'"/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334b0e10>,
 '%/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a6ba8>,
 '(/Punctuation': <gensim.models.keyedvectors.Vocab at 0x133494358>,
 ')./Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a0588>,
 ')/Punctuation': <gensim.models.keyedvectors.Vocab at 0x133494518>,
 '+/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a93c8>,
 ',/Punctuation': <gensim.models.keyedvectors.Vocab at 0x133494c18>,
 '----------------------------/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a30b8>,
 '---------------------------/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a3518>,
 '------------------------/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334b7d68>,
 '---/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334b7e10>,
 '-/Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334a05c0>,
 './Punctuation': <gensim.models.keyedvectors.Vocab at 0x1334946a0>,
 '//Punctuation': <gensi

## Vectorize given word
- The word should be inside vocabulary

In [17]:
wv_model[pos_tag('정부')]

array([[ 0.27367461,  0.03903432, -0.15697896, -0.05995852, -0.20556282,
         0.29493096, -0.12461106, -0.03138593,  0.15012996,  0.21005824,
         0.08833446, -0.0731115 , -0.22539723, -0.10171235,  0.1229942 ,
         0.18205416, -0.35497862, -0.2029646 ,  0.22256052,  0.15792799,
        -0.07422631,  0.1780657 ,  0.17028348, -0.49205971,  0.03582783,
         0.13593818, -0.29048625, -0.09045641,  0.34968069, -0.15945823,
        -0.21288867, -0.27096662,  0.20804042,  0.08516154,  0.23865591,
        -0.22121084, -0.07074531,  0.29457942,  0.060423  ,  0.14972797,
         0.25405997,  0.02219546,  0.24004619,  0.07117997,  0.03839031,
         0.0894598 , -0.20990281, -0.0103013 ,  0.00508739,  0.12543575,
         0.15545569, -0.04804167,  0.04111022,  0.08668339,  0.01405963,
         0.12664959, -0.04975626, -0.09596296, -0.21169417,  0.26844141,
         0.08644313,  0.16995083, -0.16674483,  0.08614352,  0.06905571,
        -0.28268105,  0.11360523, -0.06289653,  0.1

## (Optional) Normalize word vector
### `init_sims(replace=False)`
####  - If replace is set, forget the original vectors and only keep the normalized ones = saves lots of memory!

In [18]:
wv_model.init_sims(replace=True)

## Save model

In [19]:
wv_model.save('./word2vec.model')

## Load Model

In [20]:
wv_model = word2vec.Word2Vec.load('./word2vec.model')

In [21]:
wv_model.most_similar(pos_tag('정부'))

[('를/Josa', 0.999872088432312),
 ('하는/Verb', 0.999869704246521),
 ('의/Josa', 0.9998652935028076),
 ('◦/Foreign', 0.9998617172241211),
 ('을/Josa', 0.9998599290847778),
 ('에/Josa', 0.9998594522476196),
 ('은/Josa', 0.9998562932014465),
 ('부대/Noun', 0.9998561143875122),
 ('으로/Josa', 0.9998536109924316),
 ('이/Josa', 0.9998522996902466)]

In [22]:
wv_model.most_similar(pos_tag('초등학교'))

[('에서/Josa', 0.9995263814926147),
 ('의/Josa', 0.9995156526565552),
 ('로/Josa', 0.999476432800293),
 ('행정/Noun', 0.9994755983352661),
 ('·/Foreign', 0.9994714260101318),
 ('을/Josa', 0.9994683265686035),
 ('는/Josa', 0.9994608163833618),
 ('내용/Noun', 0.9994513988494873),
 ('./Punctuation', 0.9994490146636963),
 ('사항/Noun', 0.9994447827339172)]

# Hmm.. We need to filter stop words

In [23]:
def pos_tag(doc):
    word_pos_list = []
    for word, pos in twitter.pos(doc):
        word_pos = word+'/'+pos
        
        if len(word) > 1 and pos == 'Noun':
            word_pos_list.append(word_pos)
            
    return word_pos_list

In [24]:
tokens = pos_tag(doc)

In [25]:
tokens

['지방공무원법/Noun',
 '일부/Noun',
 '개정/Noun',
 '법률/Noun',
 '정의화/Noun',
 '의원/Noun',
 '대표/Noun',
 '발의/Noun',
 '발의/Noun',
 '연월일/Noun',
 '정의화/Noun',
 '이명수/Noun',
 '김을동/Noun',
 '이사철/Noun',
 '여상규/Noun',
 '안규백/Noun',
 '황영철/Noun',
 '박영아/Noun',
 '김정훈/Noun',
 '김학송/Noun',
 '의원/Noun',
 '제안/Noun',
 '이유/Noun',
 '주요/Noun',
 '내용/Noun',
 '초등학교/Noun',
 '저학년/Noun',
 '경우/Noun',
 '부모/Noun',
 '사랑/Noun',
 '필요/Noun',
 '나이/Noun',
 '현재/Noun',
 '공무원/Noun',
 '자녀/Noun',
 '양육/Noun',
 '육아휴직/Noun',
 '자녀/Noun',
 '나이/Noun',
 '이하/Noun',
 '초등학교/Noun',
 '저학년/Noun',
 '자녀/Noun',
 '해당/Noun',
 '부모님/Noun',
 '일자리/Noun',
 '출산/Noun',
 '의욕/Noun',
 '저하/Noun',
 '문제/Noun',
 '것임/Noun',
 '따라서/Noun',
 '육아휴직/Noun',
 '자녀/Noun',
 '연령/Noun',
 '이하/Noun',
 '개정/Noun',
 '것임/Noun',
 '조제/Noun',
 '항제/Noun',
 '법률/Noun',
 '지방공무원법/Noun',
 '일부/Noun',
 '개정/Noun',
 '법률/Noun',
 '지방공무원법/Noun',
 '일부/Noun',
 '다음/Noun',
 '개정/Noun',
 '조제/Noun',
 '항제/Noun',
 '이하/Noun',
 '초등학교/Noun',
 '취학/Noun',
 '자녀/Noun',
 '이하/Noun',
 '취학/Noun',
 '중인/Noun',
 '경우/Noun',
 '초등학교/Noun'

In [26]:
len(tokens)

5650

In [27]:
wv_model = word2vec.Word2Vec([tokens])

In [28]:
wv_model.init_sims(replace=True)

In [29]:
wv_model.save('./word2vec.model')

In [30]:
wv_model.most_similar(pos_tag('정부'))

[('결혼/Noun', 0.995997428894043),
 ('중개/Noun', 0.9958048462867737),
 ('중개업/Noun', 0.9955652952194214),
 ('국제/Noun', 0.9955242872238159),
 ('확인/Noun', 0.9954255819320679),
 ('현행/Noun', 0.9953224658966064),
 ('조제/Noun', 0.9951662421226501),
 ('사항/Noun', 0.9951329231262207),
 ('신상/Noun', 0.9950308799743652),
 ('경우/Noun', 0.9949830770492554)]

In [31]:
wv_model.most_similar(pos_tag('초등학교'))

[('행정/Noun', 0.9700460433959961),
 ('추계/Noun', 0.9684064388275146),
 ('장관/Noun', 0.968134343624115),
 ('상대방/Noun', 0.9677034616470337),
 ('결혼/Noun', 0.9674206376075745),
 ('중개/Noun', 0.9665025472640991),
 ('조제/Noun', 0.9658188223838806),
 ('해역/Noun', 0.9657609462738037),
 ('사무소/Noun', 0.9656451940536499),
 ('사항/Noun', 0.9655430316925049)]

# If you have time,
- Try with bigger data
    - [Sejong Corpus](https://ithub.korean.go.kr/user/corpus/corpusSearchManager.do)
    - [KAIST Corpus](http://semanticweb.kaist.ac.kr/home/index.php/KAIST_Corpus)
    - [Ulsan Univ. Corpus](http://nlplab.ulsan.ac.kr/doku.php?id=ucorpus)]
    - Wikipedia Dump [[link](https://dumps.wikimedia.org/kowiki/)] [[Extractor](https://github.com/j-min/WikiExtractor_To_the_one_text)]
    - NamuWiki Dump [[link](https://namu.wiki/w/%EB%82%98%EB%AC%B4%EC%9C%84%ED%82%A4:%EB%8D%B0%EC%9D%B4%ED%84%B0%EB%B2%A0%EC%9D%B4%EC%8A%A4%20%EB%8D%A4%ED%94%84)] [[Extractor](https://github.com/j-min/Easy-Namuwiki-Extractor)]


- Build your own stop words
    - https://github.com/6/stopwords-json/blob/master/dist/ko.json
    

- Embedding visualization with tensorboard
    - https://gist.github.com/lampts/026a4d6400b1efac9a13a3296f16e655
    - http://visionigniter.blogspot.kr/2017/03/word2vec-visualization-wtensorflow.html

# FastText
- https://github.com/facebookresearch/fastText
- Python Wrapper: https://github.com/salestock/fastText.py
- Highly optimized Word-vectorizer (Word and subword level) + Text Classifier

## FastText provides a wide range of pretrained word vectors (trained in WikiPedia)
- https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

## However, I prefer Kyubyong Park's pretrained word vectors (better tokenized)
- https://github.com/Kyubyong/wordvectors

### Take home (Relatively big text)
- I used [text8](http://mattmahoney.net/dc/textdata): English wikipedia

In [32]:
import fasttext

In [33]:
# Skipgram model
model = fasttext.skipgram(
    './text8.txt', # corpus path
    'model') # output model name
print(model.words) # list of words in dictionary

ValueError: fastText: cannot load ./text8.txt

In [None]:
print(model['king']) # get the vector of the word 'king'

## Load pretrained model

In [None]:
model = fasttext.load_model('model.bin')
print(model.words) # list of words in dictionary
print(model['king']) # get the vector of the word 'king'

## Classifier

In [None]:
classifier = fasttext.supervised('data.train.txt', 'model')

## Recently,  fasttext has been ported to gensim

- https://radimrehurek.com/gensim/models/wrappers/fasttext.html