## gensim 설치 - 문서 토픽 분석 패키지

In [1]:
!conda install -y gensim

Fetching package metadata ...........
Solving package specifications: .

Package plan for installation in environment C:\Python\Anaconda3:

The following NEW packages will be INSTALLED:

    bz2file:    0.98-py35_0      
    gensim:     1.0.1-np112py35_0
    smart_open: 1.5.2-py35_0     

bz2file-0.98-p   0% |                              | ETA:  --:--:--   0.00  B/s
bz2file-0.98-p 100% |###############################| ETA:  0:00:00   2.96 MB/s
bz2file-0.98-p 100% |###############################| Time: 0:00:00   2.96 MB/s

smart_open-1.5   0% |                              | ETA:  --:--:--   0.00  B/s
smart_open-1.5  33% |##########                     | Time: 0:00:00   1.49 MB/s
smart_open-1.5  66% |####################           | Time: 0:00:00 861.81 kB/s
smart_open-1.5  99% |############################## | Time: 0:00:00   1.14 MB/s
smart_open-1.5 100% |###############################| Time: 0:00:00   1.14 MB/s
smart_open-1.5 100% |###############################| Time: 0:00:00  

## 데이터 불러오기

In [2]:
import numpy

In [4]:
tdm = numpy.load('data/tdm_small.npy')

In [5]:
tdm = tdm.tolist()
tdm

<5347x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 652672 stored elements in Compressed Sparse Column format>

In [7]:
# 단어 목록
with open('data/words_small.txt', encoding='utf8') as f:
    words = f.read().splitlines()

## gensim 포맷으로 바꾸기

In [8]:
from gensim.matutils import Sparse2Corpus



In [9]:
corpus = Sparse2Corpus(tdm.T)  # transpose
corpus

<gensim.matutils.Sparse2Corpus at 0x1ca69c47588>

## Latent Dirichlet Allocation (LDA)

https://radimrehurek.com/gensim/models/ldamodel.html

In [10]:
from gensim.models.ldamodel import LdaModel

In [14]:
lda = LdaModel(corpus=corpus, num_topics=100, id2word=dict(enumerate(words)))  # , random_state=1234, passes=1 (수행횟수)

## Topic 보기

In [15]:
lda.show_topic(0)

[('에어컨', 0.020778029877119242),
 ('출시', 0.01955982822353812),
 ('사용', 0.014873255408248551),
 ('제품', 0.013929660595574586),
 ('시장', 0.013732166224622522),
 ('적용', 0.011306252269174073),
 ('듀얼', 0.0090425626860784168),
 ('LG전자', 0.0085730874981091772),
 ('기능', 0.0077263962462083835),
 ('최대', 0.0070436204325354202)]

In [16]:
lda.show_topic(1)

[('카카오', 0.070929489316327532),
 ('게임', 0.037165353742280589),
 ('순위', 0.027688732851808964),
 ('서비스', 0.020154954986528265),
 ('모바일', 0.014552805155083601),
 ('다음', 0.012980214853364062),
 ('시장', 0.011336564265992707),
 ('출시', 0.010921361824026159),
 ('매출', 0.0084419785283912242),
 ('카카오톡', 0.0079374833297866636)]

## 문서의 topic 확인

### 문서 변환

In [13]:
doc = [(i, tdm[0, i]) for i in range(1000)]

### 문서에서 가장 많이 쓰인 단어들 보기

In [17]:
from operator import itemgetter

In [18]:
doc_words = [(words[i], n) for i, n in doc]

In [19]:
sorted(doc_words, key=itemgetter(1), reverse=True)[:10]

[('게임', 37),
 ('이벤트', 31),
 ('아이템', 26),
 ('진행', 16),
 ('제공', 13),
 ('접속', 9),
 ('최대', 8),
 ('동안', 8),
 ('지급', 8),
 ('이상', 7)]

### 문서의 topic

In [20]:
lda.get_document_topics(doc)  # topic_id

[(5, 0.14077814256210786),
 (20, 0.20857717468486581),
 (59, 0.46772685193729407),
 (77, 0.031050094667528505),
 (94, 0.15005129255355878)]

In [23]:
lda.show_topic(59)

[('게임', 0.04946021803885596),
 ('무료', 0.029199819315402799),
 ('부문', 0.021350715380775268),
 ('제공', 0.018586764889769014),
 ('모바일', 0.016061494815535825),
 ('이벤트', 0.01491255477251785),
 ('순위', 0.013730158928696678),
 ('출시', 0.012448429599020974),
 ('차지', 0.011912635547120576),
 ('사진', 0.011869286676601844)]

In [24]:
lda.show_topic(20)

[('게임', 0.052396336582468443),
 ('캐릭터', 0.021450543739872277),
 ('스킬', 0.019017876649161315),
 ('선수', 0.013486312555765604),
 ('출시', 0.011512043083127275),
 ('사용', 0.0112614023155017),
 ('아이템', 0.011223015402422386),
 ('진행', 0.010149880411513835),
 ('개선', 0.010140488129899025),
 ('플레이', 0.0092458781679301914)]

In [33]:
num, width = tdm.shape
num

5347

In [34]:
width

1000

In [35]:
from collections import Counter

In [36]:
total = Counter()  # dictionary

In [39]:
for n in range(num):
    doc = [(i, tdm[n, i]) for i in range(width)]
    topics = lda.get_document_topics(doc)
    for topic, ratio in topics:
        total[topic] += ratio

KeyboardInterrupt: 

In [None]:
# total.items() : total 각 항목의 합계 리턴
# list 는 .items() 할 필요없음.
sorted(total.items(), key=itemgetter(1), reverse=True)

## LDA 모델 저장하기

In [26]:
lda.save('20170429.lda')

## LDA 모델 불러오기

In [27]:
lda = LdaModel.load('20170429.lda')

In [29]:
lda.show_topic(10)

[('가격', 0.032306178192544187),
 ('부품', 0.031942432416610947),
 ('시장', 0.027343496275689546),
 ('제품', 0.025369855835817547),
 ('생산', 0.020496569548535407),
 ('자동차', 0.018622724922754536),
 ('미국', 0.01517359984021621),
 ('출시', 0.013266175084148233),
 ('산업', 0.01314647310591927),
 ('판매', 0.012035980740608198)]