In [34]:
#출처: https://wikidocs.net/30708
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [35]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('샘플의 수 :', len(documents))

샘플의 수 : 11314


In [36]:
documents[1]
#뉴스그룹 데이터에는 특수문자가 포함된 다수의 영어 문장으로 구성되어 있고 이런 형식의 샘플이 총 11,314개 존재함

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [37]:
print(dataset.target_names)
#본래의 카테고리

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [38]:
#기본적인 아이디어로 텍스트 전처리(구두점, 숫자, 특수 문자 제거)
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [39]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [40]:
#NLTK로부터 불용어 받아오기
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) #토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
#불용어 제거

In [41]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [42]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [43]:
#정수 인코딩을 동시에 단어의 빈도수를 기록
#word_id 정수인코딩된 값, word_frequency 단어의 빈도수

from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) #수행된 결과에서 두번째 뉴스 출력(첫번째 인덱스 0)
#(66,2)는 정수 인코딩이 66으로 할당된 단어가 두번째 뉴스에서 두번 등장하였음을 의미

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [44]:
#정수 인코딩 전 어떤 단어였는지 파악해보기
print(dictionary[66])

faith


In [45]:
#총 학습된 단어의 개수를 알아뵈 위해 dictionary의 길이 파악

len(dictionary)

64281

In [46]:
#LDA 모델 훈련시키기

import gensim
NUM_TOPICS = 20 #k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
#총 20개의 토픽이 0-19번으로 할당
#각 단어 앞에 붙은 수치는 단어의 해당 토픽에 대한 기여도
#passes는 알고리즘 동작 횟수: 알고리즘이 결정하는 토픽 값이 적절히 수렴 가능한 충분히 적당한 횟수를 주면 됨
#num_words=4로 총 4개의 단어만 출력

(0, '0.023*"myers" + 0.009*"allocation" + 0.008*"mormon" + 0.007*"obfuscate"')
(1, '0.016*"picture" + 0.012*"pope" + 0.011*"music" + 0.010*"sleeve"')
(2, '0.014*"chip" + 0.013*"encryption" + 0.011*"keys" + 0.010*"system"')
(3, '0.012*"government" + 0.012*"would" + 0.011*"israel" + 0.010*"people"')
(4, '0.014*"price" + 0.010*"sale" + 0.009*"card" + 0.009*"monitor"')
(5, '0.021*"insurance" + 0.012*"canada" + 0.011*"nist" + 0.009*"japanese"')
(6, '0.013*"said" + 0.012*"people" + 0.009*"armenian" + 0.008*"armenians"')
(7, '0.019*"would" + 0.015*"like" + 0.013*"know" + 0.011*"think"')
(8, '0.049*"output" + 0.021*"printf" + 0.020*"char" + 0.019*"null"')
(9, '0.015*"mail" + 0.013*"please" + 0.012*"list" + 0.012*"send"')
(10, '0.026*"food" + 0.019*"tobacco" + 0.019*"drugs" + 0.013*"drug"')
(11, '0.035*"period" + 0.017*"power" + 0.016*"play" + 0.013*"scorer"')
(12, '0.010*"people" + 0.009*"would" + 0.008*"jesus" + 0.007*"believe"')
(13, '0.013*"hockey" + 0.012*"team" + 0.009*"league" + 0.009*"c

In [47]:
print(ldamodel.print_topics())

[(0, '0.023*"myers" + 0.009*"allocation" + 0.008*"mormon" + 0.007*"obfuscate" + 0.007*"mormons" + 0.006*"pseudo" + 0.006*"latin" + 0.006*"wwii" + 0.005*"finland" + 0.005*"john"'), (1, '0.016*"picture" + 0.012*"pope" + 0.011*"music" + 0.010*"sleeve" + 0.006*"chronic" + 0.006*"deaf" + 0.005*"ctrl" + 0.005*"males" + 0.005*"creed" + 0.005*"corn"'), (2, '0.014*"chip" + 0.013*"encryption" + 0.011*"keys" + 0.010*"system" + 0.010*"clipper" + 0.009*"security" + 0.009*"public" + 0.008*"privacy" + 0.008*"government" + 0.007*"data"'), (3, '0.012*"government" + 0.012*"would" + 0.011*"israel" + 0.010*"people" + 0.009*"state" + 0.008*"right" + 0.007*"rights" + 0.007*"israeli" + 0.007*"guns" + 0.006*"jews"'), (4, '0.014*"price" + 0.010*"sale" + 0.009*"card" + 0.009*"monitor" + 0.009*"video" + 0.009*"shipping" + 0.008*"offer" + 0.008*"thanks" + 0.008*"please" + 0.007*"condition"'), (5, '0.021*"insurance" + 0.012*"canada" + 0.011*"nist" + 0.009*"japanese" + 0.008*"ncsl" + 0.008*"outlets" + 0.008*"gaza" 

In [48]:
#LDA시각화
!pip install pyLDAvis

Looking in indexes: http://ftp.daumkakao.com/pypi/simple


In [49]:
import  pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis =  pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [50]:
#문서 별 토픽 분포 확인
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5: #i번째 문서의 토픽 비율까지만 출력
        break
    print(i,'번째 문서의 topic 비율은', topic_list)

0 번째 문서의 topic 비율은 [(0, 0.025740173), (3, 0.25443542), (6, 0.13834125), (9, 0.08175367), (12, 0.26760033), (15, 0.22080858)]
1 번째 문서의 topic 비율은 [(1, 0.10708604), (4, 0.05974015), (7, 0.40644315), (12, 0.38262224), (17, 0.02435584)]
2 번째 문서의 topic 비율은 [(3, 0.53313965), (7, 0.45232904)]
3 번째 문서의 topic 비율은 [(2, 0.4164065), (4, 0.07314018), (7, 0.28894526), (12, 0.099490434), (13, 0.055747263), (14, 0.026447125), (18, 0.029633317)]
4 번째 문서의 topic 비율은 [(4, 0.23982476), (7, 0.31152332), (13, 0.1238706), (17, 0.21505065), (18, 0.0819406)]


In [53]:
#데이터 프레임 형식으로 출력
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()
    
    #몇 번째 문서인지 의미하는 무서 번호와 해당 토픽 비중을 한 줄씩 추출
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        #각 문서에 대해 비중이 높은 토픽 순으로 토픽 정렬
        #ex. 2번토픽:50%, 5번토픽:10%, 7번토픽:40% =>2번토픽:50%, 7번토픽:40%, 5번토픽:10% 
        
        #모든 문서에 대해 각각 하단코드 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #몇 번 토픽인지와 비중을 나눠서 저장
            if j==0:#정렬 후 이므로 0일 때 비중이 제일 높음
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                #가장 비중이 높은 토픽과, 그 토픽의 비중과, 전체 토픽의 비중을 저장
            else:
                break
    return(topic_table)

In [55]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() #문서 번호를 의미하는 열로 사용하기 위해 인덱스 열을 하나 더 만듦
topictable.columns = ['문서번호', '가장 비중이 높은 토픽','토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서번호,가장 비중이 높은 토픽,토픽의 비중,각 토픽의 비중
0,0,12.0,0.2676,"[(0, 0.025740195), (3, 0.25443354), (6, 0.1383..."
1,1,7.0,0.4065,"[(1, 0.107086), (4, 0.059738647), (7, 0.406511..."
2,2,3.0,0.5331,"[(3, 0.53312415), (7, 0.45234454)]"
3,3,2.0,0.4164,"[(2, 0.41640952), (4, 0.07313735), (7, 0.28894..."
4,4,7.0,0.3115,"[(4, 0.23982401), (7, 0.31151885), (13, 0.1238..."
5,5,12.0,0.4426,"[(0, 0.045617953), (7, 0.22913527), (12, 0.442..."
6,6,16.0,0.4255,"[(2, 0.018590847), (4, 0.0725155), (7, 0.20925..."
7,7,7.0,0.3912,"[(3, 0.31976917), (7, 0.39124587), (12, 0.1979..."
8,8,15.0,0.4974,"[(5, 0.07141), (7, 0.37683117), (10, 0.0307580..."
9,9,7.0,0.5908,"[(2, 0.06154754), (4, 0.11738888), (6, 0.01766..."
