# 20 newsgroup을 이용한 LDA 실습

In [1]:
from sklearn.datasets import fetch_20newsgroups
# LDA는 빈도수에만 기반하는 CountVectorizer사용함!
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 주어진 데이터셋의 일부 카테고리 데이터만 추출하므로 카테고리 사전에 설정
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med'  ]
# 설정해준 카테고리의 데이터들만 추출
news_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            categories=cats, random_state=12)
# CountVectorizer로 텍스트 데이터들 단어 빈도수에 기반해 벡터화시키기(fit_transform까지!)
count_vect = CountVectorizer(max_df=0.95, max_features=1000,
                            min_df=2, stop_words='english',
                            ngram_range=(1,2))
ftr_vect = count_vect.fit_transform(news_df.data)
print('Count 기반 벡터화 시킨 후 shape:',ftr_vect.shape)

Count 기반 벡터화 시킨 후 shape: (7862, 1000)


- **lda.fit 까지만 하면 토픽별로 단어들의 분포를 수치로 알려줌!**

In [2]:
# LDA클래스를 이용해서 피처 벡터화시킨 것을 토픽모델링 시키기
# 8개의 주제만 뽑았으니 n_components(토픽개수) 8로 설정
lda = LatentDirichletAllocation(n_components=8, random_state=42)
lda.fit(ftr_vect)

LatentDirichletAllocation(n_components=8, random_state=42)

In [4]:
# components_속성은 8개의 토픽별(row)로 1000개의 feature(단어)들의 분포수치(column)를 보여줌
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[1.63519060e+01, 1.86054803e+01, 5.84604286e+00, ...,
        1.92702247e+02, 1.25083617e-01, 6.13626785e+00],
       [2.04214200e+02, 3.50284932e+02, 1.31948001e+02, ...,
        1.09520222e+01, 2.24437656e+02, 1.96644739e+01],
       [4.54593156e+01, 9.30356531e+01, 2.25020590e+01, ...,
        5.03797203e-01, 5.49831722e+01, 4.63608380e+01],
       ...,
       [1.36793837e+02, 1.87036321e+01, 1.26271967e-01, ...,
        1.33888270e+01, 1.59536601e+01, 3.43179992e+00],
       [7.16218683e-01, 4.49780560e+00, 9.14720569e+00, ...,
        2.24127048e+01, 1.25116055e-01, 1.50499552e+01],
       [1.25075332e-01, 1.25062790e-01, 1.25002714e-01, ...,
        1.29818061e+02, 1.25117101e-01, 4.65081589e+01]])

## 토픽별 단어 분포 확인하기

- fit까지 하면 -> 토픽별 단어들의 분포를 알려줌
- 각 토픽별로 가장 중심이 되는 단어들이 무엇인지 살펴보기

In [16]:
# 이 때 lda_model이란, 벡터화시킨 텍스트 데이터를 fit까지만 적용한 모델!
def display_topic_words(lda_model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(lda_model.components_):
        print('\nTopic #', topic_idx+1)
        
        # Topic별로 1000개의 단어들(features)중에서 높은 값 순으로 정렬 후 index를 반환해줌!
        # argsort()는 디폴트가 오름차순임(1,2,3,...) 그래서 [::-1]로 내림차순으로 바꿔주기
        topic_word_idx = topic.argsort()[::-1]
        top_idx = topic_word_idx[:num_top_words]
        
        # CountVectorizer함수 할당시킨 객체에 get_feature_names()로 벡터화시킨 feature(단어들)볼 수 있음!
        # 이 벡터화시킨 단어들(features)은 숫자-알파벳순으로 정렬되며, 단어들 순서는 fit_transform시키고 난 이후에도 동일!
        # '문자열'.join 함수로 특정 문자열 사이에 끼고 문자열 합쳐줄 수 있음.
        feature_concat = '+'.join([str(feature_names[i])+'*'+str(round(topic[i], 1)) for i in top_idx])
        print(feature_concat)        

In [17]:
feature_names = count_vect.get_feature_names()
display_topic_words(lda, feature_names, 15)


Topic # 1
like*1423.2+just*1260.0+don*1243.5+know*1010.5+good*904.8+ve*852.7+think*822.8+use*748.4+time*705.2+does*656.3+make*579.1+want*542.5+really*541.4+bike*540.1+used*534.4

Topic # 2
armenian*937.0+turkish*686.1+armenians*677.0+jews*607.1+people*581.4+government*451.3+turkey*394.0+jewish*386.3+war*376.0+000*350.3+armenia*340.6+muslim*322.8+genocide*321.1+turks*320.1+new*319.4

Topic # 3
10*568.9+medical*444.8+1993*399.9+12*387.8+health*379.0+research*356.8+20*335.1+disease*327.3+cancer*321.1+patients*301.7+11*293.1+92*286.4+information*275.5+april*258.9+number*253.3

Topic # 4
said*756.4+don*694.0+year*692.7+just*634.3+know*605.1+didn*574.9+time*562.3+people*547.1+like*487.9+game*477.6+think*460.8+went*450.0+did*449.8+say*435.5+going*387.8

Topic # 5
file*1123.1+jpeg*782.6+program*755.3+use*694.6+window*568.8+does*538.1+image*535.4+output*527.1+color*517.6+display*511.5+files*450.8+gif*434.8+thanks*420.2+entry*389.6+bit*364.9

Topic # 6
edu*1605.4+graphics*1014.4+software*772.7+

- 이 결과값들에서 단어들의 분포도(높은 순으로 정렬한 상태)를 보고 어떤 주제일지 결정하는 것은 **``사람의 몫``** 이다!!

## 문서별 토픽 분포 확인하기

- fit되어 있는 LDA모델에서 transform까지 수행한 후!

In [18]:
# transform까지 수행하면, 문서별(row)로 토픽들(column)의 분포를 알려줌
doc_topics = lda.transform(ftr_vect)
print(doc_topics.shape)
print(doc_topics[:2])

(7862, 8)
[[0.70540011 0.00543799 0.00543866 0.00544077 0.26195794 0.00544178
  0.00544319 0.00543957]
 [0.01564333 0.01563743 0.64749634 0.01563402 0.01563859 0.01567394
  0.01565302 0.25862333]]


In [21]:
# 주어진 내장 텍스트데이터의 문서이름에는 카테고리가 labeling되어있음. 
# 따라서, 카테고리가 무엇인지 아는 상태이니까 어떤 문서들이 어떤 토픽들이 높은지 확인해보자.
# 그리고 그 토픽들이 각각 무엇을 내용으로 하는지 추측해보자.
# 주어진 데이터셋의 filename속성을 이용해서 카테고리값들 가져오기
def get_filename_list(newsdata):
    filename_lst = []
    for file in newsdata.filenames:
        filename_temp = file.split('/')[-2:]
        filename = '.'.join(filename_temp)
        filename_lst.append(filename)
    return filename_lst

In [23]:
filename_lst = get_filename_list(news_df)
print(len(filename_lst))
# 7862개의 문서들이 존재한다.

7862


In [24]:
# Dataframe형태로 만들어보기
import pandas as pd
topic_names = ['Topic #'+ str(i) for i in range(0,8)]
topic_df = pd.DataFrame(data=doc_topics, columns=topic_names,
                       index=filename_lst)
topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7
comp.graphics.38765,0.7054,0.005438,0.005439,0.005441,0.261958,0.005442,0.005443,0.00544
sci.med.59107,0.015643,0.015637,0.647496,0.015634,0.015639,0.015674,0.015653,0.258623
sci.electronics.54182,0.529201,0.005439,0.005444,0.362516,0.005441,0.081073,0.005442,0.005444
rec.motorcycles.103182,0.672086,0.007357,0.007353,0.199593,0.007355,0.007355,0.007355,0.091545
soc.religion.christian.21740,0.00174,0.001738,0.001741,0.001739,0.001739,0.001738,0.001739,0.987828
rec.sport.baseball.105077,0.000568,0.99602,0.000569,0.000569,0.000568,0.000569,0.000568,0.000568
talk.politics.mideast.75974,0.319845,0.453337,0.002274,0.002277,0.002274,0.002275,0.21544,0.002278
talk.politics.mideast.76050,0.300003,0.173406,0.012516,0.294612,0.012512,0.181921,0.012508,0.012522
soc.religion.christian.20900,0.89052,0.015638,0.015628,0.015662,0.015637,0.015629,0.015629,0.015658
sci.electronics.54334,0.960181,0.005685,0.005689,0.005686,0.005692,0.005694,0.005686,0.005689


- 첫 번째 행의 문서는 원래 ``컴퓨터 그래픽``에 관한 문서이다. 이 문서는 Topic 0번이랑 Topic 4번이 주를 이룬다. 


- 하지만 실제 LDA 모델은 위와 같은 상황이 주어지는 것보다 ``Topic별 단어들의 분포를 보고 -> 각 Topic을 'A','B','C'...이다 라고 사람이 추론하고 -> 각 문서별 Topic들의 분포 수치를 본 후 -> 각 문서가 이런 A,B 라는 토픽들이 주를 이루니까 그 문서는 어떤 문서겠구나! 추론하는 과정이다!!``