# 1. 토픽모델링: sklearn

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 의학, 우주 주제를 추출.
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med'  ]

# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 categories에 cats 입력
news_df= fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'),
                            categories=cats, random_state=0)

#LDA 는 Count기반의 Vectorizer만 적용합니다.
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [None]:
count_vect.get_feature_names_out()

- LDA 객체 생성 후 Count 피처 벡터화 객체로 LDA 수행

In [None]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

- 각 토픽 모델링 주제별 단어들의 연관도 확인
 - ida 객체의 componets_ 속성은 주제별로 개별 단어들의 연관도 정규화 숫자
 - shape는 주제 개수 X 피쳐 단어 개수
 - componets_ 에 들어 있는 숫자 값은 각 주제별로 단어가 나타난 횟수를 정규화 하여 나타냄.
 - 숫자가 클 수록 토픽에서 단어가 차지하는 비중이 높음

In [None]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[ 36.099, 135.627,  21.575, ...,  30.291,  86.683,  67.929],
       [  0.125,  14.44 ,   0.125, ..., 181.507,   0.125,  93.959],
       [334.763,   0.125, 146.743, ...,   0.125,  36.369,   0.125],
       ...,
       [ 36.02 ,  20.864,   4.296, ...,  14.506,   8.339,  15.569],
       [  0.125,   0.125,   0.125, ...,  91.728,   0.125,  37.458],
       [ 54.926,   4.47 ,   9.885, ...,  48.705,   0.125,   0.125]])

- 각 토픽별 중심 단어 확인

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #',topic_index)

        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환.
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]

        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)

# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출
feature_names = count_vect.get_feature_names_out()

# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


- 개별 문서별 토픽 분포 확인
 - ida객체의 transform()을 수행하면 개별 문서별 토픽 분호를 변환

In [None]:
doc_topics = lda.transform(feat_vect)
print(doc_topics.shape)
print(doc_topics[:3])

(7862, 8)
[[0.014 0.014 0.014 0.482 0.014 0.014 0.014 0.434]
 [0.278 0.182 0.002 0.53  0.002 0.002 0.002 0.002]
 [0.005 0.222 0.005 0.005 0.005 0.005 0.005 0.746]]


- 개별 문서별 토픽 분포도를 출력
 - 20Newsgroup으로 만들어진 문서명을 출력
 - featch_20newsgopurs()으로 만들어진 데이터의 filename속성은 모든 문서의 문서명
 - filenames 속성은 절대 디렉토리를 가지는 문서명을 가지고 있으므로 '\'로 분할하여 맨 마지막 두번째 부터 파일명확인

In [None]:
def get_filename_list(newsdata):
  filename_list = []

  for file in newsdata.filenames:
    filenames_temp = file.split('\\')[-2:]
    filename = '.'.join(filenames_temp)
    filename_list.append(filename)
  return filename_list
filename_list = get_filename_list(news_df)
print('filename 개수',len(filename_list), 'filename list 10개만',filename_list[:10])

filename 개수 7862 filename list 10개만 ['/root/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20630', '/root/scikit_learn_data/20news_home/20news-bydate-test/sci.med/59422', '/root/scikit_learn_data/20news_home/20news-bydate-test/comp.graphics/38765', '/root/scikit_learn_data/20news_home/20news-bydate-test/comp.graphics/38810', '/root/scikit_learn_data/20news_home/20news-bydate-test/sci.med/59449', '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38461', '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.windows.x/66959', '/root/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104487', '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53875', '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53617']


- DataFrame으로 생성하여 문서별 토픽 분포도 확인

In [None]:
import pandas as pd

topic_names = ['Topic \ '+ str(i) for i in range(0,8)]
doc_topic_df = pd.DataFrame(data = doc_topics, columns = topic_names,index = filename_list)
doc_topic_df.head(10)

Unnamed: 0,Topic \ 0,Topic \ 1,Topic \ 2,Topic \ 3,Topic \ 4,Topic \ 5,Topic \ 6,Topic \ 7
/root/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20630,0.013897,0.013944,0.013891,0.482218,0.013979,0.013892,0.013935,0.434244
/root/scikit_learn_data/20news_home/20news-bydate-test/sci.med/59422,0.277504,0.181518,0.002121,0.530372,0.002121,0.002121,0.002121,0.002121
/root/scikit_learn_data/20news_home/20news-bydate-test/comp.graphics/38765,0.005445,0.221666,0.005445,0.005445,0.00544,0.005442,0.005442,0.745675
/root/scikit_learn_data/20news_home/20news-bydate-test/comp.graphics/38810,0.005439,0.005441,0.005449,0.578959,0.00544,0.388387,0.005442,0.005442
/root/scikit_learn_data/20news_home/20news-bydate-test/sci.med/59449,0.006584,0.552,0.006587,0.408485,0.006585,0.006585,0.006588,0.006585
/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38461,0.008342,0.008352,0.182622,0.767314,0.008335,0.008341,0.008343,0.008351
/root/scikit_learn_data/20news_home/20news-bydate-train/comp.windows.x/66959,0.372861,0.041667,0.37702,0.041668,0.041703,0.041703,0.041667,0.041711
/root/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104487,0.225351,0.674669,0.004814,0.07592,0.004812,0.004812,0.004812,0.00481
/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53875,0.008944,0.836686,0.008932,0.008941,0.008935,0.109691,0.008932,0.008938
/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53617,0.041733,0.04172,0.708081,0.041742,0.041671,0.041669,0.041699,0.041686
