In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 모터사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 의학, 우주 주제를 추출. 
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med'  ]

# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 categories에 cats 입력
news_df= fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'), 
                            categories=cats, random_state=0)

#LDA는 Count 기반의 Vectorizer만 적용합니다.  
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [4]:
lda = LatentDirichletAllocation(n_components=8, random_state=0) # n_components => 주제 개수
lda.fit(feat_vect)

In [5]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [10]:
def display_topics(model, feature_names, no_top_words):
	for topic_index, topic in enumerate(model.components_):
		print('Topic #',topic_index)
		print(topic)
		# components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
		topic_word_indexes = topic.argsort()[::-1]
		print(topic_word_indexes)
		top_indexes=topic_word_indexes[:no_top_words]
		
		# top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
		feature_concat = ' '.join([feature_names[i] for i in top_indexes])                
		print(feature_concat)

# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출
feature_names = count_vect.get_feature_names_out()

# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)


Topic # 0
[3.60992018e+01 1.35626798e+02 2.15751867e+01 1.66797374e+01
 2.79116453e+01 5.10887470e+01 2.71578118e+01 5.63629854e+02
 1.51255012e+02 2.50002658e+02 3.43889076e+02 9.41519034e+00
 1.24391406e+02 1.48332625e+02 2.41047913e+02 1.15178467e+02
 1.67518832e+02 1.24543573e+02 9.39264574e+01 8.98408568e+01
 1.15026060e+02 1.66681056e+02 3.18336067e+02 3.40871322e+02
 6.57367239e+01 9.55808129e+01 9.42926924e+01 8.20757241e+01
 9.75227950e+01 1.25015126e-01 2.07928452e+02 8.49170785e+00
 6.82592012e+01 8.92273844e+01 1.06748151e+02 7.28637466e+01
 2.00716963e+02 9.39478917e+01 9.61297746e+01 5.65332001e+01
 1.08823005e+02 1.25033129e-01 7.85519321e+01 3.97062801e+01
 1.58121306e+02 9.82950167e+01 6.04053607e+01 7.02396275e+01
 4.26303416e+01 9.26626954e+01 5.07120905e+01 2.66538978e+02
 8.84867374e+01 5.50914847e+01 2.98159374e+01 1.35476116e-01
 3.47709761e+00 2.68655460e-01 1.27766191e+01 3.72404379e+01
 6.68419408e+00 1.26835900e+01 7.94931485e+00 1.25022634e-01
 5.57997688e+0