In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [2]:
# 전처리
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ", regex=True)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x : ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x : x.lower())

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda  x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])


In [4]:
print(tokenized_doc[:5])

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object


In [5]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1])



[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [6]:
print(dictionary[66])

faith


In [7]:
len(dictionary)

64281

In [8]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words = 4)
for topic in topics:
    print(topic)

(0, '0.046*"jesus" + 0.019*"christ" + 0.016*"john" + 0.012*"father"')
(1, '0.021*"windows" + 0.019*"problem" + 0.011*"color" + 0.011*"using"')
(2, '0.023*"file" + 0.014*"program" + 0.010*"entry" + 0.010*"output"')
(3, '0.010*"people" + 0.009*"would" + 0.006*"believe" + 0.006*"think"')
(4, '0.057*"space" + 0.022*"nasa" + 0.013*"earth" + 0.010*"launch"')
(5, '0.011*"remark" + 0.009*"colorado" + 0.008*"john" + 0.008*"name"')
(6, '0.012*"available" + 0.009*"software" + 0.009*"version" + 0.008*"also"')
(7, '0.006*"water" + 0.006*"pain" + 0.005*"power" + 0.005*"light"')
(8, '0.017*"would" + 0.011*"like" + 0.009*"think" + 0.008*"people"')
(9, '0.021*"period" + 0.011*"power" + 0.011*"toronto" + 0.011*"detroit"')
(10, '0.016*"helmet" + 0.013*"rockefeller" + 0.008*"lane" + 0.007*"shaft"')
(11, '0.016*"said" + 0.010*"people" + 0.009*"know" + 0.008*"went"')
(12, '0.040*"thanks" + 0.034*"please" + 0.032*"anyone" + 0.029*"know"')
(13, '0.014*"wire" + 0.012*"ground" + 0.011*"current" + 0.008*"wiring"

In [9]:
print(ldamodel.print_topics())

[(0, '0.046*"jesus" + 0.019*"christ" + 0.016*"john" + 0.012*"father" + 0.011*"spirit" + 0.011*"lord" + 0.009*"matthew" + 0.009*"shall" + 0.007*"paul" + 0.007*"holy"'), (1, '0.021*"windows" + 0.019*"problem" + 0.011*"color" + 0.011*"using" + 0.010*"display" + 0.010*"window" + 0.010*"screen" + 0.010*"memory" + 0.008*"error" + 0.007*"running"'), (2, '0.023*"file" + 0.014*"program" + 0.010*"entry" + 0.010*"output" + 0.007*"information" + 0.007*"files" + 0.006*"info" + 0.006*"section" + 0.006*"number" + 0.006*"image"'), (3, '0.010*"people" + 0.009*"would" + 0.006*"believe" + 0.006*"think" + 0.005*"many" + 0.005*"israel" + 0.005*"true" + 0.005*"even" + 0.005*"christian" + 0.004*"also"'), (4, '0.057*"space" + 0.022*"nasa" + 0.013*"earth" + 0.010*"launch" + 0.010*"shuttle" + 0.010*"orbit" + 0.010*"satellite" + 0.009*"lunar" + 0.009*"mission" + 0.008*"militia"'), (5, '0.011*"remark" + 0.009*"colorado" + 0.008*"john" + 0.008*"name" + 0.007*"null" + 0.007*"guidelines" + 0.006*"prog" + 0.005*"coun

In [10]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [11]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i ==5:
        break
    print(i,'번째 문서의 topic 비율은', topic_list)

0 번째 문서의 topic 비율은 [(3, 0.5209929), (4, 0.018051809), (6, 0.102190204), (7, 0.024261411), (10, 0.017863132), (14, 0.28887996), (16, 0.017065324)]
1 번째 문서의 topic 비율은 [(3, 0.4453554), (8, 0.30650422), (11, 0.18452726), (19, 0.04254005)]
2 번째 문서의 topic 비율은 [(3, 0.6324879), (4, 0.036056668), (8, 0.28861153), (12, 0.029916173)]
3 번째 문서의 topic 비율은 [(3, 0.033965558), (8, 0.5112036), (14, 0.24104023), (15, 0.1287582), (17, 0.07309558)]
4 번째 문서의 topic 비율은 [(0, 0.23488718), (8, 0.26814282), (19, 0.46547532)]


In [12]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [13]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,3.0,0.521,"[(3, 0.52100897), (4, 0.018051922), (6, 0.1021..."
1,1,3.0,0.4454,"[(3, 0.44538364), (8, 0.3064658), (11, 0.18453..."
2,2,3.0,0.6325,"[(3, 0.63249165), (4, 0.03605671), (8, 0.28860..."
3,3,8.0,0.5111,"[(3, 0.03397305), (8, 0.51113534), (14, 0.2409..."
4,4,19.0,0.4655,"[(0, 0.2348852), (8, 0.26809394), (19, 0.46552..."
5,5,3.0,0.3011,"[(0, 0.0963079), (3, 0.30109861), (5, 0.219651..."
6,6,8.0,0.5371,"[(1, 0.09091253), (2, 0.038567513), (8, 0.5370..."
7,7,8.0,0.4396,"[(3, 0.3264007), (8, 0.43960652), (11, 0.08408..."
8,8,8.0,0.358,"[(2, 0.22441128), (3, 0.06723054), (7, 0.19441..."
9,9,8.0,0.5755,"[(0, 0.028196054), (2, 0.013378121), (7, 0.173..."
