In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
data = pd.read_csv('lda_data.csv')
topic_count = 6

In [None]:
vector = CountVectorizer()
X = vector.fit_transform(data['Abstract'])
lda_model = LatentDirichletAllocation(n_components=topic_count, learning_method='batch', random_state=0,
                                      max_iter=150, evaluate_every=10, n_jobs=-1)
lda_top = lda_model.fit_transform(X)

In [None]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vector.get_feature_names_out()

def get_topics(components, feature_names, n=20):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [feature_names[i] for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_, terms)

In [None]:
def cold_hot_plot(data, topic_count, lda_result, fig_size):
    df = pd.DataFrame({'Date': data['Date'], 'Topic': lda_result.argmax(axis = 1)})
    df = df.groupby(['Date', 'Topic']).size().unstack(fill_value=0)
    
    plt.figure(figsize=(topic_count*fig_size[0], fig_size[1]))
    plot = df.plot(subplots=True, layout=(1, topic_count),
                   figsize=(topic_count*fig_size[0], fig_size[1]),
                   sharex=True, sharey=True, legend=True)
    
    for sub_p in plot.flatten():
        sub_p.set_xlabel('Month')
        sub_p.set_ylabel('Count')
        sub_p.grid(True)
        sub_p.tick_params(axis='x', rotation=90)
    
    plt.tight_layout()
    plt.show()
    
cold_hot_plot(data, 10, lda_top, (10, 10))