In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim import matutils, models, corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [27]:
# Assuming count_df_1 is your DataFrame containing the count vectorized data
count_df_1 = pd.read_csv("arm_labelled.csv",index_col=0)

In [28]:
count_df_1 = count_df_1.iloc[:,:-1]

In [16]:
# Convert the DataFrame to a matrix
dtm_matrix = count_df_1.values

# Convert the matrix to a format compatible with gensim
corpus = matutils.Dense2Corpus(dtm_matrix.T)

# Create a dictionary mapping of terms to their index
terms = count_df_1.columns.tolist()
dictionary = corpora.Dictionary.from_corpus(corpus, id2word=dict(enumerate(terms)))

# Train LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=4, id2word=dictionary, passes=15)

# Visualize the LDA model
lda_visualization = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)

In [30]:
pyLDAvis.save_html(lda_visualization, 'lda_viz.html')
# pyLDAvis.display(lda_visualization)

In [25]:
num_topics = 5
lda_model_DH = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online')
LDA_DH_Model = lda_model_DH.fit_transform(count_df_1)
word_topic = np.array(lda_model_DH.components_)
#print(word_topic)
word_topic = word_topic.transpose()
num_top_words = 15
vocab_array = count_df_1.columns.values
fontsize_base = 15

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
for t in range(num_topics):
    plt.subplot(1, num_topics, t + 1)  # plot numbering starts with 1
    plt.ylim(0, num_top_words + 0.5)  # stretch the y-axis to accommodate the words
    plt.xlim(0, num_topics + 5)
    plt.xticks([])  # remove x-axis markings ('ticks')
    plt.yticks([]) # remove y-axis markings ('ticks')
    plt.title('Topic #{}'.format(t))
    top_words_idx = np.argsort(word_topic[:,t])[::-1]  # descending order
    top_words_idx = top_words_idx[:num_top_words]
    top_words = vocab_array[top_words_idx]
    top_words_shares = word_topic[top_words_idx, t]
    for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
        plt.text(0.3, num_top_words-i-0.5, word, fontsize=fontsize_base)
                 ##fontsize_base*share)

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=1)
plt.show()