In [None]:
import gensim
import pyLDAvis
import pandas as pd
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from numpy import array
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# Load the survey_clean.csv data
survey_data = pd.read_csv("./data/survey_clean_stemmed.csv")
messages = survey_data["stemmed_message"]
messages_list = [i.split() for i in messages]

print(len(messages_list))
print(messages_list)

In [None]:
# Create bigrams and trigrams models
bigram = Phrases(messages_list, min_count=10)
trigram = Phrases(bigram[messages_list])

# Print trigram example
print(trigram[bigram[messages_list[0]]])

for idx in range(len(messages_list)):
    for token in bigram[messages_list[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            messages_list[idx].append(token)
    for token in trigram[bigram[messages_list[idx]]]:
        if '_' in token:
            # Token is a bigram, add to document.
            messages_list[idx].append(token)

In [None]:
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(messages_list)
dictionary.filter_extremes(no_below=5, no_above=0.2)

print(dictionary)

In [None]:
# https://radimrehurek.com/gensim/tut1.html
# build corpus
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in messages_list]
# The function doc2bow converts document (a list of words) into the bag-of-words format
"""The function doc2bow() simply counts the number of occurrences of each distinct word,
converts the word to its integer word id and returns the result as a sparse vector.
The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “Human computer interaction”,
the words computer (id 0) and human (id 1) appear once;
the other ten dictionary words appear (implicitly) zero times."""
print(len(doc_term_matrix))
print(doc_term_matrix[100])
tfidf = models.TfidfModel(doc_term_matrix)  # build TF-IDF model
corpus_tfidf = tfidf[doc_term_matrix]

In [None]:
# function to compute coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            iterations=100,
            alpha="auto",
            per_word_topics=True,
        )
        model_list.append(model)
        coherencemodel = CoherenceModel(
            model=model, texts=texts, dictionary=dictionary, coherence="c_v"
        )
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
start = 1
limit = 21
step = 1
model_list, coherence_values = compute_coherence_values(
    dictionary,
    corpus=corpus_tfidf,
    texts=messages_list,
    start=start,
    limit=limit,
    step=step,
)

x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc="best")
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 6))

In [None]:
model = LdaModel(
    corpus=corpus_tfidf,
    id2word=dictionary,
    num_topics=5,
    iterations=100,
    alpha="auto",
    per_word_topics=True,
)
for idx, topic in model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))

In [None]:
top_words_per_topic = []
for t in range(model.num_topics):
    top_words_per_topic.extend([(t,) + x for x in model.show_topic(t, topn=10)])
df = pd.DataFrame(top_words_per_topic, columns=["Topic", "Word", "P"]).to_csv(
    "./data/top_words2.csv"
)
print(df)

In [None]:
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim_models.prepare(model, corpus_tfidf, dictionary)
print(data)
pyLDAvis.save_html(data, "./data/topic_modelling.html")