In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import gensim
import pyLDAvis.gensim_models
import pyLDAvis
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from gensim.models.coherencemodel import CoherenceModel


tqdm.pandas()

In [2]:

df = pd.read_csv("./sentiment_classified_2.csv")

def clean_more(text):
    text = text.lower()
    return text.split()

df = df[df['Stars'] !=5].reset_index(drop=True)
df['Cleaned Reviews'] = df['Cleaned Reviews'].fillna('')
df['Cleaned Reviews'] = df['Cleaned Reviews']
df['tokens'] = df["Cleaned Reviews"].progress_apply(clean_more)

bigram_phrases = Phrases(df['tokens'],min_count=5,threshold=5)

bigram_mod = Phraser(bigram_phrases)

df['bigram_tokens'] = df['tokens'].progress_apply(lambda doc: bigram_mod[doc])

print(df['bigram_tokens'].sample(5).tolist())

100%|██████████| 6859/6859 [00:00<00:00, 111312.99it/s]
100%|██████████| 6859/6859 [00:00<00:00, 169254.71it/s]

[['like', 'app'], ['nice_idea', 'terrible', 'execution', 'people', 'bot', 'bot', 'say', 'person', 'answer', 'hour', 'one', 'ever', 'automate_response', 'rent', 'due', 'day', 'access', 'money', 'stick', 'help', 'face', 'possible', 'eviction', 'thank', 'cleo'], ['awful', 'user_interface'], ['similar', 'functionality', 'design', 'mint', 'connection_issue', 'data', 'problem', 'discover', 'monarch', 'look', 'replacement_mint', 'mint', 'certainly', 'perfect', 'free', 'perform', 'well', 'monarch', 'much', 'functionality', 'similar', 'design', 'issue', 'monach', 'connect', 'account', 'disappointing', 'consider_cost', 'monarch', 'mint', 'free'], ['year', 'still', 'android_widget']]





In [3]:

dictionary = corpora.Dictionary(df['bigram_tokens'])

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)

corpus = [dictionary.doc2bow(doc) for doc in df['bigram_tokens']]


In [4]:

def compute_coherence_values(dictionary, corpus, texts, start=5, limit=20, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit + 1, step):
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=15,
            workers=6,
            random_state=42
        )
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherencemodel.get_coherence()
        coherence_values.append(coherence_score)
        print(f'Number of topics: {num_topics}, Coherence Score: {coherence_score:.4f}')
    return model_list, coherence_values



In [5]:

model_list, coherence_values = compute_coherence_values(dictionary, corpus, df['bigram_tokens'], start=5, limit=15, step=1)


Number of topics: 5, Coherence Score: 0.5231
Number of topics: 6, Coherence Score: 0.5259
Number of topics: 7, Coherence Score: 0.5261
Number of topics: 8, Coherence Score: 0.5507
Number of topics: 9, Coherence Score: 0.5472
Number of topics: 10, Coherence Score: 0.5292
Number of topics: 11, Coherence Score: 0.5420
Number of topics: 12, Coherence Score: 0.5270
Number of topics: 13, Coherence Score: 0.5228
Number of topics: 14, Coherence Score: 0.5233
Number of topics: 15, Coherence Score: 0.5139


In [6]:
best_model = model_list[8]

In [7]:
topics_data = []
for num_topics, model in zip(range(5, 16), model_list):
    for idx, topic in model.print_topics(num_topics=num_topics, num_words=8):
        topics_data.append({
            "Model_Num_Topics": num_topics,
            "Topic_Index": idx,
            "Topic_Terms": topic
        })

topics_df = pd.DataFrame(topics_data)
topics_df.to_csv("LDA_topics_by_model.csv", index=False)

In [8]:
import pandas as pd

# Create a document-topic distribution matrix
doc_topic_dists = []

for doc in corpus:
    topic_probs = best_model.get_document_topics(doc, minimum_probability=0)
    doc_topic_dists.append([prob for _, prob in topic_probs])

# Convert to DataFrame
df_topic_dist = pd.DataFrame(doc_topic_dists)
df_topic_dist.columns = [f'Topic {i}' for i in range(df_topic_dist.shape[1])]

In [9]:
# Find dominant topic in each document
dominant_topics = df_topic_dist.idxmax(axis=1)
dominant_topic_counts = dominant_topics.value_counts().sort_index()

# Print count of documents dominated by each topic
print("Document Count per Dominant Topic:")
print(dominant_topic_counts)

Document Count per Dominant Topic:
Topic 0     510
Topic 1     465
Topic 10    449
Topic 11    689
Topic 12    770
Topic 2     381
Topic 3     402
Topic 4     451
Topic 5     395
Topic 6     680
Topic 7     607
Topic 8     516
Topic 9     544
Name: count, dtype: int64


In [10]:
topic_shares = df_topic_dist.mean().sort_values(ascending=False)

print("\nAverage Share of Each Topic:")
print(topic_shares)


Average Share of Each Topic:
Topic 12    0.109934
Topic 6     0.095110
Topic 11    0.095068
Topic 7     0.083230
Topic 9     0.080527
Topic 8     0.075395
Topic 1     0.070197
Topic 4     0.069955
Topic 10    0.069043
Topic 0     0.064961
Topic 5     0.064046
Topic 3     0.061679
Topic 2     0.060854
dtype: float32


In [11]:
summary_df = pd.DataFrame({
    "Dominant Doc Count": dominant_topic_counts,
    "Average Share": df_topic_dist.mean()
}).fillna(0)

summary_df = summary_df.sort_values("Dominant Doc Count", ascending=False)
print("\nTopic Summary:")
print(summary_df)


Topic Summary:
          Dominant Doc Count  Average Share
Topic 12                 770       0.109934
Topic 11                 689       0.095068
Topic 6                  680       0.095110
Topic 7                  607       0.083230
Topic 9                  544       0.080527
Topic 8                  516       0.075395
Topic 0                  510       0.064961
Topic 1                  465       0.070197
Topic 4                  451       0.069955
Topic 10                 449       0.069043
Topic 3                  402       0.061679
Topic 5                  395       0.064046
Topic 2                  381       0.060854


In [12]:
summary_df.to_csv("LDA_quantitative.csv", index=True)

In [13]:
print("\nCoherence Values:")
for num_topics, coherence in zip(range(5, 16), coherence_values):
    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence:.4f}")


Coherence Values:
Number of Topics: 5, Coherence Score: 0.5231
Number of Topics: 6, Coherence Score: 0.5259
Number of Topics: 7, Coherence Score: 0.5261
Number of Topics: 8, Coherence Score: 0.5507
Number of Topics: 9, Coherence Score: 0.5472
Number of Topics: 10, Coherence Score: 0.5292
Number of Topics: 11, Coherence Score: 0.5420
Number of Topics: 12, Coherence Score: 0.5270
Number of Topics: 13, Coherence Score: 0.5228
Number of Topics: 14, Coherence Score: 0.5233
Number of Topics: 15, Coherence Score: 0.5139
