In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import gensim
import pyLDAvis.gensim_models
import pyLDAvis
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from gensim.models.coherencemodel import CoherenceModel


tqdm.pandas()

In [4]:

df = pd.read_csv("./sentiment_classified.csv")

def clean_more(text):
    text = text.lower()
    return text.split()

df = df[df['Stars'] !=5].reset_index(drop=True)
df['Cleaned Reviews'] = df['Cleaned Reviews'].fillna('')
df['Cleaned Reviews'] = df['Cleaned Reviews']
df['tokens'] = df["Cleaned Reviews"].progress_apply(clean_more)

bigram_phrases = Phrases(df['tokens'],min_count=5,threshold=5)

bigram_mod = Phraser(bigram_phrases)

df['bigram_tokens'] = df['tokens'].progress_apply(lambda doc: bigram_mod[doc])

print(df['bigram_tokens'].sample(5).tolist())

100%|██████████| 47837/47837 [00:00<00:00, 704576.38it/s]
100%|██████████| 47837/47837 [00:00<00:00, 116005.59it/s]

[['trouble', 'log', 'give', 'something_go', 'wrong', 'restart_phone', 'log', 'also', 'cache_clear', 'old', 'review', 'app', 'great', 'try', 'use', 'tablet', 'deposit', 'could', 'take_picture', 'download', 'app', 'phone', 'work', 'great', 'take', 'minute', 'find', 'way', 'around', 'app', 'good'], ['option', 'temporarily', 'lock', 'debit_card'], ['app', 'great', 'crash', 'time', 'every', 'log', 'issue', 'since', 'update', 'every_time', 'attempt', 'log', 'app', 'say', 'log', 'due', 'activity', 'minute', 'please', 'log', 'something', 'similar', 'often', 'get', 'message', 'additionally', 'also', 'get', 'messase', 'say', 'app', 'constantly_crash', 'need', 'close', 'please_fix', 'asap', 'know', 'reason', 'change', 'bank'], ['terrible', 'app', 'terrible', 'customer_service', 'stay', 'far', 'away', 'company', 'possible'], ['suck']]





In [5]:

dictionary = corpora.Dictionary(df['bigram_tokens'])

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=5000)

corpus = [dictionary.doc2bow(doc) for doc in df['bigram_tokens']]


In [7]:

def compute_coherence_values(dictionary, corpus, texts, start=5, limit=20, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit + 1, step):
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=15,
            workers=6,
            random_state=42
        )
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherencemodel.get_coherence()
        coherence_values.append(coherence_score)
        print(f'Number of topics: {num_topics}, Coherence Score: {coherence_score:.4f}')
    return model_list, coherence_values



In [8]:

model_list, coherence_values = compute_coherence_values(dictionary, corpus, df['bigram_tokens'], start=5, limit=15, step=1)


Number of topics: 5, Coherence Score: 0.5663
Number of topics: 6, Coherence Score: 0.5703
Number of topics: 7, Coherence Score: 0.5674
Number of topics: 8, Coherence Score: 0.5615
Number of topics: 9, Coherence Score: 0.5760
Number of topics: 10, Coherence Score: 0.5649
Number of topics: 11, Coherence Score: 0.5628
Number of topics: 12, Coherence Score: 0.5771
Number of topics: 13, Coherence Score: 0.5617
Number of topics: 14, Coherence Score: 0.5604
Number of topics: 15, Coherence Score: 0.5562


In [9]:
best_model = model_list[8]

In [21]:
topics_data = []
for num_topics, model in zip(range(5, 16), model_list):
    for idx, topic in model.print_topics(num_topics=num_topics, num_words=8):
        topics_data.append({
            "Model_Num_Topics": num_topics,
            "Topic_Index": idx,
            "Topic_Terms": topic
        })

topics_df = pd.DataFrame(topics_data)
topics_df.to_csv("LDA_topics_by_model.csv", index=False)

In [12]:
import pandas as pd

# Create a document-topic distribution matrix
doc_topic_dists = []

for doc in corpus:
    topic_probs = best_model.get_document_topics(doc, minimum_probability=0)
    doc_topic_dists.append([prob for _, prob in topic_probs])

# Convert to DataFrame
df_topic_dist = pd.DataFrame(doc_topic_dists)
df_topic_dist.columns = [f'Topic {i}' for i in range(df_topic_dist.shape[1])]

In [13]:
# Find dominant topic in each document
dominant_topics = df_topic_dist.idxmax(axis=1)
dominant_topic_counts = dominant_topics.value_counts().sort_index()

# Print count of documents dominated by each topic
print("Document Count per Dominant Topic:")
print(dominant_topic_counts)

Document Count per Dominant Topic:
Topic 0     4468
Topic 1     3803
Topic 10    2290
Topic 11    3407
Topic 12    2908
Topic 2     5389
Topic 3     3833
Topic 4     6302
Topic 5     2771
Topic 6     3252
Topic 7     2234
Topic 8     3446
Topic 9     3734
Name: count, dtype: int64


In [14]:
topic_shares = df_topic_dist.mean().sort_values(ascending=False)

print("\nAverage Share of Each Topic:")
print(topic_shares)


Average Share of Each Topic:
Topic 4     0.111135
Topic 2     0.102087
Topic 3     0.081357
Topic 9     0.080859
Topic 0     0.080770
Topic 1     0.080437
Topic 11    0.075653
Topic 8     0.073461
Topic 6     0.069728
Topic 12    0.064968
Topic 5     0.063921
Topic 7     0.058168
Topic 10    0.057457
dtype: float32


In [15]:
summary_df = pd.DataFrame({
    "Dominant Doc Count": dominant_topic_counts,
    "Average Share": df_topic_dist.mean()
}).fillna(0)

summary_df = summary_df.sort_values("Dominant Doc Count", ascending=False)
print("\nTopic Summary:")
print(summary_df)


Topic Summary:
          Dominant Doc Count  Average Share
Topic 4                 6302       0.111135
Topic 2                 5389       0.102087
Topic 0                 4468       0.080770
Topic 3                 3833       0.081357
Topic 1                 3803       0.080437
Topic 9                 3734       0.080859
Topic 8                 3446       0.073461
Topic 11                3407       0.075653
Topic 6                 3252       0.069728
Topic 12                2908       0.064968
Topic 5                 2771       0.063921
Topic 10                2290       0.057457
Topic 7                 2234       0.058168


In [18]:
summary_df.to_csv("LDA_quantitative.csv", index=True)

In [19]:
print("\nCoherence Values:")
for num_topics, coherence in zip(range(5, 16), coherence_values):
    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence:.4f}")


Coherence Values:
Number of Topics: 5, Coherence Score: 0.5663
Number of Topics: 6, Coherence Score: 0.5703
Number of Topics: 7, Coherence Score: 0.5674
Number of Topics: 8, Coherence Score: 0.5615
Number of Topics: 9, Coherence Score: 0.5760
Number of Topics: 10, Coherence Score: 0.5649
Number of Topics: 11, Coherence Score: 0.5628
Number of Topics: 12, Coherence Score: 0.5771
Number of Topics: 13, Coherence Score: 0.5617
Number of Topics: 14, Coherence Score: 0.5604
Number of Topics: 15, Coherence Score: 0.5562
