Topic modeling full dataset

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

df = pd.read_csv("redditADHD2012_preprocessed.csv")
documents = df['processed_text'] 

# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm_tfidf = tfidf_vectorizer.fit_transform(documents)

#Apply LDA
n_components = 5  #number of topics you to extract
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda.fit(dtm_tfidf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
tf_feature_names = tfidf_vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)

Topic modeling each label

In [None]:
def perform_lda_on_label(dataframe, label, n_topics=5, n_words=10):
    print(f"\nTopics for Label: {label}")
    documents = dataframe[dataframe['label'] == label]['processed_text']
    
    # Vectorize text data
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm_tfidf = tfidf_vectorizer.fit_transform(documents)
    
    # Apply LDA
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm_tfidf)
    
    # Print topics
    tf_feature_names = tfidf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([tf_feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]])
        print(message)

labels = [0, 1, 2]

for label in labels:
    perform_lda_on_label(df, label, n_topics=5, n_words=5)