Comparing Topic Models for Each Period

In [1]:
import pandas
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

In [2]:
%store -r reddit_sent_df

In [3]:
nltk.data.find("corpora/stopwords")

FileSystemPathPointer('/home/beherya/nltk_data/corpora/stopwords')

In [4]:
base_stopwords = set(stopwords.words("english"))

In [10]:
custom_stopwords = set([
    'like', 'get', 'dont', 'im', 'would', 'really', 'one', 'people',
    'time', 'know', 'feel', 'even', 'go', 'want', 'think', 'much',
    'life', 'day', 'days', 'years', 'year', 'something', 'nothing',
    'got', 'make', 'feeling', 'going', 'things', 'way', 'work',
    'help', 'cant', 'need', 'see', 'friends', 'family', 'ive', 'anyone',
    'anything', 'always', 'else', 'getting', 'started'
])

full_stop_words = base_stopwords.union(custom_stopwords)

In [11]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text


In [12]:
def display_topics(df, text_col, title, n_topics=5, n_top_words=10):
    """
    Runs and prints topic models for a given DataFrame.
    """
    print("\n" + "="*50)
    print(f" {title} (n={len(df)} posts) ")
    print("="*50)

    if len(df) < n_topics:
        print(f"Not enough documents to model {n_topics} topics. Skipping.")
        return

    # 1. Vectorize: Convert text to a word-count matrix
    # We apply our preprocessing and stopword removal here
    vectorizer = CountVectorizer(
        preprocessor=preprocess_text,
        stop_words=list(full_stop_words),
        max_df=0.9,  # Ignore words in > 90% of docs
        min_df=10,   # Ignore words in < 10 docs
        ngram_range=(1, 1) # Only use single words
    )

    try:
        dtm = vectorizer.fit_transform(df[text_col])
    except ValueError as e:
        print(f"Error vectorizing text (maybe all words were stopwords?): {e}")
        return

    # 2. Model: Run Latent Dirichlet Allocation
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42, # For reproducible results
        n_jobs=-1
    )
    lda.fit(dtm)

    # 3. Display: Print the top words for each topic
    feature_names = vectorizer.get_feature_names_out()

    for topic_idx, topic in enumerate(lda.components_):
        # Get the indices of the top words
        top_words_indices = topic.argsort()[:-n_top_words - 1:-1]
        # Get the words themselves
        top_words = [feature_names[i] for i in top_words_indices]
        print(f"Topic {topic_idx + 1}: {' '.join(top_words)}")

In [13]:
pre_covid_df = reddit_sent_df[reddit_sent_df["covid_period"] == "Pre-COVID"]
during_covid_df = reddit_sent_df[reddit_sent_df["covid_period"] == "During COVID"]
post_covid_df = reddit_sent_df[reddit_sent_df["covid_period"] == "Post-COVID"]

In [14]:
display_topics(pre_covid_df, 'full_text', title="Pre-COVID Topics")
display_topics(during_covid_df, 'full_text', title="During-COVID Topics")
display_topics(post_covid_df, 'full_text', title="Post-COVID Topics")



 Pre-COVID Topics (n=576 posts) 




Topic 1: fucking everyone alone someone love nye never hate fuck many
Topic 2: mental anxiety health didnt first school felt back night since
Topic 3: anxiety bad good person happy anxious someone everything tired talk
Topic 4: new happy alone everyone better everything anxiety anymore hope shit
Topic 5: anxiety panic attack sleep never someone told depression job pain

 During-COVID Topics (n=10517 posts) 
Topic 1: anxiety anxious panic also sleep attack bad attacks take heart
Topic 2: job mental health back home since house didnt last told
Topic 3: someone depression thoughts mental talk person love self good thing
Topic 4: anymore everything hate never tired every happy fucking better good
Topic 5: school didnt never deleted user friend parents mom said told

 Post-COVID Topics (n=6218 posts) 
Topic 1: end talk didnt someone say said friend never thing told
Topic 2: job never mental live love mom parents health school hate
Topic 3: depression better anymore everything thoughts good 