Comparing Subreddits

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

In [2]:
%store -r reddit_sent_df

In [3]:
sentiment_by_sub = reddit_sent_df.groupby(['covid_period', 'subreddit'])['compound'].mean().reset_index()

In [4]:
sentiment_by_sub['covid_period'] = pd.Categorical(sentiment_by_sub['covid_period'], 
                                                  categories=['Pre-COVID', 'During COVID', 'Post-COVID'], 
                                                  ordered=True)

In [5]:
sentiment_by_sub = sentiment_by_sub.sort_values('covid_period')

In [6]:
import plotly.express as px
fig = px.bar(
    sentiment_by_sub,
    x='covid_period',
    y='compound',
    color='subreddit',
    barmode='group',
    title='Average Sentiment Score by Subreddit and COVID Period'
)
fig.show()

Keywords by Subreddit Analysis

In [7]:
stressor_terms = {
    'health_anxiety': ['heart', 'symptoms', 'panic attack', 'panic attacks', 'scared', 'pain', 'health', 'anxious', 'attack'],
    'work_stress': ['job', 'home', 'house', 'wfh', 'remote', 'work'],
    'school_stress': ['school', 'parents', 'mom', 'dad', 'remote school', 'class', 'online class'],
    'burnout': ['tired', 'anymore', 'hate', 'exhausted', 'fucking tired', 'end'],
    'therapy': ['therapist', 'therapy', 'counseling', 'telehealth', 'find help']
}
periods = ['Pre-COVID', 'During COVID', 'Post-COVID']

subreddits = reddit_sent_df['subreddit'].unique()
categories = list(stressor_terms.keys())
results = []


In [8]:
def count_total_words(text_series):
    if text_series.empty:
        return 0
    total_words = text_series.astype(str).str.split().str.len().sum()
    return total_words


# This function counts occurrences of a list of keywords
def count_keyword_mentions(text_series, keywords):
    if text_series.empty:
        return 0
    # Create a regex pattern: 'word1|word2|word3'
    pattern = r"\b(" + "|".join(re.escape(k) for k in keywords) + r")\b"
    mentions = text_series.astype(str).str.count(pattern, flags=re.IGNORECASE).sum()
    return int(mentions)

In [9]:
for period in periods:
    for sub in subreddits:
        # Create the subset of data
        subset_df = reddit_sent_df[
            (reddit_sent_df["covid_period"] == period)
            & (reddit_sent_df["subreddit"] == sub)
        ]

        if subset_df.empty:
            continue

        # Get all text and total words for this subset
        text_data = subset_df["full_text"]
        total_words = count_total_words(text_data)

        if total_words == 0:
            continue

        # Calculate frequency for each category
        for category, keywords in stressor_terms.items():
            mentions = count_keyword_mentions(text_data, keywords)
            # Calculate frequency per 1000 words
            freq_per_1000 = (mentions / total_words) * 1000 if total_words > 0 else 0

            # Store the result
            results.append(
                {
                    "covid_period": period,
                    "subreddit": sub,
                    "category": category,
                    "frequency_per_1000": freq_per_1000,
                    "total_mentions": mentions,
                    "total_words": total_words,
                }
            )

In [10]:
keyword_freq_df = pd.DataFrame(results)

In [14]:
subreddit_order = sorted(keyword_freq_df['subreddit'].unique())
covid_period_order = ["Pre-COVID", "During COVID", "Post-COVID"]

fig = px.bar(
    keyword_freq_df,
    x='subreddit',
    y='frequency_per_1000',
    color='covid_period',
    facet_col='category',
    facet_col_wrap=3,
    barmode='group',
    
    category_orders={
        'subreddit': subreddit_order,
        'covid_period': covid_period_order
    },
    
    title='Keyword Frequency by Subreddit, Period, and Category',
    labels={
        'subreddit': "Subreddit",
        "frequency_per_1000": "Frequency Per 1000 Words",
        'covid_period': "COVID Period"
    },
    
    color_discrete_sequence=px.colors.sequential.Darkmint_r,
    height=800,
    
    facet_row_spacing=0.15,
    facet_col_spacing=0.05
)

fig.update_xaxes(tickangle=45, matches=None, showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(margin=dict(b=100))

fig.show()