## Import library

In [None]:
# IMPORT LIBRARY
# For decompressing and processing data
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

# For scraping Reddit submissions
!pip install praw
import praw
import pandas as pd
import time

# For data cleaning and visualization
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re

# Import NLTK for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Import the sentiment analysis tool
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Install and load the SpaCy package
!pip install spacy
import spacy

# Download and install the SpaCy English language model
!python -m spacy download en_core_web_sm

# For topic modeling
import gensim
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS

# For temporal analysis visualization
import statsmodels.api as sm
from matplotlib.ticker import FuncFormatter

# For date formatting in charts
import matplotlib.dates as mdates

# For interactive visualization
import plotly.express as px

import ast

# For statistic test 
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest

## 4.Sentiment Analysis

This section of the code includes:

1. Vader analysis: Analyze Vader sentiment polarity on each post, VADER descriptive statistics and Visualize sentiment changes over time.

2. NRC-EIL analysis: Analyze 8 emotions by NRC-EIL on each post, NRC-EIL descriptive statistics and Visualize 8 emotion changes over time.

3. Topic sentiment/emotion analysis: VADER Sentiment in topic level by period and NRC-EIL Emotion in topic level by period.

### 4.1.Vader analysis

- a. Analyze Vader sentiment polarity on each post.
- b. VADER descriptive statistics.
- c. Visualize sentiment changes over time.

#### a. Analyze Vader sentiment polarity on each post.

In [None]:
# Open the submissions for VADER analysis
vader_submissions = pd.read_csv('/Users/Desktop/Reddit data/preprocessed_vader.csv')

In [None]:
vader_submissions

In [None]:
# Instantiate the VADER analyzer
sid = SentimentIntensityAnalyzer()

# Analyze sentiment on each post
def analyze_vader_sentiment(text):
    text = str(text) # convert text to string type
    scores = sid.polarity_scores(text) # get sentiment score
    # determine the emotion category based on the comprehensive score
    if scores['compound'] > 0.15:
        sentiment = 'positive' # if compound scores > 0.15 then positive sentiment
    elif scores['compound'] < -0.15: # if compound scores < -0.14 then negative sentiment
        sentiment = 'negative'
    elif scores['compound'] >= -0.15 and scores['compound'] <= 0.15:
        sentiment = 'neutral' # otherwise netural sentiment for -0.15< compound < 0.15
    # return the sentiment category and score
    return sentiment, scores['pos'], scores['neg'], scores['neu'], scores['compound']

In [None]:
# Apply sentiment analysis to each post
vader_submissions['Sentiment'], vader_submissions['Positive'], vader_submissions['Negative'], vader_submissions['Neutral'], vader_submissions['Compound_Score'] = zip(*vader_submissions['Selftext'].apply(analyze_vader_sentiment))

In [None]:
vader_submissions

In [None]:
vader_submissions.to_csv('/Users/Desktop/Reddit data/vader_submissions.csv', index=False)

In [None]:
# Apply Vader analysis to the example post
post_text = "I'm proud because in the past this was always the start of a pattern: If I missed gym once I missed it a second time because the perfect week was ruined anyways. Then I often would not go back to the gym at all. But not this time. I just called my gym buddy and told him to meet two days later at the gym so we could work out together. I found it really easy to stay consistent when there is someone who expects me to show up. But there is more success! My gym buddy who I relied on in the first weeks worked day shifts for 2 weeks straight now, so he could not work out when I did. But I still got my workout done, alone, with my own willpower! I also sticked to my meal plan and lost some pounds because the success in the gym motivated me."

sentiment, pos_score, neg_score, neu_score, compound_score = analyze_vader_sentiment(post_text)

# print the output
print(f"{post_text}")
print()
print(f"Sentiment: {sentiment}, Positive Score: {pos_score}, Negative Score: {neg_score}, Neutral Score: {neu_score}, Compound Score: {compound_score}")

#### b. VADER descriptive statistics.

In [None]:
# Count the number of negative, positive, and neutral posts in each time period
sentiment_counts = vader_submissions.groupby(['Period', 'Sentiment']).size()#.unstack(fill_value=0)

# Filter the data for pre-pandemic and during-pandemic periods
pre_pandemic_posts = vader_submissions[vader_submissions['Period'] == 'pre_pandemic']
dur_pandemic_posts = vader_submissions[vader_submissions['Period'] == 'dur_pandemic']

# Count the number of positive, negative, and neutral posts in each period
sentiment_counts_pre = pre_pandemic_posts['Sentiment'].value_counts()
sentiment_counts_dur = dur_pandemic_posts['Sentiment'].value_counts()

# Calculate the percentage of each sentiment category within each period
sentiment_percentages_pre = sentiment_counts_pre / sentiment_counts_pre.sum() * 100
sentiment_percentages_dur = sentiment_counts_dur / sentiment_counts_dur.sum() * 100

In [None]:
sentiment_counts

In [None]:
(sentiment_percentages_pre, sentiment_percentages_dur)

In [None]:
# chi-square test
data = np.array([[13294, 4572],
                 [11455, 5619]])
chi2, p, dof, ex = chi2_contingency(data)
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")

### 4.2.NRC-EIL analysis

- a. Analyze 8 emotions by NRC-EIL on each post.
- b. NRC-EIL descriptive statistics.
- c. Visualize emotion changes over time.

#### a. Analyze 8 emotions by NRC-EIL on each post.

In [None]:
# Load emotion lexicon and submissions data
lex = pd.read_csv('/Users/Desktop/Reddit data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', names=['word', 'emotion', 'present'])
lex = lex[lex.present == 1]

nrc_submissions = pd.read_csv('/Users/Desktop/Reddit data/preprocessed_nrc.csv')

In [None]:
nrc_submissions

In [None]:
# NRC-EIL sentiment analysis
def analyze_nrc_sentiment(text):
    sentiment_scores = {emotion: 0 for emotion in lex.emotion.unique()}
    for word in text.split():
        matches = lex[(lex.word == word) & (lex.emotion != 'positive') & (lex.emotion != 'negative')]  # Exclude positive and negative emotions
        for _, match in matches.iterrows():
            sentiment_scores[match['emotion']] += 1
    return sentiment_scores

nrc_submissions['Emotion'] = nrc_submissions['Selftext'].apply(analyze_nre_sentiment)

In [None]:
# Apply nrc-eil analysis to the example post
post_text = "I'm proud because in the past this was always the start of a pattern: If I missed gym once I missed it a second time because the perfect week was ruined anyways. Then I often would not go back to the gym at all. But not this time. I just called my gym buddy and told him to meet two days later at the gym so we could work out together. I found it really easy to stay consistent when there is someone who expects me to show up. But there is more success! My gym buddy who I relied on in the first weeks worked day shifts for 2 weeks straight now, so he could not work out when I did. But I still got my workout done, alone, with my own willpower! I also sticked to my meal plan and lost some pounds because the success in the gym motivated me."
sentiment_scores = analyze_nrc_sentiment(post_text)

#print th result
print(f"{post_text}")
print()
scores_output = ", ".join([f"{emotion.capitalize()} : {score}" for emotion, score in sentiment_scores.items() if emotion not in ['positive', 'negative']])
print(scores_output)

In [None]:
nrc_submissions.to_csv('/Users/Desktop/Reddit data/nrc_submissions.csv', index=False)

In [None]:
nrc_submissions = pd.read_csv('/Users/Desktop/Reddit data/nrc_submissions.csv')

In [None]:
# Function to safely evaluate the string representation of the dictionary
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return None

# Apply the safe_literal_eval function to the 'Emotion' column
nrc_submissions['Emotion'] = nrc_submissions['Emotion'].apply(safe_literal_eval)

In [None]:
# Transform dictionary into columns
nrc_emotion = nrc_submissions['Emotion'].apply(pd.Series)

In [None]:
# drop the original 'Emotion' column and add emtions to new columns
nrc_submissions = pd.concat([nrc_submissions.drop(['Emotion'], axis=1), nrc_emotion], axis=1)

In [None]:
# Remove positive and negative colmun
nrc_submissions['Creation Time'] = pd.to_datetime(nrc_submissions['Creation Time'])
nrc_submissions = nrc_submissions.drop(columns=['positive', 'negative'] )

In [None]:
nrc_submissions

In [None]:
nrc_submissions.to_csv('/Users/Desktop/Reddit data/nrc_submissions_v1.csv', index=False)

#### b. NRC-EIL descriptive statistics.

In [None]:
# Extract a subset of data before and during the pandemic
pre_pandemic_data = nrc_submissions[nrc_submissions['Period'] == 'pre_pandemic']
dur_pandemic_data = nrc_submissions[nrc_submissions['Period'] == 'dur_pandemic']

In [None]:
# Calculate the sum of each emotion before the pandemic
pre_pandemic_emotions_sum = pre_pandemic_data[['trust', 'fear', 'sadness', 'anger', 'surprise', 'disgust', 'joy', 'anticipation']].sum()
pre_pandemic_emotions_sum

In [None]:
# Calculate the proportion of each emotion before the pandemic
pre_pandemic_emotions_percentage = pre_pandemic_emotions_sum / pre_pandemic_emotions_sum.sum() * 100
pre_pandemic_emotions_percentage

In [None]:
# Calculate the sum of each emotion during the pandemic
dur_pandemic_emotions_sum = dur_pandemic_data[['trust', 'fear', 'sadness', 'anger', 'surprise', 'disgust', 'joy', 'anticipation']].sum()
dur_pandemic_emotions_sum

In [None]:
# Calculate the proportion of each emotion during the pandemic
dur_pandemic_emotions_percentage = dur_pandemic_emotions_sum / dur_pandemic_emotions_sum.sum() * 100
dur_pandemic_emotions_percentage

In [None]:
# Chi-square test
data = np.array([[165270, 141028,138341,91326,106091,110334,140532,191040],
                 [191707, 169208,163727,105152,124785,130163,163295,223038]])

chi2, p, dof, ex = chi2_contingency(data)

print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")

### 4.3.Topic Sentiment and emotion analysis

- a. Sentiment in topic level by period.
- b. Emotion in topic level by period.

In [None]:
# Open the submissions for VADER analysis
lda_submissions = pd.read_csv('/Users/Desktop/Reddit data/lda_submissions.csv')

In [None]:
# Open the submissions for VADER analysis
vader_submissions = pd.read_csv('/Users/Desktop/Reddit data/vader_submissions.csv')

In [None]:
# Open the submissions for NRC-EIL analysis
nrc_submissions = pd.read_csv('/Users/Desktop/Reddit data/nrc_submissions_v1.csv')

In [None]:
vader_submissions['Topic'] = lda_submissions['Topic'].values

In [None]:
nrc_submissions['Topic'] = lda_submissions['Topic'].values

#### a. Sentiment in topic level by period.

In [None]:
# Filter data by period
pre_pandemic_data = vader_submissions[vader_submissions['Period'] == 'pre_pandemic']
dur_pandemic_data = vader_submissions[vader_submissions['Period'] == 'dur_pandemic']

# Function to calculate sentiment proportions
def calculate_sentiment_proportions(df):
    sentiment_counts = df.groupby('Topic')['Sentiment'].value_counts(normalize=True).unstack(fill_value=0)
    sentiment_proportions = sentiment_counts * 100
    return sentiment_proportions

# Calculate proportions for each period
pre_pandemic_sentiments = calculate_sentiment_proportions(pre_pandemic_data)
dur_pandemic_sentiments = calculate_sentiment_proportions(dur_pandemic_data)

In [None]:
# Print the results
pre_pandemic_sentiments

In [None]:
dur_pandemic_sentiments

In [None]:
# Visulize the sentiment polarity by topics
# custom the topic orders
topics = [2, 11, 9, 1, 10, 12, 3, 14, 4, 15, 5, 6, 13, 8, 7, 0]
pre_pandemic_sentiments_ordered = pre_pandemic_sentiments.loc[topics]
dur_pandemic_sentiments_ordered = dur_pandemic_sentiments.loc[topics]

# Extract topics and sentiment proportions from the dataframes
pre_pandemic_positive = pre_pandemic_sentiments_ordered['positive'].tolist()
pre_pandemic_negative = pre_pandemic_sentiments_ordered['negative'].tolist()
pre_pandemic_neutral = pre_pandemic_sentiments_ordered['neutral'].tolist()

dur_pandemic_positive = dur_pandemic_sentiments_ordered['positive'].tolist()
dur_pandemic_negative = dur_pandemic_sentiments_ordered['negative'].tolist()
dur_pandemic_neutral = dur_pandemic_sentiments_ordered['neutral'].tolist()

# Define bar width and positions
bar_width = 0.2  # Width of each bar
gap_width = 0.8  # Gap width between topic groups
r = np.arange(len(topics)) * (3 * bar_width + gap_width)

# Create the plot
fig, ax = plt.subplots(figsize=(16, 8))

# Plot pre-pandemic and during-pandemic positive, negative, and neutral sentiment proportions
ax.bar(r - 2.5 * bar_width, pre_pandemic_positive, color='#1f77b4', width=bar_width, label='Pre-pandemic Positive')
ax.bar(r - 1.5 * bar_width, dur_pandemic_positive, color='#aec7e8', width=bar_width, label='Dur-pandemic Positive')

ax.bar(r - 0.5 * bar_width, pre_pandemic_negative, color='#ff7f0e', width=bar_width, label='Pre-pandemic Negative')
ax.bar(r + 0.5 * bar_width, dur_pandemic_negative, color='#ffbb78', width=bar_width, label='Dur-pandemic Negative')

ax.bar(r + 1.5 * bar_width, pre_pandemic_neutral, color='#2ca02c', width=bar_width, label='Pre-pandemic Neutral')
ax.bar(r + 2.5 * bar_width, dur_pandemic_neutral, color='#98df8a', width=bar_width, label='Dur-pandemic Neutral')

# Add labels and title
x_labels = ['T2', 'T11', 'T9', 'T1', 'T10', 'T12', 'T3', 'T14', 'T4', 'T15', 'T5', 'T6', 'T13', 'T8', 'T7', 'T0'] # rename x label name
ax.set_xticks(r)
ax.set_xticklabels(x_labels, fontsize=13)
ax.set_ylabel('Percentage (%)', fontsize=15)

# Add legend
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), loc='upper left', bbox_to_anchor=(1, 1))

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Define the number of posts before and during the pandemic
n_pre_pandemic = 16242  # pre-pandemic posts
n_dur_pandemic = 19139  # dur-pandemic posts

results = [] # create en empty list to store the result

# Loop to calculate the positive and negative sentiment proportions for each topic
for topic in range(pre_pandemic_sentiments.shape[0]):
    # calulate the positive and negative proportion before the pandemic
    pre_positive_prop = pre_pandemic_sentiments.loc[topic, 'positive'] / 100
    pre_negative_prop = pre_pandemic_sentiments.loc[topic, 'negative'] / 100
    
    # calulate the positive and negative proportion during the pandemic
    dur_positive_prop = dur_pandemic_sentiments.loc[topic, 'positive'] / 100
    dur_negative_prop = dur_pandemic_sentiments.loc[topic, 'negative'] / 100
    
    # calulate the number of successes
    count_positive = [pre_positive_prop * n_pre_pandemic, dur_positive_prop * n_dur_pandemic]
    count_negative = [pre_negative_prop * n_pre_pandemic, dur_negative_prop * n_dur_pandemic]
    
    # the total posts in each period
    nobs = [n_pre_pandemic, n_dur_pandemic]
    
    # peform the Z-test
    z_stat_positive, p_value_positive = proportions_ztest(count_positive, nobs)
    z_stat_negative, p_value_negative = proportions_ztest(count_negative, nobs)
    
    # save the result
    results.append({
        'Topic': topic,
        'Positive Z-statistic': z_stat_positive,
        'Positive P-value': p_value_positive,
        'Negative Z-statistic': z_stat_negative,
        'Negative P-value': p_value_negative
    })

In [None]:
# Print the z-test result
for result in results:
    print(f"Topic {result['Topic']}:")
    print(f"  Positive: Z-Statistic = {result['Positive Z-statistic']:.4f}, P-Value = {result['Positive P-value']:.4f}")
    print(f"  Negative: Z-Statistic = {result['Negative Z-statistic']:.4f}, P-Value = {result['Negative P-value']:.4f}")

#### b. Emotion in topic level by period.

In [None]:
# Step 1: Aggregate data by Period and Topic, summing the emotion categories
agg_nrc = nrc_submissions.groupby(['Period', 'Topic']).agg({
    'joy': 'sum',
    'trust': 'sum',
    'anticipation': 'sum',
    'surprise': 'sum',
    'fear': 'sum',
    'sadness': 'sum',
    'disgust': 'sum',
    'anger': 'sum'
}).reset_index()

# Step 2: Calculate the total number of emotions per topic to use for proportion calculations
agg_nrc['total_emotions'] = agg_nrc[['joy', 'trust', 'anticipation', 'surprise', 'fear', 'sadness', 'disgust', 'anger']].sum(axis=1)

# Step 3: Calculate proportions for each emotion so that each topic sums to 100%
for emotion in ['joy', 'trust', 'anticipation', 'surprise', 'fear', 'sadness', 'disgust', 'anger']:
    agg_nrc[emotion + '_prop'] = (agg_nrc[emotion] / agg_nrc['total_emotions']) * 100

# Step 4: Split the data into two separate tables
pre_pandemic_nrc = agg_nrc[agg_nrc['Period'] == 'pre_pandemic']
dur_pandemic_nrc = agg_nrc[agg_nrc['Period'] == 'dur_pandemic']

# Step 5: Select the final columns for display, including both raw counts and proportions
emotion_columns = ['joy', 'trust', 'anticipation', 'surprise', 'fear', 'sadness', 'disgust', 'anger']
pre_pandemic_nrc = pre_pandemic_nrc[['Topic'] + emotion_columns + 
                                       [emotion + '_prop' for emotion in emotion_columns]]

dur_pandemic_nrc = dur_pandemic_nrc[['Topic'] + emotion_columns + 
                                       [emotion + '_prop' for emotion in emotion_columns]]

In [None]:
# Display the pre-pandemic tables
pre_pandemic_nrc

In [None]:
# Display the dur-pandemic tables
dur_pandemic_nrc

In [None]:
# Visualize the emotion distribution for each period
# custom the topic order according the themes
topics = [2, 11, 9, 1, 10, 12, 3, 14, 4, 15, 5, 6, 13, 8, 7, 0]
x_labels = ['T2', 'T11', 'T9', 'T1', 'T10', 'T12', 'T3', 'T14', 'T4', 'T15', 'T5', 'T6', 'T13', 'T8', 'T7', 'T0']
pre_pandemic_sorted = pre_pandemic_nrc.set_index('Topic').loc[topics].reset_index()
dur_pandemic_sorted = dur_pandemic_nrc.set_index('Topic').loc[topics].reset_index()

# extract eight emotions before the pandemic
pre_pandemic = {
    'joy': pre_pandemic_sorted['joy_prop'].tolist(),
    'trust': pre_pandemic_sorted['trust_prop'].tolist(),
    'anticipation': pre_pandemic_sorted['anticipation_prop'].tolist(),
    'surprise': pre_pandemic_sorted['surprise_prop'].tolist(),
    'fear': pre_pandemic_sorted['fear_prop'].tolist(),
    'sadness': pre_pandemic_sorted['sadness_prop'].tolist(),
    'disgust': pre_pandemic_sorted['disgust_prop'].tolist(),
    'anger': pre_pandemic_sorted['anger_prop'].tolist()
}

# extract eight emotions during the pandemic
dur_pandemic = {
    'joy': dur_pandemic_sorted['joy_prop'].tolist(),
    'trust': dur_pandemic_sorted['trust_prop'].tolist(),
    'anticipation': dur_pandemic_sorted['anticipation_prop'].tolist(),
    'surprise': dur_pandemic_sorted['surprise_prop'].tolist(),
    'fear': dur_pandemic_sorted['fear_prop'].tolist(),
    'sadness': dur_pandemic_sorted['sadness_prop'].tolist(),
    'disgust': dur_pandemic_sorted['disgust_prop'].tolist(),
    'anger': dur_pandemic_sorted['anger_prop'].tolist()
}

In [None]:
def plot_emotion_distribution(data, topics, x_labels):
    bar_width = 0.5  # set the width of the bar
    r = np.arange(len(topics))

    # set the color to orange (postive emotions) and blue (negative emotions)
    positive_colors = ['#ffcc99', '#ff9933', '#ff7f0e', '#ff6f00'] 
    negative_colors = ['#aec7e8', '#6699cc', '#1f77b4', '#0f5292'] 
 
    fig, ax = plt.subplots(figsize=(16, 8))
    bottoms = np.zeros(len(r))

    # plot the emotion stacked chart
    for i, (emotion, values) in enumerate(data.items()):
        bars = ax.bar(r, values, bottom=bottoms, width=bar_width, label=emotion.capitalize(), color=positive_colors[i] if i < 4 else negative_colors[i-4])
        # add the proportion text into bars
        for bar, value in zip(bars, values):
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2, f'{value:.1f}%', ha='center', va='center', fontsize=8)
        bottoms += np.array(values)
    
    # add labels and title
    ax.set_xticks(r)
    ax.set_xticklabels(x_labels, fontsize=13)
    ax.set_ylabel('Proportion (%)', fontsize=15)
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

    plt.tight_layout()
    plt.show()

In [None]:
# plot the emotions distribution before the pandemic
plot_emotion_distribution(pre_pandemic, topics, x_labels)

In [None]:
# plot the emotions distribution during the pandemic
plot_emotion_distribution(dur_pandemic, topics, x_labels)