## Import library

In [None]:
# IMPORT LIBRARY
# For decompressing and processing data
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

# For scraping Reddit submissions
!pip install praw
import praw
import pandas as pd
import time

# For data cleaning and visualization
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re

# Import NLTK for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Import the sentiment analysis tool
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Install and load the SpaCy package
!pip install spacy
import spacy

# Download and install the SpaCy English language model
!python -m spacy download en_core_web_sm

# For topic modeling
import gensim
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS

# For temporal analysis visualization
import statsmodels.api as sm
from matplotlib.ticker import FuncFormatter

# For date formatting in charts
import matplotlib.dates as mdates

# For interactive visualization
import plotly.express as px

import ast

# For statistic test 
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest

## 3.Topic Modeling

This section of the code includes:

- a. Build dictionary and BOW corpus for LDA model.
- b. Test the optimal topics numbers from 5 to 30.
- c. Run the LDA model with the optimal topic numbers.
- d. Temporal analysis by topics by five themes.

### a. Build dictionary and BOW corpus

In [None]:
# Open the dataset for LDA analysis
preprocessed_lda = pd.read_csv('/Users/Desktop/Reddit data/preprocessed_lda.csv')

In [None]:
# Covert the post text to string
lda_texts = preprocessed_lda['Selftext'].astype(str)

In [None]:
tokenized_texts = [[word for word in word_tokenize(doc) if word.lower() not in stop_words] for doc in lda_texts]

In [None]:
# Create the dictionary
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=5, no_above=0.8) # Filter extreme words in the dictionary
print(dictionary)

In [None]:
# Create the corpus
corpus = [dictionary.doc2bow(doc) for doc in tokenized_texts]

### b. Test the optimal topic numbers

In [None]:
# Test the number of different topics
topic_numbers = range(5, 31, 1)

In [None]:
# Create an empty list of coherence scores
c_v_scores = []

In [None]:
# Configure LDA multicore model
alpha = 1  # Hyperparameter for document-topic distribution
beta = 0.1  # Hyperparameter for topic-word distribution
random_state = 42  # Random state for reproducibility
workers = 3  # Number of CPUs to use
passes = 20  # Number of passes through the corpus
iterations = 1000 # interation 1000 times

In [None]:
# test the optimal topics
for num_topics in topic_numbers:
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha, eta=beta, workers=workers, passes=passes, random_state=random_state, iterations=iterations)
    c_v_model = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    c_v_lda = c_v_model.get_coherence()
    c_v_scores.append(c_v_lda)

In [None]:
# Consistency scores visualization

# Convert the range object to a list
topic_numbers = range(5, 31)

# Plot Pre-pandemic consistency scores
plt.plot(topic_numbers, c_v_scores, label='Coherence Score')
plt.axvline(x=16, color='red', linestyle='--', label='Selected Topic Number')

# Add legend in the upper right corner and reduce its size
plt.legend(loc='lower right', prop={'size': 8})

plt.xticks([5, 10, 15, 20, 25, 30])  # Set the ticks of x-axis
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.grid(True, color='gray', linestyle='-', linewidth=0.5)

# Save the plot to a file
plt.savefig('consistency_scores_plot.png', dpi=300, bbox_inches='tight')  # Save as PNG with high resolution

plt.show()

In [None]:
# Print the coherence_score with number of topics
for num_topics, coherence_score in zip(topic_numbers, c_v_scores):
    print(f'The model with num_topics = {num_topics} has a coherence value of {coherence_score:.3f}')

### c. Run the LDA model with the 16 topic numbers

In [None]:
# Build new dictionary and BOW corpus

# Add Custom stop words
custom_stop_words = set(['get','make','take','really','still','even','also','try','see','thing', 'new', 'end', 'much'])

# Tokenize new text
tokenized_texts_16 = [[word for word in word_tokenize(doc) if word.lower() not in custom_stop_words] for doc in lda_texts]

# Create new dictionary
dictionary_16 = corpora.Dictionary(tokenized_texts_16)
dictionary_16.filter_extremes(no_below=5, no_above=0.8)

# Create new dictionary
corpus_16 = [dictionary_16.doc2bow(doc) for doc in tokenized_texts_16]

In [None]:
# rerun the LDA model
lda_model_16 = LdaMulticore(corpus=corpus_16,
                         id2word=dictionary_16,
                         num_topics=16,
                         alpha=alpha,
                         eta=beta,
                         random_state=random_state,
                         workers=workers,
                         iterations = iterations,
                         passes=passes)

In [None]:
# Print each topic number and its associated words
for idx, topic in lda_model_16.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Calculate new coherence score for the re-run LDA model
coherence_lda_model_16 = CoherenceModel(model=lda_model_16, texts=tokenized_texts_16, dictionary=dictionary_16, coherence='c_v')
coherence_lda_16 = coherence_lda_model_16.get_coherence()
print('\nCoherence Score: ', coherence_lda_16)

In [None]:
# Get topic distribution for each document in the corpus
doc_topics = [lda_model_16.get_document_topics(doc) for doc in corpus_16]

In [None]:
# Find the topic with the highest probability for each document
relevant_topics = [max(doc, key=lambda x: x[1])[0] for doc in doc_topics] # contains the most relevant topic ID for each document

In [None]:
# Add a new column 'Topic' to the DataFrame
preprocessed_lda['Topic'] = relevant_topics

In [None]:
preprocessed_lda

In [None]:
lda_submissions = preprocessed_lda.copy()

In [None]:
lda_submissions

In [None]:
lda_submissions.to_csv('/Users/Desktop/Reddit data/lda_submissions.csv', index=False)

In [None]:
# Calculate the proportions of each topic
topic_counts_total = lda_submissions['Topic'].value_counts() # the total number of posts of each topic
topic_proportions_total = topic_counts_total / topic_counts_total.sum() * 100 # the proportions of each topic
topic_proportions_total

### d. Temporal topics by themes.

In [None]:
# Assuming 'Date' is the name of the column containing datetime information
lda_submissions['Creation Time'] = pd.to_datetime(lda_submissions['Creation Time'])

In [None]:
# Calculate the number of posts in each topic by two periods
topic_counts = lda_submissions.groupby(['Period', 'Topic']).size().unstack(fill_value=0)

# Calculate the proportions of posts in each topic by two periods
total_posts = lda_submissions.shape[0] # total posts for the dataset
topic_counts = topic_counts.stack().reset_index(name='Count') # posts in each topic by two periods
topic_proportions = topic_counts.copy()
topic_proportions['Proportion'] = (topic_proportions['Count'] / total_posts) * 100

In [None]:
# Set the 'Creation Time' column as the index
lda_submissions.set_index('Creation Time', inplace=True)

In [None]:
# Apply the resample method to resample by month and calculate the number of posts
monthly_posts = lda_submissions.resample('M').size()

In [None]:
# Initialize a DataFrame to store the expected proportions for each topic
topic_proportion = pd.DataFrame(index=monthly_posts.index)

In [None]:
# Calculate the expected proportion for each topic
for topic in lda_submissions['Topic'].unique():
    # Filter data for the specific topic and resample by month
    monthly_topic_posts = lda_submissions[lda_submissions['Topic'] == topic].resample('M').size()
    # Calculate the proportion by dividing the number of posts for each topic by the total number of posts for each month
    topic_proportion[topic] = monthly_topic_posts / monthly_posts

In [None]:
# Normalize the topic counts to get proportions
topic_proportion = topic_proportion.div(topic_proportion.sum(axis=1), axis=0)

In [None]:
topic_proportion

In [None]:
# Visualize the topic trends over time
def plot_topic_trends(topic_proportion, topics_to_plot, names):
    # Convert the time index (date) to a number
    dates_numeric = np.arange(len(topic_proportion.index))

    # Ensure the topic_proportion index is adjusted to the start of each month
    adjusted = topic_proportion.index - pd.offsets.MonthEnd(1) + pd.Timedelta(days=1)

    # Set the figure size
    plt.figure(figsize=(20, 5))

    # Loop through each topic to plot
    for i, topic in enumerate(topics_to_plot):
            # Get the expected ratio sequence of the current theme
            y = topic_proportion[topic].values * 100
            # Apply LOESS regression
            lowess_results = sm.nonparametric.lowess(y, dates_numeric, frac=0.4) # frac parameter controls the smoothness
            # Draw the original data points
            plt.scatter(adjusted, y, alpha=0.4, s=8)
            # Draw the smooth curve
            plt.plot(adjusted, lowess_results[:, 1], label=names[i], lw=1.5)

    # Mark the pandemic start point with a vertical line
    plt.axvline(x=adjusted[12], color='red', linestyle='--', label='Pandemic start point')
    plt.ylabel('Topic Proportion', fontsize=18)
    plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0f}%'.format(y))) # format y-axis as percentages

    # Ensure x-axis shows every month
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.tick_params(axis='both', which='both', length=0) # remove the small tick lines on x and y axes
    plt.legend(loc='upper right', fontsize=10) # add legend
    plt.xticks(rotation=45, fontsize=13)  # rotate x-axis labels
    plt.tight_layout()
    plt.grid(True, color='lightgray')  # add grid
    plt.show()

In [None]:
# Plot the weight management theme
topics_to_plot_1 = [2, 11, 9, 1]
names_1 = ['Weight change (T2)', 'Weight loss goal (T11)', 'Medication (T9)', 'Daily updates(T1)']
plot_topic_trends(topic_proportion, topics_to_plot_1, names_1)

In [None]:
# Plot the Diet theme
topics_to_plot_2 = [10, 12, 3, 14]
names_2 = ['Emotional eating (T10)', 'Food choice (T12)', 'Calorie tracker (T3)', 'Diet control(T14)']
plot_topic_trends(topic_proportion, topics_to_plot_2, names_2)

In [None]:
# Plot the Physical Exercise theme
topics_to_plot_3 = [4, 15, 5, 6]
names_3 = ['Motivation (T4)', 'Workout plan (T15)', 'Fitness App (T5)', 'Exercise routine (T6)']
plot_topic_trends(topic_proportion, topics_to_plot_3, names_3)

In [None]:
# Plot the Emotions and support theme
topics_to_plot_4 = [13, 8]
names_4 = ['Negative feelings (T13)', 'Seeking advice (T8)']
plot_topic_trends(topic_proportion, topics_to_plot_4)

In [None]:
# Plot the Appearance theme
topics_to_plot_5 = [7, 0]
names_5 = ['Body image (T7)', 'Clothe fit (T0)']
plot_topic_trends(topic_proportion, topics_to_plot_5)