## Import library

In [None]:
# IMPORT LIBRARY
# For decompressing and processing data
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

# For scraping Reddit submissions
!pip install praw
import praw
import pandas as pd
import time

# For data cleaning and visualization
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re

# Import NLTK for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Import the sentiment analysis tool
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Install and load the SpaCy package
!pip install spacy
import spacy

# Download and install the SpaCy English language model
!python -m spacy download en_core_web_sm

# For topic modeling
import gensim
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS

# For temporal analysis visualization
import statsmodels.api as sm
from matplotlib.ticker import FuncFormatter

# For date formatting in charts
import matplotlib.dates as mdates

# For interactive visualization
import plotly.express as px

import ast

# For statistic test 
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest

## 2.Data preprocess

This section of the code includes:

- a. Clean the submission’s dataset.
- b. Preprocess the submission’s dataset..
- c. Save the preprocessed submissions for analysis.

### a. Clean the submission’s dataset.

In [None]:
cleaned_submissions = pd.read_csv('/Users/Desktop/Reddit data/cleaned_submissions.csv')

In [None]:
# Define VADER function
def clean_vader(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'&#x200B;|\bx\s*b\b', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

In [None]:
# Define LDA and NRC-EIL function
def clean_lda_nrceil(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'&#x200B;|\bx\s*b\b', '', text)
    text = re.sub(r'\bdon t\b", "do not', '', text)
    text = re.sub(r'\bdidn t\b", "did not', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    return text

In [None]:
cleaned_submissions['clean_vader'] = cleaned_submissions['Selftext'].apply(clean_vader)
cleaned_submissions['clean_lda_nrceil'] = cleaned_submissions['Selftext'].apply(clean_lda_nrceil)

### b. Preprocess the submission’s dataset..

In [None]:
## Topic modeling (LDA)

# Step 1: Tokenize the words
cleaned_submissions['tokenized_lda'] = [
    word_tokenize(text) for text in cleaned_submissions['clean_lda_nrceil']
]

# Step 2: Lemmatization

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatize_text(tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc if token.pos_ in allowed_postags]
cleaned_submissions['lemmatized_lda_texts'] = [
    lemmatize_text(words) for words in cleaned_submissions['tokenized_lda']
]

# Step 3: Remove customed and stardard stop words 
stop_words = set(stopwords.words('english')) # load stop words
cleaned_submissions['lemmatized_lda_texts'] = [
    [word for word in words if word.lower() not in stop_words] for words in cleaned_submissions['lemmatized_lda_texts']
]

# Step 4: Remove the word with less than 3
cleaned_submissions['preprocessed_lda'] = [
    ' '.join([word for word in words if len(word) >= 3]) for words in cleaned_submissions['lemmatized_lda_texts']]

# Print the LDA result
print(cleaned_submissions[['preprocessed_lda']].head())

In [None]:
## Sentiment analysis (VADER)

# Step 1: Tokenize the words
cleaned_submissions['tokenized_vader'] = [
    word_tokenize(text) for text in cleaned_submissions['clean_vader']
]

# Step 2: Remove stop words
stop_words = set(stopwords.words('english')) # load stop words
cleaned_submissions['vader_texts'] = [
    [word for word in tokens if word not in stop_words] for tokens in cleaned_submissions['tokenized_vader']
]

# Step 3: Convert the list of words back into text.
cleaned_submissions['preprocessed_vader'] = cleaned_submissions['vader_texts'].apply(lambda words: ' '.join(words))

# Print the LDA result
print(cleaned_submissions[['preprocessed_vader']].head())

In [None]:
## Sentiment analysis (NRC-EIL)

# Step 1: Tokenize the words
cleaned_submissions['tokenized_nrc'] = [
    word_tokenize(text) for text in cleaned_submissions['clean_lda_nrceil']
]

# Step 2: Lemmatization
def lemmatize_text(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]
cleaned_submissions['lemmatized_nrc_texts'] = [
    lemmatize_text(words) for words in cleaned_submissions['tokenized_nrc']
]

# Step 2: Remove stop words
stop_words = set(stopwords.words('english')) # load stop words
cleaned_submissions['lemmatized_nrc_texts'] = [
    [word for word in words if word.lower() not in stop_words] for words in cleaned_submissions['lemmatized_nrc_texts']
]

# Step 3: Convert the list of words back into text
cleaned_submissions['preprocessed_nrc'] = cleaned_submissions['lemmatized_nrc_texts'].apply(lambda words: ' '.join(words))

# Print the LDA result
print(cleaned_submissions[['preprocessed_nrc']].head())

### c. Save the preprocessed submissions for analysis.

In [None]:
# Convert the preprocessed submissions for LDA to data frame
cleaned_submissions['Selftext'] = cleaned_submissions['preprocessed_lda']
preprocessed_lda = cleaned_submissions[['Creation Time', 'Selftext', 'Score', 'Number of Comments', 'Period']].copy()

In [None]:
# Convert the preprocessed submissions for VADER to data frame
cleaned_submissions['Selftext'] = cleaned_submissions['preprocessed_vader']
preprocesseds_vader = cleaned_submissions[['Creation Time', 'Selftext', 'Score', 'Number of Comments', 'Period']].copy()

In [None]:
# Convert the preprocessed submissions for NRC-EIL to data frame
cleaned_submissions['Selftext'] = cleaned_submissions['preprocessed_nrc']
preprocessed_nrc = cleaned_submissions[['Creation Time', 'Selftext', 'Score', 'Number of Comments', 'Period']].copy()

In [None]:
# Find preprocess value with any NaN values
na_lda = preprocessed_lda[preprocessed_lda.isna().any(axis=1)]
print(na_lda)

na_vader = preprocessed_vader[preprocessed_vader.isna().any(axis=1)]
print(na_vader)

na_nrc = preprocessed_nrc[preprocessed_nrc.isna().any(axis=1)]
print(na_nrc)

# posts with 5696,13334,31352 ids has NAN values in LDA and NRC preprocessing data

In [None]:
# Remove this three posts with NaN in the 'Selftext' column
indices_to_drop = [5696, 13334, 31352] # 35381 submissions left in the datasets
preprocessed_lda = preprocessed_lda.drop(indices_to_drop)
preprocessed_vader = preprocessed_vader.drop(indices_to_drop)
preprocessed_nrc = preprocessed_nrc.drop(indices_to_drop)

In [None]:
# Reset the index
preprocessed_lda = preprocessed_lda.reset_index(drop=True)
preprocessed_vader = preprocessed_vader.reset_index(drop=True)
preprocessed_nrc = preprocessed_nrc.reset_index(drop=True)

In [None]:
# Save the preprocessed dataset
preprocessed_lda.to_csv('/Users/Desktop/Reddit data/preprocessed_lda.csv', index=False)
preprocessed_vader.to_csv('/Users/Desktop/Reddit data/preprocessed_vader.csv', index=False)
preprocessed_nrc.to_csv('/Users/Desktop/Reddit data/preprocessed_nrc.csv', index=False)