## Import library

In [None]:
# IMPORT LIBRARY
# For decompressing and processing data
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

# For scraping Reddit submissions
!pip install praw
import praw
import pandas as pd
import time

# For data cleaning and visualization
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re

# Import NLTK for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Import the sentiment analysis tool
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Install and load the SpaCy package
!pip install spacy
import spacy

# Download and install the SpaCy English language model
!python -m spacy download en_core_web_sm

# For topic modeling
import gensim
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS

# For temporal analysis visualization
import statsmodels.api as sm
from matplotlib.ticker import FuncFormatter

# For date formatting in charts
import matplotlib.dates as mdates

# For interactive visualization
import plotly.express as px

import ast

# For statistic test 
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest

## 1. Data Collection

This section of the code includes:

- a. Extract 'r/loseit' submissions IDs from Reddit historical data.
- b. Scrape posts and metadata from Reddit.
- c. Filter out any removed or deleted posts.
- d. Basic statistics overview and visualization.

### a. Extract 'r/loseit' submissions IDs.

In [None]:
# This section of the code is adapted from an external script available at: 
# https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/filter_file.py

# 1.Define file paths and data extraction parameters

# path to the input/output file
input_file = r"/Users/Desktop/Reddit data/loseit_submissions.zst"
output_file = r"/Users/Desktop/Reddit data/loseit_submissions_2019_2021_ids"

# date range for filtering the data
from_date = datetime.strptime("2019-03-01", "%Y-%m-%d") # start date
to_date = datetime.strptime("2021-04-01", "%Y-%m-%d") # end date

# specify the output file format
output_format = 'txt' 

# filter data by subreddit
field, values = "subreddit", ["loseit"]

# define the data field to be extracted
single_field = 'id'

# enable logging of errors during data processing
write_bad_lines = True

# File containing additional filter values
values_file = None

# control matching strictness in data filtering
exact_match = False

# 2.Logging setup to capture errors during processing

log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)

# 3.Function definitions to handle data extraction and processing

def write_line_zst(handle, line):
	handle.write(line.encode('utf-8'))
	handle.write("\n".encode('utf-8'))


def write_line_json(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")


def write_line_single(handle, obj, field):
	if field in obj:
		handle.write(obj[field])
	else:
		log.info(f"{field} not in object {obj['id']}")
	handle.write("\n")


def write_line_csv(writer, obj, is_submission):
	output_list = []
	output_list.append(str(obj['score']))
	output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d"))
	if is_submission:
		output_list.append(obj['title'])
	output_list.append(f"u/{obj['author']}")
	output_list.append(f"https://www.reddit.com{obj['permalink']}")
	if is_submission:
		if obj['is_self']:
			if 'selftext' in obj:
				output_list.append(obj['selftext'])
			else:
				output_list.append("")
		else:
			output_list.append(obj['url'])
	else:
		output_list.append(obj['body'])
	writer.writerow(output_list)

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()


def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match):
	output_path = f"{output_file}.{output_format}"
	is_submission = "submission" in input_file
	log.info(f"Input: {input_file} : Output: {output_path} : Is submission {is_submission}")
	writer = None
	if output_format == "zst":
		handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
	elif output_format == "txt":
		handle = open(output_path, 'w', encoding='UTF-8')
	elif output_format == "csv":
		handle = open(output_path, 'w', encoding='UTF-8', newline='')
		writer = csv.writer(handle)
	else:
		log.error(f"Unsupported output format {output_format}")
		sys.exit()

	file_size = os.stat(input_file).st_size
	created = None
	matched_lines = 0
	bad_lines = 0
	total_lines = 0
	for line, file_bytes_processed in read_lines_zst(input_file):
		total_lines += 1
		if total_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))

			if created < from_date:
				continue
			if created > to_date:
				continue

			if field is not None:
				field_value = obj[field].lower()
				matched = False
				for value in values:
					if exact_match:
						if value == field_value:
							matched = True
							break
					else:
						if value in field_value:
							matched = True
							break
				if not matched:
					continue

			matched_lines += 1
			if output_format == "zst":
				write_line_zst(handle, line)
			elif output_format == "csv":
				write_line_csv(writer, obj, is_submission)
			elif output_format == "txt":
				if single_field is not None:
					write_line_single(handle, obj, single_field)
				else:
					write_line_json(handle, obj)
			else:
				log.info(f"Something went wrong, invalid output format {output_format}")
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
			if write_bad_lines:
				if isinstance(err, KeyError):
					log.warning(f"Key {field} is not in the object: {err}")
				elif isinstance(err, json.JSONDecodeError):
					log.warning(f"Line decoding failed: {err}")
				log.warning(line)

	handle.close()
	log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}")

# Main function to orchestrate data processing based on user-defined settings

if __name__ == "__main__":
	if single_field is not None:
		log.info("Single field output mode, changing output file format to txt")
		output_format = "txt"

	if values_file is not None:
		values = []
		with open(values_file, 'r') as values_handle:
			for value in values_handle:
				values.append(value.strip().lower())
		log.info(f"Loaded {len(values)} from values file {values_file}")
	else:
		values = [value.lower() for value in values]  # convert to lowercase

	log.info(f"Filtering field: {field}")
	if len(values) <= 20:
		log.info(f"On values: {','.join(values)}")
	else:
		log.info(f"On values:")
		for value in values:
			log.info(value)
	log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}.")
	log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
	log.info(f"Output format set to {output_format}")

	input_files = []
	if os.path.isdir(input_file):
		if not os.path.exists(output_file):
			os.makedirs(output_file)
		for file in os.listdir(input_file):
			if not os.path.isdir(file) and file.endswith(".zst"):
				input_name = os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]
				input_files.append((os.path.join(input_file, file), os.path.join(output_file, input_name)))
	else:
		input_files.append((input_file, output_file))
	log.info(f"Processing {len(input_files)} files")
	for file_in, file_out in input_files:
		process_file(file_in, file_out, output_format, field, values, from_date, to_date, single_field, exact_match)

### b. Scrape posts and metadata from Reddit.

In [None]:
# Import the 'submission ids' file
submission_ids = pd.read_csv('/Users/Desktop/Reddit data/loseit_submissions_2019_2021_ids.txt', header=None, names=['sub_id'])

In [None]:
len(submission_ids) # 115123 ids

In [None]:
# Set the Reddit API access with PRAW
reddit = praw.Reddit(client_id='F2PirR2UnatbMaFiVlJlIg',
                     client_secret='hcMIMgNKI4vvatkV4Qsc2SjLuDKyhA',
                     user_agent='web:*****:v1.0 (by /u/*****)')

subreddit = reddit.subreddit('loseit') # set the subreddit
batch_size = 10000 # set 10000 submissions per batch
submissions = [] # create an empty list to store submissions dataset

In [None]:
# For loop to scrape the submissions through Reddit API

for i in range(0, len(submission_ids), batch_size):
    batch_ids = submission_ids[i:i+batch_size]  # retrieve current batch of submissions IDs
    for sub_id in batch_ids['sub_id']:
        try:
            # fetch the submissions data from Reddit
            submission = reddit.submission(id=sub_id)
            submission.comments.replace_more(limit=0) # remove comments beyond the view limit
            created_time = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
            
            # append the submission details to the list
            submissions.append({
                'Creation Time': created_time,
                'Title': submission.title,
                'User': str(submission.author),
                'Selftext': submission.selftext,
                'Score': submission.score,
                'Number of Comments': submission.num_comments,
                'URL': submission.url
            })
        except Exception as e:
            print(f"Error processing post {sub_id}: {e}")

        time.sleep(1) # avoid exceeding the rate limits

    # convert the current batch of submissions into a dataFrame and save to CSV file
    batch_df = pd.DataFrame(submissions)
    batch_df.to_csv(f'/Users/Desktop/Reddit data/submissions_2019_2021/submissions_batch_{i}.csv', index=False)

# after the process finished, merge and save into a single dataframe
submissions_df = pd.DataFrame(submissions)
submissions_df.to_csv('/Users/Desktop/Reddit data/submissions.csv', index=False)

print("All the submissions are finished.")

### c. Filter out any removed or deleted posts.

In [None]:
# Open the 'r/loseit' submissions dataset
submissions = pd.read_csv('/Users/Desktop/Reddit data/submissions.csv')

In [None]:
# Count the deleted submissions
submissions['Selftext'].str.contains('\[deleted\]', na=False).sum() # 21515 submissions are deleted

In [None]:
# Count the removed submissions
submissions['Selftext'].str.contains('\[removed\]', na=False).sum() # 52755 submissions are removed

In [None]:
# Replace the deleted and removed submissions to NA and remove them
submissions['Selftext'].replace(['[deleted]', '[removed]'], np.nan, inplace=True)
submissions.dropna(subset=['Selftext'], inplace=True)

In [None]:
# Remove posts from automoderator
submissions = submissions[submissions['Author'] != 'AutoModerator']

In [None]:
# Remove posts from official challenges
submissions = submissions[~submissions['Title'].str.contains(r'\[Challenge\]', na=False)]

In [None]:
# Count the total number of submissions
submissions['Selftext'].count() # 35384 submissions

In [None]:
# Reset the index
submissions = submissions.reset_index(drop=True)

In [None]:
cleaned_submissions = submissions.copy()

In [None]:
# Convert 'Creation Time' column to datetime type
cleaned_submissions['Creation Time'] = pd.to_datetime(cleaned_submissions['Creation Time'])

In [None]:
# Add new column 'Period' to pre/dur pandemic
cleaned_submissions['Period'] = np.where(
    cleaned_submissions['Creation Time'] < pd.Timestamp('2020-03-11'),
    'pre_pandemic',
    'dur_pandemic'
)

In [None]:
cleaned_submissions

In [None]:
# Save the cleaned submissions to CSV file
cleaned_submissions.to_csv('/Users/Desktop/Reddit data/cleaned_submissions.csv', index=False)

### d. Basic statistics overview and visualization

In [None]:
pre_pandemic = cleaned_submissions[cleaned_submissions['Period'] == 'pre_pandemic']
len(cleaned_submissions[cleaned_submissions['Period'] == 'pre_pandemic']) # 16244 posts before the pandemic

In [None]:
dur_pandemic = cleaned_submissions[cleaned_submissions['Period'] == 'dur_pandemic']
len(cleaned_submissions[cleaned_submissions['Period'] == 'dur_pandemic']) # 19140 posts after the pandemic

In [None]:
# Basic Statistics overview of the loseit dataset before the pandemic

# Mean
pre_pandemic_mean_posts = pre_pandemic['Creation Time'].dt.date.value_counts().mean()
pre_pandemic_mean_scores = pre_pandemic['Score'].mean()
pre_pandemic.loc[:, 'Words per post'] = pre_pandemic['Selftext'].str.split().apply(len)
pre_pandemic_mean_words = pre_pandemic['Words per post'].mean()

# Median
pre_pandemic_median_posts = pre_pandemic['Creation Time'].dt.date.value_counts().median()
pre_pandemic_median_scores = pre_pandemic['Score'].median()
pre_pandemic_median_words = pre_pandemic['Words per post'].median()

# SD
pre_pandemic_sd_posts = pre_pandemic['Creation Time'].dt.date.value_counts().std()
pre_pandemic_sd_scores = pre_pandemic['Score'].std()
pre_pandemic_sd_words = pre_pandemic['Words per post'].std()

# Print the output
print("Pre-pandemic era:")
print(f"Posts per day - Mean: {pre_pandemic_mean_posts}, Median: {pre_pandemic_median_posts}, SD: {pre_pandemic_sd_posts}")
print(f"Scores per post - Mean: {pre_pandemic_mean_scores}, Median: {pre_pandemic_median_scores}, SD: {pre_pandemic_sd_scores}")
print(f"Words per post - Mean: {pre_pandemic_mean_words}, Median: {pre_pandemic_median_words}, SD: {pre_pandemic_sd_words}")

In [None]:
# Basic Statistics overview of the loseit dataset during the pandemic

# Mean
dur_pandemic_mean_posts = dur_pandemic['Creation Time'].dt.date.value_counts().mean()
dur_pandemic_mean_scores = dur_pandemic['Score'].mean()
dur_pandemic.loc[:, 'Words per post'] = dur_pandemic['Selftext'].str.split().apply(len)
dur_pandemic_mean_words = dur_pandemic['Words per post'].mean()

# Median
dur_pandemic_median_posts = dur_pandemic['Creation Time'].dt.date.value_counts().median()
dur_pandemic_median_scores = dur_pandemic['Score'].median()
dur_pandemic_median_words = dur_pandemic['Words per post'].median()

# SD
dur_pandemic_sd_posts = dur_pandemic['Creation Time'].dt.date.value_counts().std()
dur_pandemic_sd_scores = dur_pandemic['Score'].std()
dur_pandemic_sd_words = dur_pandemic['Words per post'].std()

# Print the output
print("Dur-pandemic era:")
print(f"Posts per day - Mean: {dur_pandemic_mean_posts}, Median: {dur_pandemic_median_posts}, SD: {dur_pandemic_sd_posts}")
print(f"Scores per post - Mean: {dur_pandemic_mean_scores}, Median: {dur_pandemic_median_scores}, SD: {dur_pandemic_sd_scores}")
print(f"Words per post - Mean: {dur_pandemic_mean_words}, Median: {dur_pandemic_median_words}, SD: {dur_pandemic_sd_words}")

In [None]:
# Plot the submissions distribution monthly

# convert 'Creation Time' column to datetime type
cleaned_submissions['Creation Time'] = pd.to_datetime(cleaned_submissions['Creation Time'], format='%Y-%m-%d %H:%M:%S') # specify datetime format

# extract 'Year', 'Month', 'Day' information
cleaned_submissions['Year'] = cleaned_submissions['Creation Time'].dt.year
cleaned_submissions['Month'] = cleaned_submissions['Creation Time'].dt.month
cleaned_submissions['Day'] = cleaned_submissions['Creation Time'].dt.day

# count monthly submissions
subs_per_month = cleaned_submissions.groupby(['Year', 'Month']).size().reset_index(name='Post Count')

# plot the monthly submissions
plt.figure(figsize=(15, 6))
plt.bar(subs_per_month['Year'].astype(str) + '-' + subs_per_month['Month'].astype(str), subs_per_month['Post Count'], color='green')
plt.ylabel('Number of Post', fontsize=18)
plt.xticks(rotation=45, fontsize=11)
plt.yticks(rotation=90, fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
# Open the submissions for VADER analysis
vader_submissions = pd.read_csv('/Users/Desktop/Reddit data/preprocessed_vader.csv')

In [None]:
vader_submissions

In [None]:
# Instantiate the VADER analyzer
sid = SentimentIntensityAnalyzer()

# Analyze sentiment on each post
def analyze_vader_sentiment(text):
    text = str(text) # convert text to string type
    scores = sid.polarity_scores(text) # get sentiment score
    # determine the emotion category based on the comprehensive score
    if scores['compound'] > 0.15:
        sentiment = 'positive' # if compound scores > 0.15 then positive sentiment
    elif scores['compound'] < -0.15: # if compound scores < -0.14 then negative sentiment
        sentiment = 'negative'
    elif scores['compound'] >= -0.15 and scores['compound'] <= 0.15:
        sentiment = 'neutral' # otherwise netural sentiment for -0.15< compound < 0.15
    # return the sentiment category and score
    return sentiment, scores['pos'], scores['neg'], scores['neu'], scores['compound']

In [None]:
# Apply sentiment analysis to each post
vader_submissions['Sentiment'], vader_submissions['Positive'], vader_submissions['Negative'], vader_submissions['Neutral'], vader_submissions['Compound_Score'] = zip(*vader_submissions['Selftext'].apply(analyze_vader_sentiment))

In [None]:
vader_submissions

In [None]:
vader_submissions.to_csv('/Users/Desktop/Reddit data/vader_submissions.csv', index=False)

In [None]:
# Apply Vader analysis to the example post
post_text = "I'm proud because in the past this was always the start of a pattern: If I missed gym once I missed it a second time because the perfect week was ruined anyways. Then I often would not go back to the gym at all. But not this time. I just called my gym buddy and told him to meet two days later at the gym so we could work out together. I found it really easy to stay consistent when there is someone who expects me to show up. But there is more success! My gym buddy who I relied on in the first weeks worked day shifts for 2 weeks straight now, so he could not work out when I did. But I still got my workout done, alone, with my own willpower! I also sticked to my meal plan and lost some pounds because the success in the gym motivated me."

sentiment, pos_score, neg_score, neu_score, compound_score = analyze_vader_sentiment(post_text)

# print the output
print(f"{post_text}")
print()
print(f"Sentiment: {sentiment}, Positive Score: {pos_score}, Negative Score: {neg_score}, Neutral Score: {neu_score}, Compound Score: {compound_score}")

In [None]:
# Count the number of negative, positive, and neutral posts in each time period
sentiment_counts = vader_submissions.groupby(['Period', 'Sentiment']).size()#.unstack(fill_value=0)

# Filter the data for pre-pandemic and during-pandemic periods
pre_pandemic_posts = vader_submissions[vader_submissions['Period'] == 'pre_pandemic']
dur_pandemic_posts = vader_submissions[vader_submissions['Period'] == 'dur_pandemic']

# Count the number of positive, negative, and neutral posts in each period
sentiment_counts_pre = pre_pandemic_posts['Sentiment'].value_counts()
sentiment_counts_dur = dur_pandemic_posts['Sentiment'].value_counts()

# Calculate the percentage of each sentiment category within each period
sentiment_percentages_pre = sentiment_counts_pre / sentiment_counts_pre.sum() * 100
sentiment_percentages_dur = sentiment_counts_dur / sentiment_counts_dur.sum() * 100

In [None]:
sentiment_counts

In [None]:
(sentiment_percentages_pre, sentiment_percentages_dur)

In [None]:
# chi-square test
data = np.array([[13294, 4572],
                 [11455, 5619]])
chi2, p, dof, ex = chi2_contingency(data)
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")