In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

To handle the large dataset, read in the data in chunks

In [2]:
chunk_size = 10000
chunks = pd.read_csv('../data/Reddit-Threads_2020-2021.csv', chunksize=chunk_size)
chunks2 = pd.read_csv('../data/Reddit-Threads_2022-2023.csv', chunksize=chunk_size)
data_2021 = pd.concat(chunk for chunk in chunks)
data_2223 = pd.concat(chunk for chunk in chunks2)
combined_data = pd.concat([data_2021, data_2223]).reset_index(drop=True)

In [None]:
print(combined_data.info())

In [7]:
missing_values = combined_data[combined_data.isna().any(axis=1)]
# view missing_values
# missing_values.head()

In [3]:
# drop missing values
combined_data = combined_data.dropna()
# filter out deleted/removed comments
combined_data = combined_data[~combined_data['text'].isin(["[deleted]", "[removed]"])]

Extract `yearmonth` from `timestamp` for temporal analysis

In [4]:
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data['yearmonth'] = combined_data['timestamp'].dt.to_period('M')

# order by timestamp
combined_data = combined_data.sort_values(by='timestamp').reset_index(drop=True)

Extract thread's `title` for each comment

In [5]:
combined_data['title'] = combined_data['link'].apply(lambda x: x.split('/')[5] if isinstance(x, str) else None)
combined_data['title'] = combined_data['title'].str.replace('_', ' ')

Add `index` as primary key for easier processing of models

In [None]:
combined_data['index'] = combined_data.index
# view the structure of combined_data
# combined_data.head()

In [7]:
combined_data.to_csv('../data/combined_data.csv', index=False)

# Combine text data with toxicity scores

In [None]:
combined_data = pd.read_csv('../data/combined_data.csv')

hatebert_scores = pd.read_csv('../data/hatebert_scores.csv')
hatexplain_scores = pd.read_csv('../data/hateXplain_scores.csv')
toxicbert_scores = pd.read_csv('../data/toxicbert_scores.csv')

Calculate average of the 3 scores

In [None]:
# merge the three scores with the combined_data on `index` column
scores = pd.merge(hatebert_scores, hatexplain_scores, toxicbert_scores, on='index', suffixes=('_hatebert', '_hatexplain', '_toxicbert'))
text_scores = combined_data.merge(scores, on='index', how='left')

# calculate the average of the three scores
text_scores['average_toxicity_score'] = text_scores[['toxicity_score_hatebert', 'toxicity_score_hatexplain', 'toxicity_score_toxicbert']].mean(axis=1)

# rename columns
text_scores = text_scores.rename(columns={'toxicity_score_hatebert': 'hatebert_toxicity_score', 'toxicity_score_hatexplain': 'hateXplain_toxicity_score', 'toxicity_score_toxicbert': 'toxicbert_toxicity_score'})

In [None]:
# save the data
text_scores.to_csv('../data/combined_data_scores.csv', index=False)