# Data Preprocessing

In [13]:
# loading scraped CSVs

import pandas as pd

df_dep = pd.read_csv("../data/depression_posts.csv")
df_bip = pd.read_csv("../data/bipolar_posts.csv")
df_anx = pd.read_csv("../data/anxiety_posts.csv")


In [14]:
# adding a label column for each

df_dep["label"] = "depression"
df_bip["label"] = "bipolar"
df_anx["label"] = "anxiety"


In [15]:
# concatenating into one dataframe

df = pd.concat([df_dep, df_bip, df_anx], ignore_index=True)
print(f"Combined shape: {df.shape}")

Combined shape: (1500, 5)


In [16]:
# cleaning the text fields

# dropping rows where selftext is empty or removed
df = df[df['selftext'].notnull()]
df = df[~df["selftext"].isin(["[deleted]", "[removed]", ""])]

# cleaning newlines and trimming whitespaces
df["selftext"] = df["selftext"].str.replace("\n", " ").str.strip()
df["title"] = df["title"].str.replace("\n", " ").str.strip()

In [17]:
# dropping duplicates and resetting index
df = df.drop_duplicates(subset=["selftext", "title"])
df = df.reset_index(drop=True)

In [18]:
# exporting the cleaned dataframe to a new csv file
df.to_csv("../data/mental_health_data_cleaned.csv", index=False)
print("Saved cleaned data to mental_health_data_cleaned.csv")

Saved cleaned data to mental_health_data_cleaned.csv


In [19]:
# checks
print(df["label"].value_counts())
df["length"] = df["selftext"].str.len()
df.describe()


label
depression    500
bipolar       494
anxiety       483
Name: count, dtype: int64


Unnamed: 0,timestamp,length
count,1477.0,1477.0
mean,1750490000.0,932.400135
std,4651698.0,967.142421
min,1572361000.0,0.0
25%,1750596000.0,398.0
50%,1750683000.0,667.0
75%,1750740000.0,1182.0
max,1750805000.0,15317.0
