In [1]:
import os
import kagglehub
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wordcloud

# 1. Dataset Preparation


In [2]:
path = kagglehub.dataset_download(
    "suchintikasarkar/sentiment-analysis-for-mental-health"
)
print("Path to dataset files:", path)

Path to dataset files: /home/cici/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [3]:
df = pd.read_csv(path + "/Combined Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [5]:
df = df.dropna()

In [6]:
dups = df.duplicated(subset=["statement"]).sum()
print(f"Total duplicate statements in entire dataset: {dups}")

dups_per_status = (
    df[df.duplicated(subset=["statement"], keep=False)].groupby("status").size()
)
print("\nDuplicates found per status:")
print(dups_per_status)

df = df.drop_duplicates(subset=["statement"], keep="first").copy()

Total duplicate statements in entire dataset: 1608

Duplicates found per status:
status
Anxiety                 433
Bipolar                 552
Depression              601
Normal                  463
Personality disorder    362
Stress                  574
Suicidal                 26
dtype: int64


# 2. Dataset Analysis


In [7]:
drop = ["Anxiety", "Stress", "Bipolar", "Personality disorder"]

df = df[~df["status"].isin(drop)].copy()
df = df.reset_index(drop=True)

## 2.2. Manual Analysis for Some Samples


In [8]:
SEED = 42

In [9]:
def subsample(df: pd.DataFrame, n: int = 20, seed: int = SEED) -> pd.DataFrame:
    return df.groupby("status").sample(n=n, random_state=seed).reset_index(drop=True)

In [10]:
subsampled_df = subsample(df).drop(columns=["Unnamed: 0"])
subsampled_df.head()

Unnamed: 0,statement,status
0,i wonder what misery awaits me in the msq toda...,Depression
1,I don't feel great I'm 14 and I just wanna die...,Depression
2,My depression comes and goes these past days h...,Depression
3,It seems like the more I get in touch with peo...,Depression
4,ok so i have been anxious for about year now i...,Depression


In [11]:
subsampled_df["status"].value_counts()

status
Depression    20
Normal        20
Suicidal      20
Name: count, dtype: int64

In [12]:
subsampled_df.to_csv("data/subsampled.csv", index=False)

In [13]:
shuffled_df = subsampled_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
shuffled_df.head()

Unnamed: 0,statement,status
0,i wonder what misery awaits me in the msq toda...,Depression
1,Can you share what your indicators are that an...,Depression
2,Direct 2 dong,Normal
3,"Fucking hell what is wrong with me, I am so fu...",Suicidal
4,I have my exams going onn. One over 4 left for...,Depression


In [14]:
manual_df = pd.read_csv("data/manual_analysis.csv")
manual_df

Unnamed: 0,statement,theme/topic (subcode),manual_label (code),evidence_phrases,notes
0,i wonder what misery awaits me in the msq toda...,1,Depression,"""misery"",""depression"",""despair""",
1,Can you share what your indicators are that an...,1,Depression,"""episode"",""depressive episode""",
2,Direct 2 dong,Information?,Normal,,"It doesn't actually have a clear meaning, but ..."
3,"Fucking hell what is wrong with me, I am so fu...",Self-directed rage,Suicidal,"""Fucking hell what is wrong with me"",""I am so ...",
4,I have my exams going onn. One over 4 left for...,1,Depression,"""have not studied anything"","" iv wasted a lot ...",
5,I do not know how people work 30+ hours per we...,Work phobia,Depression,"""my phobia of working has only gotten worse"",""...",
6,But it's good that the A&W Ipoh Parade is open.,1,Normal,,
7,i cant live in this world ok its too much its ...,1,Suicidal,"""i cant live in this world"",""i cant handle it ...",
8,I have had a crush on this girl for a very lon...,1,Depression,"""confused me"",""she literally started ignoring ...",
9,NSFWI have been depressed since a child. I wat...,Past trauma & Suicidal Ideation,Suicidal,"""i have cried because i am tired of living and...",


In [15]:
for i in range(len(manual_df)):
    if (
        manual_df.loc[i, "manual_label (code)"] != shuffled_df.loc[i, "status"]
        and manual_df.loc[i, "statement"].strip().lower()
        == shuffled_df.loc[i, "statement"].strip().lower()
    ):
        print(
            f"Mismatch at Row {i + 2}: Our code = {manual_df.loc[i, 'manual_label (code)']}, Original status = {shuffled_df.loc[i, 'status']}"
        )

Mismatch at Row 7: Our code = Depression, Original status = Suicidal
Mismatch at Row 17: Our code = Suicidal, Original status = Depression
Mismatch at Row 23: Our code = Suicidal, Original status = Depression
Mismatch at Row 27: Our code = Depression, Original status = Suicidal
Mismatch at Row 28: Our code = Suicidal, Original status = Depression
Mismatch at Row 33: Our code = Suicidal, Original status = Depression
Mismatch at Row 39: Our code = Depression, Original status = Suicidal
Mismatch at Row 41: Our code = Depression, Original status = Normal
Mismatch at Row 43: Our code = Suicidal, Original status = Depression
Mismatch at Row 46: Our code = Depression, Original status = Suicidal
Mismatch at Row 50: Our code = Normal, Original status = Suicidal
Mismatch at Row 54: Our code = Depression, Original status = Suicidal
Mismatch at Row 58: Our code = Suicidal, Original status = Depression
