# MindInsight Classifier: Unveiling Mental Health Patterns in Pandemic Discourse through Data-Driven Analysis

Let us first import the pertinent libraries.

In [70]:
#!pip install wordcloud

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('mental_disorders_reddit.csv')

In [4]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD
4,help,[removed],1650350907,False,BPD


### Data Preprocessing and Simple EDA (Part 1)

In [5]:
print(df.shape)

(701787, 5)


In [6]:
df.isnull().sum()

title             46
selftext       33691
created_utc        0
over_18            0
subreddit          0
dtype: int64

In [7]:
df = df.dropna(subset=['selftext'], how='any')

In [8]:
df.isnull().sum()

title          42
selftext        0
created_utc     0
over_18         0
subreddit       0
dtype: int64

In [9]:
df['subreddit'].value_counts()

BPD              233125
Anxiety          167059
depression       156717
bipolar           46666
mentalillness     44249
schizophrenia     20280
Name: subreddit, dtype: int64

In [10]:
df['title'] = df['title'].fillna('')

# Calculate the total number of words in 'title'
df['title_total'] = df['title'].apply(lambda x: len(x.split()))

# Define a function to count total characters in a text (excluding spaces)
def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

# Calculate the total number of characters in 'title'
df['title_chars'] = df['title'].apply(count_total_words)

In [11]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21
4,help,[removed],1650350907,False,BPD,1,4


In [12]:
df['text_total'] = df['selftext'].apply(lambda x: len(x.split()))

def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

df['text_chars'] = df["selftext"].apply(count_total_words)

In [13]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21,821,3282
4,help,[removed],1650350907,False,BPD,1,4,1,9


### Data Downsizing

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668096 entries, 0 to 701786
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        668096 non-null  object
 1   selftext     668096 non-null  object
 2   created_utc  668096 non-null  int64 
 3   over_18      668096 non-null  bool  
 4   subreddit    668096 non-null  object
 5   title_total  668096 non-null  int64 
 6   title_chars  668096 non-null  int64 
 7   text_total   668096 non-null  int64 
 8   text_chars   668096 non-null  int64 
dtypes: bool(1), int64(5), object(3)
memory usage: 46.5+ MB


The number of data is 666,8096. It is very large and takes a lot of time to process. As we wish to spotlight the posts published during the duration of the COVID-19 pandemic, we will be limiting our data to only include posts from March 2020 onwards. A random sample of 10,000 posts will be taken from the dataset for efficiency.

In [15]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [16]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,2022-04-19 08:29:20,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,2022-04-19 08:24:20,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,2022-04-19 08:02:59,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",2022-04-19 07:30:30,False,BPD,3,21,821,3282
4,help,[removed],2022-04-19 06:48:27,False,BPD,1,4,1,9


In [17]:
# Filter posts from March 2020 onwards
filtered_df = df[df['created_utc'] >= '2020-03-01']

# Take a random sample of 10,000 posts
sampled_df = filtered_df.sample(n=1000, random_state=42)

In [18]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,mentalillness,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,depression,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,depression,13,50,1,9


In [19]:
sampled_df['subreddit'].value_counts()

depression       297
Anxiety          275
BPD              247
mentalillness     77
bipolar           72
schizophrenia     32
Name: subreddit, dtype: int64

### Recategorizing 'subreddit'

In [20]:
# def mental_disorders(ex):
#     if ex == 'BPD':
#         return 'BPD'
#     elif ex == 'bipolar':
#         return 'bipolar'
#     elif ex == 'Anxiety':
#         return 'anxiety'
#     elif ex == 'schizophrenia':
#         return 'schizophrenia'
#     elif ex == 'depression':
#         return 'depression'
#     else:
#         return 'others'


def mental_disorders(ex):
    if ex=='BPD':
        return 'BPD'
    elif ex=='bipolar':
        return 'bipolar'
    else:
        return 'others'

In [21]:
sampled_df['subreddit'] = sampled_df['subreddit'].apply(mental_disorders)

In [22]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,others,13,50,1,9
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,others,6,23,113,506
293850,Breakup depression and self isolated without r...,[removed],2022-08-06 09:23:21,False,others,7,49,1,9
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,others,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757


In [23]:
# We will remove the rows under selftext with have '[removed]'

sampled_df = sampled_df[sampled_df['selftext'] != '[removed]']

In [24]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,others,6,23,113,506
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,others,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757
443776,Spiraling out of control,Do you ever get to where you feel fine one min...,2022-05-09 02:03:26,True,others,4,21,146,650
87944,Pms exacerbating neediness for fp,I've been working hard with my therapist on co...,2021-01-18 00:09:14,False,BPD,5,29,90,381


### 'title' and 'selftext' Preprocessing

In [29]:
import re
import string
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [35]:
def convert_lowercase(text):
    text = text.lower()
    return text

sampled_df['title'] = sampled_df['title'].apply(convert_lowercase)

In [36]:
def remove_url(text):
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text)

sampled_df['title'] = sampled_df['title'].apply(remove_url)

In [37]:
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

sampled_df['title'] = sampled_df['title'].apply(remove_punc)

In [38]:
def remove_stopwords(text):
    new_list = []
    words = word_tokenize(text)
    stopwrds = stopwords.words('english')
    for word in words:
        if word not in stopwrds:
            new_list.append(word)
    return ' '.join(new_list)

sampled_df['title'] = sampled_df['title'].apply(remove_stopwords)

In [39]:
def perform_stemming(text):
    stemmer = PorterStemmer()
    new_list = []
    words = word_tokenize(text)
    for word in words:
        new_list.append(stemmer.stem(word))

    return " ".join(new_list)

sampled_df['title'] = sampled_df['title'].apply(perform_stemming)

In [40]:
sampled_df['selftext'] = sampled_df['selftext'].apply(convert_lowercase)

In [41]:
sampled_df['selftext'] = sampled_df['selftext'].apply(remove_url)

In [42]:
sampled_df['selftext'] = sampled_df['selftext'].apply(remove_punc)

In [43]:
sampled_df['selftext'] = sampled_df['selftext'].apply(remove_stopwords)

In [44]:
sampled_df['selftext'] = sampled_df['selftext'].apply(perform_stemming)

In [45]:
sampled_df['Total Text After Transformation'] = sampled_df['selftext'].apply(lambda x: np.log(len(x.split())))

In [46]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars,Total Text After Transformation
131450,look hope feel fed,diagnosi fairli new havent start dbt yet anyon...,2020-05-30 22:47:57,False,BPD,6,28,344,1414,5.147494
691395,get motiv determin anyth,like manag cut safeti razor,2020-05-17 15:31:50,False,others,8,45,10,37,1.609438
275676,memori flash,use realli good memori ampx200b soon issu star...,2022-10-13 18:02:41,False,bipolar,2,13,91,424,3.871201
392360,ill never get live fantasi land forev,wont ever get turn fantasi realiti slave socie...,2022-03-01 07:58:19,False,others,10,41,72,288,3.496508
538129,tip manag aaaa,ive medic diagnos gener anxieti disord six mon...,2021-08-06 01:54:21,False,others,6,23,113,506,4.043051


### Classification Models

In [47]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, PrecisionRecallDisplay, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix, classification_report

In [74]:
X = sampled_df["title"]
y = sampled_df['subreddit'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state= 42, stratify = y)

In [75]:
tfidf = TfidfVectorizer(max_features= 2500, min_df= 2)
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [76]:
def check_scores(y_pred):
    print('Precision: %.3f' % precision_score(y_test, y_pred,average="micro"))
    print('Recall: %.3f' % recall_score(y_test, y_pred, average="micro"))
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average="micro"))
    return None

In [77]:
forest_model_test = RandomForestClassifier(max_depth=40,random_state=42)
forest_model_test.fit(X_train,y_train)
Y_random_model_test =forest_model_test.predict(X_test)
forest_param = {'n_estimators':[500,700,1000],'max_depth':[10,20,40,70]}
forest_Gridsearch = GridSearchCV(estimator=forest_model_test,param_grid=forest_param,cv=10,scoring='f1',n_jobs=-1)
forest_Gridsearch.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=RandomForestClassifier(max_depth=40, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [10, 20, 40, 70],
                         'n_estimators': [500, 700, 1000]},
             scoring='f1')

In [78]:
forest_Gridsearch.best_params_

{'max_depth': 10, 'n_estimators': 500}

In [79]:
forest_model_best= RandomForestClassifier(n_estimators=500,max_depth=10)

In [80]:
forest_model_best.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=500)

In [81]:
forest_y_pred =forest_model_best.predict(X_test)

In [82]:
forest_confusion = confusion_matrix(forest_y_pred,y_test)
forest_confusion

array([[  5,   0,   1],
       [  0,   0,   0],
       [ 39,  11, 116]], dtype=int64)

In [83]:
check_scores(forest_y_pred)

Precision: 0.703
Recall: 0.703
Accuracy: 0.703
F1 Score: 0.703


In [84]:
Tree_model = DecisionTreeClassifier()
Tree_model.fit(X_train,y_train)
Y_pred = Tree_model.predict(X_test)

In [85]:
decision_tree_confusion = confusion_matrix(Y_pred,y_test)
decision_tree_confusion

array([[ 19,   1,  13],
       [  1,   3,   4],
       [ 24,   7, 100]], dtype=int64)

In [86]:
check_scores(Y_pred)

Precision: 0.709
Recall: 0.709
Accuracy: 0.709
F1 Score: 0.709


In [89]:
best_k = None
best_accuracy = 0

for k in range(2, 15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_prediction = knn.predict(X_test)

    accuracy = accuracy_score(y_test, knn_prediction)
    recall = recall_score(y_test, knn_prediction, average = "micro")
    precision = precision_score(y_test, knn_prediction, average = "micro")
    f1 = f1_score(y_test, knn_prediction, average = "micro")
    cm = confusion_matrix(y_test, knn_prediction)

    print("K =", k)
    print("Accuracy Score:", accuracy)
    print("Recall Score:", recall)
    print("Precision Score:", precision)
    print("F1 Score:", f1)
    print("Confusion Matrix:")
    print(cm)
    print("\n")

    # Update best_k if the current k gives a higher accuracy
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

print("Best k based on accuracy:", best_k)

K = 2
Accuracy Score: 0.5988372093023255
Recall Score: 0.5988372093023255
Precision Score: 0.5988372093023255
F1 Score: 0.5988372093023255
Confusion Matrix:
[[14  2 28]
 [ 2  0  9]
 [23  5 89]]


K = 3
Accuracy Score: 0.6569767441860465
Recall Score: 0.6569767441860465
Precision Score: 0.6569767441860465
F1 Score: 0.6569767441860465
Confusion Matrix:
[[  6   0  38]
 [  0   0  11]
 [ 10   0 107]]


K = 4
Accuracy Score: 0.6627906976744186
Recall Score: 0.6627906976744186
Precision Score: 0.6627906976744186
F1 Score: 0.6627906976744186
Confusion Matrix:
[[ 11   0  33]
 [  0   0  11]
 [ 12   2 103]]


K = 5
Accuracy Score: 0.6686046511627907
Recall Score: 0.6686046511627907
Precision Score: 0.6686046511627907
F1 Score: 0.6686046511627907
Confusion Matrix:
[[  9   0  35]
 [  0   0  11]
 [  7   4 106]]


K = 6
Accuracy Score: 0.6627906976744186
Recall Score: 0.6627906976744186
Precision Score: 0.6627906976744186
F1 Score: 0.6627906976744186
Confusion Matrix:
[[  7   0  37]
 [  0   0  11]
 [

In [91]:
# k with the best accuracy

knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
knn_prediction = knn.predict(X_test)

accuracy = accuracy_score(y_test, knn_prediction)
recall = recall_score(y_test, knn_prediction, average = "micro")
precision = precision_score(y_test, knn_prediction, average = "micro")
f1 = f1_score(y_test, knn_prediction, average = "micro")
cm = confusion_matrix(y_test, knn_prediction)

print("Accuracy Score:", accuracy)
print("Recall Score:", recall)
print("Precision Score:", precision)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(cm)

Accuracy Score: 0.6976744186046512
Recall Score: 0.6976744186046512
Precision Score: 0.6976744186046512
F1 Score: 0.6976744186046512
Confusion Matrix:
[[  4   0  40]
 [  0   0  11]
 [  1   0 116]]
