# MindInsight Classifier: Unveiling Mental Health Patterns in Pandemic Discourse through Data-Driven Analysis

Let us first import the pertinent libraries.

In [70]:
#!pip install wordcloud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('mental_disorders_reddit.csv')

In [3]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD
4,help,[removed],1650350907,False,BPD


### Data Preprocessing and Simple EDA (Part 1)

In [4]:
print(df.shape)

(701787, 5)


In [5]:
df.isnull().sum()

title             46
selftext       33691
created_utc        0
over_18            0
subreddit          0
dtype: int64

In [6]:
df = df.dropna(subset=['selftext'], how='any')

In [7]:
df.isnull().sum()

title          42
selftext        0
created_utc     0
over_18         0
subreddit       0
dtype: int64

In [8]:
df['subreddit'].value_counts()

BPD              233125
Anxiety          167059
depression       156717
bipolar           46666
mentalillness     44249
schizophrenia     20280
Name: subreddit, dtype: int64

In [9]:
df['title'] = df['title'].fillna('')

# Calculate the total number of words in 'title'
df['title_total'] = df['title'].apply(lambda x: len(x.split()))

# Define a function to count total characters in a text (excluding spaces)
def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

# Calculate the total number of characters in 'title'
df['title_chars'] = df['title'].apply(count_total_words)

In [10]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21
4,help,[removed],1650350907,False,BPD,1,4


In [11]:
df['text_total'] = df['selftext'].apply(lambda x: len(x.split()))

def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

df['text_chars'] = df["selftext"].apply(count_total_words)

In [12]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD,3,21,821,3282
4,help,[removed],1650350907,False,BPD,1,4,1,9


### Data Downsizing

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668096 entries, 0 to 701786
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        668096 non-null  object
 1   selftext     668096 non-null  object
 2   created_utc  668096 non-null  int64 
 3   over_18      668096 non-null  bool  
 4   subreddit    668096 non-null  object
 5   title_total  668096 non-null  int64 
 6   title_chars  668096 non-null  int64 
 7   text_total   668096 non-null  int64 
 8   text_chars   668096 non-null  int64 
dtypes: bool(1), int64(5), object(3)
memory usage: 46.5+ MB


The number of data is 666,8096. It is very large and takes a lot of time to process. As we wish to spotlight the posts published during the duration of the COVID-19 pandemic, we will be limiting our data to only include posts from March 2020 onwards. A random sample of 10,000 posts will be taken from the dataset for efficiency.

In [14]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [15]:
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
0,Life is so pointless without others,Does anyone else think the most important part...,2022-04-19 08:29:20,False,BPD,6,30,74,310
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,2022-04-19 08:24:20,False,BPD,2,9,517,2259
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,2022-04-19 08:02:59,False,BPD,6,16,145,545
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",2022-04-19 07:30:30,False,BPD,3,21,821,3282
4,help,[removed],2022-04-19 06:48:27,False,BPD,1,4,1,9


In [16]:
# Filter posts from March 2020 onwards
filtered_df = df[df['created_utc'] >= '2020-03-01']

# Take a random sample of 10,000 posts
sampled_df = filtered_df.sample(n=1000, random_state=42)

In [17]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,BPD,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,mentalillness,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,bipolar,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,depression,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,depression,13,50,1,9


In [18]:
sampled_df['subreddit'].value_counts()

depression       297
Anxiety          275
BPD              247
mentalillness     77
bipolar           72
schizophrenia     32
Name: subreddit, dtype: int64

### Recategorizing 'subreddit'

In [19]:
# def mental_disorders(ex):
#     if ex == 'BPD':
#         return 'BPD'
#     elif ex == 'bipolar':
#         return 'bipolar'
#     elif ex == 'Anxiety':
#         return 'anxiety'
#     elif ex == 'schizophrenia':
#         return 'schizophrenia'
#     elif ex == 'depression':
#         return 'depression'
#     else:
#         return 'others'

def mental_disorders(ex):
    if ex== 'schizophrenia':
        return 'schizophrenia'
    elif ex == 'Anxiety':
        return 'Anxiety'
    else:
        return 'others'

In [20]:
sampled_df['subreddit'] = sampled_df['subreddit'].apply(mental_disorders)

In [21]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,others,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,others,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
313915,It's my 27 birthday and I don't know wtf with ...,[removed],2022-02-26 21:42:56,False,others,13,50,1,9
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,Anxiety,6,23,113,506
293850,Breakup depression and self isolated without r...,[removed],2022-08-06 09:23:21,False,others,7,49,1,9
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,Anxiety,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757


In [22]:
# We will remove the rows under selftext with have '[removed]'

sampled_df = sampled_df[sampled_df['selftext'] != '[removed]']

In [23]:
sampled_df.head(20)

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,others,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,others,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,Anxiety,6,23,113,506
402501,I really can’t get out of this,The last month my depression reach its lowest ...,2022-07-05 22:33:20,False,others,7,24,168,698
566855,anxiety over such insignificant things….,just had to reschedule a doctor’s appointment ...,2021-08-18 22:51:22,False,Anxiety,5,36,192,860
357103,My dog died and I have nothing left.,My marriage isn't doing great. Dog was healthy...,2022-08-25 20:40:17,False,others,8,29,196,757
443776,Spiraling out of control,Do you ever get to where you feel fine one min...,2022-05-09 02:03:26,True,others,4,21,146,650
87944,Pms exacerbating neediness for fp,I've been working hard with my therapist on co...,2021-01-18 00:09:14,False,others,5,29,90,381


### Text Pre-Processing

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

string.punctuation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Don
[nltk_data]     Bosco\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [25]:
sampled_df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit,title_total,title_chars,text_total,text_chars
131450,Looking for hope (feeling fed up),My diagnosis is fairly new and I havent starte...,2020-05-30 22:47:57,False,others,6,28,344,1414
691395,Get motivated with determination you can do an...,Like I just managed to cut with a safety razor,2020-05-17 15:31:50,False,others,8,45,10,37
275676,memory flashes,"so, I used to have a really good memory\n\n&am...",2022-10-13 18:02:41,False,others,2,13,91,424
392360,I'll never get to live in the fantasy land for...,I won't ever get to turn my fantasies into rea...,2022-03-01 07:58:19,False,others,10,41,72,288
538129,tips for managing the AAAA ?,I've been medically diagnosed with a general a...,2021-08-06 01:54:21,False,Anxiety,6,23,113,506


In [26]:
sampled_df['all_text'] = sampled_df['title'] + " " + sampled_df['selftext']

df = sampled_df[['all_text', 'subreddit']]
df = df[df['subreddit'] != 'others']

df.head()

Unnamed: 0,all_text,subreddit
538129,tips for managing the AAAA ? I've been medical...,Anxiety
566855,anxiety over such insignificant things…. just ...,Anxiety
496598,EMDR? Has anyone here used emdr therapy before...,Anxiety
613187,Getting anxious about coming to gym I used to ...,Anxiety
633253,Nicotine Helps me think I found without nicoti...,schizophrenia


In [27]:
# Define the abbreviations dictionary
abbr_dict = {
    "'cause": "because",
    "ain't": "am not",
    "can't": "can not",
    "cannot": "can not",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "gimme": "give me",
    "gotta": "got to",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "here's": "here is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'm": "i am",
    "i'll": "i will",
    "i've": "i have",
    "i ve": "i have",
    "imma": "i am going to",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "lemme": "let me",
    "let's": "let us",
    "not've": "not have",
    "shouldn't": "should not",
    "she'll": "she will",
    "she's": "she is",
    "that's": "that is",
    "there's": "there is",
    "there're": "there are",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "werent": "were not",
    "what's": "what is",
    "what're": "what are",
    "when's": "when is",
    "when're": "when are",
    "where's": "where is",
    "where're": "where are",
    "who's": "who is",
    "who're": "who are",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

# Define the function to replace the abbreviations
def replace_abbreviations(text):
    # Replace '’' with '\'
    text = re.sub('’', '\'', text)

    # Remove any word that starts with 'm' or 'f' followed by digits
    text = re.sub(r'\b[mf](\d+)\b', '', text, flags=re.IGNORECASE)

    # Remove any digit that is followed by 'm' or 'f'
    text = re.sub(r'\b(\d+)[mf]\b', '', text, flags=re.IGNORECASE)

    # Replace abbreviations with their full form
    for word in text.split():
        if word.lower() in abbr_dict:
            text = re.sub(r'\b{}\b'.format(word), abbr_dict[word.lower()], text, flags=re.IGNORECASE)
    return text

# Define the function to remove emojis
def remove_emojis(text):
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoji, '', text)

def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

def remove_whitespaces(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

def remove_digits(text):
    return re.sub(r'\d+', '', text)

In [28]:
# Cleaning and tokenization
def tokenization(text):
    set_stop_words = set(stopwords.words('english'))

    text = replace_abbreviations(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = remove_emojis(text)
    text = remove_html(text)
    text = remove_whitespaces(text)
    text = remove_digits(text)
    tokens = word_tokenize(text)

    return [w for w in tokens if w not in set_stop_words]

df['tokens']= df['all_text'].apply(lambda x: tokenization(x))
df.head()

Unnamed: 0,all_text,subreddit,tokens
538129,tips for managing the AAAA ? I've been medical...,Anxiety,"[tips, managing, aaaa, medically, diagnosed, g..."
566855,anxiety over such insignificant things…. just ...,Anxiety,"[anxiety, insignificant, things, reschedule, d..."
496598,EMDR? Has anyone here used emdr therapy before...,Anxiety,"[emdr, anyone, used, emdr, therapy, effective]"
613187,Getting anxious about coming to gym I used to ...,Anxiety,"[getting, anxious, coming, gym, used, train, c..."
633253,Nicotine Helps me think I found without nicoti...,schizophrenia,"[nicotine, helps, think, found, without, nicot..."


In [29]:
# Lemmatization
word_lemmatizer = WordNetLemmatizer()

def lemmatization(text):
    lemm_text = [word_lemmatizer.lemmatize(word, pos="v") for word in text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="n") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="a") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="r") for word in lemm_text]
    lemm_text = [word_lemmatizer.lemmatize(word, pos="s") for word in lemm_text]
    return lemm_text

df['lemmatized_tokens'] = df['tokens'].apply(lambda x:lemmatization(x))
df.head(20)

Unnamed: 0,all_text,subreddit,tokens,lemmatized_tokens
538129,tips for managing the AAAA ? I've been medical...,Anxiety,"[tips, managing, aaaa, medically, diagnosed, g...","[tip, manage, aaaa, medically, diagnose, gener..."
566855,anxiety over such insignificant things…. just ...,Anxiety,"[anxiety, insignificant, things, reschedule, d...","[anxiety, insignificant, thing, reschedule, do..."
496598,EMDR? Has anyone here used emdr therapy before...,Anxiety,"[emdr, anyone, used, emdr, therapy, effective]","[emdr, anyone, use, emdr, therapy, effective]"
613187,Getting anxious about coming to gym I used to ...,Anxiety,"[getting, anxious, coming, gym, used, train, c...","[get, anxious, come, gym, use, train, come, ne..."
633253,Nicotine Helps me think I found without nicoti...,schizophrenia,"[nicotine, helps, think, found, without, nicot...","[nicotine, help, think, find, without, nicotin..."
525248,"Weird feeling the past couple weeks Hello, I’m...",Anxiety,"[weird, feeling, past, couple, weeks, hello, f...","[weird, feel, past, couple, week, hello, f, ho..."
612722,Does anyone else spiral when listening to anxi...,Anxiety,"[anyone, else, spiral, listening, anxiety, sto...","[anyone, else, spiral, listen, anxiety, story,..."
526416,😔😔😔 Hi anxious now… about possible broken glas...,Anxiety,"[hi, anxious, possible, broken, glass, eye, th...","[hi, anxious, possible, break, glass, eye, tho..."
519916,MRI tomorrow and I'm really nervous My neurolo...,Anxiety,"[mri, tomorrow, really, nervous, neurologist, ...","[mri, tomorrow, really, nervous, neurologist, ..."
491615,How can I deal with anxiety during exams? I ha...,Anxiety,"[deal, anxiety, exams, diagnosed, generalized,...","[deal, anxiety, exam, diagnose, generalize, an..."


In [30]:
from collections import defaultdict
import copy

# Counting the unique number of tokens for num_words in text_encoding

lemmatized_words = [word for word_list in df['lemmatized_tokens'] for word in word_list]
unique_words = len(set(lemmatized_words))

# Encoding and padding

def text_encoding(lemmatized_texts, num_words):
    vocabulary = defaultdict(int)
    fdist = nltk.FreqDist()

    all_lemmatized_words = [word for word_list in lemmatized_texts for word in word_list]
    
    for word in all_lemmatized_words:
        fdist[word] += 1

    common_words = fdist.most_common(num_words)

    for idx, word in enumerate(common_words):
        vocabulary[word[0]] = (idx + 1)

    encoded_texts = []
    texts4encoding = []

    for tokens in lemmatized_texts:
        temp_codes = []
        temp_words = []

        for word in tokens:
            if word in vocabulary.keys():
                temp_codes.append(vocabulary[word])
                temp_words.append(word)

        encoded_texts.append(temp_codes)
        texts4encoding.append(temp_words)

    vector_size = max(len(x) for x in encoded_texts)

    return encoded_texts, texts4encoding, vector_size

def codes_padding(X_encoded_texts):
    pad_value = 0
    padded_codes = []

    codes_from_texts = copy.deepcopy(X_encoded_texts)
    
    # vector_size in text_encoding
    max_length = max(len(encoded_text) for encoded_text in codes_from_texts)

    for encoded_text in codes_from_texts:
        while len(encoded_text) < max_length:
            encoded_text.append(pad_value)
        padded_codes.append(encoded_text)

    return padded_codes

df['padded_encoding'] = codes_padding(text_encoding(df['lemmatized_tokens'], unique_words)[0])

df.head(20)

Unnamed: 0,all_text,subreddit,tokens,lemmatized_tokens,padded_encoding
538129,tips for managing the AAAA ? I've been medical...,Anxiety,"[tips, managing, aaaa, medically, diagnosed, g...","[tip, manage, aaaa, medically, diagnose, gener...","[173, 233, 1749, 1750, 220, 234, 2, 167, 1751,..."
566855,anxiety over such insignificant things…. just ...,Anxiety,"[anxiety, insignificant, things, reschedule, d...","[anxiety, insignificant, thing, reschedule, do...","[2, 1758, 13, 602, 128, 209, 222, 1759, 696, 4..."
496598,EMDR? Has anyone here used emdr therapy before...,Anxiety,"[emdr, anyone, used, emdr, therapy, effective]","[emdr, anyone, use, emdr, therapy, effective]","[1277, 22, 73, 1277, 84, 826, 0, 0, 0, 0, 0, 0..."
613187,Getting anxious about coming to gym I used to ...,Anxiety,"[getting, anxious, coming, gym, used, train, c...","[get, anxious, come, gym, use, train, come, ne...","[3, 34, 40, 1002, 73, 603, 40, 116, 1002, 14, ..."
633253,Nicotine Helps me think I found without nicoti...,schizophrenia,"[nicotine, helps, think, found, without, nicot...","[nicotine, help, think, find, without, nicotin...","[827, 15, 6, 54, 117, 827, 109, 168, 1004, 18,..."
525248,"Weird feeling the past couple weeks Hello, I’m...",Anxiety,"[weird, feeling, past, couple, weeks, hello, f...","[weird, feel, past, couple, week, hello, f, ho...","[111, 1, 105, 235, 59, 293, 1279, 418, 73, 89,..."
612722,Does anyone else spiral when listening to anxi...,Anxiety,"[anyone, else, spiral, listening, anxiety, sto...","[anyone, else, spiral, listen, anxiety, story,...","[22, 52, 385, 311, 2, 194, 16, 1771, 421, 89, ..."
526416,😔😔😔 Hi anxious now… about possible broken glas...,Anxiety,"[hi, anxious, possible, broken, glass, eye, th...","[hi, anxious, possible, break, glass, eye, tho...","[239, 34, 357, 188, 1015, 276, 839, 358, 14, 1..."
519916,MRI tomorrow and I'm really nervous My neurolo...,Anxiety,"[mri, tomorrow, really, nervous, neurologist, ...","[mri, tomorrow, really, nervous, neurologist, ...","[840, 315, 12, 261, 841, 9, 840, 222, 116, 13,..."
491615,How can I deal with anxiety during exams? I ha...,Anxiety,"[deal, anxiety, exams, diagnosed, generalized,...","[deal, anxiety, exam, diagnose, generalize, an...","[92, 2, 472, 220, 1291, 2, 167, 27, 177, 1292,..."


### Model 2: LSTM

In [31]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [32]:
from torch.utils.data import Dataset, DataLoader

class DatasetMapping(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
      
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
from sklearn.model_selection import train_test_split

class   DatasetLoading:
    
    def __init__(self, padded_codes, targets):
        
        self.X = padded_codes
        self.y = targets
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        
    def data_split(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.20, random_state=20231116)    

    def data_mapping(self):
        
        self.train = DatasetMapping(self.X_train, self.y_train)
        self.test = DatasetMapping(self.X_test, self.y_test)

    def data_loading(self):
        self.loader_train = DataLoader(self.train, batch_size=params.batch_size)
        self.loader_test = DataLoader(self.test, batch_size=params.batch_size)  

In [42]:
from dataclasses import dataclass

@dataclass
class Parameters:
    # Preprocessing parameters
    vector_size: int = len(df['padded_encoding'].iloc[0])
    num_words: int = unique_words
    test_size = 0.20         
    random_state = 42
   
    # Model parameters
    embedding_dim: int = 256
    num_layers: int = 2 # number of lstm layers
    num_classes: int = df['subreddit'].nunique()
    #out_size: int = 32
    #tride: int = 2
    #dilation: int = 2
       
    # Training parameters
    epochs: int = 10
    batch_size: int = 128
    learning_rate: float = 0.001
    dropout: float = 0.5
    
params=Parameters()

In [43]:
X_data = torch.tensor(df['padded_encoding'].tolist())
y_data = torch.tensor(df['subreddit'].astype('category').cat.codes.tolist(), dtype=torch.long)

# Create a DataLoader for batching
dataset = TensorDataset(X_data, y_data)

In [44]:
dsl = DatasetLoading(X_data, y_data)
dsl.data_split()
dsl.data_mapping()
dsl.data_loading()

In [45]:
class LSTMNet(nn.ModuleList):

    def __init__(self, params):
        super(LSTMNet, self).__init__()
        
        self.batch_size = params.batch_size
        self.hidden_dim = params.embedding_dim
        self.num_layers = params.num_layers
        self.input_size = params.num_words+1
        
        self.dropout = nn.Dropout(0.5)
        self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, batch_first=True)
        
        self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=256)
        self.fc2 = nn.Linear(256, params.num_classes)
        
    def forward(self, x):
        
        h = torch.zeros((self.num_layers, x.size(0), self.hidden_dim))
        c = torch.zeros((self.num_layers, x.size(0), self.hidden_dim))
        
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)
        
        out = self.embedding(x)
        out, (hidden, cell) = self.lstm(out, (h,c))
        out = self.dropout(out[:, -1, :])
        out = torch.relu_(self.fc1(out))
        out = self.dropout(out)
        out = self.fc2(out)  
        return out

### Training and Testing @ 10 Epochs

In [46]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

num_classes = df['subreddit'].nunique()

lstm_model = LSTMNet(params)
optimizer = optim.RMSprop(lstm_model.parameters(), lr=params.learning_rate)

loader_train = dsl.loader_train
loader_test = dsl.loader_test
y_train = dsl.y_train
y_test = dsl.y_test

def calculate_accuracy(predictions, targets):
    _, predicted = torch.max(predictions, 1)
    correct = (predicted == targets).sum().item()
    total = targets.size(0)
    accuracy = correct / total
    return accuracy

# Training phase
for epoch in range(params.epochs):
    
    # Set model in training mode
    lstm_model.train()
    train_predictions = []
    
    for x_batch, y_batch in loader_train:
        y_batch = y_batch.view(-1).type(torch.LongTensor)  # Convert to 1D LongTensor        
        # Print target labels for debugging

        # Feed the model
        y_pred = lstm_model(x_batch)

        # Reshape y_pred to have the shape (batch_size, num_classes)
        y_pred = y_pred.view(-1, num_classes)

        # Loss calculation
        loss = nn.CrossEntropyLoss()(y_pred, y_batch)
        
        # Clean gradients
        optimizer.zero_grad()
         
        # Gradients calculation
        loss.backward()
         
        # Gradients update
        optimizer.step()

        train_predictions.append(y_pred)

    # Calculate training accuracy
    train_predictions = torch.cat(train_predictions, dim=0)
    train_accuracy = calculate_accuracy(train_predictions, y_train)

    print("Epoch: %d, loss: %.5f, Training Accuracy: %.4f" % (epoch+1, loss.item(), train_accuracy))
    
    # Validation phase
    lstm_model.eval()
    test_predictions = []

    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            y_batch = y_batch.view(-1).type(torch.LongTensor)

            # Feed the model
            y_pred = lstm_model(x_batch)

            # Reshape y_pred to have the shape (batch_size, num_classes)
            y_pred = y_pred.view(-1, num_classes)

            test_predictions.append(y_pred)

        # Calculate testing accuracy
        test_predictions = torch.cat(test_predictions, dim=0)
        test_accuracy = calculate_accuracy(test_predictions, y_test)

        print("Validation Accuracy: %.4f" % test_accuracy)

print('Training complete!')

Epoch: 1, loss: 0.39493, Training Accuracy: 0.5067
Validation Accuracy: 0.8772
Epoch: 2, loss: 0.22821, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 3, loss: 0.21454, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 4, loss: 0.16193, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 5, loss: 0.15040, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 6, loss: 0.16640, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 7, loss: 0.15050, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 8, loss: 0.15260, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 9, loss: 0.13233, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 10, loss: 0.16451, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Training complete!


In [47]:
@dataclass
class Parameters:
    # Preprocessing parameters
    vector_size: int = len(df['padded_encoding'].iloc[0])
    num_words: int = unique_words
    test_size = 0.20         
    random_state = 42
   
    # Model parameters
    embedding_dim: int = 256
    num_layers: int = 2 # number of lstm layers
    num_classes: int = df['subreddit'].nunique()
    #out_size: int = 32
    #tride: int = 2
    #dilation: int = 2
       
    # Training parameters
    epochs: int = 50
    batch_size: int = 128
    learning_rate: float = 0.001
    dropout: float = 0.5
    
params=Parameters()

In [48]:
num_classes = df['subreddit'].nunique()

lstm_model = LSTMNet(params)
optimizer = optim.RMSprop(lstm_model.parameters(), lr=params.learning_rate)

loader_train = dsl.loader_train
loader_test = dsl.loader_test
y_train = dsl.y_train
y_test = dsl.y_test

def calculate_accuracy(predictions, targets):
    _, predicted = torch.max(predictions, 1)
    correct = (predicted == targets).sum().item()
    total = targets.size(0)
    accuracy = correct / total
    return accuracy

# Training phase
for epoch in range(params.epochs):
    
    # Set model in training mode
    lstm_model.train()
    train_predictions = []
    
    for x_batch, y_batch in loader_train:
        y_batch = y_batch.view(-1).type(torch.LongTensor)  # Convert to 1D LongTensor        
        # Print target labels for debugging

        # Feed the model
        y_pred = lstm_model(x_batch)

        # Reshape y_pred to have the shape (batch_size, num_classes)
        y_pred = y_pred.view(-1, num_classes)

        # Loss calculation
        loss = nn.CrossEntropyLoss()(y_pred, y_batch)
        
        # Clean gradients
        optimizer.zero_grad()
         
        # Gradients calculation
        loss.backward()
         
        # Gradients update
        optimizer.step()

        train_predictions.append(y_pred)

    # Calculate training accuracy
    train_predictions = torch.cat(train_predictions, dim=0)
    train_accuracy = calculate_accuracy(train_predictions, y_train)

    print("Epoch: %d, loss: %.5f, Training Accuracy: %.4f" % (epoch+1, loss.item(), train_accuracy))
    
    # Validation phase
    lstm_model.eval()
    test_predictions = []

    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            y_batch = y_batch.view(-1).type(torch.LongTensor)

            # Feed the model
            y_pred = lstm_model(x_batch)

            # Reshape y_pred to have the shape (batch_size, num_classes)
            y_pred = y_pred.view(-1, num_classes)

            test_predictions.append(y_pred)

        # Calculate testing accuracy
        test_predictions = torch.cat(test_predictions, dim=0)
        test_accuracy = calculate_accuracy(test_predictions, y_test)

        print("Validation Accuracy: %.4f" % test_accuracy)

print('Training complete!')

Epoch: 1, loss: 0.34524, Training Accuracy: 0.4667
Validation Accuracy: 0.8772
Epoch: 2, loss: 0.22010, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 3, loss: 0.19532, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 4, loss: 0.15538, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 5, loss: 0.15747, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 6, loss: 0.14592, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 7, loss: 0.14985, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 8, loss: 0.14941, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 9, loss: 0.14599, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 10, loss: 0.15858, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 11, loss: 0.14493, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 12, loss: 0.13485, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 13, loss: 0.14507, Training Accuracy: 0.95

### Training and Testing @ 100 Epochs

In [49]:
@dataclass
class Parameters:
    # Preprocessing parameters
    vector_size: int = len(df['padded_encoding'].iloc[0])
    num_words: int = unique_words
    test_size = 0.20         
    random_state = 42
   
    # Model parameters
    embedding_dim: int = 256
    num_layers: int = 2 # number of lstm layers
    num_classes: int = df['subreddit'].nunique()
    #out_size: int = 32
    #tride: int = 2
    #dilation: int = 2
       
    # Training parameters
    epochs: int = 100
    batch_size: int = 128
    learning_rate: float = 0.001
    dropout: float = 0.5
    
params=Parameters()

In [50]:
num_classes = df['subreddit'].nunique()

lstm_model = LSTMNet(params)
optimizer = optim.RMSprop(lstm_model.parameters(), lr=params.learning_rate)

loader_train = dsl.loader_train
loader_test = dsl.loader_test
y_train = dsl.y_train
y_test = dsl.y_test

def calculate_accuracy(predictions, targets):
    _, predicted = torch.max(predictions, 1)
    correct = (predicted == targets).sum().item()
    total = targets.size(0)
    accuracy = correct / total
    return accuracy

# Training phase
for epoch in range(params.epochs):
    
    # Set model in training mode
    lstm_model.train()
    train_predictions = []
    
    for x_batch, y_batch in loader_train:
        y_batch = y_batch.view(-1).type(torch.LongTensor)  # Convert to 1D LongTensor        
        # Print target labels for debugging

        # Feed the model
        y_pred = lstm_model(x_batch)

        # Reshape y_pred to have the shape (batch_size, num_classes)
        y_pred = y_pred.view(-1, num_classes)

        # Loss calculation
        loss = nn.CrossEntropyLoss()(y_pred, y_batch)
        
        # Clean gradients
        optimizer.zero_grad()
         
        # Gradients calculation
        loss.backward()
         
        # Gradients update
        optimizer.step()

        train_predictions.append(y_pred)

    # Calculate training accuracy
    train_predictions = torch.cat(train_predictions, dim=0)
    train_accuracy = calculate_accuracy(train_predictions, y_train)

    print("Epoch: %d, loss: %.5f, Training Accuracy: %.4f" % (epoch+1, loss.item(), train_accuracy))
    
    # Validation phase
    lstm_model.eval()
    test_predictions = []

    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            y_batch = y_batch.view(-1).type(torch.LongTensor)

            # Feed the model
            y_pred = lstm_model(x_batch)

            # Reshape y_pred to have the shape (batch_size, num_classes)
            y_pred = y_pred.view(-1, num_classes)

            test_predictions.append(y_pred)

        # Calculate testing accuracy
        test_predictions = torch.cat(test_predictions, dim=0)
        test_accuracy = calculate_accuracy(test_predictions, y_test)

        print("Validation Accuracy: %.4f" % test_accuracy)

print('Training complete!')

Epoch: 1, loss: 0.24102, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 2, loss: 0.28415, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 3, loss: 0.20961, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 4, loss: 0.14672, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 5, loss: 0.16217, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 6, loss: 0.14990, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 7, loss: 0.15753, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 8, loss: 0.14421, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 9, loss: 0.14663, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 10, loss: 0.15575, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 11, loss: 0.14184, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 12, loss: 0.13687, Training Accuracy: 0.9511
Validation Accuracy: 0.8772
Epoch: 13, loss: 0.15517, Training Accuracy: 0.95

### LSTM Model Evaluations

In [51]:
import torchmetrics
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
from torchmetrics import F1Score

In [52]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set the model in evaluation mode
lstm_model.eval()
test_predictions = []

# Start evaluation phase
with torch.no_grad():
    for x_batch, y_batch in loader_test:
        y_pred = lstm_model(x_batch)
        test_predictions += torch.argmax(y_pred, dim=-1).cpu().numpy().tolist()

# Flatten the true labels and predicted labels
y_test_flat = y_test.view(-1).cpu().numpy().tolist()
test_predictions_flat = test_predictions

# Calculate accuracy
accuracy = accuracy_score(y_test_flat, test_predictions_flat)

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test_flat, test_predictions_flat, average='weighted')

# Print the metrics
print("Test Accuracy: {:.5f}".format(accuracy))
print("Precision: {:.5f}".format(precision))
print("Recall: {:.5f}".format(recall))
print("F1 Score: {:.5f}".format(f1))

Test Accuracy: 0.87719
Precision: 0.76947
Recall: 0.87719
F1 Score: 0.81981
