## 1. Importing data from kaggle

We used the reddit mental illness dataset from kaggle. The dataset contains 27,000 posts from 9 different mental health subreddits. The dataset was collected using the pushshift.io API. The dataset contains the following columns:
* **title**: The title of the post
* **selftext**: The text of the post
* **subreddit**: The subreddit the post came from
* **created_utc**: The time the post was created in UTC


In [None]:
! pip install -q kaggle
! chmod 600 ~/.kaggle/kaggle.json
! mv -v /content/train /content/drive/MyDrive/RAZI
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! kaggle datasets download kamaruladha/mental-disorders-identification-reddit-nlp
! unzip mental-disorders-identification-reddit-nlp.zip -d data

## 2. Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
pd.options.mode.chained_assignment = None
import re
import warnings
# Disable warnings
warnings.filterwarnings('ignore')
import gensim
import nltk
import pandas as pd
import spacy
from collections import Counter
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from wordcloud import WordCloud
import tensorflow as tf
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
#from spellchecker import SpellChecker
# Download required nltk data
nltk.download('punkt')
nltk.download('stopwords')


ModuleNotFoundError: No module named 'spacy'

## 3. Loading the data

In [2]:
df = pd.read_csv("../data/mental_disorders_reddit.csv")
df.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD
4,help,[removed],1650350907,False,BPD


## 4. Prepare data for training

In [None]:
df.isnull().sum()

title             46
selftext       33691
created_utc        0
over_18            0
subreddit          0
dtype: int64

In [3]:
df = df.dropna()
df.isnull().sum()

title          0
selftext       0
created_utc    0
over_18        0
subreddit      0
dtype: int64

In [4]:
# Drop rows with removed text or title
df.drop(df[(df['selftext'] =='\\[removed\\]')].index, inplace=True)
df.drop(df[(df['selftext'] =='[removed]')].index, inplace=True)
df.drop(df[(df['title'] =='\\[removed\\]')].index, inplace=True)
df.drop(df[(df['title'] =='[removed]')].index, inplace=True)

In [5]:
# Combine title and text columns
df["Sentence"] = df["title"] + ' ' + df["selftext"]

In [None]:
# Select relevant columns 
df = df[['Sentence', 'subreddit']]

In [None]:
df["subreddit"].value_counts()

BPD              212825
Anxiety          161629
depression       121202
mentalillness     38161
bipolar           35672
schizophrenia     11727
Name: subreddit, dtype: int64

In [7]:
# reduce the class rows to NUM_ROWS_TOKEEP

NUM_ROWS_TOKEEP = np.min(df['subreddit'].value_counts().values)

new_df = pd.DataFrame(columns = df.columns)

new_df = pd.DataFrame(columns = df.columns)
for label in df['subreddit'].unique():
    new_df = pd.concat([new_df ,df[df['subreddit'] == label].sample(NUM_ROWS_TOKEEP) ])

df = new_df.copy()
del new_df
df["subreddit"].value_counts()


BPD              11727
bipolar          11727
depression       11727
Anxiety          11727
schizophrenia    11727
mentalillness    11727
Name: subreddit, dtype: int64

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [9]:


nltk.download('stopwords')
StopWords = set(stopwords.words('english'))
ps = PorterStemmer()
#spell = SpellChecker()

", ".join(stopwords.words('english'))

def preprocess_text(text):

    text = text.lower()

    punctuations = string.punctuation

    text = text.translate(str.maketrans('', '', punctuations))

    text = " ".join([word for word in str(text).split() if word not in StopWords])

    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, ' ', text)
    text = re.sub('\s+', ' ', text)

    text = re.sub(r'https?://\S+|www\.\S+','', text)

    text = re.sub(r'<.*?>','', text)

    text = text.replace('\n', ' ').replace('\r', '')

    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

    text = " ".join([ps.stem(word) for word in text.split()])

    text = re.sub(emoj, '', text)

    #misspelled_words = spell.unknown(text.split())
    #text = " ".join([word if word not in misspelled_words else spell.correction(word) for word in text])

    return text


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df['Sentence']  = df['Sentence'].apply(preprocess_text)

In [None]:
df.to_csv('../data/processed_df.csv', index=False)