In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv("depression.csv")

In [3]:
data.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [4]:
data.shape

(7731, 2)

In [5]:
df=data.dropna()
df.shape

(7731, 2)

In [6]:
df["is_depression"] = df["is_depression"].map({0: "Not in Depression", 1: "Depression"})
print(df.head())

                                          clean_text is_depression
0  we understand that most people who reply immed...    Depression
1  welcome to r depression s check in post a plac...    Depression
2  anyone else instead of sleeping more when depr...    Depression
3  i ve kind of stuffed around a lot in my life d...    Depression
4  sleep is my greatest and most comforting escap...    Depression


In [7]:
df.is_depression.unique()

array(['Depression', 'Not in Depression'], dtype=object)

In [8]:
df.is_depression.value_counts()

Not in Depression    3900
Depression           3831
Name: is_depression, dtype: int64

In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [10]:
# remove whitespaces
df['clean_text']=df['clean_text'].str.strip()
# lowercase the text
df['clean_text'] = df['clean_text'].str.lower()
#remove punctuation
punc = string.punctuation
table = str.maketrans('','',punc)
df['clean_text']=df['clean_text'].apply(lambda x: x.translate(table))
# tokenizing each message
df['clean_text']=df.apply(lambda x: x['clean_text'].split(' '),axis=1)
# removing stopwords
df['cleaned_text'] = df.apply(lambda x: [word for word in x['clean_text'] if word not in stopwords.words('english')],axis=1)
# stemming
ps = PorterStemmer()
df['clean_text']= df.apply(lambda x: [ps.stem(word) for word in x['clean_text']],axis=1)
# remove single letter words
df['clean_text'] = df.apply(lambda x: ' '.join([word for word in x['clean_text'] if len(word)>1]),axis=1)


In [11]:
X = np.array(df["clean_text"])
y = np.array(df["is_depression"])

In [12]:
X[0]

'we understand that most peopl who repli immedi to an op with an invit to talk privat mean onli to help but thi type of respons usual lead to either disappoint or disast it usual work out quit differ here than when you say pm me anytim in casual social context we have huge admir and appreci for the goodwil and good citizenship of so mani of you who support other here and flag inappropri content even more so becaus we know that so mani of you are struggl yourselv we re hard at work behind the scene on more inform and resourc to make it easier to give and get qualiti help here thi is just small start our new wiki page explain in detail whi it much better to respond in public comment at least until you ve gotten to know someon it will be maintain at depress wiki privat contact and the full text of the current version is below summari anyon who while act helper invit or accept privat contact pm chat or ani kind of offsit commun earli in the convers is show either bad intent or bad judgemen

In [13]:
y[0:10]

array(['Depression', 'Depression', 'Depression', 'Depression',
       'Depression', 'Depression', 'Depression', 'Depression',
       'Depression', 'Depression'], dtype=object)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_X = cv.fit_transform(X) # Fit the Data

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cv_X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [17]:
predict = dt.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
print('Accuracy of DecisionTreeClassifier',accuracy_score(y_test,predict)*100)

Accuracy of DecisionTreeClassifier 92.63089851325145


In [19]:
from sklearn.metrics import confusion_matrix
print('Confusuion matrix of DecisionTreeClassifier\n',confusion_matrix(y_test,predict))

Confusuion matrix of DecisionTreeClassifier
 [[695  69]
 [ 45 738]]


In [20]:
from sklearn.metrics import classification_report
print('Classification report of DecisionTreeClassifier\n\n',classification_report(y_test,predict))

Classification report of DecisionTreeClassifier

                    precision    recall  f1-score   support

       Depression       0.94      0.91      0.92       764
Not in Depression       0.91      0.94      0.93       783

         accuracy                           0.93      1547
        macro avg       0.93      0.93      0.93      1547
     weighted avg       0.93      0.93      0.93      1547

