## Rough Draft: Text-Based Depression Detection Using a Subset of Partially Cleaned Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import spacy
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from nltk.tokenize import RegexpTokenizer, word_tokenize
import re

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, GRU, Input, Flatten, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Input, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

In [2]:
depression = pd.read_csv('../data/depression_30k.csv')
anxiety = pd.read_csv('../data/anxiety_20k.csv')
basetext = pd.read_csv('../data/basetext.csv')

In [3]:
depression.drop(columns='Unnamed: 0',inplace=True)
anxiety.drop(columns='Unnamed: 0',inplace=True)
basetext.drop(columns='Unnamed: 0',inplace=True)

In [4]:
dep = depression.sample(2000)
bas = basetext.sample(1000)
anx = anxiety.sample(1000)

In [5]:
sample_df = pd.concat([dep,bas])

In [6]:
analyzer = SentimentIntensityAnalyzer()

In [7]:
def sentiment(row):
    analyzer = SentimentIntensityAnalyzer()
    res = analyzer.polarity_scores(row)
    return pd.Series([res['neg'], res['neu'], res['pos'], res['compound']])

In [8]:
sample_df.replace('[removed]',' ',inplace=True)
sample_df.replace('[deleted]',' ',inplace=True)

In [9]:
sample_df.fillna(' ',inplace=True)

In [10]:
sample_df['full_text'] = sample_df['title'] + " " + sample_df['selftext']
sample_df['full_text'] = sample_df['full_text'].astype(str)
sample_df['full_text'] = sample_df['full_text'].map(lambda x:x.lower())

In [11]:
sample_df['subreddit'] = sample_df['subreddit'].map({'happy':0,'CasualConversation':0, 'depression':1})

In [12]:
sample_df[['neg','neu','pos','comp']] = sample_df['full_text'].apply(sentiment)

In [13]:
sample_df.head()

Unnamed: 0,title,author,selftext,created_utc,subreddit,full_text,neg,neu,pos,comp
7977,I just wanna cry,William20022,"Whys it so hard, why cant I just cry why I can...",1601632736,1,"i just wanna cry whys it so hard, why cant i j...",0.166,0.764,0.07,-0.7748
26325,1pm slump,FlippantRaccoon,I’m finding lately that I’ll be fine until abo...,1598999736,1,1pm slump i’m finding lately that i’ll be fine...,0.044,0.863,0.093,0.7657
14168,I was doing so good.....,SubArticFawn,I was doing good today. I cleaned a lot. Did l...,1600716852,1,i was doing so good..... i was doing good toda...,0.033,0.83,0.137,0.933
15658,I can’t fucking take it anymore,Roof-Witty,Im here mostly to vent so sorry if whatever I ...,1600512406,1,i can’t fucking take it anymore im here mostly...,0.145,0.702,0.152,0.0061
9133,I just want to kill myself after unsuccessful ...,vvredditor78,"Hi All, I want to kill myself. I did rhinoplas...",1601466226,1,i just want to kill myself after unsuccessful ...,0.298,0.665,0.038,-0.9638


### Basic LSTM with Embedding Layer


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
text_labels = sample_df[['full_text','subreddit']]

In [16]:
text_labels.head()

Unnamed: 0,full_text,subreddit
7977,"i just wanna cry whys it so hard, why cant i j...",1
26325,1pm slump i’m finding lately that i’ll be fine...,1
14168,i was doing so good..... i was doing good toda...,1
15658,i can’t fucking take it anymore im here mostly...,1
9133,i just want to kill myself after unsuccessful ...,1


In [17]:
text_labels = text_labels.sample(frac=1)

In [18]:
X = text_labels['full_text'].fillna('')
y = text_labels['subreddit']

In [19]:
#parameters for tokenization
max_len = 100
training_samples = 2500
validation_samples = 500
max_words = 10_000

In [20]:
#instantiate the tokenizer
tokenizer = Tokenizer(num_words=max_words)

In [21]:
#fit the tokenizer
tokenizer.fit_on_texts(X)

In [22]:
#create the sequences
sequences = tokenizer.texts_to_sequences(X)

In [23]:
#pad sequences
data = pad_sequences(sequences, maxlen=max_len)

In [24]:
#create train/test sets
X_train = data[:training_samples]
X_test = data[training_samples:training_samples + validation_samples]

y_train = y[:training_samples]
y_test = y[training_samples: training_samples + validation_samples]

In [25]:
X_train.shape

(2500, 100)

In [26]:
X_test.shape

(500, 100)

In [27]:
y_train.shape

(2500,)

In [28]:
X_train_reshape = X_train.reshape(2500,-1,100)
X_test_reshape = X_test.reshape(500,-1,100)

In [29]:
X_train_reshape = X_train_reshape.astype(np.int32)
X_test_reshape = X_test_reshape.astype(np.int32)

In [30]:
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

In [31]:
X_train_reshape.shape

(2500, 1, 100)

In [32]:
X_test_reshape.shape

(500, 1, 100)

In [35]:
model_k = Sequential()
model_k.add(Embedding(10000, 32))
model_k.add(LSTM(32))
model_k.add(Dense(64,activation='relu',kernel_regularizer=l2(0.001)))
model_k.add(Dropout(0.5))
model_k.add(Dense(64,activation='relu',kernel_regularizer=l2(0.001)))
model_k.add(Dropout(0.5))
model_k.add(Dense(64,activation='relu',kernel_regularizer=l2(0.001)))
model_k.add(Dropout(0.5))
model_k.add(Dense(128,activation='relu',kernel_regularizer=l2(0.01)))
model_k.add(Dropout(0.4))

model_k.add(Dense(1,activation='sigmoid'))

In [36]:
model_k.compile(optimizer='adam', metrics=['accuracy'], loss='binary_crossentropy')

In [37]:
history = model_k.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
np.mean(history.history['val_accuracy'])

0.8011000007390976

### BERT to BiLSTM Small Sample

In [39]:
binary_sample = text_labels.sample(200,random_state=42)

In [40]:
X = binary_sample['full_text']
y = binary_sample['subreddit']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [42]:
import numpy as np
import spacy
from sklearn.base import BaseEstimator, TransformerMixin

class WordVectorTransformer(TransformerMixin,BaseEstimator):
    def __init__(self, model="en_trf_distilbertbaseuncased_lg"):    #put bert embeddings here
        self.model = model
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        nlp = spacy.load(self.model)
        return np.concatenate([nlp(doc).vector.reshape(1,-1) for doc in X])

In [43]:
bertvect = WordVectorTransformer()

In [44]:
X_train_bvect = bertvect.fit_transform(X_train)
X_test_bvect = bertvect.transform(X_test)

In [45]:
y_train_vect = tf.keras.utils.to_categorical(y_train)
y_test_vect = tf.keras.utils.to_categorical(y_test)

In [46]:
X_train_bvect.shape

(150, 768)

In [47]:
X_test_bvect.shape

(50, 768)

In [48]:
X_train_reshape = X_train_bvect.reshape(-1,768,1)
X_test_reshape = X_test_bvect.reshape(-1,768,1)

In [56]:
# save numpy array as npy file
from numpy import asarray
from numpy import save
from numpy import load

In [49]:
X_train_reshape.shape

(150, 768, 1)

In [50]:
X_test_reshape.shape

(50, 768, 1)

In [51]:
model_l = Sequential()

model_l.add(Conv1D(32, 7, activation = 'relu'))
model_l.add(MaxPooling1D())
model_l.add(Bidirectional(LSTM(24)))
model_l.add(Dense(64,activation='relu',kernel_regularizer=l2(0.001)))
model_l.add(Dropout(0.5))
model_l.add(Dense(64,activation='relu',kernel_regularizer=l2(0.001)))
model_l.add(Dropout(0.5))
model_l.add(Dense(1,activation='sigmoid'))

In [52]:
model_l.compile(optimizer='nadam', metrics=['accuracy'], loss='binary_crossentropy')

In [53]:
history_l = model_l.fit(X_train_reshape, y_train, validation_data=(X_test_reshape,y_test), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Testing Saving/Loading Array and Model

In [55]:
save('X_test_reshape', X_test_reshape)

In [67]:
reloaded_X_test = load('X_test_reshape.npy')

In [58]:
model_l.save('./model_l.hd')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./model_l.hd\assets


In [60]:
reloaded_model = load_model('./model_l.hd')

In [61]:
preds = model_l.predict(X_test_reshape)
# Because of how tensorflow works, the predict method will now return predict probas. This means preds are all probabilities of being depressed
#text!!

In [62]:
reloaded_preds = reloaded_model.predict(X_test_reshape)

In [64]:
preds[:10]

array([[0.9351281 ],
       [0.90173125],
       [0.9184442 ],
       [0.9563515 ],
       [0.9560249 ],
       [0.8836709 ],
       [0.8895943 ],
       [0.82933843],
       [0.94032633],
       [0.93107325]], dtype=float32)

In [65]:
reloaded_preds[:10]

array([[0.9351281 ],
       [0.90173125],
       [0.9184442 ],
       [0.9563515 ],
       [0.9560249 ],
       [0.8836709 ],
       [0.8895943 ],
       [0.82933843],
       [0.94032633],
       [0.93107325]], dtype=float32)

### Using only Vader Sentiments to Predict Depression

In [665]:
binary_sent_sample = sample_df.sample(200,random_state=7)[['neg','neu','pos','comp','subreddit']]
binary_sent_sample['subreddit'].value_counts()

1    115
0     85
Name: subreddit, dtype: int64

In [666]:
X = binary_sent_sample.drop(columns='subreddit')
y = binary_sent_sample['subreddit']

In [668]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [689]:
model_s = Sequential()

model_s.add(Input(shape=(X_train.shape[1],)))
model_s.add(Dense(12,activation='relu'))

model_s.add(Dense(1,activation='sigmoid'))

#Architechture might be too simple here, make it (a bit) more robust and see if accuracy improves
#Also go back and set things up so sample for this sentiment-based test is the same as the bERT test and compare.

In [685]:
model_s.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [686]:
history_s = model_s.fit(X_train,y_train, validation_data=(X_test,y_test), epochs=300, verbose=0, batch_size=16)

In [687]:
np.mean(history_s.history['val_accuracy'])

0.6966000071167946

In [1]:
plt.figure(figsize=(8,6))
plt.plot(history_s.history['accuracy'], label='Train')
plt.plot(history_s.history['val_accuracy'], label='Test')
plt.legend()

NameError: name 'plt' is not defined

In [3]:
nlp = spacy.load("en_trf_distilbertbaseuncased_lg")

In [None]:
#doc1 = nlp(depression)
#doc2 = nlp(anxiety)
#doc3 = nlp(neutral)