In [3]:
import pandas as pd
import numpy as np
import os
import time

In [None]:
from sklearn.model_selection import train_test_split

# Load data from csv train_data.csv
train_data = pd.read_csv("/content/drive/My Drive/depression_data/train_data.csv")
# Prepare data
X = train_data.iloc[:, 0]  # input text data
y = train_data.iloc[:, 1]  # target depression labels (0 or 1)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

38666     Can't do it anymoreAfter a recent breakup and ...
191242    how to overcome this???been 1 week  since i fi...
31471     I think my BF triggers suicidal ideation.I wou...
57486     Bruh im angy Him playing black Ops one zombies...
134764    Yes! I fucking did it!!! Woohoo I killed that ...
                                ...                        
119879    I need help with getting a pc So I want to get...
103694    An actual message to my 17 year old daughter o...
131932    Hey you, yeah you, read this plz So like a mon...
146867    Tales of my old fuckwad of a teacher. Stories ...
121958    I want to die.Er, I'm actually not fully confi...
Name: text, Length: 183159, dtype: object

In [None]:
X_val

191987    My dad is freaking out over me having high B a...
215623    i never chase girls but if she has black hair ...
143851    HeartbrokenHey everyone, I've been thinking lo...
13563     anyone wana talk im editing video gets really ...
139063    I have collected the holy trinity 1- Headache ...
                                ...                        
160578    Only anime boys look hot with abs Like Camp Bu...
85865     Im pursuing entrepreneurship and I made a new ...
148386    Anyone wanna play destiny 2 on ps4 w me? I wan...
54440     What are you doing step bro? What’s that assau...
16925     rantvent post want advice every single time mo...
Name: text, Length: 45790, dtype: object

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
  text = [word.lower() for word in text.split() if word.lower() not in stop]

  return " ".join(text)

In [None]:
X_train = X_train.map(remove_stopwords)
X_val = X_val.map(remove_stopwords)

In [None]:
from collections import Counter

def word_count(text):
  count = Counter()
  for i in text.values:
    for word in i.split():
      count[word] += 1
  return count

In [None]:
counter_train = word_count(X_train)
counter_val = word_count(X_val)
len(counter_train)

382479

In [None]:
counter_train

Counter({"can't": 44406,
         'anymoreafter': 4,
         'recent': 1037,
         'breakup': 448,
         'really': 63863,
         'bad': 20362,
         'increase': 262,
         'depression,': 2296,
         'fight': 3594,
         'anymore.': 18318,
         'want': 115136,
         'pain': 13427,
         'stop': 14750,
         "i'm": 148924,
         'probably': 13895,
         'going': 51357,
         'end': 26181,
         'tonight': 2970,
         'everyone': 22122,
         'house': 7446,
         'asleep.': 427,
         'overcome': 655,
         'this???been': 1,
         '1': 3544,
         'week': 8216,
         'since': 24015,
         'finished': 1395,
         '1st': 415,
         'semster': 5,
         'college': 7750,
         'like': 138599,
         'month': 5680,
         'break': 4740,
         '2nd': 537,
         'semester': 1462,
         'staying': 1774,
         'home': 11243,
         'day....': 10,
         'suicidal': 14183,
         'thoughts/anxi

In [None]:
num_words = len(counter_train + counter_val)

In [1]:
# Max number of words in a sequence
max_length = 100

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
word_index

{"i'm": 1,
 'like': 2,
 'want': 3,
 'know': 4,
 'feel': 5,
 'life': 6,
 'get': 7,
 'me': 8,
 'people': 9,
 'even': 10,
 'one': 11,
 'time': 12,
 'it': 13,
 'would': 14,
 'really': 15,
 'i’m': 16,
 'think': 17,
 "i've": 18,
 'going': 19,
 'go': 20,
 'never': 21,
 'im': 22,
 'much': 23,
 'friends': 24,
 'day': 25,
 "can't": 26,
 'help': 27,
 'filler': 28,
 'years': 29,
 'things': 30,
 'make': 31,
 'got': 32,
 'good': 33,
 'don’t': 34,
 'fucking': 35,
 'anything': 36,
 'could': 37,
 'way': 38,
 'school': 39,
 'someone': 40,
 'anymore': 41,
 'back': 42,
 'still': 43,
 'see': 44,
 'something': 45,
 'everything': 46,
 'always': 47,
 'need': 48,
 'family': 49,
 'die': 50,
 'better': 51,
 'nothing': 52,
 '\u200d': 53,
 'every': 54,
 'anyone': 55,
 'end': 56,
 'love': 57,
 'live': 58,
 'talk': 59,
 'year': 60,
 'kill': 61,
 'right': 62,
 'say': 63,
 'fuck': 64,
 'ever': 65,
 'work': 66,
 'hate': 67,
 'last': 68,
 'suicide': 69,
 'shit': 70,
 'myself': 71,
 'take': 72,
 'everyone': 73,
 'since':

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)

In [None]:
X_train[0]

[26,
 37867,
 1563,
 2045,
 15,
 76,
 3771,
 114,
 26,
 523,
 41,
 3,
 110,
 125,
 1,
 165,
 19,
 56,
 426,
 73,
 218,
 873]

In [None]:
from keras.utils import pad_sequences

train_pad = pad_sequences(
    X_train, maxlen=max_length, padding='post', truncating='post'
)

In [None]:
train_pad[19]

array([ 3296,    32, 28772,    40,  4552,   515,    66,  4552, 37869,
         233,   200,   123,  4552,  2148,   313,    18,    66,   377,
       17832,   377,   140,    33,   420,  2944,   829,  4552,  2148,
         148,   281,   146,   162,  1228,   853,   925,   853,   498,
         387,  3911, 77289,    26,   176,   963,   207,  4552,  2148,
         233,   660,   122,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

In [None]:
X_val = tokenizer.texts_to_sequences(X_val)
val_pad = pad_sequences(
    X_val, maxlen=max_length, padding='post', truncating='post'
)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L2

model = Sequential()
model.add(
    Embedding(
        num_words, 32, input_length=max_length, mask_zero=True
    )
)
model.add(LSTM(units=500, dropout=0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer=L2(0.001)))
# model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))

# Create optimizer object with adjusted learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
# Compile model with optimizer
  model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

  # Train model
  model.fit(
      train_pad, y_train, epochs=10, batch_size=32, validation_data=(val_pad, y_val)
  )

model.save("/content/drive/My Drive/depression_data_one/depression_rnn_model")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)

test_data = pd.read_csv("/content/drive/My Drive/depression_data/test_data.csv")
X_test = test_data.iloc[:, 0]
y_test = test_data.iloc[:, 1]
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(
    X_test_seq,
    maxlen=max_length,
    padding="post",
)
y_pred = np.round(model.predict(X_test_pad)).astype(int)
print(y_pred)
print(y_test)
# y_pred = y_pred.reshape(y_test.shape)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))
print("AUC-ROC Score: {:.3f}".format(auc_roc))

[[1]
 [1]
 [0]
 ...
 [1]
 [1]
 [1]]
0        0
1        1
2        0
3        1
4        1
        ..
26000    1
26001    0
26002    1
26003    1
26004    0
Name: label, Length: 26005, dtype: int64
Accuracy: 0.725
Precision: 0.648
Recall: 0.985
F1 Score: 0.782
AUC-ROC Score: 0.725


In [None]:
# print confusion matrix all 4 parts
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("True Negatives: {}".format(confusion_matrix(y_test, y_pred)[0][0]))
print("False Negatives: {}".format(confusion_matrix(y_test, y_pred)[1][0]))
print("True Positives: {}".format(confusion_matrix(y_test, y_pred)[1][1]))
print("False Positives: {}".format(confusion_matrix(y_test, y_pred)[0][1]))

Confusion Matrix:
[[ 6030  6959]
 [  193 12823]]
True Negatives: 6030
False Negatives: 193
True Positives: 12823
False Positives: 6959


In [None]:
import pickle

# saving
with open('/content/drive/My Drive/depression_data_one/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [8]:
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved tokenizer
tokenizer_path = '/content/drive/My Drive/depression_data_one/tokenizer.pickle'
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the saved model
model_path = '/content/drive/My Drive/depression_data_one/depression_rnn_model'
model = load_model(model_path)

# Define the sentence to predict
sentence = "I'm unhappy"

# Tokenize and preprocess the input sentence
sequence = tokenizer.texts_to_sequences([sentence])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')  # Replace MAX_SEQUENCE_LENGTH with your desired value

# Make the prediction
prediction = model.predict(padded_sequence)

# Print the predicted value
print(np.round(prediction).astype(int))


[[1]]


In [9]:
sentence = "I'm killing it."
sequence = tokenizer.texts_to_sequences([sentence])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')  # Replace MAX_SEQUENCE_LENGTH with your desired value

# Make the prediction
prediction = model.predict(padded_sequence)

# Print the predicted value
print(np.round(prediction).astype(int))

[[1]]
