In [1]:
# Required installations
# !pip cache purge
# !python3 -m pip install -U scikit-learn scipy
# !pip install nltk
# !pip install keras
# !pip install gensim
# !pip install matplotlib
# !pip install pandas
# !pip install scikit-learn
# !pip install nltk
# !pip install tensorflow

In [2]:
# Required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter

stop_words = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling2D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [3]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [4]:
# Different techniques for tackling class imbalance
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)
    return x, y
    # Another technique is penalizing the algo with class_weight=balanced, using stratified cross validation

In [5]:
# Load data
data = pd.read_csv('../split_data/train_and_valid.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
X = data['post'].apply(lambda post: clean_post(post))
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
y = to_categorical(y1)

# 70-15-15 split (test data is unseen)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.176, random_state=321)

                mental_disorder                             
                          count unique            top   freq
mental_disorder                                             
EDAnonymous               12339      1    EDAnonymous  12339
addiction                  6515      1      addiction   6515
adhd                      38786      1           adhd  38786
alcoholism                 5026      1     alcoholism   5026
anxiety                   48971      1        anxiety  48971
autism                     7583      1         autism   7583
bipolarreddit              4929      1  bipolarreddit   4929
bpd                       20606      1            bpd  20606
depression                99809      1     depression  99809
healthanxiety              7373      1  healthanxiety   7373
lonely                    20103      1         lonely  20103
ptsd                       7336      1           ptsd   7336
schizophrenia              7351      1  schizophrenia   7351
socialanxiety           

In [6]:
# Using keras tokenizer on text for pre-processing
MAX_WORDS_LIMIT = 30000
tokenizer = Tokenizer(num_words=MAX_WORDS_LIMIT, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(X_train)
with open('../models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
word_index = tokenizer.word_index
print(f'Unique tokens found: {len(word_index)}')

Unique tokens found: 110301


In [7]:
# Convert  texts to sequence of integers
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid = tokenizer.texts_to_sequences(X_valid)

# Limit size of train/validation/test sequences to 200 and pad the sequence
X_train = pad_sequences(sequences_train, maxlen=200)
X_valid = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
print(f'Shape of X train tensor: {X_train.shape}, X validation tensor: {X_valid.shape}')

# Convert target to array
y_train, y_valid = np.asarray(y_train), np.asarray(y_valid)
print(f'Shape of y train tensor: {y_train.shape}, y validation tensor: {y_valid.shape}')

Shape of X train tensor: (298700, 200), X validation tensor: (63800, 200)
Shape of y train tensor: (298700, 15), y validation tensor: (63800, 15)


In [8]:
# Creating word embeddings via pre-trained word2vec model
WORD_EMBEDDING_DIM = 300
word_vectors = KeyedVectors.load_word2vec_format('../reddit_mental_health_dataset/GoogleNews-vectors-negative300.bin', binary=True)
embeddings_matrix = np.zeros((MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM))

# Computing embeddings matrix and embedding layer
for word, i in word_index.items():
    if i >= MAX_WORDS_LIMIT:
        break
    try:
        embeddings_matrix[i] = word_vectors[word]
    except:
        embeddings_matrix[i] = np.zeros(WORD_EMBEDDING_DIM)
embedding_layer = Embedding(MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM, weights=[embeddings_matrix], trainable=True)

In [9]:
# # Approach - 1
# # Parameters
# sequence_length = X_train.shape[1]
# filter_sizes = [3, 4]
# num_filters = 128
# drop = 0.4

# # Obtaining embeddings based on input sequence
# inputs = Input(shape=(sequence_length,))
# embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# # Creating convolutional and maxpool layers
# conv_layers, maxpool_layers = [], []
# for i in range(2):
#     conv_layers.append(Conv2D(num_filters, (filter_sizes[i], WORD_EMBEDDING_DIM), activation='relu', 
#                               kernel_regularizer=regularizers.l2(0.01))(reshape))
#     maxpool_layers.append(MaxPooling2D((sequence_length - filter_sizes[i] + 1, 1), strides=(1, 1))(conv_layers[i]))

# # Constructing the complete network and creating model
# merged_tensor = concatenate(maxpool_layers, axis=1)
# flatten = Flatten()(merged_tensor)
# reshape = Reshape((2*num_filters,))(flatten)
# dropout = Dropout(drop)(flatten)
# conc = Dense(60)(flatten)
# output = Dense(units=15, activation='softmax')(conc)
# model = Model(inputs, output)

In [10]:
# # Approach - 2
# # Parameters
# sequence_length = X_train.shape[1]
# num_filters = 128
# drop = 0.25

# # Obtaining embeddings based on input sequence
# inputs = Input(shape=(sequence_length,))
# embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# # Constructing the complete network and creating model
# conv = Conv2D(num_filters, (5, WORD_EMBEDDING_DIM), activation='relu', kernel_regularizer=regularizers.l2(0.01))(reshape)
# dropout = Dropout(drop)(conv)
# pool = MaxPooling2D((sequence_length - 4, 1), strides=(1, 1))(dropout)
# flatten = GlobalAveragePooling2D()(pool)
# dropout = Dropout(drop)(flatten)
# conc = Dense(60)(flatten)
# output = Dense(units=15, activation='softmax')(conc)
# model = Model(inputs, output)

In [11]:
# Approach 3
# Parameters
sequence_length = X_train.shape[1]
num_filters = 250
drop = 0.25

# Obtaining embeddings based on input sequence
inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

# Constructing the complete network and creating model
conv = Conv1D(num_filters, 5, activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding)
dropout = Dropout(drop)(conv)
pool = GlobalMaxPooling1D()(dropout)
conc = Dense(100)(pool)
dropout2 = Dropout(drop)(conc)
output = Dense(units=15, activation='softmax')(dropout2)

2021-11-05 12:37:16.175112: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2021-11-05 12:37:16.523128: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2021-11-05 12:37:16.523189: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1835] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-11-05 12:37:16.524340: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Libra

In [12]:
# Fitting Model to the data
X_tr, y_tr = X_train, y_train
for _type in [1, 3, 5, 6, -1, 0, 2]:
    model = Model(inputs, output)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=["accuracy"])
    print('#'*110)
    print()
    if _type == -1:
        print('Without any oversampling/undersampling')
    else:
        print(f'With sampling type: {_type}')
    print()
    print()
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    hist_adam = model.fit(X_train, y_train, batch_size=600, epochs=7, verbose=2, 
                          validation_data=(X_valid, y_valid))
    # Saving model
    model.save(f'../models/CNN_model_{_type}')

##############################################################################################################

With sampling type: 1




2021-11-05 12:37:17.264080: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/7


101/101 - 87s - loss: 2.5126 - accuracy: 0.5333 - val_loss: 1.6889 - val_accuracy: 0.5629


Epoch 2/7


101/101 - 86s - loss: 1.2255 - accuracy: 0.6978 - val_loss: 1.5652 - val_accuracy: 0.5717


Epoch 3/7


101/101 - 86s - loss: 1.0652 - accuracy: 0.7378 - val_loss: 1.4424 - val_accuracy: 0.6039


Epoch 4/7


101/101 - 86s - loss: 0.9705 - accuracy: 0.7677 - val_loss: 1.3893 - val_accuracy: 0.6124


Epoch 5/7


101/101 - 86s - loss: 0.8861 - accuracy: 0.7978 - val_loss: 1.4662 - val_accuracy: 0.5961


Epoch 6/7


101/101 - 86s - loss: 0.8058 - accuracy: 0.8244 - val_loss: 1.4226 - val_accuracy: 0.6083


Epoch 7/7


101/101 - 86s - loss: 0.7378 - accuracy: 0.8490 - val_loss: 1.4037 - val_accuracy: 0.6171


2021-11-05 12:47:17.971961: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../models/CNN_model_1/assets


##############################################################################################################

With sampling type: 3




Epoch 1/7


101/101 - 86s - loss: 1.0307 - accuracy: 0.7467 - val_loss: 2.4724 - val_accuracy: 0.3976


Epoch 2/7


101/101 - 85s - loss: 0.8858 - accuracy: 0.7828 - val_loss: 2.8630 - val_accuracy: 0.3678


Epoch 3/7


101/101 - 85s - loss: 0.8259 - accuracy: 0.8014 - val_loss: 3.1131 - val_accuracy: 0.3674


Epoch 4/7


101/101 - 85s - loss: 0.7758 - accuracy: 0.8201 - val_loss: 3.8488 - val_accuracy: 0.3340


Epoch 5/7


101/101 - 85s - loss: 0.7350 - accuracy: 0.8330 - val_loss: 3.6737 - val_accuracy: 0.3430


Epoch 6/7


101/101 - 85s - loss: 0.7048 - accuracy: 0.8450 - val_loss: 4.1230 - val_accuracy: 0.3318


Epoch 7/7


101/101 - 85s - loss: 0.6653 - accuracy: 0.8587 - val_loss: 4.1751 - val_accuracy: 0.3318


INFO:tensorflow:Assets written to: ../models/CNN_model_3/assets


##############################################################################################################

With sampling type: 5




  self.estimator_.fit(_safe_indexing(X, target_class_indices))


Epoch 1/7


101/101 - 87s - loss: 1.4023 - accuracy: 0.6755 - val_loss: 1.9200 - val_accuracy: 0.4867


Epoch 2/7


101/101 - 86s - loss: 1.0498 - accuracy: 0.7508 - val_loss: 2.1297 - val_accuracy: 0.4524


Epoch 3/7


101/101 - 86s - loss: 0.9556 - accuracy: 0.7840 - val_loss: 2.1576 - val_accuracy: 0.4721


Epoch 4/7


101/101 - 86s - loss: 0.8669 - accuracy: 0.8175 - val_loss: 2.3790 - val_accuracy: 0.4398


Epoch 5/7


101/101 - 86s - loss: 0.8119 - accuracy: 0.8410 - val_loss: 2.4036 - val_accuracy: 0.4524


Epoch 6/7


101/101 - 86s - loss: 0.7522 - accuracy: 0.8639 - val_loss: 2.7008 - val_accuracy: 0.4295


Epoch 7/7


101/101 - 86s - loss: 0.6875 - accuracy: 0.8885 - val_loss: 2.9034 - val_accuracy: 0.4206


INFO:tensorflow:Assets written to: ../models/CNN_model_5/assets


##############################################################################################################

With sampling type: 6




Epoch 1/7


455/455 - 323s - loss: 1.1894 - accuracy: 0.6864 - val_loss: 1.2041 - val_accuracy: 0.6718


Epoch 2/7


455/455 - 323s - loss: 1.0661 - accuracy: 0.7058 - val_loss: 1.1672 - val_accuracy: 0.6798


Epoch 3/7


455/455 - 323s - loss: 1.0195 - accuracy: 0.7190 - val_loss: 1.1612 - val_accuracy: 0.6811


Epoch 4/7


455/455 - 324s - loss: 0.9897 - accuracy: 0.7299 - val_loss: 1.1728 - val_accuracy: 0.6788


Epoch 5/7


455/455 - 323s - loss: 0.9664 - accuracy: 0.7396 - val_loss: 1.1884 - val_accuracy: 0.6726


Epoch 6/7


455/455 - 324s - loss: 0.9464 - accuracy: 0.7485 - val_loss: 1.1885 - val_accuracy: 0.6776


Epoch 7/7


455/455 - 323s - loss: 0.9291 - accuracy: 0.7581 - val_loss: 1.2151 - val_accuracy: 0.6724


INFO:tensorflow:Assets written to: ../models/CNN_model_6/assets


##############################################################################################################

Without any oversampling/undersampling




Epoch 1/7


498/498 - 352s - loss: 0.9680 - accuracy: 0.7504 - val_loss: 1.1957 - val_accuracy: 0.6777


Epoch 2/7


498/498 - 351s - loss: 0.9311 - accuracy: 0.7600 - val_loss: 1.2137 - val_accuracy: 0.6712


Epoch 3/7


498/498 - 351s - loss: 0.9157 - accuracy: 0.7668 - val_loss: 1.2307 - val_accuracy: 0.6711


Epoch 4/7


498/498 - 351s - loss: 0.8965 - accuracy: 0.7745 - val_loss: 1.2476 - val_accuracy: 0.6649


Epoch 5/7


498/498 - 351s - loss: 0.8778 - accuracy: 0.7819 - val_loss: 1.2789 - val_accuracy: 0.6591


Epoch 6/7


498/498 - 351s - loss: 0.8574 - accuracy: 0.7894 - val_loss: 1.2717 - val_accuracy: 0.6658


Epoch 7/7


498/498 - 352s - loss: 0.8481 - accuracy: 0.7944 - val_loss: 1.2982 - val_accuracy: 0.6574


INFO:tensorflow:Assets written to: ../models/CNN_model_-1/assets


##############################################################################################################

With sampling type: 0




Epoch 1/7


2055/2055 - 1393s - loss: 0.6272 - accuracy: 0.8849 - val_loss: 1.4386 - val_accuracy: 0.6383


Epoch 2/7


2055/2055 - 1394s - loss: 0.5800 - accuracy: 0.8977 - val_loss: 1.4784 - val_accuracy: 0.6347


Epoch 3/7


2055/2055 - 1395s - loss: 0.5499 - accuracy: 0.9052 - val_loss: 1.5218 - val_accuracy: 0.6286


Epoch 4/7


2055/2055 - 1395s - loss: 0.5252 - accuracy: 0.9115 - val_loss: 1.5337 - val_accuracy: 0.6237


Epoch 5/7


2055/2055 - 1396s - loss: 0.5083 - accuracy: 0.9156 - val_loss: 1.5876 - val_accuracy: 0.6205


Epoch 6/7


2055/2055 - 1396s - loss: 0.4914 - accuracy: 0.9195 - val_loss: 1.5837 - val_accuracy: 0.6295


Epoch 7/7


2055/2055 - 1396s - loss: 0.4788 - accuracy: 0.9229 - val_loss: 1.5808 - val_accuracy: 0.6258


INFO:tensorflow:Assets written to: ../models/CNN_model_0/assets


##############################################################################################################

With sampling type: 2




Epoch 1/7


2055/2055 - 1393s - loss: 2.5923 - accuracy: 0.2463 - val_loss: 1.5917 - val_accuracy: 0.6188


Epoch 2/7


2055/2055 - 1394s - loss: 2.4424 - accuracy: 0.2660 - val_loss: 1.5990 - val_accuracy: 0.6199


Epoch 3/7


2055/2055 - 1395s - loss: 2.4240 - accuracy: 0.2721 - val_loss: 1.5488 - val_accuracy: 0.6217


Epoch 4/7


2055/2055 - 1395s - loss: 2.4083 - accuracy: 0.2768 - val_loss: 1.5439 - val_accuracy: 0.6211


Epoch 5/7


2055/2055 - 1396s - loss: 2.3937 - accuracy: 0.2809 - val_loss: 1.5211 - val_accuracy: 0.6143


Epoch 6/7


2055/2055 - 1396s - loss: 2.3831 - accuracy: 0.2848 - val_loss: 1.5411 - val_accuracy: 0.6123


Epoch 7/7


2055/2055 - 1396s - loss: 2.3733 - accuracy: 0.2882 - val_loss: 1.5623 - val_accuracy: 0.6096


INFO:tensorflow:Assets written to: ../models/CNN_model_2/assets
