In [None]:
# Required installations
!pip cache purge
!python3 -m pip install -U scikit-learn scipy
!pip install nltk
!pip install keras
!pip install gensim
!pip install imblearn

In [1]:
# Required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter

stop_words = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

2021-10-18 16:35:04.155225: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-18 16:35:04.155268: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [3]:
# Different techniques for tackling class imbalance
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 4:
        smt = SMOTETomek(ratio='auto')
        return smt.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)
    # default
    smote = SMOTE()
    return smote.fit_resample(x, y)
    # Another technique is penalizing the algo with class_weight=balanced, using stratified cross validation

In [5]:
# Load data
data = pd.read_csv('../reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
X = data['post'].apply(lambda post: clean_post(post))
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [6]:
# Using keras tokenizer on text for pre-processing
MAX_WORDS_LIMIT = 30000
tokenizer = Tokenizer(num_words=MAX_WORDS_LIMIT, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print(f'Unique tokens found: {len(word_index)}')

Unique tokens found: 102109


In [7]:
# Convert train and validation texts to sequence of integers
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid = tokenizer.texts_to_sequences(X_valid)

# Limit size of train/validation sequences to 200 and pad the sequence
X_train = pad_sequences(sequences_train, maxlen=200)
X_valid = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
print('Shape of X train and X validation tensor: ', X_train.shape, X_valid.shape)

# Convert target to array
y_train, y_valid = np.asarray(y_train), np.asarray(y_valid)
print('Shape of y train and y validation tensor:', y_train.shape, y_valid.shape)

Shape of X train and X validation tensor:  (255900, 200) (85300, 200)
Shape of y train and y validation tensor: (255900, 15) (85300, 15)


In [8]:
# Creating word embeddings via pre-trained word2vec model
WORD_EMBEDDING_DIM = 300
word_vectors = KeyedVectors.load_word2vec_format('../reddit_mental_health_dataset/GoogleNews-vectors-negative300.bin', binary=True)
embeddings_matrix = np.zeros((MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM))

# Computing embeddings matrix and embedding layer
for word, i in word_index.items():
    if i >= MAX_WORDS_LIMIT:
        break
    try:
        embeddings_matrix[i] = word_vectors[word]
    except:
        embeddings_matrix[i] = np.zeros(WORD_EMBEDDING_DIM)
embedding_layer = Embedding(MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM, weights=[embeddings_matrix], trainable=True)

In [9]:
# # Approach - 1
# # Parameters
# sequence_length = X_train.shape[1]
# filter_sizes = [3, 4]
# num_filters = 128
# drop = 0.4

# # Obtaining embeddings based on input sequence
# inputs = Input(shape=(sequence_length,))
# embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# # Creating convolutional and maxpool layers
# conv_layers, maxpool_layers = [], []
# for i in range(2):
#     conv_layers.append(Conv2D(num_filters, (filter_sizes[i], WORD_EMBEDDING_DIM), activation='relu', 
#                               kernel_regularizer=regularizers.l2(0.01))(reshape))
#     maxpool_layers.append(MaxPooling2D((sequence_length - filter_sizes[i] + 1, 1), strides=(1, 1))(conv_layers[i]))

# # Constructing the complete network and creating model
# merged_tensor = concatenate(maxpool_layers, axis=1)
# flatten = Flatten()(merged_tensor)
# reshape = Reshape((2*num_filters,))(flatten)
# dropout = Dropout(drop)(flatten)
# conc = Dense(60)(flatten)
# output = Dense(units=15, activation='softmax')(conc)
# model = Model(inputs, output)

In [12]:
# Approach - 2
# Parameters
sequence_length = X_train.shape[1]
num_filters = 128
drop = 0.25

# Obtaining embeddings based on input sequence
inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# Constructing the complete network and creating model
conv = Conv2D(num_filters, (5, 5), activation='relu')
dropout = Dropout(drop)(conv)
pool = MaxPooling2D((sequence_length - 4, 1), strides=(1, 1))(dropout)
flatten = Flatten()(pool)
conc = Dense(100)(flatten)
output = Dense(units=15, activation='softmax')(conc)
model = Model(inputs, output)

ValueError: Attempt to convert a value (<keras.layers.convolutional.Conv2D object at 0x7f21b55f7430>) with an unsupported type (<class 'keras.layers.convolutional.Conv2D'>) to a Tensor.

In [None]:
# Compiling Model using Adam optimizer
opt = Adam(learning_rate=1e-3)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"])

# Fitting Model to the data
callback = [EarlyStopping(monitor='val_loss')]
X_train, y_train = balance_data(X_train, y_train, 0)
hist_adam = model.fit(X_train, y_train, batch_size=100, epochs=50, verbose=2, 
                      validation_data=(X_valid, y_valid))

#plotting Loss
plt.suptitle('Optimizer : Adam', fontsize=10)
plt.ylabel('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.plot(hist_adam.history['loss'], color='b', label='Training Loss')
plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')