In [None]:
# Required installations
# !pip cache purge
# !python3 -m pip install -U scikit-learn scipy
# !pip install nltk
# !pip install keras
# !pip install gensim
# !pip install matplotlib
# !pip install pandas
# !pip install scikit-learn
# !pip install nltk
# !pip install tensorflow

In [None]:
# Required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter

stop_words = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling2D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [None]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [None]:
# Different techniques for tackling class imbalance
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)
    return x, y
    # Another technique is penalizing the algo with class_weight=balanced, using stratified cross validation

In [None]:
# Load data
data = pd.read_csv('../reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)
# data = data[:500]

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
X = data['post'].apply(lambda post: clean_post(post))
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [None]:
# Using keras tokenizer on text for pre-processing
MAX_WORDS_LIMIT = 30000
tokenizer = Tokenizer(num_words=MAX_WORDS_LIMIT, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print(f'Unique tokens found: {len(word_index)}')

In [None]:
# Convert  texts to sequence of integers
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid = tokenizer.texts_to_sequences(X_valid)
sequences_test = tokenizer.texts_to_sequences(X_test)

# Limit size of train/validation/test sequences to 200 and pad the sequence
X_train = pad_sequences(sequences_train, maxlen=200)
X_valid = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
X_test = pad_sequences(sequences_test, maxlen=X_train.shape[1])
print(f'Shape of X train tensor: {X_train.shape}, X validation tensor: {X_valid.shape}, X test tensor: {X_test.shape}')

# Convert target to array
y_train, y_valid, y_test = np.asarray(y_train), np.asarray(y_valid), np.asarray(y_test)
print(f'Shape of y train tensor: {y_train.shape}, y validation tensor: {y_valid.shape}, y test tensor: {y_test.shape}')

In [None]:
# Creating word embeddings via pre-trained word2vec model
WORD_EMBEDDING_DIM = 300
word_vectors = KeyedVectors.load_word2vec_format('../reddit_mental_health_dataset/GoogleNews-vectors-negative300.bin', binary=True)
embeddings_matrix = np.zeros((MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM))

# Computing embeddings matrix and embedding layer
for word, i in word_index.items():
    if i >= MAX_WORDS_LIMIT:
        break
    try:
        embeddings_matrix[i] = word_vectors[word]
    except:
        embeddings_matrix[i] = np.zeros(WORD_EMBEDDING_DIM)
embedding_layer = Embedding(MAX_WORDS_LIMIT, WORD_EMBEDDING_DIM, weights=[embeddings_matrix], trainable=True)

In [None]:
# # Approach - 1
# # Parameters
# sequence_length = X_train.shape[1]
# filter_sizes = [3, 4]
# num_filters = 128
# drop = 0.4

# # Obtaining embeddings based on input sequence
# inputs = Input(shape=(sequence_length,))
# embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# # Creating convolutional and maxpool layers
# conv_layers, maxpool_layers = [], []
# for i in range(2):
#     conv_layers.append(Conv2D(num_filters, (filter_sizes[i], WORD_EMBEDDING_DIM), activation='relu', 
#                               kernel_regularizer=regularizers.l2(0.01))(reshape))
#     maxpool_layers.append(MaxPooling2D((sequence_length - filter_sizes[i] + 1, 1), strides=(1, 1))(conv_layers[i]))

# # Constructing the complete network and creating model
# merged_tensor = concatenate(maxpool_layers, axis=1)
# flatten = Flatten()(merged_tensor)
# reshape = Reshape((2*num_filters,))(flatten)
# dropout = Dropout(drop)(flatten)
# conc = Dense(60)(flatten)
# output = Dense(units=15, activation='softmax')(conc)
# model = Model(inputs, output)

In [None]:
# # Approach - 2
# # Parameters
# sequence_length = X_train.shape[1]
# num_filters = 128
# drop = 0.25

# # Obtaining embeddings based on input sequence
# inputs = Input(shape=(sequence_length,))
# embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length, WORD_EMBEDDING_DIM, 1))(embedding)

# # Constructing the complete network and creating model
# conv = Conv2D(num_filters, (5, WORD_EMBEDDING_DIM), activation='relu', kernel_regularizer=regularizers.l2(0.01))(reshape)
# dropout = Dropout(drop)(conv)
# pool = MaxPooling2D((sequence_length - 4, 1), strides=(1, 1))(dropout)
# flatten = GlobalAveragePooling2D()(pool)
# dropout = Dropout(drop)(flatten)
# conc = Dense(60)(flatten)
# output = Dense(units=15, activation='softmax')(conc)
# model = Model(inputs, output)

In [None]:
# Approach 3
# Parameters
sequence_length = X_train.shape[1]
num_filters = 250
drop = 0.25

# Obtaining embeddings based on input sequence
inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

# Constructing the complete network and creating model
conv = Conv1D(num_filters, 5, activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding)
dropout = Dropout(drop)(conv)
pool = GlobalMaxPooling1D()(dropout)
conc = Dense(100)(pool)
dropout2 = Dropout(drop)(conc)
output = Dense(units=15, activation='softmax')(dropout2)


In [None]:
def get_metrics(ytrue, ypred):
    y_true = np.argmax(ytrue, axis=1)
    y_pred = np.argmax(ypred, axis=1)
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [None]:
# Fitting Model to the data
X_tr, y_tr = X_train, y_train
for _type in [1, 3, 5, 6, -1, 0, 2]:
    model = Model(inputs, output)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=["accuracy"])
    print('#'*110)
    print()
    if _type == -1:
        print('Without any oversampling/undersampling')
    else:
        print(f'With sampling type: {_type}')
    print()
    print()
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    hist_adam = model.fit(X_train, y_train, batch_size=600, epochs=7, verbose=2, 
                          validation_data=(X_valid, y_valid))
    # Predict on train, val and test datasets
    pred_train = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, pred_train)
    pred_valid = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, pred_valid)
    pred_test = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, pred_test)
    print()
    print()
    print('#'*110)