In [1]:
# Required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import pickle
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter

stop_words = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling2D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model, load_model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [2]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [3]:
def get_metrics(ytrue, ypred):
    y_true = np.argmax(ytrue, axis=1)
    y_pred = np.argmax(ypred, axis=1)
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [4]:
def evaluate_on_test_data():
    # Load data
    data = pd.read_csv('../split_data/test.csv')
    data = shuffle(data)

    # Class split stats
    print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
    X_test = data['post'].apply(lambda post: clean_post(post))
    label_encoder = LabelEncoder()
    y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
    y_test = to_categorical(y1)
    
    # Load tokenizer
    with open('../models/tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    # Convert  texts to sequence of integers
    sequences_test = tokenizer.texts_to_sequences(X_test)

    # Limit size of test sequences to 200 and pad the sequence
    X_test = pad_sequences(sequences_test, maxlen=200)
    print(f'Shape of X test tensor: {X_test.shape}')

    # Convert target to array
    y_test = np.asarray(y_test)
    print(f'Shape of y test tensor: {y_test.shape}')
    
    # Evaluating
    for _type in [1, 3, 5, 6, -1, 0, 2]:
        model = load_model(f'../models/CNN_model_{_type}')
        print('#'*110)
        print()
        if _type == -1:
            print('Without any oversampling/undersampling')
        else:
            print(f'With sampling type: {_type}')
        print()
        print()
        # Predict on test dataset
        pred_test = model.predict(X_test)
        get_metrics(y_test, pred_test)
        print()
        print()
        print('#'*110)

In [5]:
def get_text_label(text):
    X_test = np.array([clean_post(text),])
    
    # Load tokenizer
    with open('../models/tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    # Convert  texts to sequence of integers
    sequences_test = tokenizer.texts_to_sequences(X_test)

    # Limit size of test sequences to 200 and pad the sequence
    X_test = pad_sequences(sequences_test, maxlen=200)
    
    # Evaluating
    model = load_model(f'../models/CNN_model_6')
    pred_test = model.predict(X_test)
    return np.argmax(pred_test, axis=1)[0]

In [6]:
evaluate_on_test_data()

                mental_disorder                             
                          count unique            top   freq
mental_disorder                                             
EDAnonymous                2238      1    EDAnonymous   2238
addiction                  1126      1      addiction   1126
adhd                       6845      1           adhd   6845
alcoholism                  885      1     alcoholism    885
anxiety                    8700      1        anxiety   8700
autism                     1286      1         autism   1286
bipolarreddit               851      1  bipolarreddit    851
bpd                        3688      1            bpd   3688
depression                17522      1     depression  17522
healthanxiety              1275      1  healthanxiety   1275
lonely                     3532      1         lonely   3532
ptsd                       1307      1           ptsd   1307
schizophrenia              1361      1  schizophrenia   1361
socialanxiety           

Shape of X test tensor: (64000, 200)
Shape of y test tensor: (64000, 15)


2021-11-06 07:00:56.425039: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2021-11-06 07:00:56.772561: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2021-11-06 07:00:56.772604: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1835] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-11-06 07:00:56.774004: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Libra

##############################################################################################################

With sampling type: 1




2021-11-06 07:00:58.549489: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Classification Report:                precision    recall  f1-score   support

           0       0.66      0.87      0.75      2238
           1       0.53      0.83      0.65      1126
           2       0.86      0.80      0.83      6845
           3       0.60      0.83      0.69       885
           4       0.70      0.65      0.68      8700
           5       0.55      0.81      0.66      1286
           6       0.29      0.75      0.41       851
           7       0.44      0.50      0.47      3688
           8       0.69      0.49      0.57     17522
           9       0.52      0.77      0.62      1275
          10       0.52      0.59      0.55      3532
          11       0.55      0.76      0.64      1307
          12       0.64      0.71      0.67      1361
          13       0.51      0.66      0.58      3580
          14       0.60      0.55      0.57      9804

    accuracy                           0.62     64000
   macro avg       0.58      0.70      0.62     64000
we

##############################################################################################################

With sampling type: 3




Classification Report:                precision    recall  f1-score   support

           0       0.56      0.84      0.67      2238
           1       0.27      0.86      0.41      1126
           2       0.84      0.57      0.68      6845
           3       0.39      0.85      0.53       885
           4       0.63      0.21      0.32      8700
           5       0.19      0.87      0.31      1286
           6       0.10      0.76      0.17       851
           7       0.20      0.37      0.26      3688
           8       0.71      0.05      0.09     17522
           9       0.22      0.88      0.35      1275
          10       0.42      0.36      0.39      3532
          11       0.14      0.86      0.24      1307
          12       0.30      0.78      0.43      1361
          13       0.40      0.57      0.47      3580
          14       0.61      0.14      0.23      9804

    accuracy                           0.33     64000
   macro avg       0.40      0.60      0.37     64000
we

##############################################################################################################

With sampling type: 5




Classification Report:                precision    recall  f1-score   support

           0       0.58      0.85      0.69      2238
           1       0.35      0.82      0.49      1126
           2       0.89      0.66      0.76      6845
           3       0.33      0.90      0.48       885
           4       0.73      0.25      0.38      8700
           5       0.32      0.83      0.46      1286
           6       0.10      0.74      0.17       851
           7       0.49      0.42      0.45      3688
           8       0.73      0.09      0.16     17522
           9       0.28      0.85      0.42      1275
          10       0.25      0.75      0.38      3532
          11       0.38      0.81      0.52      1307
          12       0.31      0.77      0.44      1361
          13       0.46      0.62      0.53      3580
          14       0.54      0.40      0.46      9804

    accuracy                           0.42     64000
   macro avg       0.45      0.65      0.45     64000
we

##############################################################################################################

With sampling type: 6




Classification Report:                precision    recall  f1-score   support

           0       0.81      0.86      0.83      2238
           1       0.77      0.72      0.75      1126
           2       0.91      0.80      0.85      6845
           3       0.82      0.70      0.76       885
           4       0.71      0.73      0.72      8700
           5       0.75      0.78      0.77      1286
           6       0.57      0.58      0.58       851
           7       0.64      0.46      0.54      3688
           8       0.60      0.72      0.66     17522
           9       0.70      0.64      0.67      1275
          10       0.60      0.49      0.54      3532
          11       0.70      0.71      0.70      1307
          12       0.68      0.73      0.70      1361
          13       0.64      0.55      0.59      3580
          14       0.63      0.58      0.61      9804

    accuracy                           0.68     64000
   macro avg       0.70      0.67      0.68     64000
we

##############################################################################################################

Without any oversampling/undersampling




Classification Report:                precision    recall  f1-score   support

           0       0.86      0.79      0.82      2238
           1       0.80      0.68      0.74      1126
           2       0.84      0.86      0.85      6845
           3       0.74      0.79      0.76       885
           4       0.62      0.78      0.69      8700
           5       0.84      0.69      0.76      1286
           6       0.65      0.51      0.57       851
           7       0.54      0.53      0.54      3688
           8       0.60      0.70      0.65     17522
           9       0.67      0.60      0.63      1275
          10       0.59      0.51      0.55      3532
          11       0.77      0.62      0.69      1307
          12       0.74      0.70      0.72      1361
          13       0.66      0.41      0.51      3580
          14       0.65      0.50      0.56      9804

    accuracy                           0.66     64000
   macro avg       0.71      0.65      0.67     64000
we

##############################################################################################################

With sampling type: 0




Classification Report:                precision    recall  f1-score   support

           0       0.81      0.78      0.79      2238
           1       0.62      0.74      0.68      1126
           2       0.85      0.81      0.83      6845
           3       0.69      0.77      0.73       885
           4       0.67      0.66      0.67      8700
           5       0.68      0.76      0.72      1286
           6       0.43      0.56      0.49       851
           7       0.48      0.51      0.49      3688
           8       0.65      0.53      0.59     17522
           9       0.57      0.67      0.62      1275
          10       0.51      0.53      0.52      3532
          11       0.69      0.62      0.65      1307
          12       0.62      0.71      0.66      1361
          13       0.50      0.62      0.55      3580
          14       0.56      0.63      0.60      9804

    accuracy                           0.63     64000
   macro avg       0.62      0.66      0.64     64000
we

##############################################################################################################

With sampling type: 2




Classification Report:                precision    recall  f1-score   support

           0       0.61      0.81      0.70      2238
           1       0.66      0.66      0.66      1126
           2       0.84      0.78      0.81      6845
           3       0.68      0.64      0.66       885
           4       0.73      0.60      0.66      8700
           5       0.65      0.62      0.64      1286
           6       0.60      0.42      0.49       851
           7       0.62      0.34      0.44      3688
           8       0.59      0.64      0.61     17522
           9       0.55      0.59      0.57      1275
          10       0.41      0.50      0.45      3532
          11       0.67      0.56      0.61      1307
          12       0.64      0.63      0.64      1361
          13       0.54      0.59      0.56      3580
          14       0.53      0.59      0.56      9804

    accuracy                           0.61     64000
   macro avg       0.62      0.60      0.60     64000
we