In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [2]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home2/tgv2002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
 stop_words = set(stopwords.words('english'))

In [5]:
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [6]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 4:
        smt = SMOTETomek(ratio='auto')
        return smt.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)

    smote = SMOTE()
    return smote.fit_resample(x, y)

In [7]:
data = pd.read_csv('/home2/tgv2002/reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [9]:
import gc

X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
    print('#'*110)
    print()
    if _type == -1:
        print('Without any undersampling/oversampling')
    else:
        print(f'With sampling type: {_type}')
    model = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=15))
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, y_train_pred)

    y_valid_pred = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, y_valid_pred)

    y_test_pred = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, y_test_pred)

##############################################################################################################

With sampling type: 1



For training set

Classification Report:                precision    recall  f1-score   support

           0       0.70      0.46      0.56      3466
           1       0.90      0.46      0.61      3466
           2       0.92      0.52      0.67      3466
           3       0.74      0.76      0.75      3466
           4       0.50      0.56      0.53      3466
           5       0.96      0.62      0.76      3466
           6       0.87      0.48      0.62      3466
           7       0.12      0.75      0.20      3466
           8       0.47      0.30      0.37      3466
           9       0.82      0.37      0.51      3466
          10       0.73      0.35      0.47      3466
          11       0.90      0.56      0.69      3466
          12       0.96      0.37      0.54      3466
          13       0.74      0.35      0.47      3466
          14       0.60      0.21      0.31      3466

    accuracy                           0.48     51990
   macro avg       0.73      0.48    


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.48      0.45      0.46      2957
           1       0.56      0.42      0.48      1539
           2       0.91      0.51      0.66      9189
           3       0.35      0.73      0.47      1166
           4       0.68      0.51      0.58     11583
           5       0.73      0.60      0.66      1792
           6       0.38      0.45      0.42      1185
           7       0.09      0.73      0.16      4869
           8       0.73      0.26      0.38     23495
           9       0.34      0.32      0.33      1681
          10       0.52      0.34      0.41      4628
          11       0.59      0.54      0.57      1747
          12       0.80      0.36      0.49      1745
          13       0.54      0.33      0.41      4646
          14       0.58      0.17      0.26     13078

    accuracy                           0.38     85300
   macro avg       0.55      0.45  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.46      0.45      0.46      2879
           1       0.56      0.42      0.48      1515
           2       0.91      0.51      0.65      9085
           3       0.36      0.74      0.48      1190
           4       0.69      0.52      0.59     11428
           5       0.75      0.61      0.67      1784
           6       0.38      0.44      0.41      1129
           7       0.09      0.73      0.16      4915
           8       0.74      0.26      0.38     23580
           9       0.37      0.31      0.34      1788
          10       0.48      0.32      0.39      4778
          11       0.59      0.57      0.58      1722
          12       0.78      0.36      0.50      1705
          13       0.52      0.32      0.40      4595
          14       0.58      0.17      0.26     13207

    accuracy                           0.37     85300
   macro avg       0.55      0.45      0.


For training set

Classification Report:                precision    recall  f1-score   support

           0       0.75      0.56      0.64      3466
           1       0.88      0.55      0.68      3466
           2       0.91      0.63      0.74      3466
           3       0.78      0.72      0.75      3466
           4       0.49      0.59      0.54      3466
           5       0.97      0.63      0.76      3466
           6       0.76      0.47      0.58      3466
           7       0.12      0.78      0.20      3466
           8       0.78      0.05      0.09      3466
           9       0.85      0.41      0.56      3466
          10       0.76      0.36      0.49      3466
          11       0.92      0.60      0.73      3466
          12       0.96      0.38      0.55      3466
          13       0.79      0.38      0.51      3466
          14       0.50      0.18      0.26      3466

    accuracy                           0.49     51990
   macro avg       0.75      0.49    


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.45      0.46      0.46      2957
           1       0.52      0.53      0.53      1539
           2       0.92      0.54      0.68      9189
           3       0.39      0.69      0.50      1166
           4       0.61      0.49      0.55     11583
           5       0.73      0.60      0.65      1792
           6       0.40      0.43      0.42      1185
           7       0.08      0.79      0.14      4869
           8       0.66      0.01      0.01     23495
           9       0.29      0.33      0.31      1681
          10       0.49      0.34      0.40      4628
          11       0.59      0.55      0.57      1747
          12       0.79      0.36      0.49      1745
          13       0.50      0.35      0.41      4646
          14       0.35      0.01      0.03     13078

    accuracy                           0.29     85300
   macro avg       0.52      0.43  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.44      0.46      0.45      2879
           1       0.51      0.51      0.51      1515
           2       0.91      0.54      0.68      9085
           3       0.38      0.70      0.50      1190
           4       0.62      0.50      0.56     11428
           5       0.74      0.61      0.67      1784
           6       0.40      0.40      0.40      1129
           7       0.08      0.79      0.14      4915
           8       0.69      0.01      0.01     23580
           9       0.32      0.33      0.33      1788
          10       0.47      0.33      0.38      4778
          11       0.59      0.57      0.58      1722
          12       0.79      0.37      0.50      1705
          13       0.49      0.34      0.40      4595
          14       0.42      0.02      0.03     13207

    accuracy                           0.29     85300
   macro avg       0.52      0.43      0.


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.66      0.26      0.37      8634
           1       0.74      0.34      0.47      4511
           2       0.94      0.58      0.72     27043
           3       0.91      0.04      0.07      3519
           4       0.70      0.64      0.67     33953
           5       0.93      0.47      0.63      5190
           6       0.87      0.14      0.24      3466
           7       0.91      0.03      0.07     14304
           8       0.39      0.86      0.54     69027
           9       0.83      0.17      0.28      5056
          10       0.65      0.32      0.43     13918
          11       0.82      0.53      0.64      5112
          12       0.93      0.35      0.51      5160
          13       0.72      0.30      0.43     13464
          14       0.67      0.37      0.48     39113

    accuracy                           0.53    251470
   macro avg       0.78      0.36      0.44    251470
we


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.66      0.25      0.37      2957
           1       0.70      0.33      0.45      1539
           2       0.91      0.58      0.71      9189
           3       0.71      0.02      0.04      1166
           4       0.68      0.62      0.64     11583
           5       0.88      0.43      0.58      1792
           6       0.55      0.08      0.14      1185
           7       0.45      0.01      0.02      4869
           8       0.38      0.83      0.52     23495
           9       0.61      0.12      0.21      1681
          10       0.62      0.30      0.41      4628
          11       0.74      0.50      0.60      1747
          12       0.88      0.32      0.47      1745
          13       0.66      0.29      0.40      4646
          14       0.58      0.33      0.42     13078

    accuracy                           0.50     85300
   macro avg       0.67      0.33  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.65      0.27      0.38      2879
           1       0.70      0.34      0.46      1515
           2       0.91      0.57      0.70      9085
           3       0.53      0.02      0.03      1190
           4       0.67      0.62      0.64     11428
           5       0.87      0.44      0.59      1784
           6       0.60      0.08      0.15      1129
           7       0.48      0.01      0.02      4915
           8       0.38      0.84      0.52     23580
           9       0.62      0.12      0.21      1788
          10       0.58      0.29      0.38      4778
          11       0.75      0.52      0.61      1722
          12       0.85      0.33      0.47      1705
          13       0.66      0.28      0.39      4595
          14       0.59      0.32      0.42     13207

    accuracy                           0.50     85300
   macro avg       0.66      0.34      0.


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.72      0.59      0.65     70256
           1       0.90      0.67      0.77     70256
           2       0.91      0.62      0.74     70256
           3       0.85      0.75      0.79     70256
           4       0.48      0.58      0.52     70256
           5       0.96      0.75      0.84     70256
           6       0.87      0.66      0.75     70256
           7       0.43      0.07      0.12     70256
           8       0.53      0.26      0.35     70256
           9       0.81      0.53      0.64     70256
          10       0.75      0.45      0.56     70256
          11       0.91      0.74      0.82     70256
          12       0.97      0.50      0.66     70256
          13       0.72      0.49      0.58     70256
          14       0.15      0.77      0.25     70256

    accuracy                           0.56   1053840
   macro avg       0.73      0.56      0.60   1053840
we


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.45      0.46      0.46      2957
           1       0.56      0.44      0.49      1539
           2       0.91      0.52      0.66      9189
           3       0.42      0.63      0.51      1166
           4       0.67      0.52      0.59     11583
           5       0.78      0.59      0.67      1792
           6       0.38      0.45      0.42      1185
           7       0.20      0.02      0.03      4869
           8       0.74      0.25      0.38     23495
           9       0.35      0.30      0.32      1681
          10       0.51      0.34      0.41      4628
          11       0.62      0.54      0.58      1747
          12       0.81      0.35      0.49      1745
          13       0.51      0.35      0.41      4646
          14       0.22      0.75      0.34     13078

    accuracy                           0.43     85300
   macro avg       0.54      0.43  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.43      0.46      0.45      2879
           1       0.57      0.43      0.49      1515
           2       0.91      0.51      0.66      9085
           3       0.41      0.62      0.49      1190
           4       0.68      0.53      0.59     11428
           5       0.79      0.60      0.68      1784
           6       0.38      0.42      0.40      1129
           7       0.20      0.02      0.03      4915
           8       0.74      0.25      0.38     23580
           9       0.36      0.29      0.32      1788
          10       0.48      0.33      0.39      4778
          11       0.62      0.57      0.59      1722
          12       0.78      0.36      0.50      1705
          13       0.50      0.33      0.40      4595
          14       0.22      0.75      0.34     13207

    accuracy                           0.42     85300
   macro avg       0.54      0.43      0.


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.67      0.44      0.53     70256
           1       0.88      0.54      0.67     70256
           2       0.92      0.52      0.66     70256
           3       0.76      0.72      0.74     70256
           4       0.49      0.51      0.50     70256
           5       0.96      0.62      0.76     70256
           6       0.87      0.51      0.64     70256
           7       0.11      0.75      0.20     70256
           8       0.43      0.27      0.33     70256
           9       0.78      0.36      0.50     70256
          10       0.74      0.34      0.47     70256
          11       0.91      0.56      0.69     70256
          12       0.98      0.38      0.55     70256
          13       0.72      0.34      0.47     70256
          14       0.55      0.17      0.26     70256

    accuracy                           0.47   1053840
   macro avg       0.72      0.47      0.53   1053840
we


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.47      0.45      0.46      2957
           1       0.55      0.50      0.53      1539
           2       0.91      0.52      0.66      9189
           3       0.40      0.69      0.51      1166
           4       0.68      0.50      0.58     11583
           5       0.80      0.58      0.68      1792
           6       0.42      0.45      0.44      1185
           7       0.09      0.74      0.16      4869
           8       0.72      0.26      0.39     23495
           9       0.34      0.32      0.33      1681
          10       0.54      0.34      0.41      4628
          11       0.63      0.54      0.58      1747
          12       0.81      0.36      0.49      1745
          13       0.53      0.34      0.42      4646
          14       0.60      0.16      0.26     13078

    accuracy                           0.38     85300
   macro avg       0.56      0.45  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.45      0.46      0.45      2879
           1       0.56      0.49      0.52      1515
           2       0.91      0.51      0.66      9085
           3       0.39      0.69      0.50      1190
           4       0.69      0.51      0.58     11428
           5       0.80      0.60      0.69      1784
           6       0.41      0.43      0.42      1129
           7       0.09      0.73      0.16      4915
           8       0.72      0.27      0.39     23580
           9       0.36      0.32      0.34      1788
          10       0.50      0.32      0.39      4778
          11       0.62      0.57      0.59      1722
          12       0.80      0.37      0.50      1705
          13       0.52      0.33      0.40      4595
          14       0.60      0.16      0.26     13207

    accuracy                           0.38     85300
   macro avg       0.56      0.45      0.


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.73      0.59      0.65     70256
           1       0.90      0.66      0.76     70256
           2       0.91      0.62      0.74     70256
           3       0.84      0.75      0.79     70256
           4       0.49      0.58      0.53     70256
           5       0.96      0.75      0.84     70256
           6       0.88      0.60      0.71     70256
           7       0.13      0.70      0.23     70256
           8       0.53      0.26      0.35     70256
           9       0.80      0.54      0.64     70256
          10       0.74      0.45      0.56     70256
          11       0.91      0.74      0.82     70256
          12       0.97      0.50      0.66     70256
          13       0.72      0.49      0.59     70256
          14       0.56      0.16      0.25     70256

    accuracy                           0.56   1053840
   macro avg       0.74      0.56      0.61   1053840
we


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.45      0.46      0.46      2957
           1       0.58      0.43      0.49      1539
           2       0.91      0.52      0.66      9189
           3       0.42      0.64      0.51      1166
           4       0.67      0.51      0.58     11583
           5       0.79      0.59      0.68      1792
           6       0.38      0.38      0.38      1185
           7       0.09      0.75      0.16      4869
           8       0.74      0.25      0.38     23495
           9       0.34      0.31      0.32      1681
          10       0.51      0.34      0.41      4628
          11       0.63      0.55      0.59      1747
          12       0.81      0.35      0.49      1745
          13       0.51      0.35      0.41      4646
          14       0.61      0.16      0.25     13078

    accuracy                           0.38     85300
   macro avg       0.56      0.44  


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.43      0.47      0.45      2879
           1       0.58      0.42      0.49      1515
           2       0.91      0.51      0.66      9085
           3       0.40      0.63      0.49      1190
           4       0.68      0.52      0.59     11428
           5       0.79      0.60      0.68      1784
           6       0.39      0.36      0.38      1129
           7       0.09      0.75      0.16      4915
           8       0.74      0.26      0.38     23580
           9       0.34      0.30      0.32      1788
          10       0.48      0.33      0.39      4778
          11       0.63      0.57      0.60      1722
          12       0.79      0.36      0.50      1705
          13       0.50      0.34      0.40      4595
          14       0.62      0.15      0.25     13207

    accuracy                           0.37     85300
   macro avg       0.56      0.44      0.