In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [2]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home2/tgv2002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
 stop_words = set(stopwords.words('english'))

In [5]:
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [6]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 4:
        smt = SMOTETomek(ratio='auto')
        return smt.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)

    smote = SMOTE()
    return smote.fit_resample(x, y)

In [7]:
data = pd.read_csv('/home2/tgv2002/reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [9]:
import gc

X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
    print('#'*110)
    print()
    if _type == -1:
        print('Without any undersampling/oversampling')
    else:
        print(f'With sampling type: {_type}')
    model = DecisionTreeClassifier(max_depth=15)
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, y_train_pred)

    y_valid_pred = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, y_valid_pred)

    y_test_pred = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, y_test_pred)

##############################################################################################################

With sampling type: 1



For training set

Classification Report:                precision    recall  f1-score   support

           0       0.71      0.47      0.56      3494
           1       0.88      0.54      0.67      3494
           2       0.94      0.52      0.67      3494
           3       0.78      0.56      0.65      3494
           4       0.49      0.55      0.52      3494
           5       0.97      0.62      0.76      3494
           6       0.89      0.39      0.54      3494
           7       0.11      0.78      0.20      3494
           8       0.46      0.28      0.35      3494
           9       0.87      0.30      0.45      3494
          10       0.76      0.35      0.48      3494
          11       0.92      0.57      0.70      3494
          12       0.98      0.40      0.56      3494
          13       0.74      0.33      0.46      3494
          14       0.59      0.22      0.32      3494

    accuracy                           0.46     52410
   macro avg       0.74      0.46    

Classification Report:                precision    recall  f1-score   support

           0       0.45      0.42      0.44      2887
           1       0.51      0.46      0.48      1528
           2       0.92      0.49      0.64      9116
           3       0.37      0.54      0.44      1215
           4       0.68      0.50      0.57     11428
           5       0.76      0.59      0.66      1807
           6       0.38      0.34      0.36      1160
           7       0.08      0.75      0.15      4805
           8       0.72      0.24      0.36     23672
           9       0.42      0.26      0.32      1740
          10       0.50      0.33      0.40      4763
          11       0.61      0.53      0.56      1788
          12       0.77      0.36      0.49      1743
          13       0.54      0.32      0.40      4599
          14       0.55      0.18      0.27     13049

    accuracy                           0.36     85300
   macro avg       0.55      0.42      0.44     85300
we

Classification Report:                precision    recall  f1-score   support

           0       0.46      0.44      0.45      2884
           1       0.51      0.46      0.49      1549
           2       0.93      0.50      0.65      9053
           3       0.34      0.50      0.41      1165
           4       0.69      0.50      0.58     11618
           5       0.76      0.59      0.67      1824
           6       0.37      0.32      0.34      1126
           7       0.09      0.76      0.16      4955
           8       0.72      0.24      0.36     23335
           9       0.38      0.27      0.32      1652
          10       0.49      0.33      0.39      4797
          11       0.57      0.52      0.54      1674
          12       0.77      0.34      0.47      1744
          13       0.54      0.31      0.40      4526
          14       0.54      0.18      0.27     13398

    accuracy                           0.36     85300
   macro avg       0.54      0.42      0.43     85300
we


For training set

Classification Report:                precision    recall  f1-score   support

           0       0.76      0.59      0.67      3494
           1       0.90      0.55      0.68      3494
           2       0.93      0.55      0.69      3494
           3       0.76      0.66      0.71      3494
           4       0.50      0.55      0.52      3494
           5       0.96      0.64      0.76      3494
           6       0.78      0.38      0.51      3494
           7       0.11      0.80      0.20      3494
           8       0.33      0.06      0.11      3494
           9       0.85      0.35      0.50      3494
          10       0.78      0.34      0.47      3494
          11       0.93      0.61      0.74      3494
          12       0.97      0.38      0.55      3494
          13       0.82      0.37      0.51      3494
          14       0.49      0.17      0.26      3494

    accuracy                           0.47     52410
   macro avg       0.72      0.47    

Classification Report:                precision    recall  f1-score   support

           0       0.44      0.43      0.43      2887
           1       0.48      0.49      0.48      1528
           2       0.93      0.49      0.64      9116
           3       0.37      0.62      0.47      1215
           4       0.67      0.48      0.56     11428
           5       0.66      0.60      0.63      1807
           6       0.38      0.33      0.35      1160
           7       0.07      0.80      0.14      4805
           8       0.55      0.04      0.08     23672
           9       0.34      0.30      0.32      1740
          10       0.51      0.32      0.39      4763
          11       0.62      0.53      0.57      1788
          12       0.76      0.35      0.48      1743
          13       0.51      0.33      0.40      4599
          14       0.39      0.02      0.03     13049

    accuracy                           0.29     85300
   macro avg       0.51      0.41      0.40     85300
we


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.65      0.27      0.38      8697
           1       0.73      0.36      0.48      4478
           2       0.95      0.58      0.72     27165
           3       0.66      0.07      0.13      3486
           4       0.71      0.63      0.67     33917
           5       0.94      0.47      0.63      5153
           6       0.81      0.14      0.23      3494
           7       0.54      0.05      0.10     14339
           8       0.39      0.87      0.54     69165
           9       0.76      0.18      0.30      5137
          10       0.66      0.30      0.42     13755
          11       0.84      0.53      0.65      5102
          12       0.93      0.35      0.51      5123
          13       0.72      0.30      0.42     13572
          14       0.67      0.36      0.47     38983

    accuracy                           0.53    251566
   macro avg       0.73      0.36      0.44    251566
we


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.64      0.25      0.36      2884
           1       0.70      0.32      0.44      1549
           2       0.92      0.57      0.70      9053
           3       0.48      0.05      0.09      1165
           4       0.68      0.60      0.64     11618
           5       0.86      0.43      0.58      1824
           6       0.48      0.07      0.12      1126
           7       0.32      0.03      0.05      4955
           8       0.37      0.83      0.51     23335
           9       0.54      0.15      0.24      1652
          10       0.60      0.27      0.37      4797
          11       0.73      0.47      0.57      1674
          12       0.86      0.30      0.45      1744
          13       0.65      0.26      0.38      4526
          14       0.57      0.31      0.40     13398

    accuracy                           0.49     85300
   macro avg       0.63      0.33      0.


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.71      0.59      0.64     70324
           1       0.90      0.65      0.75     70324
           2       0.91      0.62      0.74     70324
           3       0.80      0.79      0.80     70324
           4       0.49      0.56      0.52     70324
           5       0.97      0.75      0.84     70324
           6       0.87      0.59      0.70     70324
           7       0.14      0.70      0.23     70324
           8       0.55      0.25      0.34     70324
           9       0.82      0.51      0.63     70324
          10       0.77      0.43      0.55     70324
          11       0.92      0.76      0.83     70324
          12       0.97      0.50      0.66     70324
          13       0.70      0.50      0.59     70324
          14       0.55      0.17      0.26     70324

    accuracy                           0.56   1054860
   macro avg       0.74      0.56      0.61   1054860
we

Classification Report:                precision    recall  f1-score   support

           0       0.40      0.44      0.42      2887
           1       0.56      0.40      0.47      1528
           2       0.92      0.51      0.66      9116
           3       0.39      0.63      0.48      1215
           4       0.68      0.51      0.58     11428
           5       0.82      0.59      0.68      1807
           6       0.39      0.38      0.38      1160
           7       0.09      0.74      0.15      4805
           8       0.73      0.24      0.36     23672
           9       0.37      0.29      0.33      1740
          10       0.52      0.33      0.41      4763
          11       0.63      0.53      0.58      1788
          12       0.81      0.36      0.49      1743
          13       0.50      0.35      0.41      4599
          14       0.61      0.16      0.25     13049

    accuracy                           0.37     85300
   macro avg       0.56      0.43      0.44     85300
we


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.69      0.45      0.54     70324
           1       0.88      0.47      0.61     70324
           2       0.91      0.52      0.66     70324
           3       0.75      0.66      0.70     70324
           4       0.47      0.52      0.50     70324
           5       0.97      0.62      0.76     70324
           6       0.88      0.43      0.58     70324
           7       0.11      0.76      0.19     70324
           8       0.43      0.26      0.33     70324
           9       0.81      0.32      0.46     70324
          10       0.73      0.34      0.46     70324
          11       0.91      0.57      0.70     70324
          12       0.97      0.39      0.55     70324
          13       0.71      0.33      0.45     70324
          14       0.56      0.17      0.26     70324

    accuracy                           0.45   1054860
   macro avg       0.72      0.45      0.52   1054860
we

Classification Report:                precision    recall  f1-score   support

           0       0.46      0.42      0.44      2887
           1       0.53      0.39      0.45      1528
           2       0.90      0.52      0.66      9116
           3       0.38      0.62      0.47      1215
           4       0.68      0.51      0.58     11428
           5       0.82      0.59      0.69      1807
           6       0.41      0.37      0.39      1160
           7       0.08      0.74      0.15      4805
           8       0.71      0.26      0.38     23672
           9       0.42      0.30      0.35      1740
          10       0.52      0.34      0.41      4763
          11       0.63      0.53      0.57      1788
          12       0.80      0.36      0.50      1743
          13       0.53      0.33      0.40      4599
          14       0.60      0.17      0.26     13049

    accuracy                           0.37     85300
   macro avg       0.56      0.43      0.45     85300
we


For training set



Classification Report:                precision    recall  f1-score   support

           0       0.70      0.59      0.64     70324
           1       0.89      0.64      0.75     70324
           2       0.91      0.62      0.74     70324
           3       0.81      0.76      0.78     70324
           4       0.49      0.57      0.53     70324
           5       0.97      0.75      0.84     70324
           6       0.86      0.59      0.70     70324
           7       0.13      0.69      0.22     70324
           8       0.54      0.25      0.34     70324
           9       0.81      0.52      0.63     70324
          10       0.75      0.43      0.55     70324
          11       0.92      0.76      0.83     70324
          12       0.97      0.51      0.67     70324
          13       0.71      0.49      0.58     70324
          14       0.55      0.17      0.26     70324

    accuracy                           0.56   1054860
   macro avg       0.73      0.56      0.60   1054860
we

Classification Report:                precision    recall  f1-score   support

           0       0.40      0.44      0.42      2887
           1       0.55      0.40      0.46      1528
           2       0.91      0.51      0.66      9116
           3       0.40      0.63      0.49      1215
           4       0.68      0.51      0.58     11428
           5       0.81      0.59      0.68      1807
           6       0.37      0.38      0.38      1160
           7       0.09      0.74      0.15      4805
           8       0.73      0.24      0.36     23672
           9       0.37      0.29      0.33      1740
          10       0.52      0.33      0.40      4763
          11       0.63      0.54      0.58      1788
          12       0.81      0.36      0.50      1743
          13       0.51      0.35      0.41      4599
          14       0.60      0.16      0.25     13049

    accuracy                           0.37     85300
   macro avg       0.56      0.43      0.44     85300
we