In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [2]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home2/tgv2002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
 stop_words = set(stopwords.words('english'))

In [5]:
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [6]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 4:
        smt = SMOTETomek(ratio='auto')
        return smt.fit_resample(x, y)
    elif _type == 5:
        cc = ClusterCentroids()
        return cc.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)

    smote = SMOTE()
    return smote.fit_resample(x, y)

In [7]:
data = pd.read_csv('/home2/tgv2002/reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [9]:
import gc

X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
    print('#'*110)
    print()
    if _type == -1:
        print('Without any undersampling/oversampling')
    else:
        print(f'With sampling type: {_type}')
    model = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=15))
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, y_train_pred)

    y_valid_pred = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, y_valid_pred)

    y_test_pred = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, y_test_pred)

##############################################################################################################

With sampling type: 1

For training set

Classification Report:                precision    recall  f1-score   support

           0       0.88      0.94      0.91      3511
           1       0.91      0.91      0.91      3511
           2       0.89      0.90      0.89      3511
           3       0.92      0.94      0.93      3511
           4       0.76      0.71      0.73      3511
           5       0.94      0.90      0.92      3511
           6       0.87      0.77      0.81      3511
           7       0.73      0.72      0.73      3511
           8       0.68      0.61      0.64      3511
           9       0.88      0.92      0.90      3511
          10       0.74      0.83      0.78      3511
          11       0.91      0.89      0.90      3511
          12       0.91      0.88      0.89      3511
          13       0.78      0.81      0.79      3511
          14


For test set

Classification Report:                precision    recall  f1-score   support

           0       0.85      0.79      0.82      2915
           1       0.81      0.68      0.74      1450
           2       0.85      0.86      0.86      9083
           3       0.79      0.73      0.76      1191
           4       0.70      0.73      0.71     11564
           5       0.90      0.69      0.78      1828
           6       0.75      0.45      0.56      1182
           7       0.63      0.49      0.55      4877
           8       0.59      0.74      0.66     23400
           9       0.76      0.57      0.65      1740
          10       0.63      0.51      0.57      4720
          11       0.81      0.63      0.71      1748
          12       0.84      0.61      0.70      1680
          13       0.63      0.55      0.59      4573
          14       0.64      0.58      0.61     13349

    accuracy                           0.68     85300
   macro avg       0.75      0.64      0.


For validation set

Classification Report:                precision    recall  f1-score   support

           0       0.78      0.85      0.81      2923
           1       0.68      0.78      0.73      1525
           2       0.87      0.84      0.86      9280
           3       0.71      0.81      0.75      1159
           4       0.75      0.63      0.68     11539
           5       0.64      0.80      0.71      1762
           6       0.39      0.60      0.47      1087
           7       0.46      0.58      0.51      4842
           8       0.71      0.51      0.60     23492
           9       0.56      0.78      0.65      1719
          10       0.49      0.68      0.57      4753
          11       0.60      0.75      0.67      1694
          12       0.57      0.75      0.64      1794
          13       0.53      0.66      0.59      4614
          14       0.57      0.62      0.59     13117

    accuracy                           0.64     85300
   macro avg       0.62      0.71  