In [1]:
# Required imports
import matplotlib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score
import re
# from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from numpy import loadtxt
from xgboost import XGBClassifier

In [2]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [3]:
# Different techniques for tackling class imbalance
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balance_data(x, y, _type):
    if _type == 0:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)
    return x, y
    # Another technique is penalizing the algo with class_weight=balanced, using stratified cross validation

In [4]:
# Load data
data = pd.read_csv('../reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [5]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [6]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [7]:

import gc

# Creating the model and checking it for various undersampled cases
X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
    print('#'*110)
    print()
    if _type == -1:
        print('Without any undersampling/oversampling')
    else:
        print(f'With sampling type: {_type}')
    print()
    print()
    model = XGBClassifier()
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    model.fit(X_train, y_train)

    # Predict on training
    y_train_pred = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, y_train_pred)

    # Use model on validation set
    y_valid_pred = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, y_valid_pred)

    # Use model on test set
    y_test_pred = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, y_test_pred)
    
    # Deleting unwanted variables
    del model
    del X_train
    del y_train
    del y_train_pred
    del y_valid_pred
    del y_test_pred
    gc.collect()
    
    print()
    print()
    print('#'*110)

##############################################################################################################

With sampling type: 1







For training set

Classification Report:                precision    recall  f1-score   support

           0       0.90      0.94      0.92      3451
           1       0.96      0.96      0.96      3451
           2       0.95      0.93      0.94      3451
           3       0.95      0.97      0.96      3451
           4       0.85      0.86      0.86      3451
           5       0.97      0.93      0.95      3451
           6       0.95      0.86      0.90      3451
           7       0.82      0.82      0.82      3451
           8       0.84      0.81      0.82      3451
           9       0.94      0.95      0.94      3451
          10       0.83      0.91      0.86      3451
          11       0.97      0.94      0.96      3451
          12       0.96      0.91      0.93      3451
          13       0.88      0.88      0.88      3451
          14       0.83      0.89      0.86      3451

    accuracy                           0.90     51765
   macro avg       0.91      0.90    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.93      0.92      0.93      3451
           1       0.96      0.95      0.95      3451
           2       0.95      0.94      0.95      3451
           3       0.95      0.96      0.95      3451
           4       0.85      0.86      0.86      3451
           5       0.98      0.92      0.95      3451
           6       0.90      0.85      0.88      3451
           7       0.77      0.78      0.78      3451
           8       0.79      0.75      0.77      3451
           9       0.95      0.93      0.94      3451
          10       0.80      0.90      0.85      3451
          11       0.96      0.93      0.95      3451
          12       0.94      0.88      0.91      3451
          13       0.88      0.87      0.87      3451
          14       0.75      0.87      0.81      3451

    accuracy                           0.89     51765
   macro avg       0.89      0.89    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.89      0.82      0.85      8648
           1       0.91      0.84      0.88      4573
           2       0.92      0.85      0.89     27226
           3       0.89      0.90      0.90      3471
           4       0.79      0.77      0.78     33928
           5       0.94      0.80      0.87      5202
           6       0.87      0.68      0.76      3451
           7       0.81      0.51      0.63     14315
           8       0.62      0.80      0.70     69020
           9       0.90      0.72      0.80      5073
          10       0.69      0.63      0.66     13825
          11       0.89      0.79      0.84      5143
          12       0.92      0.75      0.83      5145
          13       0.76      0.62      0.68     13632
          14       0.72      0.65      0.68     38934

    accuracy                           0.74    251586
   macro avg       0.84      0.74    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.89      0.82      0.85      8759
           1       0.91      0.84      0.88      4645
           2       0.92      0.85      0.88     27505
           3       0.89      0.89      0.89      3510
           4       0.79      0.77      0.78     34663
           5       0.94      0.80      0.86      5288
           6       0.87      0.68      0.76      3451
           7       0.81      0.51      0.63     14504
           8       0.62      0.80      0.70     70205
           9       0.89      0.71      0.79      5204
          10       0.69      0.63      0.66     14109
          11       0.89      0.79      0.84      5211
          12       0.92      0.75      0.83      5237
          13       0.76      0.62      0.68     13928
          14       0.72      0.65      0.68     39681

    accuracy                           0.74    255900
   macro avg       0.83      0.74    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.89      0.92      0.91     70205
           1       0.94      0.97      0.96     70205
           2       0.92      0.87      0.90     70205
           3       0.94      0.99      0.96     70205
           4       0.75      0.71      0.73     70205
           5       0.96      0.94      0.95     70205
           6       0.93      0.93      0.93     70205
           7       0.71      0.70      0.70     70205
           8       0.65      0.52      0.58     70205
           9       0.91      0.95      0.93     70205
          10       0.73      0.84      0.78     70205
          11       0.93      0.94      0.94     70205
          12       0.95      0.90      0.92     70205
          13       0.78      0.79      0.79     70205
          14       0.68      0.72      0.70     70205

    accuracy                           0.85   1053075
   macro avg       0.85      0.85    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.95      0.95      0.95     70205
           1       0.97      0.98      0.97     70205
           2       0.92      0.90      0.91     70205
           3       0.97      0.99      0.98     70205
           4       0.80      0.75      0.77     70205
           5       0.98      0.95      0.96     70205
           6       0.95      0.95      0.95     70205
           7       0.83      0.76      0.79     70205
           8       0.61      0.63      0.62     70205
           9       0.95      0.95      0.95     70205
          10       0.80      0.86      0.83     70205
          11       0.96      0.96      0.96     70205
          12       0.97      0.94      0.95     70205
          13       0.84      0.84      0.84     70205
          14       0.67      0.73      0.70     70205

    accuracy                           0.88   1053075
   macro avg       0.88      0.88    