In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [None]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def clean_post(post):
  post = post.lower()
  post = re.sub(r"\n", " ", post)
  post = re.sub("[\<\[].*?[\>\]]", " ", post)
  post = re.sub(r"[^a-z ]", " ", post)
  post = re.sub(r"\b\w{1,3}\b", " ", post)
  return " ".join([x for x in post.split() if x not in stop_words])

In [None]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
  if _type == 0:
    ros = RandomOverSampler(random_state=42)
    return ros.fit_resample(x, y)
  elif _type == 1:
    rus = RandomUnderSampler(random_state=42, replacement=True)
    return rus.fit_resample(x, y)
  elif _type == 2:
    smote = SMOTE()
    return smote.fit_resample(x, y)
  elif _type == 3:
    nm = NearMiss()
    return nm.fit_resample(x, y)
  elif _type == 4:
    smt = SMOTETomek(ratio='auto')
    return smt.fit_resample(x, y)
  elif _type == 5:
    cc = ClusterCentroids()
    return cc.fit_resample(x, y)
  elif _type == 6:
    tl = TomekLinks()
    return tl.fit_resample(x, y)
  # default
  smote = SMOTE()
  return smote.fit_resample(x, y)

In [None]:
data = pd.read_csv('../../reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [None]:
import gc

X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
  print('#'*110)
  print()
  if _type == -1:
    print('Without any undersampling/oversampling')
  else:
    print(f'With sampling type: {_type}')
  model = DecisionTreeClassifier(max_depth=15)
  X_train, y_train = balance_data(X_tr, y_tr, _type)
  model.fit(X_train, y_train)

  y_train_pred = model.predict(X_train)
  print()
  print("For training set")
  print()
  get_metrics(y_train, y_train_pred)

  y_valid_pred = model.predict(X_valid)
  print()
  print("For validation set")
  print()
  get_metrics(y_valid, y_valid_pred)

  y_test_pred = model.predict(X_test)
  print()
  print("For test set")
  print()
  get_metrics(y_test, y_test_pred)

##############################################################################################################

With sampling type: 1





For training set

Classification Report:                precision    recall  f1-score   support

           0       0.70      0.45      0.55      3438
           1       0.89      0.51      0.65      3438
           2       0.95      0.52      0.67      3438
           3       0.78      0.67      0.72      3438
           4       0.52      0.53      0.52      3438
           5       0.98      0.63      0.76      3438
           6       0.88      0.40      0.55      3438
           7       0.11      0.78      0.19      3438
           8       0.44      0.31      0.36      3438
           9       0.88      0.30      0.44      3438
          10       0.75      0.34      0.46      3438
          11       0.91      0.56      0.69      3438
          12       0.98      0.39      0.55      3438
          13       0.72      0.34      0.46      3438
          14       0.56      0.19      0.29      3438

    accuracy                           0.46     51570
   macro avg       0.74      0.46    




For training set

Classification Report:                precision    recall  f1-score   support

           0       0.77      0.57      0.66      3438
           1       0.90      0.54      0.68      3438
           2       0.92      0.63      0.75      3438
           3       0.78      0.66      0.72      3438
           4       0.49      0.53      0.51      3438
           5       0.98      0.63      0.77      3438
           6       0.76      0.38      0.51      3438
           7       0.92      0.05      0.10      3438
           8       0.36      0.07      0.12      3438
           9       0.81      0.35      0.48      3438
          10       0.78      0.47      0.59      3438
          11       0.93      0.60      0.73      3438
          12       0.98      0.38      0.55      3438
          13       0.80      0.38      0.51      3438
          14       0.13      0.89      0.22      3438

    accuracy                           0.48     51570
   macro avg       0.75      0.48    