In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [2]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
 stop_words = set(stopwords.words('english'))

In [5]:
def clean_post(post):
  post = post.lower()
  post = re.sub(r"\n", " ", post)
  post = re.sub("[\<\[].*?[\>\]]", " ", post)
  post = re.sub(r"[^a-z ]", " ", post)
  post = re.sub(r"\b\w{1,3}\b", " ", post)
  return " ".join([x for x in post.split() if x not in stop_words])

In [6]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
  if _type == 0:
    ros = RandomOverSampler(random_state=42)
    return ros.fit_resample(x, y)
  elif _type == 1:
    rus = RandomUnderSampler(random_state=42, replacement=True)
    return rus.fit_resample(x, y)
  elif _type == 2:
    smote = SMOTE()
    return smote.fit_resample(x, y)
  elif _type == 3:
    nm = NearMiss()
    return nm.fit_resample(x, y)
  elif _type == 4:
    smt = SMOTETomek(ratio='auto')
    return smt.fit_resample(x, y)
  elif _type == 5:
    cc = ClusterCentroids()
    return cc.fit_resample(x, y)
  elif _type == 6:
    tl = TomekLinks()
    return tl.fit_resample(x, y)
  # default
  smote = SMOTE()
  return smote.fit_resample(x, y)

In [8]:
data = pd.read_csv('/content/drive/MyDrive/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [9]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [None]:
import gc

X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1, 0, 2]:
  print('#'*110)
  print()
  if _type == -1:
    print('Without any undersampling/oversampling')
  else:
    print(f'With sampling type: {_type}')
  model = BaggingClassifier(base_estimator=LogisticRegression())
  X_train, y_train = balance_data(X_tr, y_tr, _type)
  model.fit(X_train, y_train)

  y_train_pred = model.predict(X_train)
  print()
  print("For training set")
  print()
  get_metrics(y_train, y_train_pred)

  y_valid_pred = model.predict(X_valid)
  print()
  print("For validation set")
  print()
  get_metrics(y_valid, y_valid_pred)

  y_test_pred = model.predict(X_test)
  print()
  print("For test set")
  print()
  get_metrics(y_test, y_test_pred)

##############################################################################################################

With sampling type: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist