In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [31]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
stop_words = set(stopwords.words('english'))

In [34]:
def clean_post(post):
  post = post.lower()
  post = re.sub(r"\n", " ", post)
  post = re.sub("[\<\[].*?[\>\]]", " ", post)
  post = re.sub(r"[^a-z ]", " ", post)
  post = re.sub(r"\b\w{1,3}\b", " ", post)
  return " ".join([x for x in post.split() if x not in stop_words])

In [35]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

def balance_data(x, y, _type):
  if _type == 0:
    ros = RandomOverSampler(random_state=42)
    return ros.fit_resample(x, y)
  elif _type == 1:
    rus = RandomUnderSampler(random_state=42, replacement=True)
    return rus.fit_resample(x, y)
  elif _type == 2:
    smote = SMOTE()
    return smote.fit_resample(x, y)
  elif _type == 3:
    nm = NearMiss()
    return nm.fit_resample(x, y)
  elif _type == 4:
    smt = SMOTETomek(ratio='auto')
    return smt.fit_resample(x, y)
  elif _type == 5:
    cc = ClusterCentroids()
    return cc.fit_resample(x, y)
  elif _type == 6:
    tl = TomekLinks()
    return tl.fit_resample(x, y)
  # default
  smote = SMOTE()
  return smote.fit_resample(x, y)

In [36]:
data = pd.read_csv('/content/drive/MyDrive/reddit_dataset.csv')
data = shuffle(data)

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
X = data['post'].apply(lambda post: clean_post(post))
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

                mental_disorder                              
                          count unique            top    freq
mental_disorder                                              
EDAnonymous               14577      1    EDAnonymous   14577
addiction                  7641      1      addiction    7641
adhd                      45631      1           adhd   45631
alcoholism                 5911      1     alcoholism    5911
anxiety                   57671      1        anxiety   57671
autism                     8869      1         autism    8869
bipolarreddit              5780      1  bipolarreddit    5780
bpd                       24294      1            bpd   24294
depression               117331      1     depression  117331
healthanxiety              8648      1  healthanxiety    8648
lonely                    23635      1         lonely   23635
ptsd                       8643      1           ptsd    8643
schizophrenia              8712      1  schizophrenia    8712
socialan

In [37]:
MAX_WORDS_LIMIT = 30000
tokenizer = Tokenizer(num_words=MAX_WORDS_LIMIT, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print(f'Unique tokens found: {len(word_index)}')

Unique tokens found: 102208


In [38]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid = tokenizer.texts_to_sequences(X_valid)

X_train = pad_sequences(sequences_train, maxlen=200)
X_valid = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
print('Shape of X train and X validation tensor: ', X_train.shape, X_valid.shape)

y_train, y_valid = np.asarray(y_train), np.asarray(y_valid)
print('Shape of y train and y validation tensor:', y_train.shape, y_valid.shape)

Shape of X train and X validation tensor:  (255900, 200) (85300, 200)
Shape of y train and y validation tensor: (255900, 15) (85300, 15)


In [42]:
prob_dict = {}
model_dict = {}

In [48]:
def get_models(i):
  model = Sequential()
  model.add(Dense(1,activation='sigmoid',input_shape=(X_train[0].shape)))
  print("\n\n\n starting classification for",i)
  model.summary()
  model.compile(optimizer='adam',loss = keras.losses.BinaryCrossentropy(from_logits=False),metrics=['accuracy'])
  history = model.fit(x=X_train,y=y_train[:,i],batch_size=64,verbose=1,validation_data=(X_valid,y_valid[:,i]))
  y_prob = model.predict(X_valid)
  y_prob[y_prob>0.5] = 1
  y_prob[y_prob<=0.5] = 0
  prob_dict[i] = y_prob
  model_dict[i] = model

In [49]:
for i in range(y_train.shape[1]):
  get_models(i)




 starting classification for 0
Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 1)                 201       
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________



 starting classification for 1
Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 1)                 201       
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________



 starting classification for 2
Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 1)   

In [50]:
y_hat = np.zeros(y_valid.shape)

for i in range(15):
  y_hat[:,i] = prob_dict[i].reshape(y_hat[:,i].shape)

In [56]:
y_diff = np.abs(y_hat - y_valid)

In [59]:
acc = 1 - np.sum(y_diff/2)/y_valid.shape[0]

In [60]:
acc

0.1465123094958969