In [None]:
# Required installations
# !pip cache purge
# !python3 -m pip install -U scikit-learn scipy
# !pip install nltk
# !pip install keras
# !pip install gensim
# !pip install matplotlib
# !pip install pandas
# !pip install scikit-learn
# !pip install nltk
# !pip install tensorflow

In [None]:
# Required imports
import matplotlib
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score
import re
import imblearn
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [None]:
# Different techniques for tackling class imbalance
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss

def balance_data(x, y, _type):
    if _type == 1:
        rus = RandomUnderSampler(random_state=42, replacement=True)
        return rus.fit_resample(x, y)
    elif _type == 3:
        nm = NearMiss()
        return nm.fit_resample(x, y)
    elif _type == 6:
        tl = TomekLinks()
        return tl.fit_resample(x, y)
    return x, y
    # Another technique is penalizing the algo with class_weight=balanced, using stratified cross validation

In [None]:
# Load data
data = pd.read_csv('../reddit_mental_health_dataset/reddit_dataset.csv')
data = shuffle(data)
# data = data[:500]

# Class split stats
print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
x = data['post'].apply(lambda post: clean_post(post))

# Vectorizing text data
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_counts)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# y = to_categorical(y1)

# 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=321)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=321)

In [None]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)

    print('Accuracy: ', result2, "\n\n")

In [None]:
import gc

# Creating the model and checking it for various undersampled cases
X_tr, y_tr = X_train, y_train

for _type in [1, 3, 6, -1]:
    print('#'*110)
    print()
    if _type == -1:
        print('Without any undersampling/oversampling')
    else:
        print(f'With sampling type: {_type}')
    print()
    print()
    model = KNeighborsClassifier(n_neighbors=301)
    X_train, y_train = balance_data(X_tr, y_tr, _type)
    model.fit(X_train, y_train)

    # Predict on training
    y_train_pred = model.predict(X_train)
    print()
    print("For training set")
    print()
    get_metrics(y_train, y_train_pred)

    # Use model on validation set
    y_valid_pred = model.predict(X_valid)
    print()
    print("For validation set")
    print()
    get_metrics(y_valid, y_valid_pred)

    # Use model on test set
    y_test_pred = model.predict(X_test)
    print()
    print("For test set")
    print()
    get_metrics(y_test, y_test_pred)
    
    # Deleting unwanted parameters
    del model
    del X_train
    del y_train
    del y_train_pred
    del y_valid_pred
    del y_test_pred
    gc.collect()
    
    print()
    print()
    print('#'*110)