# Import Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
# from gensim.models import Word2Vec
# import joblib

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.constraints import max_norm
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

In [None]:
# !pip install transformers

# Load Data


In [None]:
df = pd.read_csv('mental_health.csv')
df.head()
df.shape

#Data Preprocessing




In [None]:
# Check data types
df.dtypes

In [None]:
# Check missing values
df.isnull().sum()

The dataset does not contain any missing value.

In [None]:
# Check the distribution of mental health status
label_0_count = df[df['label']==0].shape[0]
label_1_count = df[df['label']==1].shape[0]
print("Number of entries with label = 0: {}".format(label_0_count))
print("Number of entries with label = 1: {}".format(label_1_count))

# Experimenting with Data

## Vectorize "text" variable

In [None]:
# Vectorize "text" using CountVectorizer
# Documentation 1: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# Documentation 2: https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

vectorizer = CountVectorizer()
X = vectorizer.fit(df['text'])
print("Vocabulary: ", vectorizer.vocabulary_)

features = vectorizer.transform(df['text'])
features = features.toarray()

In [None]:
print(features.shape)

In [None]:
# Split data into training set and testing set
x_train, x_test, y_train, y_test = train_test_split(features, df['label'], train_size=0.8, random_state=0)
print("Number of entries in the training set: {}".format(x_train.shape[0]))
print("Number of entries in the testing set: {}".format(x_test.shape[0]))

In [None]:
# Get the vocabulary from the CountVectorizer object
vocabulary = vectorizer.get_feature_names_out()

# Sum the counts of each word across all documents
word_counts = np.sum(features, axis=0)

# Sort the words in descending order of frequency
sorted_idx = np.argsort(word_counts)[::-1]

# Get the top 20 words by frequency
top_words = [vocabulary[i] for i in sorted_idx[:20]]
top_counts = [word_counts[i] for i in sorted_idx[:20]]

# Plot the frequency of the top 20 words
plt.barh(top_words, top_counts)
plt.title('Frequency of Top 20 Words')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

In [None]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# csr_matrix compress sparse matrix
x_train_csr = csr_matrix(x_train)
x_test_csr = csr_matrix(x_test)

In [None]:
reg = LogisticRegression(max_iter=1000)
reg.fit(x_train_csr, y_train)

# predict on test data
y_pred = reg.predict(x_test_csr)

In [None]:
accuracy_score(y_pred, y_test)

## Feature selection with PCA

In [None]:
# Define the number of components to keep
n_components = 1000

# Perform PCA feature reduction
pca = PCA(n_components=n_components)
x_train_selected = pca.fit_transform(x_train)
x_test_selected = pca.transform(x_test)

In [None]:
# Save the PCA model and transformed data
joblib.dump(pca, 'pca_model.pkl')
np.save('x_train_pca.npy', x_train_selected)
np.save('x_test_pca.npy', x_test_selected)

In [None]:
# Load the saved PCA model
pca = joblib.load('pca_model.pkl')

# Load the saved transformed data
x_train_selected = np.load('x_train_pca.npy')
x_test_selected = np.load('x_test_pca.npy')

In [None]:
reg = LogisticRegression(max_iter=1000)
reg.fit(x_train_selected, y_train)

# predict on test data
y_pred = reg.predict(x_test_selected)

In [None]:
accuracy_score(y_pred, y_test)

## Random Forest

In [None]:
# Define the parameter grid for the random forest classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [100, 200, 300],
    'min_samples_split': [50, 100, 200, 300]
}

# Create a random forest classifier object
rfc = RandomForestClassifier()

# Create a grid search object with cross-validation
grid_search = GridSearchCV(rfc, param_grid, cv=5)

# Fit the grid search object on the training data
grid_search.fit(x_train_selected, y_train)

# Print the best hyperparameters found by the grid search
print("Best hyperparameters:", grid_search.best_params_)

In [None]:
# Use the best hyperparameters to train a random forest classifier on the training data
best_rfc = RandomForestClassifier(**grid_search.best_params_)
best_rfc.fit(x_train_selected, y_train)

# Use the classifier to make predictions on the testing data
predictions = best_rfc.predict(x_test_selected)

# Evaluate the best classifier on the testing data
accuracy = best_rfc.score(x_test_selected, y_test)
print("Accuracy:", accuracy)

# Print the confusion matrix
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix:\n', cm)

## Neural Networks

In [None]:
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(x_train_selected.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# change learning rate
model.compile(loss='binary_crossentropy',
              optimizer=SGD(learning_rate=0.001),
              metrics=['accuracy'])

# train model
model.fit(x_train_selected, y_train, epochs=50, batch_size=32,
          validation_data=(x_test_selected, y_test))

# evaluate model on test set
test_loss, test_acc = model.evaluate(x_test_selected, y_test)
print('Test accuracy:', test_acc)

## Random Froest with Word2Vec

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=0)

# Train a Word2vec model on the training data
corpus = [doc.split() for doc in train_data]
model = Word2Vec(corpus, vector_size=100, window=5, min_count=5, workers=4)

In [None]:
# Define a function to generate feature vectors for each comment
def get_vector(text):
    words = text.split()
    vec = np.zeros((100,))
    count = 0
    for word in words:
        if word in model.wv.key_to_index:
            vec += model.wv[word]
            count += 1
    if count > 0:
        return vec / count
    else:
        return vec

In [None]:
# Generate feature vectors for the training and testing data
train_features = np.vstack([get_vector(text) for text in train_data])
test_features = np.vstack([get_vector(text) for text in test_data])

# Train a random forest classifier on the training data
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_features, train_labels)

# Use the classifier to make predictions on the testing data
predictions = rfc.predict(test_features)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy:', accuracy)

# Print the confusion matrix
cm = confusion_matrix(test_labels, predictions)
print('Confusion matrix:\n', cm)

In [None]:
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(train_features.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# change learning rate
model.compile(loss='binary_crossentropy',
              optimizer=SGD(learning_rate=0.001),
              metrics=['accuracy'])

# train model
model.fit(train_features, train_labels, epochs=50, batch_size=32,
          validation_data=(test_features, y_test))

# evaluate model on test set
test_loss, test_acc = model.evaluate(test_features, y_test)
print('Test accuracy:', test_acc)

## BERT with NN

In [None]:
# BERT Embedding
'''
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertModel
import tensorflow as tf
from tensorflow import keras
from transformers import TFDistilBertModel

def main():
    # Load the dataset
    df = pd.read_csv('mental_health.csv')
    df = df.iloc[:1500]

    # Split the dataset into training and testing sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Initialize the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the text data
    train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)

    # Perform feature selection
    selector = SelectKBest(chi2, k='all')
    train_features = np.array(train_encodings['input_ids'])
    train_features = selector.fit_transform(train_features, train_df['label'])
    train_attention_mask = np.array(train_encodings['attention_mask'])
    train_attention_mask = selector.transform(train_attention_mask)
    test_features = np.array(test_encodings['input_ids'])
    test_features = selector.transform(test_features)
    test_attention_mask = np.array(test_encodings['attention_mask'])
    test_attention_mask = selector.transform(test_attention_mask)

    # Load the BERT model
    #bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Generate the BERT embeddings
    train_embeddings = bert_model([train_features, train_attention_mask])[0][:, 0, :]
    test_embeddings = bert_model([test_features, test_attention_mask])[0][:, 0, :]

    # Save the train and test embeddings to files
    np.save('train_embeddings.npy', train_embeddings)
    np.save('test_embeddings.npy', test_embeddings)

    # Load the saved embeddings from files
    #train_embeddings = np.load('train_embeddings.npy')
    #test_embeddings = np.load('test_embeddings.npy')


    # Define the neural network model
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(768,)),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the neural network model
    model.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
                metrics=['accuracy'])

    # Train the neural network model
    model.fit(train_embeddings, train_df['label'], epochs=100, batch_size=32)

    # Evaluate the neural network model on the test data
    test_loss, test_acc = model.evaluate(test_embeddings, test_df['label'])
    print('Test accuracy:', test_acc)

if __name__ == "__main__":
    main()
'''

In [None]:
# Load training and testing sets
train_df = pd.read_csv('train_df_distilbert.csv')
test_df = pd.read_csv('test_df_distilbert.csv')

In [None]:
# Load the saved embeddings from files
train_embeddings_distil = np.load('train_embeddings_distilbert_1500rows.npy')
test_embeddings_distil = np.load('test_embeddings_distilbert_1500rows.npy')

In [None]:
# Define the neural network model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(768,), 
                       kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.1),
    keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.1),
    keras.layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the neural network model
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.0001),
              metrics=['accuracy'])

# Train the neural network model
model.fit(train_embeddings_distil, train_df['label'], epochs=50, batch_size=32,
          validation_data=(test_embeddings_distil, test_df['label']))

# Evaluate the neural network model on the test data
test_loss, test_acc = model.evaluate(test_embeddings_distil, test_df['label'])
print('Test accuracy:', test_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.8700000047683716


In [None]:
# Load training and testing sets
train_df = pd.read_csv('train_df_bert.csv')
test_df = pd.read_csv('test_df_bert.csv')

In [None]:
# Load the saved embeddings from files
train_embeddings_non_distil = np.load('train_embeddings_bert_1500rows.npy')
test_embeddings_non_distil = np.load('test_embeddings_bert_1500rows.npy')

In [None]:
# Define the neural network model with L2 regularization and dropout
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(768,), 
                       kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.3),
    keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.2),
    keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.1)),
    Dropout(0.2),
    keras.layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.1)),
    Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the neural network model
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.0001),
              metrics=['accuracy'])

# Train the neural network model
model.fit(train_embeddings_non_distil, train_df['label'], epochs=100, batch_size=32,
         validation_data=(test_embeddings_non_distil, test_df['label']))

# Evaluate the neural network model on the test data
test_loss, test_acc = model.evaluate(test_embeddings_non_distil, test_df['label'])
print('Test accuracy:', test_acc)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test accuracy: 0.8999999761581421
