<a href="https://colab.research.google.com/github/tsholofelo-mokheleli/SACAIR-Conference-Proceedings-2023/blob/main/Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [140]:
# Load the libraries
import pandas  as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Warning filter
import warnings
warnings.filterwarnings('ignore')
cmap=sns.color_palette('Blues_r')

# Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, confusion_matrix, balanced_accuracy_score
from imblearn.metrics import geometric_mean_score

# Classifiers
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Boosting Classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

# Imputation
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import KNNImputer

# Plot Theme
sns.set_style("darkgrid")
plt.style.use("seaborn-deep")

### **Load Data**

In [141]:
data = pd.read_csv("Clean Mental Health.csv")

In [142]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              2723 non-null   float64
 2   tech_company              2723 non-null   float64
 3   company_role              1840 non-null   float64
 4   benefits                  2723 non-null   float64
 5   care_options              2427 non-null   float64
 6   wellness_program          2723 non-null   float64
 7   seek_help                 2723 non-null   float64
 8   anonymity                 2723 non-null   float64
 9   leave                     2723 non-null   float64
 10  mental_importance         2723 non-null   float64
 11  neg_consequence_coworker  2980 non-null   float64
 12  discuss_mh                1859 non-null   float64
 13  work_interfere            546 non-null    float64
 14  coworker

### **Initial Data Imputation**

In [143]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Impute NaN values with -1 in the selected columns
data[columns_to_impute] = data[columns_to_impute].fillna(-1)

In [144]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [145]:
#Get the numbers of missing values in each column
missing = data.isnull().sum()

#Get the percentage of missing values in each column
missing_pct = round(data.isnull().sum()/len(data) * 100, 1)

data_missing = pd.concat([missing, missing_pct], axis=1)
data_missing.columns = ["Num", "%"]
print(data_missing)

                           Num     %
self_employed                0   0.0
no_employees                 0   0.0
tech_company                 0   0.0
company_role                 0   0.0
benefits                     0   0.0
care_options                 0   0.0
wellness_program             0   0.0
seek_help                    0   0.0
anonymity                    0   0.0
leave                        0   0.0
mental_importance            0   0.0
neg_consequence_coworker     0   0.0
discuss_mh                   0   0.0
work_interfere               0   0.0
coworkers                    0   0.0
supervisor                   0   0.0
mental_health_interview      0   0.0
family_history               0   0.0
past_mental_health           0   0.0
mental_health                0   0.0
mental_health_diagnosed   1080  33.0
treatment                    0   0.0
age                          0   0.0
gender                       0   0.0
country                      0   0.0


In [146]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

### **Split the data**

In [147]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

Using ***stratify*** is especially important when dealing with imbalanced datasets, where one class is much more frequent than the others. It helps prevent your training or testing set from having a significantly different class distribution than the original data, which could lead to biased model performance evaluation.

### **Define Classifiers**

In [148]:
# Define a list of classifiers
classifiers = {
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

### **Baseline Models**

In [149]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.3f}".format(metrics["Accuracy"]))
    print("Precision: {:.3f}".format(metrics["Precision"]))
    print("Recall: {:.3f}".format(metrics["Recall"]))
    print("F1 Score: {:.3f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.3f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.3f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.3f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.3f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Support Vector Machine ---
Accuracy: 0.662
Precision: 0.662
Recall: 1.000
F1 Score: 0.797
ROC AUC: 0.500
Kappa: 0.000
Geometric Mean: 0.000
Balanced Accuracy: 0.500
Confusion Matrix:
[[  0 148]
 [  0 290]]


--- Naive Bayes ---
Accuracy: 0.877
Precision: 0.924
Recall: 0.886
F1 Score: 0.905
ROC AUC: 0.872
Kappa: 0.730
Geometric Mean: 0.872
Balanced Accuracy: 0.872
Confusion Matrix:
[[127  21]
 [ 33 257]]


--- Random Forest ---
Accuracy: 0.920
Precision: 0.918
Recall: 0.966
F1 Score: 0.941
ROC AUC: 0.898
Kappa: 0.817
Geometric Mean: 0.896
Balanced Accuracy: 0.898
Confusion Matrix:
[[123  25]
 [ 10 280]]


--- Logistic Regression ---
Accuracy: 0.893
Precision: 0.886
Recall: 0.962
F1 Score: 0.922
ROC AUC: 0.859
Kappa: 0.750
Geometric Mean: 0.853
Balanced Accuracy: 0.859
Confusion Matrix:
[[112  36]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.728
Precision: 0.745
Recall: 0.897
F1 Score: 0.814
ROC AUC: 0.648
Kappa: 0.327
Geometric Mean: 0.598
Balanced Accuracy: 0.648
Confusion

### **Mode Imputation**

In [150]:
data = pd.read_csv("Clean Mental Health.csv")

In [151]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [152]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [153]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [154]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.3f}".format(metrics["Accuracy"]))
    print("Precision: {:.3f}".format(metrics["Precision"]))
    print("Recall: {:.3f}".format(metrics["Recall"]))
    print("F1 Score: {:.3f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.3f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.3f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.3f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.3f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Support Vector Machine ---
Accuracy: 0.662
Precision: 0.662
Recall: 1.000
F1 Score: 0.797
ROC AUC: 0.500
Kappa: 0.000
Geometric Mean: 0.000
Balanced Accuracy: 0.500
Confusion Matrix:
[[  0 148]
 [  0 290]]


--- Naive Bayes ---
Accuracy: 0.886
Precision: 0.929
Recall: 0.897
F1 Score: 0.912
ROC AUC: 0.881
Kappa: 0.749
Geometric Mean: 0.881
Balanced Accuracy: 0.881
Confusion Matrix:
[[128  20]
 [ 30 260]]


--- Random Forest ---
Accuracy: 0.913
Precision: 0.917
Recall: 0.955
F1 Score: 0.936
ROC AUC: 0.893
Kappa: 0.802
Geometric Mean: 0.891
Balanced Accuracy: 0.893
Confusion Matrix:
[[123  25]
 [ 13 277]]


--- Logistic Regression ---
Accuracy: 0.893
Precision: 0.886
Recall: 0.962
F1 Score: 0.922
ROC AUC: 0.859
Kappa: 0.750
Geometric Mean: 0.853
Balanced Accuracy: 0.859
Confusion Matrix:
[[112  36]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.733
Precision: 0.736
Recall: 0.931
F1 Score: 0.822
ROC AUC: 0.638
Kappa: 0.316
Geometric Mean: 0.566
Balanced Accuracy: 0.638
Confusion

### **Multiple Imputation by Chained Equations (MICE)**

In [155]:
data = pd.read_csv("Clean Mental Health.csv")

In [156]:
# Initialize the MICE imputer
mice_imputer = IterativeImputer()

# Exclude the 'mental_health_diagnosed' column from imputation
columns_to_impute = [col for col in data.columns if col != 'mental_health_diagnosed']

# Perform MICE imputation on the selected columns
data[columns_to_impute] = mice_imputer.fit_transform(data[columns_to_impute])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [157]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [158]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [159]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.3f}".format(metrics["Accuracy"]))
    print("Precision: {:.3f}".format(metrics["Precision"]))
    print("Recall: {:.3f}".format(metrics["Recall"]))
    print("F1 Score: {:.3f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.3f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.3f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.3f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.3f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Support Vector Machine ---
Accuracy: 0.662
Precision: 0.662
Recall: 1.000
F1 Score: 0.797
ROC AUC: 0.500
Kappa: 0.000
Geometric Mean: 0.000
Balanced Accuracy: 0.500
Confusion Matrix:
[[  0 148]
 [  0 290]]


--- Naive Bayes ---
Accuracy: 0.886
Precision: 0.920
Recall: 0.907
F1 Score: 0.913
ROC AUC: 0.876
Kappa: 0.747
Geometric Mean: 0.875
Balanced Accuracy: 0.876
Confusion Matrix:
[[125  23]
 [ 27 263]]


--- Random Forest ---
Accuracy: 0.913
Precision: 0.912
Recall: 0.962
F1 Score: 0.936
ROC AUC: 0.890
Kappa: 0.801
Geometric Mean: 0.887
Balanced Accuracy: 0.890
Confusion Matrix:
[[121  27]
 [ 11 279]]


--- Logistic Regression ---
Accuracy: 0.890
Precision: 0.885
Recall: 0.959
F1 Score: 0.921
ROC AUC: 0.858
Kappa: 0.745
Geometric Mean: 0.852
Balanced Accuracy: 0.858
Confusion Matrix:
[[112  36]
 [ 12 278]]


--- K-Nearest Neighbors ---
Accuracy: 0.724
Precision: 0.732
Recall: 0.921
F1 Score: 0.815
ROC AUC: 0.629
Kappa: 0.295
Geometric Mean: 0.558
Balanced Accuracy: 0.629
Confusion

### **Hot Deck Imputation**

In [160]:
data = pd.read_csv("Clean Mental Health.csv")

In [161]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]
imputed_data = data.copy()

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
  vars_to_impute = [column]

  for var in vars_to_impute:
      # Create a donor pool consisting of cases without missing values for the current variable
      donor_pool = imputed_data.dropna(subset=[var])

      # Iterate over each case with a missing value for the current variable
      for index, row in imputed_data[imputed_data[var].isnull()].iterrows():
          # Check if there are any donor cases available
          if not donor_pool.empty:
              # Find the closest donor case based on a chosen similarity measure
              similarity_scores = np.abs(donor_pool[vars_to_impute] - row[vars_to_impute])
              closest_donor = donor_pool.iloc[similarity_scores.sum(axis=1).idxmin()]

              # Impute the missing value with the value from the closest donor case
              imputed_data.at[index, var] = closest_donor[var]
          else:
              # Handle the case where there are no donor cases available
              # You can choose to skip imputation or use another imputation method
              imputed_data.at[index, var] = np.nan  # or any other imputation approach

data = imputed_data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [162]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [163]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [164]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.3f}".format(metrics["Accuracy"]))
    print("Precision: {:.3f}".format(metrics["Precision"]))
    print("Recall: {:.3f}".format(metrics["Recall"]))
    print("F1 Score: {:.3f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.3f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.3f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.3f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.3f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Support Vector Machine ---
Accuracy: 0.662
Precision: 0.662
Recall: 1.000
F1 Score: 0.797
ROC AUC: 0.500
Kappa: 0.000
Geometric Mean: 0.000
Balanced Accuracy: 0.500
Confusion Matrix:
[[  0 148]
 [  0 290]]


--- Naive Bayes ---
Accuracy: 0.856
Precision: 0.945
Recall: 0.831
F1 Score: 0.884
ROC AUC: 0.868
Kappa: 0.696
Geometric Mean: 0.867
Balanced Accuracy: 0.868
Confusion Matrix:
[[134  14]
 [ 49 241]]


--- Random Forest ---
Accuracy: 0.916
Precision: 0.920
Recall: 0.955
F1 Score: 0.937
ROC AUC: 0.897
Kappa: 0.808
Geometric Mean: 0.895
Balanced Accuracy: 0.897
Confusion Matrix:
[[124  24]
 [ 13 277]]


--- Logistic Regression ---
Accuracy: 0.890
Precision: 0.883
Recall: 0.962
F1 Score: 0.921
ROC AUC: 0.856
Kappa: 0.744
Geometric Mean: 0.849
Balanced Accuracy: 0.856
Confusion Matrix:
[[111  37]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.758
Precision: 0.754
Recall: 0.941
F1 Score: 0.837
ROC AUC: 0.670
Kappa: 0.386
Geometric Mean: 0.613
Balanced Accuracy: 0.670
Confusion

### **K-Nearest Neighbors Imputation**

In [165]:
data = pd.read_csv("Clean Mental Health.csv")

In [166]:
target_column = 'mental_health_diagnosed'
y = data[target_column]

# Remove the target column from the DataFrame for imputation
X = data.drop(columns=[target_column])

# Perform KNN imputation
imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
X_imputed = imputer.fit_transform(X)

# Convert the imputed array back to a DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Combine the imputed data with the target column
data_imputed = pd.concat([X_imputed_df, y], axis=1)

data = data_imputed
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [167]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [168]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [169]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.3f}".format(metrics["Accuracy"]))
    print("Precision: {:.3f}".format(metrics["Precision"]))
    print("Recall: {:.3f}".format(metrics["Recall"]))
    print("F1 Score: {:.3f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.3f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.3f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.3f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.3f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Support Vector Machine ---
Accuracy: 0.662
Precision: 0.662
Recall: 1.000
F1 Score: 0.797
ROC AUC: 0.500
Kappa: 0.000
Geometric Mean: 0.000
Balanced Accuracy: 0.500
Confusion Matrix:
[[  0 148]
 [  0 290]]


--- Naive Bayes ---
Accuracy: 0.890
Precision: 0.929
Recall: 0.903
F1 Score: 0.916
ROC AUC: 0.884
Kappa: 0.758
Geometric Mean: 0.884
Balanced Accuracy: 0.884
Confusion Matrix:
[[128  20]
 [ 28 262]]


--- Random Forest ---
Accuracy: 0.916
Precision: 0.912
Recall: 0.966
F1 Score: 0.938
ROC AUC: 0.892
Kappa: 0.806
Geometric Mean: 0.888
Balanced Accuracy: 0.892
Confusion Matrix:
[[121  27]
 [ 10 280]]


--- Logistic Regression ---
Accuracy: 0.888
Precision: 0.883
Recall: 0.959
F1 Score: 0.919
ROC AUC: 0.854
Kappa: 0.739
Geometric Mean: 0.848
Balanced Accuracy: 0.854
Confusion Matrix:
[[111  37]
 [ 12 278]]


--- K-Nearest Neighbors ---
Accuracy: 0.724
Precision: 0.723
Recall: 0.945
F1 Score: 0.819
ROC AUC: 0.618
Kappa: 0.276
Geometric Mean: 0.524
Balanced Accuracy: 0.618
Confusion