<a href="https://colab.research.google.com/github/tsholofelo-mokheleli/SACAIR-Conference-Proceedings-2023/blob/main/Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the libraries
import pandas  as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Warning filter
import warnings
warnings.filterwarnings('ignore')
cmap=sns.color_palette('Blues_r')

# Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, confusion_matrix, balanced_accuracy_score
from imblearn.metrics import geometric_mean_score

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Boosting Classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

# Imputation
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import KNNImputer

# Plot Theme
sns.set_style("darkgrid")
plt.style.use("seaborn-deep")

### **Load Data**

In [2]:
data = pd.read_csv("Clean Mental Health.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              2723 non-null   float64
 2   tech_company              2723 non-null   float64
 3   company_role              1840 non-null   float64
 4   benefits                  2723 non-null   float64
 5   care_options              2427 non-null   float64
 6   wellness_program          2723 non-null   float64
 7   seek_help                 2723 non-null   float64
 8   anonymity                 2723 non-null   float64
 9   leave                     2723 non-null   float64
 10  mental_importance         2723 non-null   float64
 11  neg_consequence_coworker  2980 non-null   float64
 12  discuss_mh                1859 non-null   float64
 13  work_interfere            546 non-null    float64
 14  coworker

### **Initial Data Imputation**

In [4]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Impute NaN values with -1 in the selected columns
data[columns_to_impute] = data[columns_to_impute].fillna(-1)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [6]:
#Get the numbers of missing values in each column
missing = data.isnull().sum()

#Get the percentage of missing values in each column
missing_pct = round(data.isnull().sum()/len(data) * 100, 1)

data_missing = pd.concat([missing, missing_pct], axis=1)
data_missing.columns = ["Num", "%"]
print(data_missing)

                           Num     %
self_employed                0   0.0
no_employees                 0   0.0
tech_company                 0   0.0
company_role                 0   0.0
benefits                     0   0.0
care_options                 0   0.0
wellness_program             0   0.0
seek_help                    0   0.0
anonymity                    0   0.0
leave                        0   0.0
mental_importance            0   0.0
neg_consequence_coworker     0   0.0
discuss_mh                   0   0.0
work_interfere               0   0.0
coworkers                    0   0.0
supervisor                   0   0.0
mental_health_interview      0   0.0
family_history               0   0.0
past_mental_health           0   0.0
mental_health                0   0.0
mental_health_diagnosed   1080  33.0
treatment                    0   0.0
age                          0   0.0
gender                       0   0.0
country                      0   0.0


In [7]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

### **Split the data**

In [8]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

Using ***stratify*** is especially important when dealing with imbalanced datasets, where one class is much more frequent than the others. It helps prevent your training or testing set from having a significantly different class distribution than the original data, which could lead to biased model performance evaluation.

### **Define Classifiers**

In [9]:
# Define a list of classifiers
classifiers = {
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

### **Baseline Models**

In [10]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.2f}".format(metrics["Accuracy"]))
    print("Precision: {:.2f}".format(metrics["Precision"]))
    print("Recall: {:.2f}".format(metrics["Recall"]))
    print("F1 Score: {:.2f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.2f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.2f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.2f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.2f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Naive Bayes ---
Accuracy: 0.88
Precision: 0.92
Recall: 0.89
F1 Score: 0.90
ROC AUC: 0.87
Kappa: 0.73
Geometric Mean: 0.87
Balanced Accuracy: 0.87
Confusion Matrix:
[[127  21]
 [ 33 257]]


--- Random Forest ---
Accuracy: 0.92
Precision: 0.92
Recall: 0.96
F1 Score: 0.94
ROC AUC: 0.90
Kappa: 0.81
Geometric Mean: 0.89
Balanced Accuracy: 0.90
Confusion Matrix:
[[123  25]
 [ 11 279]]


--- Logistic Regression ---
Accuracy: 0.89
Precision: 0.89
Recall: 0.96
F1 Score: 0.92
ROC AUC: 0.86
Kappa: 0.75
Geometric Mean: 0.85
Balanced Accuracy: 0.86
Confusion Matrix:
[[112  36]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.73
Precision: 0.74
Recall: 0.90
F1 Score: 0.81
ROC AUC: 0.65
Kappa: 0.33
Geometric Mean: 0.60
Balanced Accuracy: 0.65
Confusion Matrix:
[[ 59  89]
 [ 30 260]]


--- XGBoost ---
Accuracy: 0.91
Precision: 0.93
Recall: 0.94
F1 Score: 0.93
ROC AUC: 0.90
Kappa: 0.80
Geometric Mean: 0.90
Balanced Accuracy: 0.90
Confusion Matrix:
[[127  21]
 [ 17 273]]


--- AdaBoost ---
Accu

### **Mode Imputation**

In [11]:
data = pd.read_csv("Clean Mental Health.csv")

In [12]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [13]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [14]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [15]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.2f}".format(metrics["Accuracy"]))
    print("Precision: {:.2f}".format(metrics["Precision"]))
    print("Recall: {:.2f}".format(metrics["Recall"]))
    print("F1 Score: {:.2f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.2f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.2f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.2f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.2f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Naive Bayes ---
Accuracy: 0.89
Precision: 0.93
Recall: 0.90
F1 Score: 0.91
ROC AUC: 0.88
Kappa: 0.75
Geometric Mean: 0.88
Balanced Accuracy: 0.88
Confusion Matrix:
[[128  20]
 [ 30 260]]


--- Random Forest ---
Accuracy: 0.92
Precision: 0.91
Recall: 0.97
F1 Score: 0.94
ROC AUC: 0.89
Kappa: 0.81
Geometric Mean: 0.89
Balanced Accuracy: 0.89
Confusion Matrix:
[[121  27]
 [ 10 280]]


--- Logistic Regression ---
Accuracy: 0.89
Precision: 0.89
Recall: 0.96
F1 Score: 0.92
ROC AUC: 0.86
Kappa: 0.75
Geometric Mean: 0.85
Balanced Accuracy: 0.86
Confusion Matrix:
[[112  36]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.73
Precision: 0.74
Recall: 0.93
F1 Score: 0.82
ROC AUC: 0.64
Kappa: 0.32
Geometric Mean: 0.57
Balanced Accuracy: 0.64
Confusion Matrix:
[[ 51  97]
 [ 20 270]]


--- XGBoost ---
Accuracy: 0.92
Precision: 0.92
Recall: 0.96
F1 Score: 0.94
ROC AUC: 0.90
Kappa: 0.81
Geometric Mean: 0.90
Balanced Accuracy: 0.90
Confusion Matrix:
[[124  24]
 [ 12 278]]


--- AdaBoost ---
Accu

### **Multiple Imputation by Chained Equations (MICE)**

In [16]:
data = pd.read_csv("Clean Mental Health.csv")

In [17]:
# Initialize the MICE imputer
mice_imputer = IterativeImputer()

# Exclude the 'mental_health_diagnosed' column from imputation
columns_to_impute = [col for col in data.columns if col != 'mental_health_diagnosed']

# Perform MICE imputation on the selected columns
data[columns_to_impute] = mice_imputer.fit_transform(data[columns_to_impute])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [18]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [19]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [20]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.2f}".format(metrics["Accuracy"]))
    print("Precision: {:.2f}".format(metrics["Precision"]))
    print("Recall: {:.2f}".format(metrics["Recall"]))
    print("F1 Score: {:.2f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.2f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.2f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.2f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.2f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Naive Bayes ---
Accuracy: 0.89
Precision: 0.92
Recall: 0.91
F1 Score: 0.91
ROC AUC: 0.88
Kappa: 0.75
Geometric Mean: 0.88
Balanced Accuracy: 0.88
Confusion Matrix:
[[125  23]
 [ 27 263]]


--- Random Forest ---
Accuracy: 0.91
Precision: 0.91
Recall: 0.96
F1 Score: 0.94
ROC AUC: 0.89
Kappa: 0.80
Geometric Mean: 0.89
Balanced Accuracy: 0.89
Confusion Matrix:
[[122  26]
 [ 12 278]]


--- Logistic Regression ---
Accuracy: 0.89
Precision: 0.89
Recall: 0.96
F1 Score: 0.92
ROC AUC: 0.86
Kappa: 0.74
Geometric Mean: 0.85
Balanced Accuracy: 0.86
Confusion Matrix:
[[112  36]
 [ 12 278]]


--- K-Nearest Neighbors ---
Accuracy: 0.72
Precision: 0.73
Recall: 0.92
F1 Score: 0.82
ROC AUC: 0.63
Kappa: 0.30
Geometric Mean: 0.56
Balanced Accuracy: 0.63
Confusion Matrix:
[[ 50  98]
 [ 23 267]]


--- XGBoost ---
Accuracy: 0.91
Precision: 0.92
Recall: 0.95
F1 Score: 0.94
ROC AUC: 0.89
Kappa: 0.80
Geometric Mean: 0.89
Balanced Accuracy: 0.89
Confusion Matrix:
[[124  24]
 [ 14 276]]


--- AdaBoost ---
Accu

### **Hot Deck Imputation**

In [21]:
data = pd.read_csv("Clean Mental Health.csv")

In [22]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]
imputed_data = data.copy()

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
  vars_to_impute = [column]

  for var in vars_to_impute:
      # Create a donor pool consisting of cases without missing values for the current variable
      donor_pool = imputed_data.dropna(subset=[var])

      # Iterate over each case with a missing value for the current variable
      for index, row in imputed_data[imputed_data[var].isnull()].iterrows():
          # Check if there are any donor cases available
          if not donor_pool.empty:
              # Find the closest donor case based on a chosen similarity measure
              similarity_scores = np.abs(donor_pool[vars_to_impute] - row[vars_to_impute])
              closest_donor = donor_pool.iloc[similarity_scores.sum(axis=1).idxmin()]

              # Impute the missing value with the value from the closest donor case
              imputed_data.at[index, var] = closest_donor[var]
          else:
              # Handle the case where there are no donor cases available
              # You can choose to skip imputation or use another imputation method
              imputed_data.at[index, var] = np.nan  # or any other imputation approach

data = imputed_data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [23]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [24]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [25]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.2f}".format(metrics["Accuracy"]))
    print("Precision: {:.2f}".format(metrics["Precision"]))
    print("Recall: {:.2f}".format(metrics["Recall"]))
    print("F1 Score: {:.2f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.2f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.2f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.2f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.2f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Naive Bayes ---
Accuracy: 0.86
Precision: 0.95
Recall: 0.83
F1 Score: 0.88
ROC AUC: 0.87
Kappa: 0.70
Geometric Mean: 0.87
Balanced Accuracy: 0.87
Confusion Matrix:
[[134  14]
 [ 49 241]]


--- Random Forest ---
Accuracy: 0.92
Precision: 0.92
Recall: 0.96
F1 Score: 0.94
ROC AUC: 0.90
Kappa: 0.81
Geometric Mean: 0.90
Balanced Accuracy: 0.90
Confusion Matrix:
[[124  24]
 [ 12 278]]


--- Logistic Regression ---
Accuracy: 0.89
Precision: 0.88
Recall: 0.96
F1 Score: 0.92
ROC AUC: 0.86
Kappa: 0.74
Geometric Mean: 0.85
Balanced Accuracy: 0.86
Confusion Matrix:
[[111  37]
 [ 11 279]]


--- K-Nearest Neighbors ---
Accuracy: 0.76
Precision: 0.75
Recall: 0.94
F1 Score: 0.84
ROC AUC: 0.67
Kappa: 0.39
Geometric Mean: 0.61
Balanced Accuracy: 0.67
Confusion Matrix:
[[ 59  89]
 [ 17 273]]


--- XGBoost ---
Accuracy: 0.91
Precision: 0.92
Recall: 0.95
F1 Score: 0.94
ROC AUC: 0.89
Kappa: 0.80
Geometric Mean: 0.89
Balanced Accuracy: 0.89
Confusion Matrix:
[[124  24]
 [ 14 276]]


--- AdaBoost ---
Accu

### **K-Nearest Neighbors Imputation**

In [26]:
data = pd.read_csv("Clean Mental Health.csv")

In [27]:
target_column = 'mental_health_diagnosed'
y = data[target_column]

# Remove the target column from the DataFrame for imputation
X = data.drop(columns=[target_column])

# Perform KNN imputation
imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
X_imputed = imputer.fit_transform(X)

# Convert the imputed array back to a DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Combine the imputed data with the target column
data_imputed = pd.concat([X_imputed_df, y], axis=1)

data = data_imputed
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [28]:
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2189 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             2189 non-null   int64
 1   no_employees              2189 non-null   int64
 2   tech_company              2189 non-null   int64
 3   company_role              2189 non-null   int64
 4   benefits                  2189 non-null   int64
 5   care_options              2189 non-null   int64
 6   wellness_program          2189 non-null   int64
 7   seek_help                 2189 non-null   int64
 8   anonymity                 2189 non-null   int64
 9   leave                     2189 non-null   int64
 10  mental_importance         2189 non-null   int64
 11  neg_consequence_coworker  2189 non-null   int64
 12  discuss_mh                2189 non-null   int64
 13  work_interfere            2189 non-null   int64
 14  coworkers                 2189 non-null 

In [29]:
X = data.drop(["mental_health_diagnosed"], axis=1)
y = data['mental_health_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [30]:
# Dictionary to store evaluation metrics
results = {}

# Loop through each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    geometric_mean = geometric_mean_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Kappa": kappa,
        "Geometric Mean": geometric_mean,
        "Balanced Accuracy": balanced_accuracy,
        "Confusion Matrix": confusion,
    }

# Display the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print("Accuracy: {:.2f}".format(metrics["Accuracy"]))
    print("Precision: {:.2f}".format(metrics["Precision"]))
    print("Recall: {:.2f}".format(metrics["Recall"]))
    print("F1 Score: {:.2f}".format(metrics["F1 Score"]))
    print("ROC AUC: {:.2f}".format(metrics["ROC AUC"]))
    print("Kappa: {:.2f}".format(metrics["Kappa"]))
    print("Geometric Mean: {:.2f}".format(metrics["Geometric Mean"]))
    print("Balanced Accuracy: {:.2f}".format(metrics["Balanced Accuracy"]))
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("\n")

--- Naive Bayes ---
Accuracy: 0.89
Precision: 0.93
Recall: 0.90
F1 Score: 0.92
ROC AUC: 0.88
Kappa: 0.76
Geometric Mean: 0.88
Balanced Accuracy: 0.88
Confusion Matrix:
[[128  20]
 [ 28 262]]


--- Random Forest ---
Accuracy: 0.91
Precision: 0.91
Recall: 0.96
F1 Score: 0.93
ROC AUC: 0.88
Kappa: 0.79
Geometric Mean: 0.88
Balanced Accuracy: 0.88
Confusion Matrix:
[[119  29]
 [ 11 279]]


--- Logistic Regression ---
Accuracy: 0.89
Precision: 0.88
Recall: 0.96
F1 Score: 0.92
ROC AUC: 0.85
Kappa: 0.74
Geometric Mean: 0.85
Balanced Accuracy: 0.85
Confusion Matrix:
[[111  37]
 [ 12 278]]


--- K-Nearest Neighbors ---
Accuracy: 0.72
Precision: 0.72
Recall: 0.94
F1 Score: 0.82
ROC AUC: 0.62
Kappa: 0.28
Geometric Mean: 0.52
Balanced Accuracy: 0.62
Confusion Matrix:
[[ 43 105]
 [ 16 274]]


--- XGBoost ---
Accuracy: 0.91
Precision: 0.92
Recall: 0.94
F1 Score: 0.93
ROC AUC: 0.89
Kappa: 0.79
Geometric Mean: 0.89
Balanced Accuracy: 0.89
Confusion Matrix:
[[124  24]
 [ 17 273]]


--- AdaBoost ---
Accu