<a href="https://colab.research.google.com/github/tsholofelo-mokheleli/SACAIR-Conference-Proceedings-2023/blob/main/Experiment_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Load the libraries
import pandas  as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Warning filter
import warnings
warnings.filterwarnings('ignore')
cmap=sns.color_palette('Blues_r')

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Imputation
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import KNNImputer

# Plot Theme
sns.set_style("darkgrid")
plt.style.use("seaborn-deep")

### **Load Data**

In [13]:
data = pd.read_csv("Clean Mental Health.csv")

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              2723 non-null   float64
 2   tech_company              2723 non-null   float64
 3   company_role              1840 non-null   float64
 4   benefits                  2723 non-null   float64
 5   care_options              2427 non-null   float64
 6   wellness_program          2723 non-null   float64
 7   seek_help                 2723 non-null   float64
 8   anonymity                 2723 non-null   float64
 9   leave                     2723 non-null   float64
 10  mental_importance         2723 non-null   float64
 11  neg_consequence_coworker  2980 non-null   float64
 12  discuss_mh                1859 non-null   float64
 13  work_interfere            546 non-null    float64
 14  coworker

### **Initial Data Imputation**

In [15]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Impute NaN values with -1 in the selected columns
data[columns_to_impute] = data[columns_to_impute].fillna(-1)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [17]:
#Get the numbers of missing values in each column
missing = data.isnull().sum()

#Get the percentage of missing values in each column
missing_pct = round(data.isnull().sum()/len(data) * 100, 1)

data_missing = pd.concat([missing, missing_pct], axis=1)
data_missing.columns = ["Num", "%"]
print(data_missing)

                           Num     %
self_employed                0   0.0
no_employees                 0   0.0
tech_company                 0   0.0
company_role                 0   0.0
benefits                     0   0.0
care_options                 0   0.0
wellness_program             0   0.0
seek_help                    0   0.0
anonymity                    0   0.0
leave                        0   0.0
mental_importance            0   0.0
neg_consequence_coworker     0   0.0
discuss_mh                   0   0.0
work_interfere               0   0.0
coworkers                    0   0.0
supervisor                   0   0.0
mental_health_interview      0   0.0
family_history               0   0.0
past_mental_health           0   0.0
mental_health                0   0.0
mental_health_diagnosed   1080  33.0
treatment                    0   0.0
age                          0   0.0
gender                       0   0.0
country                      0   0.0


In [18]:
data = data.drop(["mental_health_diagnosed"], axis=1)

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')


incomplete = data
incomplete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             3269 non-null   int64
 1   no_employees              3269 non-null   int64
 2   tech_company              3269 non-null   int64
 3   company_role              3269 non-null   int64
 4   benefits                  3269 non-null   int64
 5   care_options              3269 non-null   int64
 6   wellness_program          3269 non-null   int64
 7   seek_help                 3269 non-null   int64
 8   anonymity                 3269 non-null   int64
 9   leave                     3269 non-null   int64
 10  mental_importance         3269 non-null   int64
 11  neg_consequence_coworker  3269 non-null   int64
 12  discuss_mh                3269 non-null   int64
 13  work_interfere            3269 non-null   int64
 14  coworkers                 3269 non-null 

### **Mode Imputation**

In [19]:
data = pd.read_csv("Clean Mental Health.csv")

In [20]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [21]:
data = data.drop(["mental_health_diagnosed"], axis=1)

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')


mode_imputed = data
mode_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             3269 non-null   int64
 1   no_employees              3269 non-null   int64
 2   tech_company              3269 non-null   int64
 3   company_role              3269 non-null   int64
 4   benefits                  3269 non-null   int64
 5   care_options              3269 non-null   int64
 6   wellness_program          3269 non-null   int64
 7   seek_help                 3269 non-null   int64
 8   anonymity                 3269 non-null   int64
 9   leave                     3269 non-null   int64
 10  mental_importance         3269 non-null   int64
 11  neg_consequence_coworker  3269 non-null   int64
 12  discuss_mh                3269 non-null   int64
 13  work_interfere            3269 non-null   int64
 14  coworkers                 3269 non-null 

### **Multiple Imputation by Chained Equations (MICE)**

In [22]:
data = pd.read_csv("Clean Mental Health.csv")

In [23]:
# Initialize the MICE imputer
mice_imputer = IterativeImputer()

# Exclude the 'mental_health_diagnosed' column from imputation
columns_to_impute = [col for col in data.columns if col != 'mental_health_diagnosed']

# Perform MICE imputation on the selected columns
data[columns_to_impute] = mice_imputer.fit_transform(data[columns_to_impute])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [24]:
data = data.drop(["mental_health_diagnosed"], axis=1)

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')


mice_imputed = data
mice_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             3269 non-null   int64
 1   no_employees              3269 non-null   int64
 2   tech_company              3269 non-null   int64
 3   company_role              3269 non-null   int64
 4   benefits                  3269 non-null   int64
 5   care_options              3269 non-null   int64
 6   wellness_program          3269 non-null   int64
 7   seek_help                 3269 non-null   int64
 8   anonymity                 3269 non-null   int64
 9   leave                     3269 non-null   int64
 10  mental_importance         3269 non-null   int64
 11  neg_consequence_coworker  3269 non-null   int64
 12  discuss_mh                3269 non-null   int64
 13  work_interfere            3269 non-null   int64
 14  coworkers                 3269 non-null 

### **Hot Deck Imputation**

In [25]:
data = pd.read_csv("Clean Mental Health.csv")

In [26]:
columns_to_impute = data.columns[data.columns != "mental_health_diagnosed"]
imputed_data = data.copy()

# Iterate through columns to impute missing values with mode
for column in columns_to_impute:
  vars_to_impute = [column]

  for var in vars_to_impute:
      # Create a donor pool consisting of cases without missing values for the current variable
      donor_pool = imputed_data.dropna(subset=[var])

      # Iterate over each case with a missing value for the current variable
      for index, row in imputed_data[imputed_data[var].isnull()].iterrows():
          # Check if there are any donor cases available
          if not donor_pool.empty:
              # Find the closest donor case based on a chosen similarity measure
              similarity_scores = np.abs(donor_pool[vars_to_impute] - row[vars_to_impute])
              closest_donor = donor_pool.iloc[similarity_scores.sum(axis=1).idxmin()]

              # Impute the missing value with the value from the closest donor case
              imputed_data.at[index, var] = closest_donor[var]
          else:
              # Handle the case where there are no donor cases available
              # You can choose to skip imputation or use another imputation method
              imputed_data.at[index, var] = np.nan  # or any other imputation approach

data = imputed_data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   int64  
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [27]:
data = data.drop(["mental_health_diagnosed"], axis=1)

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')


hotdeck_imputed = data
hotdeck_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             3269 non-null   int64
 1   no_employees              3269 non-null   int64
 2   tech_company              3269 non-null   int64
 3   company_role              3269 non-null   int64
 4   benefits                  3269 non-null   int64
 5   care_options              3269 non-null   int64
 6   wellness_program          3269 non-null   int64
 7   seek_help                 3269 non-null   int64
 8   anonymity                 3269 non-null   int64
 9   leave                     3269 non-null   int64
 10  mental_importance         3269 non-null   int64
 11  neg_consequence_coworker  3269 non-null   int64
 12  discuss_mh                3269 non-null   int64
 13  work_interfere            3269 non-null   int64
 14  coworkers                 3269 non-null 

### **K-Nearest Neighbors Imputation**

In [28]:
data = pd.read_csv("Clean Mental Health.csv")

In [29]:
target_column = 'mental_health_diagnosed'
y = data[target_column]

# Remove the target column from the DataFrame for imputation
X = data.drop(columns=[target_column])

# Perform KNN imputation
imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
X_imputed = imputer.fit_transform(X)

# Convert the imputed array back to a DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Combine the imputed data with the target column
data_imputed = pd.concat([X_imputed_df, y], axis=1)

data = data_imputed
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   self_employed             3269 non-null   float64
 1   no_employees              3269 non-null   float64
 2   tech_company              3269 non-null   float64
 3   company_role              3269 non-null   float64
 4   benefits                  3269 non-null   float64
 5   care_options              3269 non-null   float64
 6   wellness_program          3269 non-null   float64
 7   seek_help                 3269 non-null   float64
 8   anonymity                 3269 non-null   float64
 9   leave                     3269 non-null   float64
 10  mental_importance         3269 non-null   float64
 11  neg_consequence_coworker  3269 non-null   float64
 12  discuss_mh                3269 non-null   float64
 13  work_interfere            3269 non-null   float64
 14  coworker

In [30]:
data = data.drop(["mental_health_diagnosed"], axis=1)

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype('int64')


knn_imputed = data
knn_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   self_employed             3269 non-null   int64
 1   no_employees              3269 non-null   int64
 2   tech_company              3269 non-null   int64
 3   company_role              3269 non-null   int64
 4   benefits                  3269 non-null   int64
 5   care_options              3269 non-null   int64
 6   wellness_program          3269 non-null   int64
 7   seek_help                 3269 non-null   int64
 8   anonymity                 3269 non-null   int64
 9   leave                     3269 non-null   int64
 10  mental_importance         3269 non-null   int64
 11  neg_consequence_coworker  3269 non-null   int64
 12  discuss_mh                3269 non-null   int64
 13  work_interfere            3269 non-null   int64
 14  coworkers                 3269 non-null 

### **Check for Nulls**

In [31]:
datasets = {
    "Incomplete":incomplete,
    "Mode": mode_imputed,
    "Mice": mice_imputed,
    "Hot Deck": hotdeck_imputed,
    "KNN": knn_imputed
}

for name, dataset in datasets.items():
    nan_mask = dataset.isna()
    nan_count = nan_mask.sum()
    columns_with_nan = nan_count[nan_count > 0]

    print(f"Columns with NaN values in {name}:")
    print(columns_with_nan)

    total_nan_count = nan_count.sum()
    print("Total NaN count:", total_nan_count)
    print("\n")

Columns with NaN values in Incomplete:
Series([], dtype: int64)
Total NaN count: 0


Columns with NaN values in Mode:
Series([], dtype: int64)
Total NaN count: 0


Columns with NaN values in Mice:
Series([], dtype: int64)
Total NaN count: 0


Columns with NaN values in Hot Deck:
Series([], dtype: int64)
Total NaN count: 0


Columns with NaN values in KNN:
Series([], dtype: int64)
Total NaN count: 0




### **1. Preservation of relationships**

In [32]:
incomplete_corr = incomplete.corr()

mode_complete_corr = mode_imputed.corr()
mice_complete_corr = mice_imputed.corr()
hd_complete_corr = hotdeck_imputed.corr()
knn_complete_corr = knn_imputed.corr()

mode_complete_corr_diff = np.abs(incomplete_corr - mode_complete_corr)
mice_complete_corr_diff = np.abs(incomplete_corr - mice_complete_corr)
hd_complete_corr_diff = np.abs(incomplete_corr - hd_complete_corr)
knn_complete_corr_diff = np.abs(incomplete_corr - knn_complete_corr)

mode_average_diff = mode_complete_corr_diff.mean().mean()
mice_average_diff = mice_complete_corr_diff.mean().mean()
hd_average_diff = hd_complete_corr_diff.mean().mean()
knn_average_diff = knn_complete_corr_diff .mean().mean()

print("Mode Average difference in correlation matrices:", mode_average_diff)
print("Mice Average difference in correlation matrices:", mice_average_diff)
print("Hot Deck Average difference in correlation matrices:", hd_average_diff)
print("KNN Average difference in correlation matrices:", knn_average_diff)

Mode Average difference in correlation matrices: 0.18903101469090622
Mice Average difference in correlation matrices: 0.15162621126541576
Hot Deck Average difference in correlation matrices: 0.195172813749415
KNN Average difference in correlation matrices: 0.1605950241013949


### **2. Mean Absolute Error**

In [33]:
mode_mae = mean_absolute_error(incomplete, mode_imputed)
multiple_mae = mean_absolute_error(incomplete, mice_imputed)
hot_deck_mae = mean_absolute_error(incomplete, hotdeck_imputed)
knn_mae = mean_absolute_error(incomplete, knn_imputed)

print("Mode Imputation MAE:", mode_mae)
print("Mice Imputation MAE:", multiple_mae)
print("Hot Deck Imputation MAE:", hot_deck_mae)
print("KNN Imputation MAE:", knn_mae)

Mode Imputation MAE: 0.328604568165596
Mice Imputation MAE: 0.24399663505659222
Hot Deck Imputation MAE: 0.39025695931477516
KNN Imputation MAE: 0.26990924849597225


### **3. Root Mean Squared Error**

In [34]:
# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Calculate RMSE for each dataset
mode_rmse = rmse(incomplete, mode_imputed)
multiple_rmse = rmse(incomplete, mice_imputed)
hot_deck_rmse = rmse(incomplete, hotdeck_imputed)
knn_rmse = rmse(incomplete, knn_imputed)

print("Mode Imputation RMSE:", mode_rmse)
print("Mice Imputation RMSE:", multiple_rmse)
print("Hot Deck Imputation RMSE:", hot_deck_rmse)
print("KNN Imputation RMSE:", knn_rmse)

Mode Imputation RMSE: 1.0112740133164588
Mice Imputation RMSE: 0.8297439778801579
Hot Deck Imputation RMSE: 1.1250460262726025
KNN Imputation RMSE: 0.840769311753721
