In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from pylab import rcParams
import plotly.graph_objects as go
import missingno as msno
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [2]:
df = pd.read_csv('datasets/heart_cdc.csv')

In [3]:
# Define a dictionary for column renaming
columns_name_mapping = {
    "cvdcrhd4": "HeartDisease",
    "weight2": "BMI1", 
    "height3":"BMI2", 
    "_smoker3":"Smoking", 
    "drnkany5":"AlcoholDrinking", 
    "cvdstrk3":"Stroke", 
    "physhlth":"PhysicalHealth",
    "menthlth":"MentalHealth", 
    "diffwalk":"DiffWalking", 
    "_sex":"Sex", 
    "_ageg5yr":"AgeCategory", 
    "_race":"Race", 
    "diabete4":"Diabetic", 
    "exerany2":"PhysicalActivity", 
    "genhlth":"GenHealth", 
    "sleptim1":"SleepTime", 
    "_asthms1":"Asthma", 
    "chckdny2":"KidneyDisease", 
    "chcscncr":"SkinCancer",
    "_bmi5":"BMI_calc"
}

# Define a list of the columns you want to keep
columns_to_keep = [ 
    "HeartDisease",
    "BMI1", 
    "BMI2", 
    "Smoking", 
    "AlcoholDrinking", 
    "Stroke", 
    "PhysicalHealth",
    "MentalHealth", 
    "DiffWalking", 
    "Sex", 
    "AgeCategory", 
    "Race", 
    "Diabetic", 
    "PhysicalActivity", 
    "GenHealth", 
    "SleepTime", 
    "Asthma", 
    "KidneyDisease", 
    "SkinCancer",
    "BMI_calc"
]

# Rename columns and select the columns to keep in one step
df = df.rename(columns=columns_name_mapping)[columns_to_keep]

In [4]:
value_mapping = {
    'HeartDisease': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'AlcoholDrinking': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'Stroke': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'PhysicalActivity': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'DiffWalking': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'SkinCancer': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'KidneyDisease': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'Asthma': {1:'Current', 2:'Former', 3:'Never', 9:'Not Clear'},
    'Smoking': {1:'Smoker', 2:'Approximate Smoker', 3:'Former Smoker', 4:'Never', 9:'Not Clear'},
    'Sex': {1:'Male', 2:'Female'},
    'AgeCategory': {1:'18-24', 2:'25-29', 3:'30-34', 4:'35-39', 5:'40-44', 6:'45-49', 7:'50-54', 8:'55-59', 9:'60-64', 10:'65-69', 11:'70-74', 12:'75-79', 13:'80+', 14:'Not Clear'},
    'Race': {1:'White', 2:'Black', 3:'Native', 4:'Asian', 5:'Hawaian', 6:'Other', 7:'Multiracial', 8:'Hispanic', 9:'Not Clear'},
    'Diabetic': {1:'Yes', 2:'Yes(Pregnant)', 3:'No', 4:'Border Line', 7:'Not Clear', 9:'Refused'},
    'GenHealth': {1:'Excellent', 2:'Very Good', 3:'Good', 4:'Fair', 5:'Poor', 7:'Not Clear', 9:'Refused'},
}

# Use the replace() method to change the values in the DataFrame
df.replace(value_mapping, inplace=True)

In [4]:
value_mapping = {
    'HeartDisease': {1:1, 2:2, 7:3, 9:4},
    'AlcoholDrinking': {1:1, 2:2, 7:3, 9:4},
    'Stroke': {1:1, 2:2, 7:3, 9:4},
    'PhysicalActivity': {1:1, 2:2, 7:3, 9:4},
    'DiffWalking': {1:1, 2:2, 7:3, 9:4},
    'SkinCancer': {1:1, 2:2, 7:3, 9:4},
    'KidneyDisease': {1:1, 2:2, 7:3, 9:4},
    'Asthma': {1:1, 2:2, 3:3, 9:4},
    'Smoking': {1:1, 2:2, 3:3, 4:4, 9:5},
    'Diabetic': {1:1, 2:2, 3:3, 4:4, 7:5, 9:6},
    'GenHealth': {1:1, 2:2, 3:3, 4:4, 5:5, 7:6, 9:7},
}

# Use the replace() method to change the values in the DataFrame
df.replace(value_mapping, inplace=True)

In [5]:
cat_var = ['HeartDisease',
    'Stroke',
    'PhysicalActivity',
    'DiffWalking',
    'SkinCancer',
    'KidneyDisease',
    'Race',
    'Diabetic',
    'GenHealth']

In [6]:
num_var = ["BMI1",
              "BMI2",
              "PhysicalHealth",
              "MentalHealth",
              "SleepTime",
              "BMI_calc"]

In [None]:
cat_var.head()

In [None]:
num_var.head()

In [5]:
df.head()

Unnamed: 0,HeartDisease,BMI1,BMI2,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_calc
0,No,106.0,507.0,Smoker,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very Good,5.0,Current,No,Yes,1660.0
1,No,170.0,504.0,Not Clear,Refused,No,88.0,88.0,No,Female,65-69,Black,No,Yes,Good,7.0,Former,No,No,2918.0
2,No,7777.0,508.0,Never,No,No,88.0,88.0,No,Female,65-69,Black,No,Yes,Good,7.0,Never,No,No,
3,No,9999.0,9999.0,Never,No,No,88.0,88.0,No,Female,80+,White,No,No,Excellent,6.0,Never,No,No,
4,No,126.0,506.0,Never,No,Yes,88.0,88.0,No,Female,80+,White,No,Yes,Very Good,7.0,Never,No,No,2034.0


In [6]:
df.isna().sum()

HeartDisease            3
BMI1                 9852
BMI2                10824
Smoking                 0
AlcoholDrinking         0
Stroke                  3
PhysicalHealth          5
MentalHealth            5
DiffWalking         15280
Sex                     0
AgeCategory             0
Race                    1
Diabetic                6
PhysicalActivity        3
GenHealth               8
SleepTime               3
Asthma                  0
KidneyDisease           6
SkinCancer              3
BMI_calc            41357
dtype: int64

In [10]:
cat_var_known = df.dropna(subset=['DiffWalking'])
cat_var_missing = df[df['DiffWalking'].isna()]

In [11]:
X_known = cat_var_known.drop(columns=['DiffWalking'])
y_known = cat_var_known['DiffWalking']

In [8]:
def preprocess_categorical(df):
    label_encoders = {}
    for col in df.select_dtypes(include=['object']):
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
    return df, label_encoders

In [9]:
df, label_encoders = preprocess_categorical(df)

In [17]:
df.head()

Unnamed: 0,HeartDisease,BMI1,BMI2,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_calc
0,0,106.0,507.0,4,0,0,3.0,30.0,0,0,7,8,4,3,6,5.0,0,0,3,1660.0
1,0,170.0,504.0,3,2,0,88.0,88.0,0,0,9,1,1,3,2,7.0,1,0,0,2918.0
2,0,7777.0,508.0,2,0,0,88.0,88.0,0,0,9,1,1,3,2,7.0,2,0,0,
3,0,9999.0,9999.0,2,0,0,88.0,88.0,0,0,12,8,1,0,0,6.0,2,0,0,
4,0,126.0,506.0,2,0,3,88.0,88.0,0,0,12,8,1,3,6,7.0,2,0,0,2034.0


In [12]:
cat_tree = DecisionTreeClassifier(random_state=0)
cat_tree.fit(X_known, y_known)

In [18]:
missing_values = cat_var_missing.drop(columns=['DiffWalking'])

In [20]:
missing_values['DiffWalking'] = cat_tree.predict(missing_values.drop(columns=['DiffWalking']))

KeyError: "['DiffWalking'] not found in axis"

In [None]:
def imputation_process(df2, feature_var, row_no, pos, num_var, bin_var, class_var, old_model_fit = "", old_model="",  previous_var= ""):
  '''Creates an adaboost ensemble based on 3 types of adaboost functionality (depending on variable type) to be used for prediction
  on first missing value, which is then imputed'''

  complete_df = df2.iloc[0:row_no]                        
  y = complete_df[feature_var]
  complete_df.drop(feature_var, axis = 1, inplace = True)
  X = complete_df
  prediction_feat = df2.iloc[row_no].copy()
  prediction_feat.drop(feature_var, inplace = True)
  
  #As a temporary fix for multiple missing values, will use mean imputation for a secondary, tertiary etc missing value temporarily 
  for series_name in X.columns:                                   
    if math.isnan(prediction_feat[series_name]):
      if series_name in bin_var+class_var:
        prediction_feat[series_name] = round(statistics.mean(df2[series_name].notna()),0)
      else:
        prediction_feat[series_name] = round(statistics.mean(df2[series_name].notna()),0)

  imp_time_start = time.time()
  if feature_var in num_var:
    if feature_var != previous_var:
      model = AdaBoostRegressor(random_state = 42)
      model_fit = model.fit(X.values,y.values)
      yhat = model.predict([prediction_feat])
    else:
      yhat = old_model_fit.predict([prediction_feat])
      model, model_fit = old_model, old_model_fit
  
  elif feature_var in bin_var:
    if feature_var != previous_var:
      model = AdaBoostClassifier(random_state = 42)
      model_fit = model.fit(X.values,y.values)
      yhat = model.predict([prediction_feat])
    else:
      yhat = old_model_fit.predict([prediction_feat])
      model, model_fit = old_model, old_model_fit

  elif feature_var in class_var:
    if feature_var != previous_var:
      model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3),  random_state = 42)
      model_fit = model.fit(X.values,y.values)
      yhat = model.predict([prediction_feat]) 
    else:
      yhat = old_model_fit.predict([prediction_feat])
      model, model_fit = old_model, old_model_fit
  else: 
    print("Error, found variable missing from variable lists")

  df2.iloc[row_no, pos-1] = yhat[0]
  previous_var = feature_var

  return model_fit, model, previous_var


In [None]:
 model_fit_1, model_1, previous_var_1 = imputation_process(df2, feature_var, row_no, pos,  num_var, bin_var, class_var)