### A4: Implementing Advanced feature engineering and selection techniques

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [39]:
data = pd.read_csv("./healthcare-dataset-stroke-data.csv")
data.replace(['Unknown', 'None', 'N/A', 'NaN', 'NULL', '?',''], pd.NA, inplace=True)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,cardiovascular_health_score,BMI_Glucose,overall_health
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1,1,8370.054,851.8
1,Female,61.0,0,0,202.21,,never smoked,1,0,,
2,Male,80.0,0,1,105.92,32.5,never smoked,1,1,3442.4,757.5
3,Female,49.0,0,0,171.23,34.4,smokes,1,0,5890.312,791.2
4,Female,79.0,1,0,174.12,24.0,never smoked,1,1,4178.88,562.0


In [40]:
df.dropna(subset=["stroke"], inplace=True)
X = data.iloc[:, :-1]
Y = data.iloc[:, -1:]

In [41]:
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'smoking_status', ]
num_cols = ['age', 'avg_glucose_level', 'bmi']

In [42]:
KNN_imputer = KNNImputer(n_neighbors = 3)
scaled_cols = pd.DataFrame(KNN_imputer.fit_transform(X[num_cols]),columns=num_cols)

In [43]:
categorical_imputer = SimpleImputer(strategy='most_frequent',  missing_values=pd.NA)

categorical_imputer. fit(X[categorical_cols])
X_categorical_imputed = pd.DataFrame(categorical_imputer.transform(X[categorical_cols]), columns=categorical_cols)

X = pd.concat([scaled_cols, X_categorical_imputed], axis=1)

In [44]:
upper_limit = round(X["age"].mean() + 1.5 * X["age"].std(), 4)
lower_limit = round(X["age"].mean() - 1.5 * X["age"].std(), 4)
X["age"] = np.where(X["age"] < lower_limit, lower_limit, X["age"])
X["age"] = np.where(X["age"] > upper_limit, upper_limit, X["age"])

In [45]:
Q1 = X["avg_glucose_level"].quantile(0.25)
Q3 = X["avg_glucose_level"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["avg_glucose_level"] = np.where(X["avg_glucose_level"] < lower_limit, lower_limit, X["avg_glucose_level"])
X["avg_glucose_level"] = np.where(X["avg_glucose_level"] > upper_limit, upper_limit, X["avg_glucose_level"])

In [46]:
Q1 = X["bmi"].quantile(0.25)
Q3 = X["bmi"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["bmi"] = np.where(X["bmi"] < lower_limit, lower_limit, X["bmi"])
X["bmi"] = np.where(X["bmi"] > upper_limit, upper_limit, X["bmi"])

In [50]:
sc_x = StandardScaler()
X_standardized = pd.DataFrame(sc_x.fit_transform(X[num_cols]), columns=num_cols)

In [48]:
X = pd.concat([X[categorical_cols], scaled_cols], axis = 1)

In [51]:
categorical_cols = ['gender', 'smoking_status', ]
cat_dummies = pd.get_dummies(X[categorical_cols])

df = pd.concat([cat_dummies, X], axis=1)
df.drop(["gender_Other", "gender", "smoking_status"], axis=1, inplace=True) 
df

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi
0,False,True,True,False,False,0,1,67.0,228.69,36.6
1,True,False,False,True,False,0,0,61.0,202.21,30.5
2,False,True,False,True,False,0,1,80.0,105.92,32.5
3,True,False,False,False,True,0,0,49.0,171.23,34.4
4,True,False,False,True,False,1,0,79.0,174.12,24.0
...,...,...,...,...,...,...,...,...,...,...
5105,True,False,False,True,False,1,0,80.0,83.75,26.8
5106,True,False,False,True,False,0,0,81.0,125.20,40.0
5107,True,False,False,True,False,0,0,35.0,82.99,30.6
5108,False,True,True,False,False,0,0,51.0,166.29,25.6


### 1. Feature engineering
we will add the following features to the dataset:

1, cardiovascular health score: this will be just the addition of the 2 cardiovascular health metrics (hypertension and heart disease)

2, BMI_Glucose interactions : this will be the multiplication of the BMI and Glucose features

3, smoking_bmi interactions: this will be the multiplication of the weighted smoking status  and BMI features

4, overall_health index: a weighted sum of bmi and cardiovascular health score

In [52]:
# 1, cardiovascular health score
df['cardiovascular_health_score'] = df ['heart_disease'] + df['hypertension']
print(df.shape)
df.head()

(5110, 11)


Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi,cardiovascular_health_score
0,False,True,True,False,False,0,1,67.0,228.69,36.6,1
1,True,False,False,True,False,0,0,61.0,202.21,30.5,0
2,False,True,False,True,False,0,1,80.0,105.92,32.5,1
3,True,False,False,False,True,0,0,49.0,171.23,34.4,0
4,True,False,False,True,False,1,0,79.0,174.12,24.0,1


In [53]:
# 2. BMI Glucose interactions

df["BMI_Glucose"] = df["bmi"] * df["avg_glucose_level"]
df.head()


Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi,cardiovascular_health_score,BMI_Glucose
0,False,True,True,False,False,0,1,67.0,228.69,36.6,1,8370.054
1,True,False,False,True,False,0,0,61.0,202.21,30.5,0,6167.405
2,False,True,False,True,False,0,1,80.0,105.92,32.5,1,3442.4
3,True,False,False,False,True,0,0,49.0,171.23,34.4,0,5890.312
4,True,False,False,True,False,1,0,79.0,174.12,24.0,1,4178.88


In [54]:
# 3. SMoking_Bmi_interactions
df["smoking_bmi_interaction"] = 10*df["smoking_status_formerly smoked"].astype(int) + -4*df["smoking_status_never smoked"].astype(int) + 17*df["smoking_status_smokes"].astype(int) + 23*df["BMI_Glucose"]

df.head()

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi,cardiovascular_health_score,BMI_Glucose,smoking_bmi_interaction
0,False,True,True,False,False,0,1,67.0,228.69,36.6,1,8370.054,192521.242
1,True,False,False,True,False,0,0,61.0,202.21,30.5,0,6167.405,141846.315
2,False,True,False,True,False,0,1,80.0,105.92,32.5,1,3442.4,79171.2
3,True,False,False,False,True,0,0,49.0,171.23,34.4,0,5890.312,135494.176
4,True,False,False,True,False,1,0,79.0,174.12,24.0,1,4178.88,96110.24


In [55]:
# 4,overall_health_index

df["overall_health"] = 23*df["bmi"] + 10*df["cardiovascular_health_score"]

df.head()

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi,cardiovascular_health_score,BMI_Glucose,smoking_bmi_interaction,overall_health
0,False,True,True,False,False,0,1,67.0,228.69,36.6,1,8370.054,192521.242,851.8
1,True,False,False,True,False,0,0,61.0,202.21,30.5,0,6167.405,141846.315,701.5
2,False,True,False,True,False,0,1,80.0,105.92,32.5,1,3442.4,79171.2,757.5
3,True,False,False,False,True,0,0,49.0,171.23,34.4,0,5890.312,135494.176,791.2
4,True,False,False,True,False,1,0,79.0,174.12,24.0,1,4178.88,96110.24,562.0


##### At this point we have implemented our own advanced feature engineering with important features extracted by combining variables which affect stroke 

In [56]:
# save the new preprocessed and feature added dataset
df.to_csv("final_dataset.csv",index = False)