### A3: Implementing basic data preprocessing techniques like normalization and encoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [2]:
data = pd.read_csv("./healthcare-dataset-stroke-data.csv")
data.replace(['Unknown', 'None', 'N/A', 'NaN', 'NULL', '?',''], pd.NA, inplace=True)

In [3]:
nulls = data.isna().sum()
print("Number of null values", nulls)

Number of null values gender                  0
age                     0
hypertension            0
heart_disease           0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64


In [4]:
data.dropna(subset=["stroke"], inplace=True)
X = data.iloc[:, :-1]
Y = data.iloc[:, -1:]

In [5]:
num_cols = ['age', 'avg_glucose_level', 'bmi']
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'smoking_status', ]

In [6]:
KNN_imputer = KNNImputer(n_neighbors = 3)
scaled_cols = pd.DataFrame(KNN_imputer.fit_transform(X[num_cols]),columns=num_cols)

In [7]:
categorical_imputer = SimpleImputer(strategy='most_frequent',  missing_values=pd.NA)

categorical_imputer. fit(X[categorical_cols])
X_categorical_imputed = pd.DataFrame(categorical_imputer.transform(X[categorical_cols]), columns=categorical_cols)

X = pd.concat([scaled_cols, X_categorical_imputed], axis=1)

In [8]:
upper_limit = round(X["age"].mean() + 1.5 * X["age"].std(), 4)
lower_limit = round(X["age"].mean() - 1.5 * X["age"].std(), 4)
X["age"] = np.where(X["age"] < lower_limit, lower_limit, X["age"])
X["age"] = np.where(X["age"] > upper_limit, upper_limit, X["age"])

In [9]:
Q1 = X["avg_glucose_level"].quantile(0.25)
Q3 = X["avg_glucose_level"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["avg_glucose_level"] = np.where(X["avg_glucose_level"] < lower_limit, lower_limit, X["avg_glucose_level"])
X["avg_glucose_level"] = np.where(X["avg_glucose_level"] > upper_limit, upper_limit, X["avg_glucose_level"])

In [10]:
Q1 = X["bmi"].quantile(0.25)
Q3 = X["bmi"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["bmi"] = np.where(X["bmi"] < lower_limit, lower_limit, X["bmi"])
X["bmi"] = np.where(X["bmi"] > upper_limit, upper_limit, X["bmi"])

In [11]:
X.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,5110.0
mean,43.505968,100.996204,28.798545
std,21.528474,33.214738,7.2658
min,9.3076,55.12,10.3
25%,25.0,77.245,23.7
50%,45.0,91.885,28.2
75%,61.0,114.09,33.1
max,77.1456,169.3575,47.2


### we handled missing values and outliers as A2 and we will proceed to normalization and encoding

# Normalization for numerical columns

## Standardization 

In [12]:
sc_x = StandardScaler()
X_standardized = pd.DataFrame(sc_x.fit_transform(X[num_cols]), columns=num_cols)
X_standardized

Unnamed: 0,age,avg_glucose_level,bmi
0,1.091407,2.058363,1.073828
1,0.812679,2.058363,0.234196
2,1.562718,0.148256,0.509485
3,0.255223,2.058363,0.771010
4,1.562718,2.058363,-0.660494
...,...,...,...
5105,1.562718,-0.519284,-0.275089
5106,1.562718,0.728778,1.541819
5107,-0.395142,-0.542168,0.247960
5108,0.348133,1.966000,-0.440262


## Min-max scaler

In [13]:
min_max_x = MinMaxScaler()
X_scaled = pd.DataFrame(min_max_x.fit_transform(X[num_cols]), columns=num_cols)
X_scaled

Unnamed: 0,age,avg_glucose_level,bmi
0,0.850444,1.000000,0.712737
1,0.761998,1.000000,0.547425
2,1.000000,0.444688,0.601626
3,0.585106,1.000000,0.653117
4,1.000000,1.000000,0.371274
...,...,...,...
5105,1.000000,0.250618,0.447154
5106,1.000000,0.613459,0.804878
5107,0.378732,0.243965,0.550136
5108,0.614588,0.973148,0.414634


# Encoding for categorical columns

In [14]:
X = pd.concat([X[categorical_cols], scaled_cols], axis = 1)
X

Unnamed: 0,gender,hypertension,heart_disease,smoking_status,age,avg_glucose_level,bmi
0,Male,0,1,formerly smoked,67.0,228.69,36.6
1,Female,0,0,never smoked,61.0,202.21,30.5
2,Male,0,1,never smoked,80.0,105.92,32.5
3,Female,0,0,smokes,49.0,171.23,34.4
4,Female,1,0,never smoked,79.0,174.12,24.0
...,...,...,...,...,...,...,...
5105,Female,1,0,never smoked,80.0,83.75,26.8
5106,Female,0,0,never smoked,81.0,125.20,40.0
5107,Female,0,0,never smoked,35.0,82.99,30.6
5108,Male,0,0,formerly smoked,51.0,166.29,25.6


## One Hot Encoding

In [15]:
categorical_cols = ['gender', 'smoking_status', ]
cat_dummies = pd.get_dummies(X[categorical_cols])

df = pd.concat([cat_dummies, X], axis=1)
df.drop(["gender_Other", "gender", "smoking_status"], axis=1, inplace=True) 
df

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi
0,False,True,True,False,False,0,1,67.0,228.69,36.6
1,True,False,False,True,False,0,0,61.0,202.21,30.5
2,False,True,False,True,False,0,1,80.0,105.92,32.5
3,True,False,False,False,True,0,0,49.0,171.23,34.4
4,True,False,False,True,False,1,0,79.0,174.12,24.0
...,...,...,...,...,...,...,...,...,...,...
5105,True,False,False,True,False,1,0,80.0,83.75,26.8
5106,True,False,False,True,False,0,0,81.0,125.20,40.0
5107,True,False,False,True,False,0,0,35.0,82.99,30.6
5108,False,True,True,False,False,0,0,51.0,166.29,25.6


## Label encoding

In [16]:
label_encoder = LabelEncoder()

df["gender_Female"] = label_encoder.fit_transform(df["gender_Female"])
df["gender_Male"] = label_encoder.fit_transform(df["gender_Male"])
df["smoking_status_formerly smoked"] = label_encoder.fit_transform(df["smoking_status_formerly smoked"])
df["smoking_status_never smoked"] = label_encoder.fit_transform(df["smoking_status_never smoked"])
df["smoking_status_smokes"] = label_encoder.fit_transform(df["smoking_status_smokes"])

df

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi
0,0,1,1,0,0,0,1,67.0,228.69,36.6
1,1,0,0,1,0,0,0,61.0,202.21,30.5
2,0,1,0,1,0,0,1,80.0,105.92,32.5
3,1,0,0,0,1,0,0,49.0,171.23,34.4
4,1,0,0,1,0,1,0,79.0,174.12,24.0
...,...,...,...,...,...,...,...,...,...,...
5105,1,0,0,1,0,1,0,80.0,83.75,26.8
5106,1,0,0,1,0,0,0,81.0,125.20,40.0
5107,1,0,0,1,0,0,0,35.0,82.99,30.6
5108,0,1,1,0,0,0,0,51.0,166.29,25.6
