## A1: Data Cleaning and Preprocessing  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./healthcare-dataset-stroke-data.csv")
data.replace(['Unknown', 'None', 'N/A', 'NaN', 'NULL', '?',''], pd.NA, inplace=True)

In [3]:
data.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,202.21,,never smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
3,Female,49.0,0,0,171.23,34.4,smokes,1
4,Female,79.0,1,0,174.12,24.0,never smoked,1


In [4]:
data.tail(5)


Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
5105,Female,80.0,1,0,83.75,,never smoked,0
5106,Female,81.0,0,0,125.2,40.0,never smoked,0
5107,Female,35.0,0,0,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,166.29,25.6,formerly smoked,0
5109,Female,44.0,0,0,85.28,26.2,,0


In [5]:
data.sample(5)

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
1627,Female,53.0,1,0,202.66,34.1,smokes,0
4158,Male,25.0,0,0,65.36,24.7,never smoked,0
210,Female,39.0,0,0,83.24,26.3,never smoked,1
2633,Female,82.0,0,0,243.59,24.3,never smoked,0
4821,Female,34.0,0,0,68.53,29.7,never smoked,0


In [6]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   avg_glucose_level  5110 non-null   float64
 5   bmi                4909 non-null   float64
 6   smoking_status     3566 non-null   object 
 7   stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 319.5+ KB


In [8]:
data[((data["age"] > 50) & ( data["heart_disease"] == 1 ))]

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
6,Male,74.0,1,1,70.09,27.4,never smoked,1
11,Female,61.0,0,1,120.46,36.8,smokes,1
13,Male,78.0,0,1,219.84,,,1
...,...,...,...,...,...,...,...,...
4784,Female,81.0,1,1,126.34,27.4,smokes,0
4855,Female,57.0,1,1,231.72,45.7,formerly smoked,0
4921,Male,72.0,0,1,238.27,,smokes,0
4986,Male,58.0,0,1,225.35,26.5,smokes,0


In [9]:
nulls = data.isna().sum()
print("Number of null values", nulls)

Number of null values gender                  0
age                     0
hypertension            0
heart_disease           0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64


## Divde the data to dependent and independent variables

In [10]:
data.dropna(subset=["stroke"], inplace=True)
X = data.iloc[:, :-1]
Y = data.iloc[:, -1:]

In [11]:
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'smoking_status', ]
num_cols = ['age', 'avg_glucose_level', 'bmi']

In [12]:
KNN_imputer = KNNImputer(n_neighbors = 3)
scaled_cols = pd.DataFrame(KNN_imputer.fit_transform(X[num_cols]),columns=num_cols)

In [13]:
categorical_imputer = SimpleImputer(strategy='most_frequent',  missing_values=pd.NA)

categorical_imputer. fit(X[categorical_cols])
X_categorical_imputed = pd.DataFrame(categorical_imputer.transform(X[categorical_cols]), columns=categorical_cols)

X = pd.concat([scaled_cols, X_categorical_imputed], axis=1)

In [14]:
upper_limit = round(X["age"].mean() + 1.5 * X["age"].std(), 4)
lower_limit = round(X["age"].mean() - 1.5 * X["age"].std(), 4)
X["age"] = np.where(X["age"] < lower_limit, lower_limit, X["age"])
X["age"] = np.where(X["age"] > upper_limit, upper_limit, X["age"])

In [15]:
Q1 = X["avg_glucose_level"].quantile(0.25)
Q3 = X["avg_glucose_level"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["avg_glucose_level"] = np.where(X["avg_glucose_level"] < lower_limit, lower_limit, X["avg_glucose_level"])
X["avg_glucose_level"] = np.where(X["avg_glucose_level"] > upper_limit, upper_limit, X["avg_glucose_level"])

In [16]:
Q1 = X["bmi"].quantile(0.25)
Q3 = X["bmi"].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

X["bmi"] = np.where(X["bmi"] < lower_limit, lower_limit, X["bmi"])
X["bmi"] = np.where(X["bmi"] > upper_limit, upper_limit, X["bmi"])

In [17]:
X.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,5110.0
mean,43.505968,100.996204,28.798545
std,21.528474,33.214738,7.2658
min,9.3076,55.12,10.3
25%,25.0,77.245,23.7
50%,45.0,91.885,28.2
75%,61.0,114.09,33.1
max,77.1456,169.3575,47.2


In [18]:
sc_x = StandardScaler()
X_standardized = pd.DataFrame(sc_x.fit_transform(X[num_cols]), columns=num_cols)
X_standardized

Unnamed: 0,age,avg_glucose_level,bmi
0,1.091407,2.058363,1.073828
1,0.812679,2.058363,0.234196
2,1.562718,0.148256,0.509485
3,0.255223,2.058363,0.771010
4,1.562718,2.058363,-0.660494
...,...,...,...
5105,1.562718,-0.519284,-0.275089
5106,1.562718,0.728778,1.541819
5107,-0.395142,-0.542168,0.247960
5108,0.348133,1.966000,-0.440262


In [19]:
X = pd.concat([X[categorical_cols], scaled_cols], axis = 1)

In [20]:
categorical_cols = ['gender', 'smoking_status', ]
cat_dummies = pd.get_dummies(X[categorical_cols])

df = pd.concat([cat_dummies, X], axis=1)
df.drop(["gender_Other", "gender", "smoking_status"], axis=1, inplace=True) 
df

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi
0,False,True,True,False,False,0,1,67.0,228.69,36.6
1,True,False,False,True,False,0,0,61.0,202.21,30.5
2,False,True,False,True,False,0,1,80.0,105.92,32.5
3,True,False,False,False,True,0,0,49.0,171.23,34.4
4,True,False,False,True,False,1,0,79.0,174.12,24.0
...,...,...,...,...,...,...,...,...,...,...
5105,True,False,False,True,False,1,0,80.0,83.75,26.8
5106,True,False,False,True,False,0,0,81.0,125.20,40.0
5107,True,False,False,True,False,0,0,35.0,82.99,30.6
5108,False,True,True,False,False,0,0,51.0,166.29,25.6


In [21]:
label_encoder = LabelEncoder()

df["gender_Female"] = label_encoder.fit_transform(df["gender_Female"])
df["gender_Male"] = label_encoder.fit_transform(df["gender_Male"])
df["smoking_status_formerly smoked"] = label_encoder.fit_transform(df["smoking_status_formerly smoked"])
df["smoking_status_never smoked"] = label_encoder.fit_transform(df["smoking_status_never smoked"])
df["smoking_status_smokes"] = label_encoder.fit_transform(df["smoking_status_smokes"])

df

Unnamed: 0,gender_Female,gender_Male,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,hypertension,heart_disease,age,avg_glucose_level,bmi
0,0,1,1,0,0,0,1,67.0,228.69,36.6
1,1,0,0,1,0,0,0,61.0,202.21,30.5
2,0,1,0,1,0,0,1,80.0,105.92,32.5
3,1,0,0,0,1,0,0,49.0,171.23,34.4
4,1,0,0,1,0,1,0,79.0,174.12,24.0
...,...,...,...,...,...,...,...,...,...,...
5105,1,0,0,1,0,1,0,80.0,83.75,26.8
5106,1,0,0,1,0,0,0,81.0,125.20,40.0
5107,1,0,0,1,0,0,0,35.0,82.99,30.6
5108,0,1,1,0,0,0,0,51.0,166.29,25.6


# Split the data into train and test datas

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3832 entries, 2954 to 2732
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             3832 non-null   object 
 1   hypertension       3832 non-null   object 
 2   heart_disease      3832 non-null   object 
 3   smoking_status     3832 non-null   object 
 4   age                3832 non-null   float64
 5   avg_glucose_level  3832 non-null   float64
 6   bmi                3832 non-null   float64
dtypes: float64(3), object(4)
memory usage: 239.5+ KB


In [23]:
Y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3832 entries, 2954 to 2732
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   stroke  3832 non-null   int64
dtypes: int64(1)
memory usage: 59.9 KB


In [24]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1278 entries, 42 to 4803
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             1278 non-null   object 
 1   hypertension       1278 non-null   object 
 2   heart_disease      1278 non-null   object 
 3   smoking_status     1278 non-null   object 
 4   age                1278 non-null   float64
 5   avg_glucose_level  1278 non-null   float64
 6   bmi                1278 non-null   float64
dtypes: float64(3), object(4)
memory usage: 79.9+ KB


In [25]:
Y_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1278 entries, 42 to 4803
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   stroke  1278 non-null   int64
dtypes: int64(1)
memory usage: 20.0 KB
