# Preprocessing

## Loading the data

In [94]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Exploring the data

In [95]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [96]:
data["gender"].value_counts()

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

In [97]:
data.age.value_counts()

age
78.00    102
57.00     95
52.00     90
54.00     87
51.00     86
        ... 
1.40       3
0.48       3
0.16       3
0.08       2
0.40       2
Name: count, Length: 104, dtype: int64

In [98]:
data.hypertension.value_counts()

hypertension
0    4612
1     498
Name: count, dtype: int64

In [99]:
data.heart_disease.value_counts()

heart_disease
0    4834
1     276
Name: count, dtype: int64

In [100]:
data.ever_married.value_counts()

ever_married
Yes    3353
No     1757
Name: count, dtype: int64

In [101]:
data.work_type.value_counts()

work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

In [102]:
data.Residence_type.value_counts()

Residence_type
Urban    2596
Rural    2514
Name: count, dtype: int64

In [103]:
data.avg_glucose_level.value_counts()

avg_glucose_level
93.88     6
73.00     5
72.49     5
91.85     5
91.68     5
         ..
61.98     1
91.01     1
120.03    1
138.51    1
118.41    1
Name: count, Length: 3979, dtype: int64

In [104]:
data.bmi.value_counts()

bmi
28.7    41
28.4    38
27.7    37
27.6    37
26.7    37
        ..
47.9     1
13.0     1
13.9     1
54.1     1
14.9     1
Name: count, Length: 418, dtype: int64

In [105]:
data.smoking_status.value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [106]:
data.stroke.value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

## Removing NaNs

In [107]:
Imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data["bmi"] = Imputer.fit_transform(data[["bmi"]])
data.drop("id",axis=1,inplace=True)
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


# Removing outliers

In [108]:
def replace_iqr_outliers(df, cols):
    cleaned_df = df.copy()
    for col in cols:
        Q1, Q3 = cleaned_df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        mean_val = cleaned_df[col].mean()
        cleaned_df.loc[(cleaned_df[col] < lower) | (cleaned_df[col] > upper), col] = mean_val
    return cleaned_df


In [109]:
numerical_cols = data.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('stroke')
data = data.drop(data[data["gender"] == "Other"].index)

data = replace_iqr_outliers(data, numerical_cols)
data

  cleaned_df.loc[(cleaned_df[col] < lower) | (cleaned_df[col] > upper), col] = mean_val
  cleaned_df.loc[(cleaned_df[col] < lower) | (cleaned_df[col] > upper), col] = mean_val


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0.000000,0.054022,Yes,Private,Urban,106.140399,36.600000,formerly smoked,1
1,Female,61.0,0.000000,0.000000,Yes,Self-employed,Rural,106.140399,28.893237,never smoked,1
2,Male,80.0,0.000000,0.054022,Yes,Private,Rural,105.920000,32.500000,never smoked,1
3,Female,49.0,0.000000,0.000000,Yes,Private,Urban,106.140399,34.400000,smokes,1
4,Female,79.0,0.097475,0.000000,Yes,Self-employed,Rural,106.140399,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,0.097475,0.000000,Yes,Private,Urban,83.750000,28.893237,never smoked,0
5106,Female,81.0,0.000000,0.000000,Yes,Self-employed,Urban,125.200000,40.000000,never smoked,0
5107,Female,35.0,0.000000,0.000000,Yes,Self-employed,Rural,82.990000,30.600000,never smoked,0
5108,Male,51.0,0.000000,0.000000,Yes,Private,Rural,166.290000,25.600000,formerly smoked,0


## Data Discretization

### Discretizing age into 8 categories and bmi into 4 categories

In [110]:
from sklearn.preprocessing import KBinsDiscretizer

age_estimator = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform')
age_data = age_estimator.fit_transform(data[["age"]])
print(age_estimator.bin_edges_)
data["age"] = age_data

bmi_estimator = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
bmi_data = bmi_estimator.fit_transform(data[["bmi"]])
print(bmi_estimator.bin_edges_)
data["bmi"] = bmi_data

data


[array([8.000e-02, 1.032e+01, 2.056e+01, 3.080e+01, 4.104e+01, 5.128e+01,
        6.152e+01, 7.176e+01, 8.200e+01])                                ]
[array([11.3  , 20.025, 28.75 , 37.475, 46.2  ])]


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,6.0,0.000000,0.054022,Yes,Private,Urban,106.140399,2.0,formerly smoked,1
1,Female,5.0,0.000000,0.000000,Yes,Self-employed,Rural,106.140399,2.0,never smoked,1
2,Male,7.0,0.000000,0.054022,Yes,Private,Rural,105.920000,2.0,never smoked,1
3,Female,4.0,0.000000,0.000000,Yes,Private,Urban,106.140399,2.0,smokes,1
4,Female,7.0,0.097475,0.000000,Yes,Self-employed,Rural,106.140399,1.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,7.0,0.097475,0.000000,Yes,Private,Urban,83.750000,2.0,never smoked,0
5106,Female,7.0,0.000000,0.000000,Yes,Self-employed,Urban,125.200000,3.0,never smoked,0
5107,Female,3.0,0.000000,0.000000,Yes,Self-employed,Rural,82.990000,2.0,never smoked,0
5108,Male,4.0,0.000000,0.000000,Yes,Private,Rural,166.290000,1.0,formerly smoked,0


### Data Scaling

In [111]:
scaler = StandardScaler()
features_to_scale = ["avg_glucose_level"]
data[features_to_scale] =scaler.fit_transform(data[features_to_scale])
data


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,6.0,0.000000,0.054022,Yes,Private,Urban,0.594096,2.0,formerly smoked,1
1,Female,5.0,0.000000,0.000000,Yes,Self-employed,Rural,0.594096,2.0,never smoked,1
2,Male,7.0,0.000000,0.054022,Yes,Private,Rural,0.583954,2.0,never smoked,1
3,Female,4.0,0.000000,0.000000,Yes,Private,Urban,0.594096,2.0,smokes,1
4,Female,7.0,0.097475,0.000000,Yes,Self-employed,Rural,0.594096,1.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,7.0,0.097475,0.000000,Yes,Private,Urban,-0.436210,2.0,never smoked,0
5106,Female,7.0,0.000000,0.000000,Yes,Self-employed,Urban,1.471133,3.0,never smoked,0
5107,Female,3.0,0.000000,0.000000,Yes,Self-employed,Rural,-0.471182,2.0,never smoked,0
5108,Male,4.0,0.000000,0.000000,Yes,Private,Rural,3.361911,1.0,formerly smoked,0


# Categorical Data Encoding

In [112]:
encoder = LabelEncoder()
object_columns= data.select_dtypes(include=['object']).columns.tolist()

for col in object_columns:
    data[col] = encoder.fit_transform(data[col])
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,6.0,0.000000,0.054022,1,2,1,0.594096,2.0,1,1
1,0,5.0,0.000000,0.000000,1,3,0,0.594096,2.0,2,1
2,1,7.0,0.000000,0.054022,1,2,0,0.583954,2.0,2,1
3,0,4.0,0.000000,0.000000,1,2,1,0.594096,2.0,3,1
4,0,7.0,0.097475,0.000000,1,3,0,0.594096,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,7.0,0.097475,0.000000,1,2,1,-0.436210,2.0,2,0
5106,0,7.0,0.000000,0.000000,1,3,1,1.471133,3.0,2,0
5107,0,3.0,0.000000,0.000000,1,3,0,-0.471182,2.0,2,0
5108,1,4.0,0.000000,0.000000,1,2,0,3.361911,1.0,1,0


## Saveing to a new CSV

In [113]:
data.to_csv('data/processed_data.csv', index=False)