# Capstone 2: Preprocessing

### 1. Sourcing and Loading

#### 1a. Import Relevant Libraries

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

from library.sb_utils import save_file

#### 1b. Load the Data

In [2]:
stroke_data = pd.read_csv(r'..\healthcare-dataset-stroke-data.csv')

In [3]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


#### 1c. Data Wrangling

I'm now going to do a few quick data wrangling steps. Some of these steps are repeated from the Data Wrangling notebook. I am repeating some of these steps because I want to recreate the dummy variables in a different way. 

In [4]:
# Count NaNs
missing = pd.concat([stroke_data.isnull().sum(), 100 * stroke_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
id,0,0.0
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0
smoking_status,0,0.0


In [5]:
# Identify categorical features
stroke_data.select_dtypes(include='object')

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,Yes,Private,Urban,formerly smoked
1,Female,Yes,Self-employed,Rural,never smoked
2,Male,Yes,Private,Rural,never smoked
3,Female,Yes,Private,Urban,smokes
4,Female,Yes,Self-employed,Rural,never smoked
...,...,...,...,...,...
5105,Female,Yes,Private,Urban,never smoked
5106,Female,Yes,Self-employed,Urban,never smoked
5107,Female,Yes,Self-employed,Rural,never smoked
5108,Male,Yes,Private,Rural,formerly smoked


In [6]:
# Determine unique categorical variables
stroke_data[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']].nunique()

gender            3
ever_married      2
work_type         5
Residence_type    2
smoking_status    4
dtype: int64

In [7]:
# Since there is only one 'Other' gender value, I will remove it
stroke_data = stroke_data[stroke_data['gender'] != 'Other']
stroke_data['gender'].value_counts()

Female    2994
Male      2115
Name: gender, dtype: int64

### 2. Encode Categorical Variables:

I will do two separate encodings here. I will use one-hot encoding for 'gender', 'ever_married', and 'Residence_type' because they are binary and I will use dummy encoding for the remaining categorical variables.

In [8]:
# Dummy encoding
dummy_df = pd.get_dummies(stroke_data[['work_type', 'smoking_status']])
print(dummy_df.shape)
print(type(dummy_df))
dummy_df.head(-10)

(5109, 9)
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,0,1,0,0,0,1,0,0
1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1
4,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
5095,0,0,0,0,1,1,0,0,0
5096,1,0,0,0,0,0,0,1,0
5097,0,0,0,1,0,1,0,0,0
5098,0,0,0,0,1,1,0,0,0


In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
# One hot encoding
enc = OneHotEncoder(drop='first')
data_slice = stroke_data[['gender', 'ever_married', 'Residence_type']]
one_hot_data = enc.fit_transform(data_slice)
print(one_hot_data.shape)
one_hot_data = pd.DataFrame(one_hot_data.toarray(), columns=['gender', 'ever_married', 'Residence_type'], index=dummy_df.index, dtype=np.uint8)
print(one_hot_data.shape)
print(type(one_hot_data))
one_hot_data.head(-10)

(5109, 3)
(5109, 3)
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,gender,ever_married,Residence_type
0,1,1,1
1,0,1,0
2,1,1,0
3,0,1,1
4,0,1,0
...,...,...,...
5095,1,0,0
5096,1,1,0
5097,1,1,1
5098,1,0,1


In [11]:
# merge dataframes
cat_data_encoded = one_hot_data.merge(dummy_df, left_on=one_hot_data.index, right_on=dummy_df.index)
cat_data_encoded = cat_data_encoded.drop(columns=['key_0'], axis=1)
print(cat_data_encoded.shape)
cat_data_encoded.head(-10)

(5109, 12)


Unnamed: 0,gender,ever_married,Residence_type,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,1,1,0,0,1,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0,1,0
2,1,1,0,0,0,1,0,0,0,0,1,0
3,0,1,1,0,0,1,0,0,0,0,0,1
4,0,1,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5094,1,0,0,0,0,0,0,1,1,0,0,0
5095,1,1,0,1,0,0,0,0,0,0,1,0
5096,1,1,1,0,0,0,1,0,1,0,0,0
5097,1,0,1,0,0,0,0,1,1,0,0,0


##### Merge all the data back together:

In [13]:
# merge categorical and numerical variables back together
stroke_data_num = stroke_data[['id', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']]
stroke_data_encoded = cat_data_encoded.merge(stroke_data_num, left_on=cat_data_encoded.index, right_on=stroke_data_num.index)
stroke_data_encoded = stroke_data_encoded.drop(columns=['key_0'], axis=1)
stroke_data_encoded.head()

Unnamed: 0,gender,ever_married,Residence_type,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,id,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,1,1,1,0,0,1,0,0,0,1,0,0,9046,0,1,228.69,36.6,1
1,0,1,0,0,0,0,1,0,0,0,1,0,51676,0,0,202.21,,1
2,1,1,0,0,0,1,0,0,0,0,1,0,31112,0,1,105.92,32.5,1
3,0,1,1,0,0,1,0,0,0,0,0,1,60182,0,0,171.23,34.4,1
4,0,1,0,0,0,0,1,0,0,0,1,0,1665,1,0,174.12,24.0,1


##### Remove NaNs from the data

In [14]:
# Since bmi appears to be normally distributed (besides the couple outliers), I will replace NaNs with the median value.
stroke_data_encoded['bmi'].fillna(stroke_data_encoded['bmi'].median(), inplace=True)

# check that all nulls are removed
stroke_data_encoded['bmi'].isnull().sum()

0

### 3. Scale the Data

Now, I need to scale the data so that the data is properly prepared for machine learning algorithms. We will be using scikit-learn's Standard Scaler here to accomplish this. The only columns that need scaled after the encoding is 'avg_glucose_level' and 'bmi'. All of the other variables are binary, except 'id', which doesn't make sense to scale.

In [15]:
from sklearn.preprocessing import StandardScaler

In [21]:
# scale the avg_glucose_level and bmi columns
scaler = StandardScaler()
stroke_columns_scaled = scaler.fit_transform(stroke_data_encoded[['avg_glucose_level', 'bmi']])
stroke_columns_scaled = pd.DataFrame(stroke_columns_scaled, columns=['avg_glucose_level', 'bmi'], index=stroke_data_encoded.index)
stroke_columns_scaled.head()

Unnamed: 0,avg_glucose_level,bmi
0,2.706151,1.004738
1,2.121398,-0.099202
2,-0.004957,0.47225
3,1.437272,0.719013
4,1.501091,-0.63169


In [23]:
# replace columns in dataframe
stroke_data_encoded['avg_glucose_level'] = stroke_columns_scaled['avg_glucose_level']
stroke_data_encoded['bmi'] = stroke_columns_scaled['bmi']
stroke_data_encoded.head()

Unnamed: 0,gender,ever_married,Residence_type,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,id,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,1,1,1,0,0,1,0,0,0,1,0,0,9046,0,1,2.706151,1.004738,1
1,0,1,0,0,0,0,1,0,0,0,1,0,51676,0,0,2.121398,-0.099202,1
2,1,1,0,0,0,1,0,0,0,0,1,0,31112,0,1,-0.004957,0.47225,1
3,0,1,1,0,0,1,0,0,0,0,0,1,60182,0,0,1.437272,0.719013,1
4,0,1,0,0,0,0,1,0,0,0,1,0,1665,1,0,1.501091,-0.63169,1


### 4. Split into Training and Testing Sets

Finally, we can split the data into training and testing sets to be used in the modeling phase. For the split, we are going to do a simple 80:20 ratio.

In [24]:
from sklearn.model_selection import train_test_split

In [27]:
# split the target variable
X = stroke_data_encoded.drop(['stroke'], axis=1)
y = stroke_data_encoded['stroke']

Unnamed: 0,gender,ever_married,Residence_type,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,id,hypertension,heart_disease,avg_glucose_level,bmi
0,1,1,1,0,0,1,0,0,0,1,0,0,9046,0,1,2.706151,1.004738
1,0,1,0,0,0,0,1,0,0,0,1,0,51676,0,0,2.121398,-0.099202
2,1,1,0,0,0,1,0,0,0,0,1,0,31112,0,1,-0.004957,0.47225
3,0,1,1,0,0,1,0,0,0,0,0,1,60182,0,0,1.437272,0.719013
4,0,1,0,0,0,0,1,0,0,0,1,0,1665,1,0,1.501091,-0.63169


In [28]:
# use train_test_split to split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### 4a. Save the data to file

In [30]:
import hickle as hkl
data = {'X_train': X_train, 'X_test': X_test,'y_train': y_train,'y_test':y_test}
hkl.dump(data,'stroke_traintest_data.hkl')

pandas 1.5.3


