# Heart Disease Model

This notebook is meant to show how to quickly training a Scikit Learn ML model using pipelines and columns transformers and save a complete pipeline model.

In [41]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
import joblib

In [18]:
df = pd.read_csv('./data/heart-data.zip')

In [19]:
df.shape

(918, 12)

In [20]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [21]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [22]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [23]:
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

In [24]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [25]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

## Model Pipleline

In [26]:
cat_columns = list(X.select_dtypes(include=[object]).columns)
numeric_columns = list(X.select_dtypes(exclude=[object]).columns)

In [27]:
cat_columns

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [28]:
numeric_columns

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [29]:
preprocessing_transformer = make_column_transformer(

    (StandardScaler(), numeric_columns),
    
    (OneHotEncoder(categories='auto', handle_unknown='ignore'), cat_columns)
)

In [36]:
model = make_pipeline(preprocessing_transformer, LogisticRegression())

## Explore Model Options

In [37]:
scores = cross_val_score(model, X, y, cv=3)
scores

array([0.87254902, 0.85947712, 0.77124183])

In [38]:
scores.mean()

0.8344226579520697

## Train Model

Train the model on all of the available data.  Recall that the model is actually the entire pipeline, so all of the encodings and scaling will be taken care of.

In [39]:
model.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'RestingBP',
                                                   'Cholesterol', 'FastingBS',
                                                   'MaxHR', 'Oldpeak']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Sex', 'ChestPainType',
                                                   'RestingECG',
                                                   'ExerciseAngina',
                                                   'ST_Slope'])])),
                ('logisticregression', LogisticRegression())])

## Save Model

In [42]:
joblib.dump(model, 'heart_model.pkl')

['heart_model.pkl']

## Load Model

Load the model, as we would if we deployed the model and were running in production.

Use a value from the training data, which I realize is not a best practice, but we are just trying out the process at this point

In [43]:
heart_model = joblib.load('heart_model.pkl')

In [65]:
# use a sample from the initial dataframe
X_test = X.iloc[[4]]
y_test = y[4]

In [70]:
X_test

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [72]:
X_test.values

array([[54, 'M', 'NAP', 150, 195, 0, 'Normal', 122, 'N', 0.0, 'Up']],
      dtype=object)

In [71]:
X_test.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'],
      dtype='object')

In [47]:
y_test

0

Make a prediction from a row from the dataframe read in initially.

In [68]:
y_pred = heart_model.predict(X_test)

In [69]:
y_pred

array([0])

Create a new record, as if we read it from a REST request, and create a prediction dataframe.

In [77]:
new_sample = [[54, 'M', 'NAP', 150, 195, 0, 'Normal', 122, 'N', 0.0, 'Up']]

In [78]:
new_sample_df = pd.DataFrame(data=new_sample, columns=['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'] )

In [79]:
new_sample_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [80]:
y_pred = heart_model.predict(new_sample_df)

In [81]:
y_pred

array([0])