In [41]:
import os
import uuid
import pickle
import warnings
import numpy as np
import pandas as pd 
from pprint import pprint
from collections import Counter

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier

# metrics
from scipy.stats import zscore
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report, roc_auc_score, f1_score

# plotting
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Contents

- Util functions
- Part One: Intial POC
    - Data import & examination
    - Feature exploration
    - Feature dictionary
    - Model training
    - League table
- Part Two: Pipeline

# - Util functions

The code below would normally be in a util library and imported, however I have copied the relevant functions here so that the proper notebook content below runs from top to bottom.

In [42]:
def zscore_outliers_continuous_feature(df, feature, threshold):
    """return the number of records outside the zscore threshold"""
    tmp = df.copy()
    tmp["zscore"] = (tmp[feature] - tmp[feature].mean())/tmp[feature].std(ddof=0)
    return len(tmp[tmp["zscore"] > threshold])

def iqr_outliers_continuous_feature(df, feature):
    """return the number of records outside 1.5 x the interquartile range"""
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    return len(df[(df[feature] < (Q1-1.5*IQR)) | (df[feature] > (Q3+1.5*IQR))])

def distplot_continuous_feature(df, feature):
    """plotly histogram of continuous feature from dataframe"""
    fig = px.histogram(df, x=feature, marginal="box", barmode="overlay")
    fig.write_image("outputs/features/{}/displot.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def distplot_continuous_feature_vs_target(df, feature):
    """plotly histogram of continuous feature from dataframe verses target variable"""
    fig = px.histogram(df, x=feature, color="target", marginal="box", barmode="overlay")
    fig.write_image("outputs/features/{}/displot_target.png".format(feature), width=1080, height=720, scale=3)
    fig.show()  
    
def barplot_categorical_feature(df, feature):
    """barplot of categorial feature from dataframe"""
    tmp = df.copy()
    tmp[feature] = tmp[feature].astype(str)
    tmp['target'] = tmp['target'].astype(str)
    fig = px.bar(tmp, x=feature)
    fig.write_image("outputs/features/{}/barplot.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def barplot_categorical_feature_vs_target(df, feature):
    """barplot of categorial feature from dataframe verse target variable"""
    tmp = df.copy()
    tmp[feature] = tmp[feature].astype(str)
    tmp['target'] = tmp['target'].astype(str)
    fig = px.bar(tmp, x=feature, color="target")
    fig.write_image("outputs/features/{}/barplot_target.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def plot_league_table(df):
    "scatter plot of ranked league table accuracy"
    tmp = df.copy()
    tmp.loc[tmp['mean_cv_accuracy'].isnull(), 'mean_cv_accuracy'] = tmp['accuracy']
    tmp.loc[tmp['auc'].isnull(), 'auc'] = tmp['accuracy']
    tmp = tmp.sort_values(['mean_cv_accuracy'])
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=tmp['name'], y=tmp['mean_cv_accuracy'],error_y=dict(type='data', array=tmp['std_cv_accuracy'],visible=True),
                             mode='lines+markers', name='CV accuracy'))
    fig.add_trace(go.Scatter(x=tmp['name'], y=tmp["auc"], name='AUC', mode='markers', marker=dict(color='red')))
    fig.add_trace(go.Scatter(x=tmp['name'], y=100.*tmp["tpr"], name='TPR', mode='markers', marker=dict(color='green')))
    fig.update_traces(marker_line_width=2, marker_size=8)
    fig.update_layout(xaxis=dict(title="Model"),yaxis=dict(title="Accuracy %")) 
    fig.update_layout(yaxis_range=[40,100])
    fig.write_image("outputs/models/league_table.png", width=1080, height=720, scale=3)
    fig.show()
    
def plot_confusion_matrix(cm, labels, title):
    "heatmap plot of confusion matrix values"
    data = go.Heatmap(z=cm, y=labels, x=labels, colorscale="aggrnyl")
    annotations = []
    for i, row in enumerate(cm):
        for j, value in enumerate(row):
            annotations.append({"x": labels[i], "y": labels[j], "font": {"color": "white"}, "text": str(value),
                                "xref": "x1", "yref": "y1", "showarrow": False})
    layout = {"title": title,
              "xaxis": {"title": "Predicted value"},
              "yaxis": {"title": "Real value"},
              "annotations": annotations}
    fig = go.Figure(data=data, layout=layout)
    fig.write_image("outputs/models/{}/confusion_matrix.png".format(title), width=1080, height=720, scale=3)
    fig.show()
    
def create_feature_folders(features):
    """create a folder in the output directory for each feature"""
    for feature in features:
        if feature != "target":
            path = os.path.join("outputs", "features", feature)
            if not os.path.exists(path):
                os.makedirs(path)

def create_model_folder(model):
    """create a folder in the output directory for this specific model"""
    path = os.path.join("outputs","models", model)
    if not os.path.exists(path):
        os.makedirs(path)

# - PART ONE: INITIAL POC

## - - - Data import & examination

 The code below simply loads the data, examines example records and summarises each field/column
 
 > *I can see there are a reasonably small number of records but also a correspondingly reasonable number of features. I also note there is a binary target value (i.e. a binary classification problem) along with features that have outliers and missing values.* 

In [43]:
# load
df = pd.read_csv('inputs\heart.csv')

In [44]:
# preview
df.head(10)

Unnamed: 0,age,sex,chest pain type,resting blood pressure,chol,fasting blood sugar,resting ECG,max heart rate,exang,oldpeak,slope,number vessels flourosopy,thal,target
0,63,1,3,145,233,1,0,150.0,0,-99.99,0,0,1,1
1,37,1,2,130,250,0,1,187.0,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178.0,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163.0,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148.0,0,-99.99,1,0,1,1
6,56,0,1,140,294,0,0,153.0,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173.0,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162.0,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174.0,0,1.6,2,0,2,1


In [45]:
# shape
df.shape

(303, 14)

In [46]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        303 non-null    int64  
 1   sex                        303 non-null    int64  
 2   chest pain type            303 non-null    int64  
 3   resting blood pressure     303 non-null    int64  
 4   chol                       303 non-null    int64  
 5   fasting blood sugar        303 non-null    int64  
 6   resting ECG                303 non-null    int64  
 7   max heart rate             274 non-null    float64
 8   exang                      303 non-null    int64  
 9   oldpeak                    303 non-null    float64
 10  slope                      303 non-null    int64  
 11  number vessels flourosopy  303 non-null    int64  
 12  thal                       303 non-null    int64  
 13  target                     303 non-null    int64  

In [47]:
# unique values
df.nunique()

age                           41
sex                            2
chest pain type                4
resting blood pressure        49
chol                         152
fasting blood sugar            2
resting ECG                    3
max heart rate                88
exang                          2
oldpeak                       39
slope                          3
number vessels flourosopy      5
thal                           4
target                         2
dtype: int64

## - - - Feature exploration

The code below explores each feature in turn programmatically generating a feature dictionary to record the findings.
 
 > *I often find that insight from feature exploration is buried inside sandbox type code and not readily accessible to other colleagues/co-workers and so I always put emphasis on the creation of a feature dictionary. Depending on the sophistiaction of the desired architecture it can be used to define data validation and pre-processing steps. For brevity I have limited it here to simple documentation of feature properties.* 

> *I also put an emphasis on saving high-quality static image of any relevant plots so that they can readily be used in client presentations*

In [48]:
# initialise feature dictionary
feature_lst = []

# create feature folders
create_feature_folders(list(df))

### - - - - - "age"

In [49]:
feature = "age"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "continuous",
                    "scaling": "standardise",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "sex"

In [50]:
feature = "sex"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Sex of the patient",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round( df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "chest pain type"

In [51]:
feature = "chest pain type"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Suspected ranked spectrum of chest pain",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "resting blood pressure"

In [52]:
feature = "resting blood pressure"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Resting blood pressure value",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "chol"

In [53]:
feature = "chol"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Suspected, cholesterol",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "fasting blood sugar"

In [54]:
feature = "fasting blood sugar"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "resting ECG"

In [55]:
feature = "resting ECG"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "max heart rate"

In [56]:
feature = "max heart rate"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "A",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": "median_value",
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "exang"

In [57]:
feature = "exang"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "oldpeak"

In [58]:
feature = "oldpeak"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "A",
                    "description": "",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": "median_value",
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "slope"

In [59]:
feature = "slope"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "number vessels flourosopy"

In [60]:
feature = "number vessels flourosopy"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "thal"

In [61]:
feature = "thal"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

## - - - Feature dictionary

The code below allows us to export/publish the feature dictionary as definitive output from our feature exploration.
 
 > *In the feature dictionary I have included the following fields*
 
 > - feature: *the name of the relevant feature*
 > - veracity: *a RAG indication of usability & reliability of the feature, not overly relevant here*
 > - description: *a description of the field, not overly relevant here at the fields are submitted as anonymous*
 > - data_type: *inferred data type of the relevant feature*
 > - scaling: *recommended scaling consideration*
 > - nulls: *count of null values*
 > - outliers-zscore: *count of values outside z-score threshold*
 > - outliers_iqr: *count of values outside 1.5 x interquartile range*
 > - imputation: *recommended imputation consideration*
 > - target_correlation_pearson: *correlation coefficient between feature and target*
 > - comments: *additional notes on the relevant feature*
 
 > *From the feature exploration the data looks reasonable. There are missing-value issues with "max heart rate" and erroneous values with "oldpeak" which need consideration. It is also clear that there are strong correlations between the target and both "chest pain type" and "max heart rate".*

In [62]:
feature_dictionary = pd.DataFrame.from_dict(feature_lst)
feature_dictionary.head(15)

Unnamed: 0,feature,veracity,description,data_type,scaling,nulls,outliers_zscore,outliers_iqr,imputation,target_correlation_pearson,comments
0,age,G,,continuous,standardise,0,0,0,,-0.225,
1,sex,G,Sex of the patient,binary,,0,0,0,,-0.281,
2,chest pain type,G,Suspected ranked spectrum of chest pain,"categorical, suspected ordinal",minmax,0,0,0,,0.434,
3,resting blood pressure,G,Resting blood pressure value,continuous,standard,0,2,9,,-0.145,
4,chol,G,"Suspected, cholesterol",continuous,standard,0,4,5,,-0.085,
5,fasting blood sugar,G,,binary,,0,0,45,,-0.028,
6,resting ECG,G,,"categorical, suspected ordinal",,0,0,0,,0.137,
7,max heart rate,A,,"categorical, suspected ordinal",standard,29,0,3,median_value,0.404,
8,exang,G,,binary,,0,0,0,,-0.437,
9,oldpeak,A,,continuous,standard,0,0,28,median_value,-0.079,


## - - - Model training

In the code below we split the data into training and test set and then pre-process it before training a handful of different models to give a broad indication of performance.

For each trained model we generate an entry in the model dictionary including both the properties of the model and the relevant accuracy measures. We also package the model and its preprocessing steps into the dictionary, then assigined the entry a unique ID we then use this to deploy the model into the pipeline. 

> *In the first instance the model dictionary serves an excellent purpose as a "league table" publishing the performance of the different models to a wider audience. However, more than that, the model dictionary is fundamental to the pipeline architecture. It allows us to isolate and protect the production pipeline from the R&D activity and simply redploy newly trained models using just the model's unique identifier.*

> *I have also deliberately included some terrible models, to baseline model performance against.*

### - - - - - Initialisation

For simplicity in the first instance, we have used minmax scaling across the dataset rather than consider scaling features indepently. Also I have applied to median value imputation across "max heart rate" and "oldpeak".

In [63]:
# create testing and training sets
x = df.loc[:, df.columns != 'target']
y = df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=2)

In [64]:
# missing values
print(x_train['oldpeak'][x_train['oldpeak']>=0.].median())
print(x_train['max heart rate'].median())

def missing_values(df):
    """return the dataframe with erroneous/missing values processed as required"""
    
    # oldpeak
    df['oldpeak'] = df['oldpeak'].replace(-99.99, np.nan)
    df['oldpeak'] = df['oldpeak'].fillna(0.8)
    
    # max heart rate
    df['max heart rate'] = df['max heart rate'].fillna(152.)
    
    return df

x_train = missing_values(x_train)
x_test = missing_values(x_test)

0.8
152.0


In [65]:
# scaling
sc = MinMaxScaler()
features = list(x_train)
x_train[features] = sc.fit_transform(x_train[features])
x_test[features] = sc.transform(x_test[features])

In [66]:
# initialise model list
model_lst = []

### - - - - - Model: "random baseline"
 > *This is a baseline model, used to compare performance against*

In [67]:
# model creation and fit
dist = Counter(y_train)
level = dist[0]/len(y_train)
pred = (np.random.rand(len(x_test))>=level).astype(int)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Random baseline",
             "model_type": "Target probability",
             "features": np.nan,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.44      0.51      0.47        41
           1       0.53      0.46      0.49        50

    accuracy                           0.48        91
   macro avg       0.49      0.49      0.48        91
weighted avg       0.49      0.48      0.48        91



### - - - - - Model: "key feature - sex"

> *This is another baseline model, used to compare performance against*

In [68]:
# model creation and fit
features = ["sex"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Key feature - sex",
             "model_type": "Naive Bayes",
             "model": pickle.dumps(clf),
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.44      0.51      0.47        41
           1       0.53      0.46      0.49        50

    accuracy                           0.48        91
   macro avg       0.49      0.49      0.48        91
weighted avg       0.49      0.48      0.48        91



### - - - - - Model: "key feature - number vessels flourosopy"

> *This is another baseline model, used to compare performance against*

In [69]:
# model creation and fit
features = ["number vessels flourosopy"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Key feature - number vessels flourosopy",
             "model_type": "Naive Bayes",
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.82      0.34      0.48        41
           1       0.64      0.94      0.76        50

    accuracy                           0.67        91
   macro avg       0.73      0.64      0.62        91
weighted avg       0.72      0.67      0.63        91



### - - - - - Model: "key feature - chest pain type"
> *This is another baseline model, used to compare performance against*

In [70]:
# model creation and fit
features = ["chest pain type"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Key feature - chest pain type",
             "model_type": "Naive Bayes",
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.77      0.73      0.75        41
           1       0.79      0.82      0.80        50

    accuracy                           0.78        91
   macro avg       0.78      0.78      0.78        91
weighted avg       0.78      0.78      0.78        91



### - - - - - Model: "random forest"

In [71]:
# model creation and fit
features = list(x_train)
clf = RandomForestClassifier(max_depth=8, n_estimators=16, max_features=3)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Random forest on all features",
             "model_type": "Random forest",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred),2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.92      0.80      0.86        41
           1       0.85      0.94      0.90        50

    accuracy                           0.88        91
   macro avg       0.89      0.87      0.88        91
weighted avg       0.88      0.88      0.88        91



### - - - - - "logistic regression"

In [72]:
# model creation and fit
features = list(x_train)
clf = LogisticRegression()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Logistic regression on all features",
             "model_type": "Logistic regression",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred), 2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred), 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(), 2),
             "std_cv_accuracy": np.round(100.*cv_score.std(), 2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.89      0.76      0.82        41
           1       0.82      0.92      0.87        50

    accuracy                           0.85        91
   macro avg       0.85      0.84      0.84        91
weighted avg       0.85      0.85      0.84        91



### - - - - - "k-Nearest neighbours"

In [73]:
# model creation and fit
features = list(x_train)
clf = KNeighborsClassifier(n_neighbors=12)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "k-Nearest neighbours on all features",
             "model_type": "kNN",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred), 2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred), 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(), 2),
             "std_cv_accuracy": np.round(100.*cv_score.std(), 2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.79      0.76      0.77        41
           1       0.81      0.84      0.82        50

    accuracy                           0.80        91
   macro avg       0.80      0.80      0.80        91
weighted avg       0.80      0.80      0.80        91



### - - - - - "Support Vector Classifier"

In [74]:
# model creation and fit
features = list(x_train)
clf = SVC(kernel='rbf', C=3)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "SVC on all features",
             "model_type": "SVC",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred), 2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred), 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(), 2),
             "std_cv_accuracy": np.round(100.*cv_score.std(), 2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        41
           1       0.88      0.88      0.88        50

    accuracy                           0.87        91
   macro avg       0.87      0.87      0.87        91
weighted avg       0.87      0.87      0.87        91



### - - - - - "XGB"

In [75]:
# model creation and fit
features = list(x_train)
clf = XGBClassifier(learning_rate=0.01, n_estimators=5)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "XGB on all features",
             "model_type": "XGB",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred), 2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred), 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(), 2),
             "std_cv_accuracy": np.round(100.*cv_score.std(), 2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.85      0.83      0.84        41
           1       0.86      0.88      0.87        50

    accuracy                           0.86        91
   macro avg       0.86      0.85      0.86        91
weighted avg       0.86      0.86      0.86        91



### - - - - - "Model stacking"

In [76]:
# model creation and fit
features = list(x_train)

clf_svc = SVC(kernel='rbf', C=3)
clf_rf = RandomForestClassifier(max_depth=8, n_estimators=16, max_features=3)
clf_xgb = XGBClassifier(learning_rate=0.01, n_estimators=20)
clf_knn = KNeighborsClassifier(n_neighbors=12)
clf_lr = LogisticRegression(max_iter=1000)
clf=StackingCVClassifier(classifiers=[clf_svc, clf_knn, clf_xgb], meta_classifier=clf_svc, random_state=42)
clf.fit(x_train,y_train)

pred = clf.predict(x_test[features])
conf_matrix = confusion_matrix(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": str(uuid.uuid4()),
             "name": "Stacking classifiers",
             "model_type": "Stacking",
             "model": pickle.dumps(clf),
             "missing_values": missing_values,
             "scaling": pickle.dumps(sc),
             "features": features,
             "auc": np.round(100.*roc_auc_score(y_test, pred), 2),
             "tpr": conf_matrix[1][1]/sum(y_test),
             "accuracy": np.round(100.*accuracy_score(y_test, pred), 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(), 2),
             "std_cv_accuracy": np.round(100.*cv_score.std(), 2)}
model_lst.append(model_dct)

# model plots
create_model_folder(model_dct["name"])
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.94      0.71      0.81        41
           1       0.80      0.96      0.87        50

    accuracy                           0.85        91
   macro avg       0.87      0.83      0.84        91
weighted avg       0.86      0.85      0.84        91



# - - - Model dictionary a.k.a "League Table"

From the data dictionary below we can see that progressive improvement from simple baseline models to higher performing ML models. In the plot below we have considered the AUC score, the true-positive-rate and the average cross-validation score to assess model performance. Interestingly there is a strong trade-off between model stability and peak-performance, which suggest room for improvement some of the models. 

In [77]:
model_dictionary = pd.DataFrame.from_dict(model_lst)
model_dictionary = model_dictionary.set_index('model_id')
lst = ['name', 'model_type',
       'accuracy', 'auc', 'mean_cv_accuracy', 'std_cv_accuracy', 'tpr',
       'model', 'features', 'missing_values', 'scaling']
model_dictionary = model_dictionary[lst]
model_dictionary.head(11)

Unnamed: 0_level_0,name,model_type,accuracy,auc,mean_cv_accuracy,std_cv_accuracy,tpr,model,features,missing_values,scaling
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4a7aee56-fddc-475a-9520-f32a10fe6134,Random baseline,Target probability,48.35,,,,,,,,
1bc4f06b-b61f-4d14-aec3-809dc9a08da6,Key feature - sex,Naive Bayes,48.35,,,,,b'\x80\x03csklearn.naive_bayes\nGaussianNB\nq\...,[sex],,
0da5f1df-c1f3-4d67-8b2e-9382d3c1d3fb,Key feature - number vessels flourosopy,Naive Bayes,67.03,,,,,,[number vessels flourosopy],,
d6de4945-93fe-4247-9d1a-f36438a36107,Key feature - chest pain type,Naive Bayes,78.02,,,,,,[chest pain type],,
202e3a00-164c-420e-9af4-97de2c19ad87,Random forest on all features,Random forest,87.91,87.24,76.9,6.79,0.94,b'\x80\x03csklearn.ensemble._forest\nRandomFor...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...
6495729a-3324-471e-abfe-ff3a9ca538c2,Logistic regression on all features,Logistic regression,84.62,83.8,80.15,6.03,0.92,b'\x80\x03csklearn.linear_model._logistic\nLog...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...
4f549b80-a780-4446-b715-83ef609d376c,k-Nearest neighbours on all features,kNN,80.22,79.8,79.22,9.29,0.84,b'\x80\x03csklearn.neighbors._classification\n...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...
e879c698-4062-4135-b263-9d2701b22099,SVC on all features,SVC,86.81,86.68,78.31,7.05,0.88,b'\x80\x03csklearn.svm._classes\nSVC\nq\x00)\x...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...
9374962f-1c3b-419a-ae17-fa00449f8dc7,XGB on all features,XGB,85.71,85.46,77.45,7.5,0.88,b'\x80\x03cxgboost.sklearn\nXGBClassifier\nq\x...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...
e17245bc-f8db-4286-9a22-4907ca70c238,Stacking classifiers,Stacking,84.62,83.37,78.77,9.59,0.96,b'\x80\x03cmlxtend.classifier.stacking_cv_clas...,"[age, sex, chest pain type, resting blood pres...",<function missing_values at 0x000001E9A20042F0>,b'\x80\x03csklearn.preprocessing._data\nMinMax...


In [78]:
plot_league_table(model_dictionary)

# - Part Two: Pipeline

In the simple code below we generate simple toy production data and pass this through our productionised pipeline, in which, the model is defined/deployed by its unique identifier. 

In [79]:
# generate single production data record
prod_data = {'age': 63.00,
             'sex': 1.00,
             'chest pain type': 3.00,
             'resting blood pressure': 145.00,
             'chol': 233.00,
             'fasting blood sugar': 1.00,
             'resting ECG': 0.00,
             'max heart rate': 150.00,
             'exang': 0.00,
             'oldpeak': -99.99,
             'slope': 0.00,
             'number vessels flourosopy': 0.00,
             'thal': 1.00}

In [81]:
# --------------
# INITIALISATION
# --------------

# model version
model_id = "6495729a-3324-471e-abfe-ff3a9ca538c2"

# model functions
features = model_dictionary.loc[model_id, "features"]
missing_values = model_dictionary.loc[model_id, "missing_values"]
scaler = pickle.loads(model_dictionary.loc[model_id, "scaling"])
classifier = pickle.loads(model_dictionary.loc[model_id, "model"])

# --------------------
# LOAD PRODUCTION DATA
# -------------------- 
df = pd.DataFrame(data=prod_data, index=[0])

# TODO: apply input validation

# --------------
# PRE-PROCESSING
# --------------

# missing values
df = missing_values(df)

# scaling
df[features] = scaler.transform(df[features])

# TODO: apply cleaning validation

# ---------------
# CLASSIFICATTION
# ---------------
pred = classifier.predict(df[features])

# TODO: apply output validation

# ------
# OUTPUT
# ------
print(pred)

[1]
