In [436]:
import os
import pickle
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from collections import Counter
import uuid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as ltb
from mlxtend.classifier import StackingCVClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import zscore

import warnings
warnings.filterwarnings("ignore")

# Contents

- Util functions
- Part One: Intial POC
    - Data import & examination
    - Feature exploration
    - Feature dictionary
    - Model training
    - League table
- Part Two: Pipeline

# - Util functions

The code below would normally be in a util library and imported, however I have copied the relevant functions here so that the proper notebook content below runs from top to bottom.

 > *I would emphasise the benefit of reusuable utils like this and encourage team members to develop their own personalised 'cheat sheets' with helpful functions like this.* 

In [437]:
def zscore_outliers_continuous_feature(df, feature, threshold):
    """return the number of records outside the zscore threshold"""
    tmp = df.copy()
    tmp["zscore"] = (tmp[feature] - tmp[feature].mean())/tmp[feature].std(ddof=0)
    return len(tmp[tmp["zscore"] > threshold])

def iqr_outliers_continuous_feature(df, feature):
    """return the number of records outside 1.5 x the interquartile range"""
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    return len(df[(df[feature] < (Q1-1.5*IQR)) | (df[feature] > (Q3+1.5*IQR))])

def distplot_continuous_feature(df, feature):
    """plotly histogram of continuous feature from dataframe"""
    fig = px.histogram(df, x=feature, marginal="box", barmode="overlay")
    fig.write_image("outputs/features/{}/displot.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def distplot_continuous_feature_vs_target(df, feature):
    """plotly histogram of continuous feature from dataframe verses target variable"""
    fig = px.histogram(df, x=feature, color="target", marginal="box", barmode="overlay")
    fig.write_image("outputs/features/{}/displot_target.png".format(feature), width=1080, height=720, scale=3)
    fig.show()  
    
def barplot_categorical_feature(df, feature):
    """barplot of categorial feature from dataframe"""
    tmp = df.copy()
    tmp[feature] = tmp[feature].astype(str)
    tmp['target'] = tmp['target'].astype(str)
    fig = px.bar(tmp, x=feature)
    fig.write_image("outputs/features/{}/barplot.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def barplot_categorical_feature_vs_target(df, feature):
    """barplot of categorial feature from dataframe verse target variable"""
    tmp = df.copy()
    tmp[feature] = tmp[feature].astype(str)
    tmp['target'] = tmp['target'].astype(str)
    fig = px.bar(tmp, x=feature, color="target")
    fig.write_image("outputs/features/{}/barplot_target.png".format(feature), width=1080, height=720, scale=3)
    fig.show()
    
def plot_league_table(df):
    "scatter plot of ranked league table accuracy"
    tmp = df.copy()
    tmp.loc[tmp['mean_cv_accuracy'].isnull(), 'mean_cv_accuracy'] = tmp['accuracy']
    tmp = tmp.sort_values(['mean_cv_accuracy'])
    fig = px.scatter(tmp, x="name", y="mean_cv_accuracy",
                     hover_data=["name", "model_type", "mean_cv_accuracy", "std_cv_accuracy"])
    fig.update_layout(yaxis_range=[40,100])
    fig.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'))
    fig.write_image("outputs/models/league_table.png", width=1080, height=720, scale=3)
    fig.show()
    
def plot_confusion_matrix(cm, labels, title):
    "heatmap plot of confusion matrix values"
    data = go.Heatmap(z=cm, y=labels, x=labels, colorscale="aggrnyl")
    annotations = []
    for i, row in enumerate(cm):
        for j, value in enumerate(row):
            annotations.append({"x": labels[i], "y": labels[j], "font": {"color": "white"}, "text": str(value),
                                "xref": "x1", "yref": "y1", "showarrow": False})
    layout = {"title": title,
              "xaxis": {"title": "Predicted value"},
              "yaxis": {"title": "Real value"},
              "annotations": annotations}
    fig = go.Figure(data=data, layout=layout)
    fig.write_image("outputs/models/{}/confusion_matrix.png".format(title), width=1080, height=720, scale=3)
    fig.show()
    
def create_feature_folders(features):
    """create a folder in the output directory for each feature"""
    for feature in features:
        if feature != "target":
            path = os.path.join("outputs", "features", feature)
            if not os.path.exists(path):
                os.makedirs(path)

# - PART ONE: INITIAL POC

## - - - Data import & examination

 The code below simply loads the data, examines example records and summarises each field/column
 
 > *I can see there are a reasonably small number of records but also a correspondingly reasonable number of features. I also note there is a binary target value (i.e. a binary classification problem) along with features that have outliers and missing values.* 

In [438]:
# load
df = pd.read_csv('heart.csv')

In [439]:
# preview
df.head(10)

Unnamed: 0,age,sex,chest pain type,resting blood pressure,chol,fasting blood sugar,resting ECG,max heart rate,exang,oldpeak,slope,number vessels flourosopy,thal,target
0,63,1,3,145,233,1,0,150.0,0,-99.99,0,0,1,1
1,37,1,2,130,250,0,1,187.0,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178.0,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163.0,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148.0,0,-99.99,1,0,1,1
6,56,0,1,140,294,0,0,153.0,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173.0,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162.0,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174.0,0,1.6,2,0,2,1


In [440]:
# shape
df.shape

(303, 14)

In [441]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        303 non-null    int64  
 1   sex                        303 non-null    int64  
 2   chest pain type            303 non-null    int64  
 3   resting blood pressure     303 non-null    int64  
 4   chol                       303 non-null    int64  
 5   fasting blood sugar        303 non-null    int64  
 6   resting ECG                303 non-null    int64  
 7   max heart rate             274 non-null    float64
 8   exang                      303 non-null    int64  
 9   oldpeak                    303 non-null    float64
 10  slope                      303 non-null    int64  
 11  number vessels flourosopy  303 non-null    int64  
 12  thal                       303 non-null    int64  
 13  target                     303 non-null    int64  

In [442]:
# unique values
df.nunique()

age                           41
sex                            2
chest pain type                4
resting blood pressure        49
chol                         152
fasting blood sugar            2
resting ECG                    3
max heart rate                88
exang                          2
oldpeak                       39
slope                          3
number vessels flourosopy      5
thal                           4
target                         2
dtype: int64

## - - - Feature exploration

The code below explores each feature in turn programmatically generating a feature dictionary to record the findings.
 
 > *I often find that insight from feature exploration is buried inside sandbox type code and not readily accessible to other colleagues/co-workers and so I always put emphasis on the creation of a feature dictionary.* 

> *I also put an emphasis on saving high-quality static image of any relevant plots so that they can readily be used in client presentations*

In [443]:
# initialise feature dictionary
feature_lst = []

# create feature folders
create_feature_folders(list(df))

### - - - - - "age"

In [444]:
feature = "age"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "continuous",
                    "scaling": "standardise",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "sex"

In [445]:
feature = "sex"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Sex of the patient",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round( df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "chest pain type"

In [446]:
feature = "chest pain type"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Suspected ranked spectrum of chest pain",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "resting blood pressure"

In [447]:
feature = "resting blood pressure"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Resting blood pressure value",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "chol"

In [448]:
feature = "chol"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "Suspected, cholesterol",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "fasting blood sugar"

In [449]:
feature = "fasting blood sugar"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "resting ECG"

In [450]:
feature = "resting ECG"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "max heart rate"

In [451]:
feature = "max heart rate"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "A",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": "median_value",
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "exang"

In [452]:
feature = "exang"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "binary",
                    "scaling": None,
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "oldpeak"

In [453]:
feature = "oldpeak"

# distplot the feature
distplot_continuous_feature(df, feature)

# distplot the feature vs the target
distplot_continuous_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "A",
                    "description": "",
                    "data_type": "continuous",
                    "scaling": "standard",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": "median_value",
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "slope"

In [454]:
feature = "slope"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "number vessels flourosopy"

In [455]:
feature = "number vessels flourosopy"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

### - - - - - "thal"

In [456]:
feature = "thal"

# barplot the feature
barplot_categorical_feature(df, feature)

# barplot the feature vs the target
barplot_categorical_feature_vs_target(df, feature)

# update feature dictionary
feature_lst.append({"feature": feature,
                    "veracity": "G",
                    "description": "",
                    "data_type": "categorical, suspected ordinal",
                    "scaling": "minmax",
                    "nulls": len(df[df[feature].isnull()]),
                    "outliers_zscore": zscore_outliers_continuous_feature(df, feature, 3.0),
                    "outliers_iqr": iqr_outliers_continuous_feature(df, feature),
                    "imputation": None,
                    "target_correlation_pearson": np.round(df[feature].corr(df["target"], method="pearson"), 3),
                    "comments": ""})

## - - - Feature dictionary

The code below allows us to export/publish the feature dictionary as definitive output from our feature exploration.
 

In [457]:
feature_dictionary = pd.DataFrame.from_dict(feature_lst)
feature_dictionary.head(15)

Unnamed: 0,feature,veracity,description,data_type,scaling,nulls,outliers_zscore,outliers_iqr,imputation,target_correlation_pearson,comments
0,age,G,,continuous,standardise,0,0,0,,-0.225,
1,sex,G,Sex of the patient,binary,,0,0,0,,-0.281,
2,chest pain type,G,Suspected ranked spectrum of chest pain,"categorical, suspected ordinal",minmax,0,0,0,,0.434,
3,resting blood pressure,G,Resting blood pressure value,continuous,standard,0,2,9,,-0.145,
4,chol,G,"Suspected, cholesterol",continuous,standard,0,4,5,,-0.085,
5,fasting blood sugar,G,,binary,,0,0,45,,-0.028,
6,resting ECG,G,,"categorical, suspected ordinal",,0,0,0,,0.137,
7,max heart rate,A,,"categorical, suspected ordinal",standard,29,0,3,median_value,0.404,
8,exang,G,,binary,,0,0,0,,-0.437,
9,oldpeak,A,,continuous,standard,0,0,28,median_value,-0.079,


## - - - Model training

### - - - - - Initialisation

In [477]:
# create testing and training sets
x = df.loc[:, df.columns != 'target']
y = df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=0)

In [478]:
# missing values

def missing_values(df):
    """return the dataframe with erroneous/missing values processed as required"""
    df['oldpeak'] = df['oldpeak'].replace(-99.99, np.nan)
    df = df.fillna(df.median())
    return df

df = missing_values(df)

In [479]:
# scaling
sc = MinMaxScaler()
features = list(x_train)
x_train[features] = sc.fit_transform(x_train[features])
x_test[features] = sc.transform(x_test[features])

In [480]:
# initialise model list
model_lst = []

### - - - - - Model: "random baseline"

In [490]:
# model creation and fit
dist = Counter(y_train)
level = dist[0]/len(y_train)
pred = (np.random.rand(len(x_test))>=level).astype(int)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": uuid.uuid4(),
             "name": "Random baseline",
             "model_type": "Target probability",
             "features": np.nan,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.45      0.45      0.45        33
           1       0.58      0.58      0.58        43

    accuracy                           0.53        76
   macro avg       0.52      0.52      0.52        76
weighted avg       0.53      0.53      0.53        76



### - - - - - Model: "key feature - sex"

In [491]:
# model creation and fit
features = ["sex"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": uuid.uuid4(),
             "name": "Key feature - sex",
             "model_type": "Naive Bayes",
             "model": pickle.dumps(clf),
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.45      0.45      0.45        33
           1       0.58      0.58      0.58        43

    accuracy                           0.53        76
   macro avg       0.52      0.52      0.52        76
weighted avg       0.53      0.53      0.53        76



### - - - - - Model: "key feature - number vessels flourosopy"

In [492]:
# model creation and fit
features = ["number vessels flourosopy"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": uuid.uuid4(),
             "name": "Key feature - number vessels flourosopy",
             "model_type": "Naive Bayes",
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.71      0.30      0.43        33
           1       0.63      0.91      0.74        43

    accuracy                           0.64        76
   macro avg       0.67      0.61      0.58        76
weighted avg       0.67      0.64      0.61        76



### - - - - - Model: "key feature - chest pain type"

In [493]:
# model creation and fit
features = ["chest pain type"]
clf = GaussianNB()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": uuid.uuid4(),
             "name": "Key feature - chest pain type",
             "model_type": "Naive Bayes",
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2)}
model_lst.append(model_dct)

              precision    recall  f1-score   support

           0       0.73      0.73      0.73        33
           1       0.79      0.79      0.79        43

    accuracy                           0.76        76
   macro avg       0.76      0.76      0.76        76
weighted avg       0.76      0.76      0.76        76



### - - - - - Model: "random forest"

In [494]:
# model creation and fit
features = list(x_train)
clf = RandomForestClassifier(max_depth=8, n_estimators=16, max_features=3)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(classification_report(y_test, pred))

# add to the model list
model_dct = {"model_id": uuid.uuid4(),
             "name": "Random forest on all features",
             "model_type": "Random forest",
             "features": features,
             "accuracy": np.round(100.*accuracy_score(y_test, pred),2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
model_lst.append(model_dct)

# create model folder
path = os.path.join("outputs","models", model_dct["name"])
if not os.path.exists(path):
    os.makedirs(path)
            
# plot confusion matrix
plot_confusion_matrix(conf_matrix, ['_0', '_1'], model_dct["name"])

              precision    recall  f1-score   support

           0       0.83      0.76      0.79        33
           1       0.83      0.88      0.85        43

    accuracy                           0.83        76
   macro avg       0.83      0.82      0.82        76
weighted avg       0.83      0.83      0.83        76



### 03.05 --- "logistic regression"

In [468]:
# model creation and fit
features = list(x_train)

clf = LogisticRegression()
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
acc_score = accuracy_score(y_test, pred)
auc_score = roc_auc_score(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(acc_score)
print(auc_score)
print(cv_score.mean())
print(cv_score.std())

model_dct = {"model_id": uuid.uuid4(),
             "name": "Logistic regression on all features",
             "model_type": "Logistic regression",
             "features": features,
             "accuracy": np.round(100.*acc_score,2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
league_table.append(model_dct)

0.8026315789473685
0.7903453136011276
0.8104743083003951
0.06624209951220952


### 03.06 --- "k-Nearest neighbours"

In [469]:
# model creation and fit
features = list(x_train)

clf = KNeighborsClassifier(n_neighbors=12)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
acc_score = accuracy_score(y_test, pred)
auc_score = roc_auc_score(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(acc_score)
print(auc_score)
print(cv_score.mean())
print(cv_score.std())

model_dct = {"model_id": uuid.uuid4(),
             "name": "k-Nearest neighbours on all features",
             "model_type": "kNN",
             "features": features,
             "accuracy": np.round(100.*acc_score,2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
league_table.append(model_dct)

0.7894736842105263
0.7787174066243834
0.7891304347826087
0.05599585366616302


### 03.07 --- "Support Vector Classifier"

In [470]:
# model creation and fit
features = list(x_train)

clf = SVC(kernel='rbf', C=3)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
acc_score = accuracy_score(y_test, pred)
auc_score = roc_auc_score(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(acc_score)
print(auc_score)
print(cv_score.mean())
print(cv_score.std())

model_dct = {"model_id": uuid.uuid4(),
             "name": "SVC on all features",
             "model_type": "SVC",
             "features": features,
             "accuracy": np.round(100.*acc_score,2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
league_table.append(model_dct)

0.8947368421052632
0.8893587033121917
0.8021739130434783
0.05139441961479929


### 03.08 --- "XGB"

In [471]:
# model creation and fit
features = list(x_train)

clf = XGBClassifier(learning_rate=0.01, n_estimators=20)
clf.fit(x_train[features], y_train)
pred = clf.predict(x_test[features])
acc_score = accuracy_score(y_test, pred)
auc_score = roc_auc_score(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(acc_score)
print(auc_score)
print(cv_score.mean())
print(cv_score.std())

model_dct = {"model_id": uuid.uuid4(),
             "name": "XGB on all features",
             "model_type": "XGB",
             "features": features,
             "accuracy": np.round(100.*acc_score, 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
league_table.append(model_dct)

0.8421052631578947
0.828752642706131
0.7976284584980237
0.05833354721655432


### 03.09 --- Model stacking

In [473]:
# model creation and fit
features = list(x_train)

clf_svc = SVC(kernel='rbf', C=3)
clf_rf = RandomForestClassifier(max_depth=8, n_estimators=16, max_features=3)
clf_xgb = XGBClassifier(learning_rate=0.01, n_estimators=20)
clf_knn = KNeighborsClassifier(n_neighbors=12)
clf_lr = LogisticRegression(max_iter=1000)
clf=StackingCVClassifier(classifiers=[clf_svc, clf_rf, clf_knn, clf_xgb], meta_classifier=clf_svc, random_state=42)
clf.fit(x_train,y_train)

pred = clf.predict(x_test[features])
acc_score = accuracy_score(y_test, pred)
auc_score = roc_auc_score(y_test, pred)
cv_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10)
print(acc_score)
print(auc_score)
print(cv_score.mean())
print(cv_score.std())

model_dct = {"model_id": uuid.uuid4(),
             "name": "Stacking classifiers",
             "model_type": "Stacking",
             "features": features,
             "accuracy": np.round(100.*acc_score, 2),
             "mean_cv_accuracy": np.round(100.*cv_score.mean(),2),
             "std_cv_accuracy": np.round(100.*cv_score.std(),2)}
league_table.append(model_dct)

0.868421052631579
0.8625792811839322
0.824110671936759
0.037663649801016885


# - - - Model dictionary a.k.a "League Table"

In [474]:
league_table = pd.DataFrame.from_dict(league_table)
league_table.head(11)

Unnamed: 0,model_id,name,model_type,features,accuracy,model,mean_cv_accuracy,std_cv_accuracy
0,ba0e0df4-2971-4763-b103-ff2cbe6a5dd0,Random baseline,Target probability,,50.0,,,
1,67e7de27-9fe7-4bc9-99e0-c1af0004c093,Key feature - sex,Naive Bayes,[sex],50.0,b'\x80\x03csklearn.naive_bayes\nGaussianNB\nq\...,,
2,f09a1b01-24a8-4073-b6d1-c9de8e7e918e,Key feature - number vessels flourosopy,Naive Bayes,[number vessels flourosopy],64.47,,,
3,3c50ad11-b9f1-4d12-bb35-7bd6d60e6d3c,Key feature - chest pain type,Naive Bayes,[chest pain type],76.32,,,
4,487cdcc4-4691-4bc6-b96b-3f128f93cab3,Random forest on all features,Random forest,"[age, sex, chest pain type, resting blood pres...",85.53,,80.26,5.41
5,ff236a2e-0fff-4965-9432-4efd9a3e87bb,Logistic regression on all features,Logistic regression,"[age, sex, chest pain type, resting blood pres...",80.26,,81.05,6.62
6,81866008-e368-4205-b4b9-ffdd876e4572,k-Nearest neighbours on all features,kNN,"[age, sex, chest pain type, resting blood pres...",78.95,,78.91,5.6
7,692fde48-aaff-41e7-8d90-2ac83fd3a506,SVC on all features,SVC,"[age, sex, chest pain type, resting blood pres...",89.47,,80.22,5.14
8,14dc2a4c-8b2b-4f59-be5f-fa6e60743500,XGB on all features,XGB,"[age, sex, chest pain type, resting blood pres...",84.21,,79.76,5.83
9,2447ba60-2bf0-4ebc-b293-0f2570142b2d,LGBM on all features,LGBM,"[age, sex, chest pain type, resting blood pres...",84.21,,78.1,8.52


In [475]:
plot_league_table(league_table)