In [27]:
# import pandas, numpy, and necessary sklearn functions

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_precision_recall_curve, plot_confusion_matrix

In [28]:
# explore and clean data
df = pd.read_csv('data/cardiac_data.csv',na_values='?')
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num       '],
      dtype='object')

In [29]:
# rename num to target
df = df.rename(columns={'num       ': 'target'})

df['target'].value_counts(dropna=False)

0    188
1    106
Name: target, dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  293 non-null    float64
 4   chol      271 non-null    float64
 5   fbs       286 non-null    float64
 6   restecg   293 non-null    float64
 7   thalach   293 non-null    float64
 8   exang     293 non-null    float64
 9   oldpeak   294 non-null    float64
 10  slope     104 non-null    float64
 11  ca        3 non-null      float64
 12  thal      28 non-null     float64
 13  target    294 non-null    int64  
dtypes: float64(10), int64(4)
memory usage: 32.3 KB


In [31]:
df = df.drop(['slope','ca','thal'], axis=1)

df = df.dropna().copy()

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 0 to 293
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       261 non-null    int64  
 1   sex       261 non-null    int64  
 2   cp        261 non-null    int64  
 3   trestbps  261 non-null    float64
 4   chol      261 non-null    float64
 5   fbs       261 non-null    float64
 6   restecg   261 non-null    float64
 7   thalach   261 non-null    float64
 8   exang     261 non-null    float64
 9   oldpeak   261 non-null    float64
 10  target    261 non-null    int64  
dtypes: float64(7), int64(4)
memory usage: 24.5 KB


The ten features we’ll be using are:

 1. **age:** age in years
 2. **sex:** sex (1 = male; 0 = female)
 3. **cp:** chest pain type
     - 1 = typical angina
     - 2 = atypical angina
     - 3 = non-anginal pain
     - 4 = asymptomatic
 4. **trestbps:** resting blood pressure (in mm Hg on admission to the hospital)
 5.  **chol:** serum cholesterol in mg/dl
 6. **fbs:** (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
 7. **restecg:** resting electrocardiographic results
    - 0 = normal 
    - 1 = having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - 2 = showing probable or definite left ventricular hypertrophy by Estes’ criteria
 8. **thalach:** maximum heart rate achieved
 9. **exang:** exercise-induced angina (1 = yes; 0 = no)
 10. **oldpeak:** ST depression induced by exercise relative to rest

In [33]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0
5,32,0,2,105.0,198.0,0.0,0.0,165.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
289,52,1,4,160.0,331.0,0.0,0.0,94.0,1.0,2.5,1
290,54,0,3,130.0,294.0,0.0,1.0,100.0,1.0,0.0,1
291,56,1,4,155.0,342.0,1.0,0.0,150.0,1.0,3.0,1
292,58,0,2,180.0,393.0,0.0,0.0,110.0,1.0,1.0,1


In [34]:
 ## Transform the Categorical Variables:
 ### Creating Dummy Variables

In [35]:
df['cp'].value_counts(dropna=False)

df['restecg'].value_counts(dropna=False)

0.0    208
1.0     47
2.0      6
Name: restecg, dtype: int64

In [36]:
df = pd.get_dummies(df,columns=['cp','restecg'],drop_first=True)

numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_cols = list(set(df.columns) - set(numeric_cols) - {'target'})
cat_cols.sort()

print(numeric_cols)
print(cat_cols)

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
['cp_2', 'cp_3', 'cp_4', 'exang', 'fbs', 'restecg_1.0', 'restecg_2.0', 'sex']


In [37]:
    ## Split Training and Test Datasets
random_seed = 888
df_train, df_test = train_test_split(df,test_size=0.2, random_state=random_seed, stratify=df['target'])

In [38]:
print(df_train.shape)
print(df_test.shape)
print()
print(df_train['target'].value_counts(normalize=True))
print()
print(df_test['target'].value_counts(normalize=True))

(208, 14)
(53, 14)

0    0.625
1    0.375
Name: target, dtype: float64

0    0.622642
1    0.377358
Name: target, dtype: float64


## Transform the Numerical Variables
### Scaling

In [46]:
scaler = StandardScaler()
scaler.fit(df_train['numeric_cols'])

def get_fetures_and_target_arrays(df, 
                                  numeric_cols,
                                  cat_cols,
                                  scaler):
    X_numeric_scaled = scaler.transform(df[numeric_cols])
    X_categorical = df[cat_cols].to_numpy
    X = np.hstack((X_categorical,X_numeric_scaled))
    y = df['target']
    return X, y

X, y = get_features_and_arrays(df_train, numeric_cols, cat_cols, scaler)

KeyError: 'numeric_cols'

In [41]:
 ## fit the logistic regression model
clf = LogisticRegression(penalty='none')

clf.fit(X,y)

NameError: name 'X' is not defined

In [42]:
 ## Evaluate the model
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)



NameError: name 'get_features_and_target_arrays' is not defined

In [43]:
plot_roc_curve(clf, X_test, y_test)

NameError: name 'plot_roc_curve' is not defined

In [44]:
plot_precision_recall_curve(clf, X_test, y_test)

NameError: name 'X_test' is not defined

In [45]:
test_prob = clf.predict_proba(X_test)[:,1]
test_pred = clf.predict(X_test)

NameError: name 'X_test' is not defined

In [None]:
print('Log loss = {:.5f}'.format(log_loss(y_test, test_prob)))
print('AUC = {:.5f}'.format(roc_auc_score(y_test, test_prob)))
print('Average Precision = {:.5f}'.format(average_precision_score(y_test, test_prob)))
print('\nUsing 0.5 as threshold:')
print('Accuracy = {:.5f}'.format(accuracy_score(y_test, test_pred)))
print('Precision = {:.5f}'.format(precision_score(y_test, test_pred)))
print('Recall = {:.5f}'.format(recall_score(y_test, test_pred)))
print('F1 score = {:.5f}'.format(f1_score(y_test, test_pred)))

print('\nClassification Report')
print(classification_report(y_test, test_pred))

In [None]:
print('Confusion Matrix')
plot_confusion_matrix(clf, X_test, y_test)

In [None]:
## Interpret the Results

In [None]:
coefficients = np.hstack((clf.intercept_,clf.coef_[0]))

pd.DataFrame(data={'variable':['intercept'] + cat_cols + numeric_cols, 'coefficient':coefficients})

In [None]:
pd.DataFrame(data={'variable': numeric_cols, 'unit': np.sqrt(scaler.var_)})