In [1]:
import pandas as pd
import numpy as np
from random import randint

## Step 1: Getting Data Ready

In [2]:
# Storing dataset into a Pandas DataFrame
heart_disease_df = pd.read_csv("datasets/heart-disease.csv")

In [3]:
# Printing head of DataFrame to make sure the data has been loaded correctly
heart_disease_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Using Random seed to make results reproducible:
np.random.seed(42)
# Creating X & Y, where:
# X: Feature columns in DataFrame (whatever is needed for predicting)
# Y: Column to be predicted
X = heart_disease_df.drop('target',axis = 1)
Y = heart_disease_df['target']
X.shape,Y.shape

((303, 13), (303,))

### Splitting dataset into train, validation, and test sets using the train_test_split method twice:

In [5]:
from sklearn.model_selection import train_test_split
# Splitting dataset into train, validation, and test sets using the train_test_split method twice:

X_train_valid,X_test,Y_train_valid,Y_test = train_test_split(X,Y,test_size = 0.2)

In [6]:
X_train,X_valid,Y_train,Y_valid = train_test_split(X_train_valid,Y_train_valid,train_size = 0.7)

In [7]:
# Looking at shapes to ensure that the dataset was split properly:
X_train.shape,Y_train.shape,X_valid.shape,Y_valid.shape,X_test.shape,Y_test.shape

((169, 13), (169,), (73, 13), (73,), (61, 13), (61,))

## Step 2: Choosing the Model
Done by following the Model Selection Map. In this example, it's a Classification problem.
<br>This step involves trying different models.
<br>RandomForestClassifier from sklearn.ensemble is used.

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

## Step 3: Fitting Model to Data & Making Predictions

In [9]:
# Fitting training data to model using fit () method
model.fit(X_train, Y_train)

Instantiating Calibration Model:

In [10]:
from sklearn.calibration import CalibratedClassifierCV
cal_model = CalibratedClassifierCV(model,method="sigmoid",cv=35)

In [11]:
# Fitting validation dataset to calibration model using fit () method
cal_model.fit(X_valid, Y_valid)

In [12]:
# Making predictions using predict() method:
Y_pred = model.predict(X_test)
Y_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [13]:
Y_cal_pred = cal_model.predict(X_test)
Y_cal_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [14]:
# Making predictions using predict_proba() method:
Y_prob_pred = model.predict_proba(X_test)
Y_prob_pred[:10]

array([[0.78, 0.22],
       [0.44, 0.56],
       [0.39, 0.61],
       [0.92, 0.08],
       [0.17, 0.83],
       [0.25, 0.75],
       [0.18, 0.82],
       [0.95, 0.05],
       [0.95, 0.05],
       [0.51, 0.49]])

In [15]:
# Making predictions using predict_proba() method:
Y_prob_cal_pred = cal_model.predict_proba(X_test)
Y_prob_cal_pred[:10]

array([[0.74268875, 0.25731125],
       [0.35870389, 0.64129611],
       [0.4216634 , 0.5783366 ],
       [0.69406025, 0.30593975],
       [0.39844504, 0.60155496],
       [0.2589538 , 0.7410462 ],
       [0.47266703, 0.52733297],
       [0.75980122, 0.24019878],
       [0.72980034, 0.27019966],
       [0.41124629, 0.58875371]])

## Step 4: Evaluating Model

### 4.1: Using `score()` method:

In [16]:
model.score(X_test, Y_test)

0.8360655737704918

In [17]:
cal_model.score(X_test, Y_test)

0.8688524590163934

### 4.2: Using **Scoring** parameters:

#### a) Cross-Validation Accuracy:

In [18]:
from sklearn.model_selection import cross_val_score
# 5-fold Cross-val score: 
cross_val_score_array = cross_val_score(model, X, Y, cv=5)
np.mean(cross_val_score_array)

0.8181967213114755

In [19]:
# 5-fold Cross-val score on calibrated model: 
cross_val_score_array_cal = cross_val_score(cal_model, X, Y, cv=5)
cal_model
# np.mean(cross_val_score_array_cal)

#### b) Area Under the Curve (AUC/ROC):
- `predict_proba()` is used for this metric

In [20]:
# Finding Prediction Probability:
# Y_probs_pred = model.predict_proba(X_test)

# Printing first 10 Probabilities:
# Y_probs_pred[:10]


In [21]:
# Left column represents probability of prediction being *False* while right column represents probability of prediction being *True*
# Y_positive = Y_probs_pred[:,1]
# Y_positive

- Calculating FPR, TPR,and Thresholds:

In [22]:
# Importing roc_curve:
# from sklearn.metrics import roc_curve
# fpr,tpr,thresholds = roc_curve(Y_test,Y_positive)

# Printing out fpr:
# fpr

## Step 5: Improving Model


In [23]:
# Try different numbers of estimators with cross-validation and no cross-validation

#from sklearn.model_selection import cross_val_score

#for i in range(10, 100, 10):
#    print(f"Trying model with {i} estimators...")
#    model = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
#    print(f"Model accruacy on test set: {model.score(X_test, Y_test)}")
 #   print(f"Cross-validation score: {np.mean(cross_val_score(model, X, Y, cv=5)) * 100}%")
 #   print("")

## Step 6: Saving Trained Model

In [24]:
# import pickle

# Save trained model to file
#pickle.dump(model, open("random_forest_model_1.pkl", "wb"))