In [1]:
import pandas as pd
import numpy as np
from random import randint

## Step 1: Getting Data Ready

In [2]:
# Storing dataset into a Pandas DataFrame
heart_disease_df = pd.read_csv("datasets/heart-disease.csv")

In [3]:
# Printing head of DataFrame to make sure the data has been loaded correctly
heart_disease_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Using Random seed to make results reproducible:
np.random.seed(42)
# Creating X & Y, where:
# X: Feature columns in DataFrame (whatever is needed for predicting)
# Y: Column to be predicted
X = heart_disease_df.drop('target',axis = 1)
Y = heart_disease_df['target']

In [5]:
# Splitting dataset into test & train sets using train_test_split method:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [6]:
# Looking at shapes to ensure that the dataset was split properly:
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((227, 13), (227,), (76, 13), (76,))

In [7]:
# Looking at labels in X_test set:
print (X_test.index)

Int64Index([179, 228, 111, 246,  60,   9, 119, 223, 268,  33,   5, 101,  45,
            175, 118,  46, 125, 192, 285, 279, 152, 269, 272,  25, 146, 283,
            254,  73, 231, 109, 139, 284, 198,  42,  17, 168,  76,  90,  24,
             57,  92,  77, 137, 116,   7, 251, 281,  78, 292, 232, 219, 255,
             63,  82, 236, 204, 249, 104, 300, 193, 184, 132, 202, 196,  75,
            176,  59,  93,   6, 177,  30,  22, 258,  56, 242, 114],
           dtype='int64')


## Step 2: Choosing the Model
Done by following the Model Selection Map. In this example, it's a Classification problem.
<br>This step involves trying different models.
<br>RandomForestClassifier from sklearn.ensemble is used.

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

## Step 3: Fitting Model to Data & Making Predictions

In [9]:
# Fitting data to model using fit () method on train sets
model.fit(X_train, Y_train)

In [10]:
# Making predictions using predict() method:
Y_pred = model.predict(X_test)
Y_pred

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

In [11]:
# Making predictions using predict_proba() method:
Y_prob_pred = model.predict_proba(X_test)
Y_prob_pred[:10]

array([[0.91, 0.09],
       [0.45, 0.55],
       [0.51, 0.49],
       [0.85, 0.15],
       [0.25, 0.75],
       [0.05, 0.95],
       [0.25, 0.75],
       [0.97, 0.03],
       [0.98, 0.02],
       [0.52, 0.48]])

In [12]:
# Getting a random index from the X_test set
random_index = X_test.index[randint(0,X_test.shape[0])]

In [13]:
# Making a prediction on a single sample (has to be array)
X_test.loc[random_index]
model.predict(np.array(X_test.loc[random_index]).reshape(1, -1))
# (-1) makes NumPy figure out how many columns this array has



array([0], dtype=int64)

In [14]:
# Printing the random element from X_test DF
X_test.loc[random_index]

age          58.0
sex           1.0
cp            0.0
trestbps    125.0
chol        300.0
fbs           0.0
restecg       0.0
thalach     171.0
exang         0.0
oldpeak       0.0
slope         2.0
ca            2.0
thal          3.0
Name: 236, dtype: float64

In [15]:
# Printing the random element from Y_test Series
Y_test.loc[random_index]

0

In [16]:
# Printing the random element from heart_disease_df
print(heart_disease_df.loc[random_index])

age          58.0
sex           1.0
cp            0.0
trestbps    125.0
chol        300.0
fbs           0.0
restecg       0.0
thalach     171.0
exang         0.0
oldpeak       0.0
slope         2.0
ca            2.0
thal          3.0
target        0.0
Name: 236, dtype: float64


## Step 4: Evaluating Model

### 4.1: Using `score()` method:

In [17]:
# Perform .score on train set first:
model.score(X_train, Y_train)

1.0

In [18]:
# Perform .score on test set:
model.score(X_test, Y_test)

0.8289473684210527

### 4.2: Using **Scoring** parameters:

#### a) Cross-Validation Accuracy:

In [19]:
from sklearn.model_selection import cross_val_score
# 5-fold Cross-val score: 
cross_val_score_array = cross_val_score(model, X, Y, cv=5)
np.mean(cross_val_score_array)

0.811639344262295

#### b) Area Under the Curve (AUC/ROC):
- `predict_proba()` is used for this metric

In [20]:
# Finding Prediction Probability:
Y_probs_pred = model.predict_proba(X_test)

# Printing first 10 Probabilities:
Y_probs_pred[:10]


array([[0.91, 0.09],
       [0.45, 0.55],
       [0.51, 0.49],
       [0.85, 0.15],
       [0.25, 0.75],
       [0.05, 0.95],
       [0.25, 0.75],
       [0.97, 0.03],
       [0.98, 0.02],
       [0.52, 0.48]])

In [21]:
# Left column represents probability of prediction being *False* while right column represents probability of prediction being *True*
Y_positive = Y_probs_pred[:,1]
Y_positive

array([0.09, 0.55, 0.49, 0.15, 0.75, 0.95, 0.75, 0.03, 0.02, 0.48, 0.85,
       0.31, 0.94, 0.14, 0.95, 0.99, 1.  , 0.14, 0.05, 0.05, 0.54, 0.07,
       0.69, 0.71, 0.64, 0.72, 0.8 , 0.77, 0.1 , 0.79, 0.05, 0.13, 0.02,
       0.35, 0.58, 0.06, 0.66, 0.71, 0.65, 0.86, 0.87, 0.82, 0.84, 0.75,
       0.75, 0.28, 0.64, 0.98, 0.11, 0.05, 0.21, 0.18, 0.83, 0.74, 0.05,
       0.11, 0.31, 1.  , 0.11, 0.01, 0.22, 0.96, 0.24, 0.45, 0.93, 0.13,
       0.35, 0.53, 0.85, 0.86, 0.81, 0.92, 0.58, 0.87, 0.22, 0.93])

- Calculating FPR, TPR,and Thresholds:

In [22]:
# Importing roc_curve:
from sklearn.metrics import roc_curve
fpr,tpr,thresholds = roc_curve(Y_test,Y_positive)

# Printing out fpr:
fpr

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.02857143, 0.02857143,
       0.02857143, 0.05714286, 0.05714286, 0.05714286, 0.05714286,
       0.08571429, 0.08571429, 0.11428571, 0.11428571, 0.17142857,
       0.2       , 0.2       , 0.22857143, 0.22857143, 0.25714286,
       0.31428571, 0.37142857, 0.45714286, 0.57142857, 0.65714286,
       0.77142857, 0.88571429, 0.91428571, 0.97142857, 1.        ])

## Step 5: Improving Model


In [23]:
# Try different numbers of estimators with cross-validation and no cross-validation

from sklearn.model_selection import cross_val_score

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accruacy on test set: {model.score(X_test, Y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, X, Y, cv=5)) * 100}%")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.8289473684210527
Cross-validation score: 80.5191256830601%

Trying model with 20 estimators...
Model accruacy on test set: 0.8026315789473685
Cross-validation score: 79.85245901639344%

Trying model with 30 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 81.16939890710381%

Trying model with 40 estimators...
Model accruacy on test set: 0.8421052631578947
Cross-validation score: 81.83060109289617%

Trying model with 50 estimators...
Model accruacy on test set: 0.8289473684210527
Cross-validation score: 82.1639344262295%

Trying model with 60 estimators...
Model accruacy on test set: 0.8421052631578947
Cross-validation score: 83.8032786885246%

Trying model with 70 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 81.81967213114754%

Trying model with 80 estimators...
Model accruacy on test set: 0.8552631578947368
Cross-validation score: 82.497267759562

## Step 6: Saving Trained Model

In [24]:
import pickle

# Save trained model to file
pickle.dump(model, open("random_forest_model_1.pkl", "wb"))