In [1]:
import pandas as pd
import numpy as np
from random import randint

Step 1: Getting Data Ready

In [2]:
# Storing dataset into a Pandas DataFrame
heart_disease_df = pd.read_csv("datasets/heart-disease.csv")

In [3]:
# Printing head of DataFrame to make sure the data has been loadede correctly
heart_disease_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Creating X & Y, where:
# X: Feature columns in DataFrame (whatever is needed for predicting)
# Y: Column to be predicted
X = heart_disease_df.drop('target',axis = 1)
Y = heart_disease_df['target']

In [5]:
# Splitting dataset into test & train sets using train_test_split method:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [6]:
# Looking at shapes to ensure that the dataset was split properly:
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((227, 13), (227,), (76, 13), (76,))

In [7]:
# Looking at labels in X_test set:
print (X_test.index)

Int64Index([214,  71, 267, 250,   5, 103, 137, 151, 278, 191,  35, 212, 170,
            224, 231, 192,  68,  54,  15, 167, 282, 106, 120,  98,  94, 184,
             26, 238, 133, 230,  73, 229, 244, 223, 186, 157, 285, 197, 276,
            296, 194,  91, 264,  25, 122, 266, 255, 113,  34, 207, 105,  83,
            200, 188,   1, 165,  64, 193, 134, 130, 112, 270, 292, 166, 288,
              6, 275, 287, 119, 141, 158,  45,  16, 178, 109, 102],
           dtype='int64')


Step 2: Choosing the Model
Done by following the Model Selection Map. In this example, it's a Classification problem.
Involves trying different models.
RandomForestClassifier from sklearn.ensemble is used.

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

Step 3: Fitting Model to Data & Making Predictions

In [9]:
# Fitting data to model using fit () method on train sets
model.fit(X_train, Y_train)

In [10]:
# Making predictions using predict() method:
Y_pred = model.predict(X_test)

In [11]:
# Getting a random index from the X_test set
random_index = X_test.index[randint(0,X_test.shape[0])]

In [12]:
# Making a prediction on a single sample (has to be array)
X_test.loc[random_index]
model.predict(np.array(X_test.loc[random_index]).reshape(1, -1))
# (-1) makes NumPy figure out how many columns this array has



array([0], dtype=int64)

In [13]:
# Printing the random element from X_test DF
X_test.loc[random_index]

age          56.0
sex           1.0
cp            0.0
trestbps    125.0
chol        249.0
fbs           1.0
restecg       0.0
thalach     144.0
exang         1.0
oldpeak       1.2
slope         1.0
ca            1.0
thal          2.0
Name: 214, dtype: float64

In [14]:
# Printing the random element from Y_test Series
Y_test.loc[random_index]

0

In [15]:
# Printing the random element from heart_disease_df
print(heart_disease_df.loc[random_index])

age          56.0
sex           1.0
cp            0.0
trestbps    125.0
chol        249.0
fbs           1.0
restecg       0.0
thalach     144.0
exang         1.0
oldpeak       1.2
slope         1.0
ca            1.0
thal          2.0
target        0.0
Name: 214, dtype: float64


Step 4: Evaluating Model

In [16]:
# Perform .score on train set first:
model.score(X_train, Y_train)

1.0

In [17]:
# Perform .score on test set:
model.score(X_test, Y_test)

0.8157894736842105

Step 5: Improving Model


In [18]:
# Try different numbers of estimators with cross-validation and no cross-validation

from sklearn.model_selection import cross_val_score

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accruacy on test set: {model.score(X_test, Y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, X, Y, cv=5)) * 100}%")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.7763157894736842
Cross-validation score: 81.18032786885246%

Trying model with 20 estimators...
Model accruacy on test set: 0.7763157894736842
Cross-validation score: 83.14207650273224%

Trying model with 30 estimators...
Model accruacy on test set: 0.8289473684210527
Cross-validation score: 80.50819672131148%

Trying model with 40 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 80.50819672131148%

Trying model with 50 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 81.84153005464483%

Trying model with 60 estimators...
Model accruacy on test set: 0.7894736842105263
Cross-validation score: 83.1584699453552%

Trying model with 70 estimators...
Model accruacy on test set: 0.8026315789473685
Cross-validation score: 81.81967213114754%

Trying model with 80 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 83.1475409836

Step 6: Saving Trained Model

In [19]:
import pickle

# Save trained model to file
pickle.dump(model, open("random_forest_model_1.pkl", "wb"))