In [99]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

## Load Data

In [100]:
# Load in data
heart_disease_df = pd.read_csv('../Heart_Disease_Prediction.csv')
heart_disease_df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


## Pre-processing

In [101]:
# Check the distribution of our target variable
heart_disease_df['Heart Disease'].value_counts()

Absence     150
Presence    120
Name: Heart Disease, dtype: int64

In [102]:
# Encode target variable with Scikit Learn
le = LabelEncoder()
encoded_df = heart_disease_df.copy()
encoded_df['Heart Disease'] = le.fit_transform(encoded_df['Heart Disease'])
encoded_df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0


### ^^ Heart Disease Note: 
#### 1 = Presence
#### 2 = Absence

In [103]:
# Define the target and features
y = encoded_df['Heart Disease'].ravel()

X = encoded_df.drop('Heart Disease', axis = 1)

In [104]:
# Split in to test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

In [105]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit the scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Random Forest Model

In [106]:
# Create random forest classifier
model = RandomForestClassifier(n_estimators = 128, random_state = 20)

In [107]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

## Make Predictions

In [108]:
# Make predictions using the testing data
predictions = model.predict(X_test_scaled)

## Evaluate the Model

In [109]:
# Calculate confusion matrix
matrix = confusion_matrix(y_test, predictions)
matrix

array([[34,  2],
       [ 9, 23]], dtype=int64)

In [110]:
# Calculate accuracy score
accuracy_score = accuracy_score(y_test, predictions)
print(f"Accuracy score: {accuracy_score}")

Accuracy score: 0.8382352941176471


In [111]:
classification_report = classification_report(y_test, predictions)
print(classification_report)

              precision    recall  f1-score   support

           0       0.79      0.94      0.86        36
           1       0.92      0.72      0.81        32

    accuracy                           0.84        68
   macro avg       0.86      0.83      0.83        68
weighted avg       0.85      0.84      0.84        68



## Rank Importance of the Features

In [112]:
# Calculate feature importance
importances = model.feature_importances_
importances

array([0.09127707, 0.03004352, 0.10262158, 0.0801181 , 0.08460157,
       0.01043904, 0.02083626, 0.10881749, 0.07312787, 0.12528118,
       0.056498  , 0.09641122, 0.1199271 ])

In [113]:
# Sort the features by their importance.
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.12528117990303572, 'ST depression'),
 (0.1199270969285885, 'Thallium'),
 (0.10881748998297755, 'Max HR'),
 (0.10262158374449802, 'Chest pain type'),
 (0.09641122180658694, 'Number of vessels fluro'),
 (0.091277065486575, 'Age'),
 (0.0846015658249309, 'Cholesterol'),
 (0.08011810222129855, 'BP'),
 (0.07312787456646674, 'Exercise angina'),
 (0.0564980002198541, 'Slope of ST'),
 (0.03004351854214346, 'Sex'),
 (0.020836263015885457, 'EKG results'),
 (0.010439037757158988, 'FBS over 120')]

## Testing if Scaling Makes any Difference in this Model

In [114]:
# Train the model with unscaled data
unscaled_model = model.fit(X_train,y_train)

In [115]:
# Make predictions
unscaled_predictions = model2.predict(X_test)

In [116]:
matrix = confusion_matrix(y_test, unscaled_predictions)
matrix

array([[34,  2],
       [ 9, 23]], dtype=int64)

### The same matrix is produced, so it did not make a difference