In [2]:
# import the library
import pandas as pd

# Read the CSV 

In [3]:
csv_file2 = "./resources/heart.csv"
heart_df = pd.read_csv(csv_file2)
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:

# Drop the null columns where all values are null
heart_df = heart_df.dropna(axis='columns', how='all')

# Drop the null rows
heart_df = heart_df.dropna()
heart_df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


# Select features (columns)

In [9]:
# Set features
# selected_features = df[['names', 'of', 'selected', 'features', 'here']]
selected_features = heart_df

# Create a Train Test Split

In [10]:
# Assign x and y values
X = heart_df.drop("ca", axis=1)
y = heart_df["ca"]

print(X.shape, y.shape)


(303, 13) (303,)


In [11]:
# Import Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=115, stratify=y)

In [12]:
X_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,thal,target
58,34,1,3,118,182,0,0,174,0,0.0,2,2,1
285,46,1,0,140,311,0,1,120,1,1.8,1,3,0
270,46,1,0,120,249,0,0,144,0,0.8,2,3,0
189,41,1,0,110,172,0,0,158,0,0.0,2,3,0
184,50,1,0,150,243,0,0,128,0,2.6,1,3,0


# Pre-processing
Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
# Scale the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model - Random Forest¶
Use Random Forest model

In [14]:
# Create the Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(X_train_scaled, y_train)

In [15]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6447368421052632


In [16]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.15809054, 0.02710844, 0.04706406, 0.11619289, 0.13941327,
       0.02715629, 0.03525888, 0.13981572, 0.02067033, 0.12976527,
       0.03301972, 0.03818745, 0.08825713])

In [17]:
# Sort the features by their importance
sorted(zip(rf.feature_importances_, selected_features), reverse=True)

[(0.15809054405795103, 'age'),
 (0.13981571951243288, 'thalach'),
 (0.13941327029597103, 'chol'),
 (0.1297652662845454, 'oldpeak'),
 (0.11619289076324346, 'trestbps'),
 (0.08825712609222622, 'thal'),
 (0.047064061525920624, 'cp'),
 (0.03818745452098729, 'ca'),
 (0.03525888363720419, 'restecg'),
 (0.03301971950802296, 'slope'),
 (0.02715629468683196, 'fbs'),
 (0.027108437172156448, 'sex'),
 (0.020670331942506715, 'exang')]

# Hyperparameter tuning
Use GridSearchCV to tune the Random Forest model's parameters

In [18]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [250, 300, 350],
              'max_depth': [125, 150, 175]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [19]:
# Fit the Random Forest model using the grid search estimator. 
# This will take the Random Forest model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... max_depth=125, n_estimators=250, score=0.565, total=   0.3s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.522, total=   0.3s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.600, total=   0.3s
[CV] max_depth=125, n_estimators=250 .................................
[CV] ..... max_depth=125, n_estimators=250, score=0.533, total=   0.4s
[CV] max_depth=125, n_estimators=250 .................................
[CV] ..... max_depth=125, n_estimators=250, score=0.556, total=   0.3s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.630, total=   0.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.522, total=   0.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.600, total=   0.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.556, total=   0.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   17.9s finished


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [125, 150, 175],
                         'n_estimators': [250, 300, 350]},
             verbose=3)

In [20]:
# List the best parameters for this dataset
print(f"Best Parameters: {grid.best_params_}")

# List the best score
print(f"Best Score: {grid.best_score_}")

Best Parameters: {'max_depth': 175, 'n_estimators': 350}
Best Score: 0.5945893719806764


In [21]:
print(f"Training Grid Score: {grid.score(X_train_scaled, y_train)}")
print(f"Testing Grid Score: {grid.score(X_test_scaled, y_test)}")

Training Grid Score: 1.0
Testing Grid Score: 0.618421052631579


In [23]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [24]:
#Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82        44
           1       0.33      0.25      0.29        16
           2       0.22      0.20      0.21        10
           3       0.00      0.00      0.00         5
           4       1.00      1.00      1.00         1

    accuracy                           0.62        76
   macro avg       0.46      0.47      0.46        76
weighted avg       0.54      0.62      0.57        76



  _warn_prf(average, modifier, msg_start, len(result))


# Save the Model

In [25]:
# Save the model 

import joblib
filename = 'MYCROFTXTREEM_RF.sav'
joblib.dump(rf, filename)

['MYCROFTXTREEM_RF.sav']