# Random Forest Model

In [1]:
# import required libraries
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os
import imblearn
from imblearn.over_sampling import SMOTE
import joblib

In [2]:
# load stroke positive data subset
stroke_subset = pd.read_csv("data/combined_subsets.csv")
stroke_subset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,0,0,228.69,36.6,1,1
1,31112,1,80.0,0,1,1,0,1,105.92,32.5,0,1
2,60182,0,49.0,0,0,1,0,0,171.23,34.4,2,1
3,1665,0,79.0,1,0,1,1,1,174.12,24.0,0,1
4,56669,1,81.0,0,0,1,0,0,186.21,29.0,1,1


In [17]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,0,0,228.69,36.6,1
1,1,80.0,0,1,1,0,1,105.92,32.5,0
2,0,49.0,0,0,1,0,0,171.23,34.4,2
3,0,79.0,1,0,1,1,1,174.12,24.0,0
4,1,81.0,0,0,1,0,0,186.21,29.0,1


In [18]:
print("**********************")
print(f"Before X: {stroke_data.shape}")
print(f"Before y: {target.shape}")

sm = SMOTE()
stroke_data, target = sm.fit_resample(stroke_data,target)

print("**********************")
print(f"SMOTED X: {stroke_data.shape}")
print(f"SMOTED y: {target.shape}")

**********************
Before X: (29072, 10)
Before y: (29072,)
**********************
SMOTED X: (57048, 10)
SMOTED y: (57048,)


## Used All Parameters

In [19]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [20]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model for training: {train_score}")
print(f"Accuracy using the Random Forest Model for testing: {score}")
print(f"Difference in accuracy: {difference}")

Accuracy using the Random Forest Model for training: 1.0
Accuracy using the Random Forest Model for testing: 0.970551114850652
Difference in accuracy: 0.02944888514934796


In [21]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.4337738418542013, 'age'),
 (0.19909193357696503, 'avg_glucose_level'),
 (0.13911794395306618, 'bmi'),
 (0.05747410280806864, 'work_type'),
 (0.05433376011240886, 'smoking_status'),
 (0.04768907620421162, 'Residence_type'),
 (0.02819502858363839, 'gender'),
 (0.015715171405726702, 'hypertension'),
 (0.015673874066330522, 'ever_married'),
 (0.008935267435382772, 'heart_disease')]

In [22]:
from sklearn.metrics import classification_report

predictions = rf.predict(stroke_data)

print(classification_report(target, predictions,
                            target_names=["No Stroke", "Stroke"]))

              precision    recall  f1-score   support

   No Stroke       1.00      0.99      0.99     28524
      Stroke       0.99      1.00      0.99     28524

    accuracy                           0.99     57048
   macro avg       0.99      0.99      0.99     57048
weighted avg       0.99      0.99      0.99     57048



In [23]:
filename = 'RF_model.sav'

#dump model to file
joblib.dump(rf, filename)

['RF_model.sav']

## With Max Depth = 3

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200, max_depth=3)
rf = rf.fit(X_train, y_train)

max_depth3_train_score = rf.score(X_train, y_train)
max_depth3_score = rf.score(X_test, y_test)
difference = max_depth3_train_score - max_depth3_score

print(f"Accuracy using the Random Forest Model with max depth of 3 for training: {max_depth3_train_score}")
print(f"Accuracy using the Random Forest Model with max depth of 3 for testing: {max_depth3_score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## With Min Sample Split = 10

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10)
rf = rf.fit(X_train, y_train)

min_sample10_train_score = rf.score(X_train, y_train)
min_sample10_score = rf.score(X_test, y_test)
difference = min_sample10_train_score - min_sample10_score

print(f"Accuracy using the Random Forest Model with minimum sample of 10 for training: {min_sample10_train_score}")
print(f"Accuracy using the Random Forest Model with minimum sample of 10 for testing: {min_sample10_score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Ever Married, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "avg_glucose_level"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type, residence type, or average glucose level for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type, residence type, or average glucose level for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Ever Married, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type and residence type for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type and residence type for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "ever_married"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type, residence type, or ever married for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type, residence type, or ever married for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Heart Disease, BMI

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "ever_married", "hypertension", "avg_glucose_level", "smoking_status"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with gender, age, heart disease, and bmi for training: {train_score}")
print(f"Accuracy using the Random Forest Model with gender, age, heart disease, and bmi for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Age, Hypertension, Heart Disease, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id", "gender","work_type","Residence_type", "ever_married"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with age, hypertension, heart disease, avg glucose level, bmi, and smoking status for training: {train_score}")
print(f"Accuracy using the Random Forest Model with age, hypertension, heart disease, avg glucose level, bmi, and smoking status for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender and Age

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id", "work_type","Residence_type", "ever_married", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "smoking_status"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with gender and age for training: {train_score}")
print(f"Accuracy using the Random Forest Model with gender and for testing: {score}")
print(f"Difference in accuracy: {difference}")


In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)