# Random Forest Model

In [None]:
# import required libraries
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os

In [None]:
# load stroke positive data subset
stroke_subset = pd.read_csv("data/combined_subsets.csv")
stroke_subset.head()

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

## Used All Parameters

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model for training: {train_score}")
print(f"Accuracy using the Random Forest Model for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## With Max Depth = 3

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200, max_depth=3)
rf = rf.fit(X_train, y_train)

max_depth3_train_score = rf.score(X_train, y_train)
max_depth3_score = rf.score(X_test, y_test)
difference = max_depth3_train_score - max_depth3_score

print(f"Accuracy using the Random Forest Model with max depth of 3 for training: {max_depth3_train_score}")
print(f"Accuracy using the Random Forest Model with max depth of 3 for testing: {max_depth3_score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## With Min Sample Split = 10

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10)
rf = rf.fit(X_train, y_train)

min_sample10_train_score = rf.score(X_train, y_train)
min_sample10_score = rf.score(X_test, y_test)
difference = min_sample10_train_score - min_sample10_score

print(f"Accuracy using the Random Forest Model with minimum sample of 10 for training: {min_sample10_train_score}")
print(f"Accuracy using the Random Forest Model with minimum sample of 10 for testing: {min_sample10_score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Ever Married, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "avg_glucose_level"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type, residence type, or average glucose level for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type, residence type, or average glucose level for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Ever Married, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type and residence type for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type and residence type for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Hypertension, Heart Disease, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "ever_married"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model without work type, residence type, or ever married for training: {train_score}")
print(f"Accuracy using the Random Forest Model without work type, residence type, or ever married for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender, Age, Heart Disease, BMI

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id","work_type","Residence_type", "ever_married", "hypertension", "avg_glucose_level", "smoking_status"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with gender, age, heart disease, and bmi for training: {train_score}")
print(f"Accuracy using the Random Forest Model with gender, age, heart disease, and bmi for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Age, Hypertension, Heart Disease, Avg Glucose Level, BMI, Smoking Status

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id", "gender","work_type","Residence_type", "ever_married"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

In [None]:
# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with age, hypertension, heart disease, avg glucose level, bmi, and smoking status for training: {train_score}")
print(f"Accuracy using the Random Forest Model with age, hypertension, heart disease, avg glucose level, bmi, and smoking status for testing: {score}")
print(f"Difference in accuracy: {difference}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Gender and Age

In [None]:
# split dataset in features and target variables

# target
target = stroke_subset["stroke"]
target_names = ["no", "yes"]

# features
stroke_data = stroke_subset.drop(["stroke","id", "work_type","Residence_type", "ever_married", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "smoking_status"], axis=1)
feature_names = stroke_data.columns

stroke_data.head()

# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stroke_data, target, random_state=42)

# create Random Forest Classifier object
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

train_score = rf.score(X_train, y_train)
score = rf.score(X_test, y_test)
difference = train_score - score

print(f"Accuracy using the Random Forest Model with gender and age for training: {train_score}")
print(f"Accuracy using the Random Forest Model with gender and for testing: {score}")
print(f"Difference in accuracy: {difference}")


In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)