In [None]:
# Import dependencies
import psycopg2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Set up connection to database
engine = psycopg2.connect(
    database="stroke_dataset",
    user = "postgres",
    password = "bootcamp",
    host = "final-project.cpi4gz0jwsu4.us-west-1.rds.amazonaws.com",
    port = '5432'
    )

cursor = engine.cursor()

In [None]:
# Pull in working dataset from database
sql = """
SELECT "FIPS", "Num_Per_100k", "State", "County", "Percent_Smokers", "Percent_Obese", "Food_Environment_Index", "Percent_Access_Exercise", "Primary_Care_Physician_Rate", "Mental_Health_Providers_Rate", "Percent_Some_College", "Percent_Unemployed", "Income_Inequality_Ratio", "Violent_Crime_Rate", "Average_Daily_Air_Polution_Particle_Matter", "Percent_Long_Commute_Drives_Alone", "Percent_Rural"
FROM public."All_Data"
"""

all_data_df = pd.read_sql(sql, con=engine)
all_data_df.head()

In [None]:
# Check for null values
all_data_df.isnull().sum()

FIPS                                          0
Num_Per_100k                                  0
State                                         0
County                                        0
Percent_Smokers                               0
Percent_Obese                                 0
Food_Environment_Index                        0
Percent_Access_Exercise                       0
Primary_Care_Physician_Rate                   0
Mental_Health_Providers_Rate                  0
Percent_Some_College                          0
Percent_Unemployed                            0
Income_Inequality_Ratio                       0
Violent_Crime_Rate                            0
Average_Daily_Air_Polution_Particle_Matter    0
Percent_Long_Commute_Drives_Alone             0
Percent_Rural                                 0
dtype: int64

In [None]:
# Drop null values
all_data_df = all_data_df.dropna(axis=0)

In [None]:
# Drop identifier columns
all_data_usable_df = all_data_df.drop(["FIPS", "State", "County"], 1)
all_data_usable_df.head()

  


Unnamed: 0,Num_Per_100k,Percent_Smokers,Percent_Obese,Food_Environment_Index,Percent_Access_Exercise,Primary_Care_Physician_Rate,Mental_Health_Providers_Rate,Percent_Some_College,Percent_Unemployed,Income_Inequality_Ratio,Violent_Crime_Rate,Average_Daily_Air_Polution_Particle_Matter,Percent_Long_Commute_Drives_Alone,Percent_Rural
0,107.0,19.0,36.0,7.1,71.0,45.0,13.0,61.9,5.3,4.4,266.0,10.8,41.0,42.0
1,82.5,17.0,29.0,7.9,69.0,73.0,89.0,63.8,5.4,4.6,217.0,9.6,40.0,42.3
2,96.1,22.0,44.0,5.5,53.0,42.0,8.0,39.7,8.6,5.9,329.0,10.1,35.0,67.8
3,112.3,20.0,38.0,7.6,49.0,53.0,9.0,49.8,6.6,4.2,147.0,10.5,49.0,68.4
4,96.1,20.0,36.0,8.5,32.0,21.0,9.0,53.9,5.5,4.1,212.0,11.3,60.0,90.0


Random Forest Regressor - all features

In [None]:
# https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/
# Divide data into attributes and labels

X = all_data_usable_df.iloc[:, 1:13].values
y = all_data_usable_df.iloc[:, 0].values

In [None]:
# Divide data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Train random forest algorithm to solve this regression problem
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Increase estimators to 50
regressor2 = RandomForestRegressor(n_estimators=50, random_state=0)
regressor2.fit(X_train, y_train)
y_pred2 = regressor2.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred2)))

In [None]:
# Change estimators to 30
regressor3 = RandomForestRegressor(n_estimators=30, random_state=0)
regressor3.fit(X_train, y_train)
y_pred3 = regressor3.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred3)))

Random Forest Regressor - health features

In [None]:
# Drop non-health feature columns
health_features_df = all_data_usable_df.drop(["Percent_Rural", "Percent_Long_Commute_Drives_Alone", "Average_Daily_Air_Polution_Particle_Matter", "Violent_Crime_Rate", "Income_Inequality_Ratio", "Percent_Unemployed", "Percent_Some_College"], 1)
health_features_df.head()

In [None]:
# Divide into attributes and labels
X = all_data_usable_df.iloc[:, 1:7].values
y = all_data_usable_df.iloc[:, 0].values

In [None]:
# Divide data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Scale Features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Train random forest algorithm to solve this regression problem
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Increase estimators to 50
regressor2 = RandomForestRegressor(n_estimators=50, random_state=0)
regressor2.fit(X_train, y_train)
y_pred2 = regressor2.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred2)))

In [None]:
# Change estimators to 30
regressor3 = RandomForestRegressor(n_estimators=30, random_state=0)
regressor3.fit(X_train, y_train)
y_pred3 = regressor3.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred3)))

Random Forest Regressor - environment / economic features

In [None]:
ee_features_df = all_data_usable_df.drop(["Percent_Smokers", "Percent_Obese", "Food_Environment_Index", "Percent_Access_Exercise", "Primary_Care_Physician_Rate", "Mental_Health_Providers_Rate"], 1)
ee_features_df.head()

In [None]:
# Divide into attributes and labels
X = all_data_usable_df.iloc[:, 1:6].values
y = all_data_usable_df.iloc[:, 0].values

In [None]:
# Divide data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Scale Features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Train random forest algorithm to solve this regression problem
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Increase estimators to 50
regressor2 = RandomForestRegressor(n_estimators=50, random_state=0)
regressor2.fit(X_train, y_train)
y_pred2 = regressor2.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred2)))

In [None]:
# Change estimators to 30
regressor3 = RandomForestRegressor(n_estimators=30, random_state=0)
regressor3.fit(X_train, y_train)
y_pred3 = regressor3.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred3)))

Random Forest Regressor - features of importance