In [40]:
import numpy as np
import os
import pandas as pd
import nbimporter
from pred import preprocess_data
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

In [41]:
df = pd.read_csv('../cleaned_data/merged_data.csv')

In [42]:
X = df[['County', 'State', 'Total', 'Less than $10,000', '$10,000 to $14,999', '$15,000 to $24,999', '$25,000 to $34,999', '$35,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 to $149,999', '$150,000 to $199,999', '$200,000 or more', 'Median income', 'Mean income', 'Sex', 'ethnicity']]
y = df['Heart Disease Mortality']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [44]:
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df)

In [45]:
rf_regressor = make_pipeline(preprocessor, RandomForestRegressor(random_state = 42))

In [46]:
randomforest_search_grid = {
    "randomforestregressor__n_estimators": [50, 100, 200, 300],
    "randomforestregressor__max_depth": [None, 3, 5, 7],
    "randomforestregressor__max_features": [None, "sqrt", "log2"],
    "randomforestregressor__min_samples_split": range(2, 5),
    "randomforestregressor__min_samples_leaf": range(1, 5)
}

In [47]:
model = RandomizedSearchCV(estimator = rf_regressor, 
                                         param_distributions = randomforest_search_grid, 
                                         n_iter = 50, scoring = "neg_root_mean_squared_error", 
                                         cv = 3, n_jobs = -1, random_state = 42)

In [48]:
model.fit(X_train, y_train)

In [49]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [50]:
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)

print(train_mse)
print(test_mse)
print(model.best_params_)

4840.610296505682
6868.6167798630395
{'randomforestregressor__n_estimators': 200, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__max_features': None, 'randomforestregressor__max_depth': 7}
