In [119]:
import numpy as np
import os
import pandas as pd
import nbimporter
from pred import preprocess_data
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [120]:
df = pd.read_csv("../cleaned_data/heart_disease_mortality_cleaned.csv")

In [121]:
X = df[["Year", "LocationAbbr", "LocationDesc", "GeographicLevel", "Data_Value_Unit", "Data_Value_Type", "Sex", "ethnicity", "LocationID", "Y_lat", "X_lon", "Georeference"]]
y = df["Heart Disease Mortality"]

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [123]:
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df)

In [124]:
select_feature_model =  make_pipeline(preprocessor, RandomForestRegressor(random_state = 42))

In [125]:
select_feature_model.fit(X_train, y_train)

In [126]:
y_hat_train = select_feature_model.predict(X_train)
y_hat_test = select_feature_model.predict(X_test)

In [127]:
print(select_feature_model.named_steps["randomforestregressor"].feature_importances_)

[0.00000000e+00 1.96053581e-01 1.99248335e-01 1.59459400e-04
 7.16769133e-04 9.69767563e-04 4.95641803e-05 1.78227717e-04
 7.15149682e-04 1.81673259e-03 1.04075715e-04 1.89303371e-04
 1.79547013e-04 1.57421242e-03 1.05426813e-03 4.10396604e-04
 2.22119222e-04 6.61586724e-04 2.56710231e-04 5.89426057e-04
 3.39215408e-04 7.62713177e-03 1.17672463e-03 1.45356934e-03
 2.73738741e-04 5.02020855e-04 1.75696226e-05 1.49303097e-03
 3.37324666e-03 1.80861871e-03 1.02418592e-05 1.57888258e-03
 1.49610656e-03 1.64561383e-03 3.91600015e-03 6.03985806e-04
 5.18872457e-05 3.63982114e-04 3.36429845e-04 2.43827036e-03
 1.11568776e-03 1.13421148e-03 1.19811179e-02 9.30150874e-04
 9.95990828e-04 3.18359771e-04 3.76345740e-05 5.20615509e-04
 8.29322238e-04 7.51532476e-04 4.20404971e-03 1.40248215e-05
 4.61957068e-04 1.26398217e-03 9.47505283e-06 6.20036747e-05
 4.38983300e-04 1.15924853e-03 1.03954409e-03 5.81952130e-04
 1.00148141e-03 1.56624371e-05 1.00844409e-03 1.56902183e-02
 1.52910454e-01 1.972382

In [128]:
rf_regressor = Pipeline([
    ("preprocessor", preprocessor), 
    ("regressor", RandomForestRegressor(random_state = 42)),
    ])

In [129]:
randomforest_search_grid = {
    "regressor__n_estimators": [50, 100, 200],
    "regressor__max_depth": [None, 1, 2, 3],
    "regressor__max_features": [None, "sqrt", "log2"],
}

In [130]:
model = GridSearchCV(estimator = rf_regressor, param_grid = randomforest_search_grid, scoring = "neg_root_mean_squared_error", cv = 3, verbose = 3)

In [131]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=50;, score=-82.377 total time=  10.1s
[CV 2/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=50;, score=-72.856 total time=   9.8s
[CV 3/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=50;, score=-85.589 total time=  10.3s
[CV 1/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=100;, score=-81.853 total time=  19.8s
[CV 2/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=100;, score=-72.393 total time=  19.6s
[CV 3/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=100;, score=-85.152 total time=  20.0s
[CV 1/3] END regressor__max_depth=None, regressor__max_features=None, regressor__n_estimators=200;, score=-81.447 total time=  39.5s
[CV 2/3] E

In [132]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [133]:
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)

print(train_mse)
print(test_mse)
print(model.best_params_)

776.5474802171799
6099.035328371916
{'regressor__max_depth': None, 'regressor__max_features': None, 'regressor__n_estimators': 200}
