In [11]:
import numpy as np
import os
import pandas as pd
import nbimporter
from pred import preprocess_data
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

In [12]:
df = pd.read_csv('../cleaned_data/merged_data.csv')

In [13]:
X = df[['County', 'State', 'Median income', 'Mean income', 'Sex', 'ethnicity']]
y = df['Heart Disease Mortality']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [15]:
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(df)

In [16]:
rf_regressor = make_pipeline(preprocessor, PolynomialFeatures(), RandomForestRegressor(random_state = 42))

In [17]:
randomforest_search_grid = {
    "polynomialfeatures__degree": range(0, 3),
    "randomforestregressor__n_estimators": [50, 100, 200, 300],
    "randomforestregressor__max_depth": [None, 3, 5, 7],
    "randomforestregressor__max_features": [None, "sqrt", "log2"],
    "randomforestregressor__min_samples_split": range(2, 5),
    "randomforestregressor__min_samples_leaf": range(1, 5)
}

In [18]:
model = RandomizedSearchCV(estimator = rf_regressor, 
                                         param_distributions = randomforest_search_grid, 
                                         n_iter = 50, scoring = "neg_root_mean_squared_error", 
                                         cv = 3, n_jobs = -1, random_state = 42)

In [19]:
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [None]:
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)

print(train_mse)
print(test_mse)
print(model.best_params_)

5102.346092519032
6902.595358688841
{'randomforestregressor__n_estimators': 100, 'randomforestregressor__min_samples_split': 3, 'randomforestregressor__min_samples_leaf': 3, 'randomforestregressor__max_features': None, 'randomforestregressor__max_depth': None}
