In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor

sns.set(style="white")

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('./df_foodaccess_allmortality_SVI.csv')

In [3]:
df.isnull().mean().sort_values(ascending = False).head(39)

Deaths Thyroid                          0.994603
Crude Rate Thyroid                      0.994603
Age Adjusted Rate Thyroid               0.994603
Crude Rate MalignantEndocrine           0.989206
Age Adjusted Rate MalignantEndocrine    0.989206
Deaths MalignantEndocrine               0.989206
Deaths Obesity                          0.964127
Age Adjusted Rate Obesity               0.964127
Crude Rate Obesity                      0.964127
Deaths Psychoactive                     0.939365
Crude Rate Psychoactive                 0.939365
Age Adjusted Rate Psychoactive          0.939365
Age Adjusted Rate Assault               0.917143
Crude Rate Assault                      0.917143
Deaths Assault                          0.917143
Deaths Pulmonary                        0.902857
Crude Rate Pulmonary                    0.902857
Age Adjusted Rate Pulmonary             0.902857
Age Adjusted Rate Metabolic             0.863175
Deaths Metabolic                        0.863175
Crude Rate Metabolic

## <font color = darkorchid> Feature Selection </font>

In [4]:
# Create feature list with percentage metrics
features_proportional_share = ['GroupQuartersFlag', 
                               'PovertyRate',
                               'LILATracts_1And10', 'LILATracts_1And20',
                               'LILATracts_Vehicle', 'LILATracts_halfAnd10', 
                               'LowIncomeTracts',
                               'LA1and10', 'LA1and20', 'LAhalfand10',
                               'lablack10share', 'lablack20share', 
                               'lahisp10share', 'lahisp20share', 
                               'lahunv10share', 'lahunv20share', 
                               'lalowi10share', 'lalowi20share', 
                               'lapop10share', 'lapop20share',
                               'lasnap10share', 'lasnap20share',
                               'lawhite10share', 'lawhite20share' 
                              ]

In [5]:
# Create feature list with population metrics
features_population = ['NUMGQTRS',
                       'MedianFamilyIncome', 
                       'LALOWI05_10', 'LALOWI1_10', 'LALOWI1_20',
                       'LAPOP05_10', 'LAPOP1_10', 'LAPOP1_20', 
                       'TractBlack', 'TractHUNV', 'TractHispanic',
                       'TractLOWI', 'TractSNAP', 'TractWhite', 
                       'lablack10','lablack20', 'lahisp10', 'lahisp20',
                       'lahunv10', 'lahunv20','lalowi10', 'lalowi20',
                       'lapop10', 'lapop20','lasnap10', 'lasnap20',
                       'lawhite10', 'lawhite20', 
                 ]

In [6]:
features_combined = features_population + features_proportional_share

In [7]:
# Set age adjusted rates as targets
target_list = ['Age Adjusted Rate Endocrine',
               'Age Adjusted Rate Obesity', 'Age Adjusted Rate Metabolic', 'Age Adjusted Rate Diabetes', 
               'Age Adjusted Rate Circulatory',
               'Age Adjusted Rate Pulmonary', 'Age Adjusted Rate Arteries', 'Age Adjusted Rate Hypertension',
               'Age Adjusted Rate External',
               'Age Adjusted Rate Assault', 'Age Adjusted Rate Intentional', 'Age Adjusted Rate Transport',
               'Age Adjusted Rate Mental',
               'Age Adjusted Rate Psychoactive',
               'Age Adjusted Rate Neoplasms', 
               'Age Adjusted Rate MalignantDigestive', 'Age Adjusted Rate MalignantBreast', 'Age Adjusted Rate MalignantFemale', 'Age Adjusted Rate MalignantMale', 'Age Adjusted Rate MalignantEndocrine',
               'Age Adjusted Rate All']

In [8]:
# Define function to consolidate r2 & MAE
def make_scores_df(r2_train, r2_test, mae_train, mae_test, model):
    r2 = pd.DataFrame(data = np.array([[np.mean(r2_train)],[np.mean(r2_test)]]), 
                 index = ['R^2 Train', 'R^2 Test'], columns = [model])

    mae = pd.DataFrame(data = np.array([[np.mean(mae_train)],[np.mean(mae_test)]]), 
                 index = ['MAE Train', 'MAE Test'], columns = [model])

    combined_scores = pd.merge(r2, mae, on = model, how = 'outer',
                               left_index = True, right_index = True)
    
    return combined_scores

## <font color = darkorchid> RandomForest </font>

In [9]:
# Drop if target value is not available
df_rf = df.dropna(subset = ['Age Adjusted Rate All'])

# Bootstrap observations
df_rf_sample = df_rf.sample(n = df_rf.shape[0], replace = True)

df_rf = pd.concat([df_rf, df_rf_sample])

# Set features to percentage metrics
X = df_rf[features_combined]
y = df_rf['Age Adjusted Rate All']

# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# StandardScale features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [10]:
# Instantiate and fit Random Forest
rf = RandomForestRegressor(n_jobs = -1, verbose = 1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.2s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=1, warm_start=False)

In [11]:
params = [{'n_estimators': [200],
           'criterion': ['mse'],
           'min_samples_split': [2, 3],
           'min_impurity_split': [0.0, 0.1],
           'max_depth': [None],
           'max_features': ['sqrt', 'auto'],
           'min_samples_leaf': [1],
           'bootstrap': [False],
           'warm_start': [False]
         }]

rf_GS = GridSearchCV(rf, params,  verbose = 1).fit(X_train, y_train)

In [13]:
print(rf_GS.best_score_)
print(rf_GS.best_params_)

0.728588310551414
{'bootstrap': False, 'criterion': 'mse', 'max_depth': None, 'max_features': 'sqrt', 'min_impurity_split': 0.1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'warm_start': False}
