# A basic template to apply any models to train on the collected data

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn import preprocessing
pd.set_option('display.max_columns', None)
import graphviz
plt.rcParams['figure.figsize'] = [18, 10]

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split



In [2]:
trainDF = pd.read_csv('../outputs/trainXY.csv',parse_dates=['date_implement'])
trainDF = trainDF.sample(frac=1)

In [3]:
trainDF.head()
trainDF = trainDF.dropna()

# Selecting a unique policy

In [4]:
policies = trainDF['policy'].unique()
policy_selected = policies[6]
print(f'Selected Policy: {policy_selected}')
trainDF = trainDF[trainDF['policy']==policy_selected]

Selected Policy: C4_Restrictions on gatherings


In [5]:
# Finalizing Indexes so we can drop state names 
trainDF = trainDF.reset_index(drop=True)

# Get a list of columns with features 
columns = trainDF.columns.to_list()

In [6]:
Y_columns = [x for x in columns if 'Y_' in x]

In [7]:
X_columns = [x for x in columns if 'Y_' not in x and 'Unnamed' not in x]

X_columns.remove('date_implement')
# X_columns.remove('policy')
X_columns.remove('state_x')
X_columns.remove('policy_type')
X_columns.remove('stateName')
X_columns.remove('CEN_stateCode')
X_columns.remove('submission_date')
X_columns.remove('state_y')

X_columns.remove('caseInterpolate_gauss3')
X_columns.remove('caseInterpolate_MA7')
X_columns.remove('caseInterpolate_savitzky31_3')
X_columns.remove('caseInterpolate_gauss8')
X_columns.remove('new_case_zscore')
X_columns.remove('new_case')
X_columns.remove('policy')



In [8]:
X = trainDF[X_columns]
y = trainDF[Y_columns]


In [9]:
y

Unnamed: 0,Y_TREND_caseInterpolate_MA7_14,Y_TRENDQUANT_caseInterpolate_MA7_14,Y_SLOPE_caseInterpolate_MA7_14,Y_TREND_caseInterpolate_MA7_21,Y_TRENDQUANT_caseInterpolate_MA7_21,Y_SLOPE_caseInterpolate_MA7_21,Y_TREND_caseInterpolate_MA7_28,Y_TRENDQUANT_caseInterpolate_MA7_28,Y_SLOPE_caseInterpolate_MA7_28,Y_TREND_caseInterpolate_gauss8_14,Y_TRENDQUANT_caseInterpolate_gauss8_14,Y_SLOPE_caseInterpolate_gauss8_14,Y_TREND_caseInterpolate_gauss8_21,Y_TRENDQUANT_caseInterpolate_gauss8_21,Y_SLOPE_caseInterpolate_gauss8_21,Y_TREND_caseInterpolate_gauss8_28,Y_TRENDQUANT_caseInterpolate_gauss8_28,Y_SLOPE_caseInterpolate_gauss8_28,Y_TREND_caseInterpolate_gauss3_14,Y_TRENDQUANT_caseInterpolate_gauss3_14,Y_SLOPE_caseInterpolate_gauss3_14,Y_TREND_caseInterpolate_gauss3_21,Y_TRENDQUANT_caseInterpolate_gauss3_21,Y_SLOPE_caseInterpolate_gauss3_21,Y_TREND_caseInterpolate_gauss3_28,Y_TRENDQUANT_caseInterpolate_gauss3_28,Y_SLOPE_caseInterpolate_gauss3_28
0,-1.0,-3.530612,-6.877802,-1.0,-6.748299,-6.877802,-1.0,-3.663265,-6.877802,-1.0,-2.714286,-4.133846,-1.0,-3.666667,-4.133846,-1.0,-3.892857,-4.133846,-1.0,-1.857143,-5.726923,-1.0,-5.428571,-5.726923,-1.0,-4.357143,-5.726923
1,-1.0,-21.908163,-31.682088,-1.0,-22.462585,-31.682088,-1.0,-29.535714,-31.682088,-1.0,-23.714286,-28.477692,-1.0,-27.190476,-28.477692,-1.0,-31.321429,-28.477692,-1.0,-20.928571,-26.100000,-1.0,-21.476190,-26.100000,-1.0,-27.750000,-26.100000
2,1.0,6.551020,3.431538,1.0,4.367347,3.431538,1.0,3.275510,3.431538,1.0,9.214286,6.052308,1.0,7.047619,6.052308,1.0,5.500000,6.052308,1.0,8.285714,5.103077,1.0,5.952381,5.103077,1.0,4.464286,5.103077
3,-1.0,-27.959184,-18.735604,-1.0,-18.918367,-18.735604,-1.0,-15.887755,-18.735604,-1.0,-17.714286,-16.282308,-1.0,-16.476190,-16.282308,-1.0,-13.321429,-16.282308,-1.0,-21.500000,-21.546154,-1.0,-20.761905,-21.546154,-1.0,-16.857143,-21.546154
4,-1.0,-8.959184,-9.266374,-1.0,-8.809524,-9.266374,-1.0,-7.091837,-9.266374,-1.0,-3.000000,-5.043077,-1.0,-4.333333,-5.043077,-1.0,-4.392857,-5.043077,-1.0,-2.357143,-7.695385,-1.0,-6.000000,-7.695385,-1.0,-4.785714,-7.695385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,1.0,2.061224,-3.515495,-1.0,-1.721088,-3.515495,-1.0,-3.770408,-3.515495,1.0,4.214286,-0.455385,1.0,0.952381,-0.455385,-1.0,-1.678571,-0.455385,1.0,4.071429,-0.981538,1.0,0.285714,-0.981538,-1.0,-2.607143,-0.981538
227,1.0,0.336735,0.071099,1.0,0.224490,0.071099,1.0,0.168367,0.071099,1.0,6.357143,2.991538,1.0,4.333333,2.991538,1.0,3.250000,2.991538,1.0,1.714286,0.530000,1.0,1.142857,0.530000,1.0,0.857143,0.530000
228,-1.0,-99.306122,-97.343297,-1.0,-107.741497,-97.343297,-1.0,-61.984694,-97.343297,-1.0,-96.285714,-85.182308,-1.0,-87.571429,-85.182308,-1.0,-69.928571,-85.182308,-1.0,-101.285714,-104.553077,-1.0,-110.142857,-104.553077,-1.0,-76.285714,-104.553077
229,1.0,6.132653,17.615275,1.0,12.265306,17.615275,1.0,20.642857,17.615275,1.0,7.714286,13.109231,1.0,11.571429,13.109231,1.0,14.035714,13.109231,1.0,4.500000,12.485385,1.0,9.904762,12.485385,1.0,15.285714,12.485385


# Encoding Policy Names 

In [10]:
# X['policy'].unique()

In [11]:
# le = preprocessing.LabelEncoder()
# le.fit(X['policy'])
# print(le.classes_)

In [12]:
# X['policy'] = pd.Series(le.transform(X['policy']))
# original_policies_decoded = pd.Series(le.inverse_transform(X['policy']))

In [13]:
# for i in range(len(le.classes_)):
#     print(f'{i}: {le.classes_[i]}')

In [14]:
# unique_policy = X['policy'].unique()

------

-------

# Train Models Here

In [15]:
y

Unnamed: 0,Y_TREND_caseInterpolate_MA7_14,Y_TRENDQUANT_caseInterpolate_MA7_14,Y_SLOPE_caseInterpolate_MA7_14,Y_TREND_caseInterpolate_MA7_21,Y_TRENDQUANT_caseInterpolate_MA7_21,Y_SLOPE_caseInterpolate_MA7_21,Y_TREND_caseInterpolate_MA7_28,Y_TRENDQUANT_caseInterpolate_MA7_28,Y_SLOPE_caseInterpolate_MA7_28,Y_TREND_caseInterpolate_gauss8_14,Y_TRENDQUANT_caseInterpolate_gauss8_14,Y_SLOPE_caseInterpolate_gauss8_14,Y_TREND_caseInterpolate_gauss8_21,Y_TRENDQUANT_caseInterpolate_gauss8_21,Y_SLOPE_caseInterpolate_gauss8_21,Y_TREND_caseInterpolate_gauss8_28,Y_TRENDQUANT_caseInterpolate_gauss8_28,Y_SLOPE_caseInterpolate_gauss8_28,Y_TREND_caseInterpolate_gauss3_14,Y_TRENDQUANT_caseInterpolate_gauss3_14,Y_SLOPE_caseInterpolate_gauss3_14,Y_TREND_caseInterpolate_gauss3_21,Y_TRENDQUANT_caseInterpolate_gauss3_21,Y_SLOPE_caseInterpolate_gauss3_21,Y_TREND_caseInterpolate_gauss3_28,Y_TRENDQUANT_caseInterpolate_gauss3_28,Y_SLOPE_caseInterpolate_gauss3_28
0,-1.0,-3.530612,-6.877802,-1.0,-6.748299,-6.877802,-1.0,-3.663265,-6.877802,-1.0,-2.714286,-4.133846,-1.0,-3.666667,-4.133846,-1.0,-3.892857,-4.133846,-1.0,-1.857143,-5.726923,-1.0,-5.428571,-5.726923,-1.0,-4.357143,-5.726923
1,-1.0,-21.908163,-31.682088,-1.0,-22.462585,-31.682088,-1.0,-29.535714,-31.682088,-1.0,-23.714286,-28.477692,-1.0,-27.190476,-28.477692,-1.0,-31.321429,-28.477692,-1.0,-20.928571,-26.100000,-1.0,-21.476190,-26.100000,-1.0,-27.750000,-26.100000
2,1.0,6.551020,3.431538,1.0,4.367347,3.431538,1.0,3.275510,3.431538,1.0,9.214286,6.052308,1.0,7.047619,6.052308,1.0,5.500000,6.052308,1.0,8.285714,5.103077,1.0,5.952381,5.103077,1.0,4.464286,5.103077
3,-1.0,-27.959184,-18.735604,-1.0,-18.918367,-18.735604,-1.0,-15.887755,-18.735604,-1.0,-17.714286,-16.282308,-1.0,-16.476190,-16.282308,-1.0,-13.321429,-16.282308,-1.0,-21.500000,-21.546154,-1.0,-20.761905,-21.546154,-1.0,-16.857143,-21.546154
4,-1.0,-8.959184,-9.266374,-1.0,-8.809524,-9.266374,-1.0,-7.091837,-9.266374,-1.0,-3.000000,-5.043077,-1.0,-4.333333,-5.043077,-1.0,-4.392857,-5.043077,-1.0,-2.357143,-7.695385,-1.0,-6.000000,-7.695385,-1.0,-4.785714,-7.695385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,1.0,2.061224,-3.515495,-1.0,-1.721088,-3.515495,-1.0,-3.770408,-3.515495,1.0,4.214286,-0.455385,1.0,0.952381,-0.455385,-1.0,-1.678571,-0.455385,1.0,4.071429,-0.981538,1.0,0.285714,-0.981538,-1.0,-2.607143,-0.981538
227,1.0,0.336735,0.071099,1.0,0.224490,0.071099,1.0,0.168367,0.071099,1.0,6.357143,2.991538,1.0,4.333333,2.991538,1.0,3.250000,2.991538,1.0,1.714286,0.530000,1.0,1.142857,0.530000,1.0,0.857143,0.530000
228,-1.0,-99.306122,-97.343297,-1.0,-107.741497,-97.343297,-1.0,-61.984694,-97.343297,-1.0,-96.285714,-85.182308,-1.0,-87.571429,-85.182308,-1.0,-69.928571,-85.182308,-1.0,-101.285714,-104.553077,-1.0,-110.142857,-104.553077,-1.0,-76.285714,-104.553077
229,1.0,6.132653,17.615275,1.0,12.265306,17.615275,1.0,20.642857,17.615275,1.0,7.714286,13.109231,1.0,11.571429,13.109231,1.0,14.035714,13.109231,1.0,4.500000,12.485385,1.0,9.904762,12.485385,1.0,15.285714,12.485385


In [16]:
X = X.apply(zscore)
y = y.apply(zscore)

In [17]:
from sklearn.datasets import load_iris
from sklearn import tree

In [18]:
clf = tree.DecisionTreeRegressor()

In [19]:
y_selected = y[['Y_TRENDQUANT_caseInterpolate_MA7_14']]

In [20]:
clf = clf.fit(X, y_selected)

In [21]:
# dot_data = tree.export_graphviz(clf, out_file=None) 
# graph = graphviz.Source(dot_data) 
# graph.render("tree") 

In [22]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=X.columns.to_list(), 
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render(f'../outputs/TREEOP_{policy_selected}')

'../outputs/TREEOP_C4_Restrictions on gatherings.pdf'

# Most Important Features 

In [23]:
# Random Forest 

regr = RandomForestRegressor(max_depth=100, random_state=0)
regr.fit(X, y_selected)

  regr.fit(X, y_selected)


RandomForestRegressor(max_depth=100, random_state=0)

In [24]:
def imp_df(column_names, importances):
    data = {
        'Feature': column_names,
        'Importance': importances,
    }
    df = pd.DataFrame(data) \
        .set_index('Feature') \
        .sort_values('Importance', ascending=False)

    return df

In [25]:
base_imp = imp_df(X.columns, regr.feature_importances_)
print(base_imp)

                                                 Importance
Feature                                                    
metric_change                                      0.441014
CEN_Without Health Care Coverage                   0.047322
FD_adminAndFireResponse                            0.029803
FD_totalFireDepartments                            0.028892
FD_moreThan10kCitizens                             0.026201
S4_people_vaccinated                               0.024684
OB_Prevalence                                      0.024538
CEN_Total Housing Units                            0.023080
CEN_Employment Rate                                0.022389
FD_lessThan10kCitizens                             0.021530
CEN_Total Population                               0.018907
S3_people_fully_vaccinated                         0.018509
S2_people_vaccinated                               0.016685
S1_people_vaccinated                               0.016037
S1_people_vaccinated_per_hundred        