In [14]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = [18, 10]

In [2]:
DF = pd.read_csv('../outputs/trainXY.csv')

# Little Bit of Cleanup

### `One Important Detail`
I try to preserve state code as it is easier to later draw some conclustions, so a little bit effort but i think it will be worth it  


In [3]:
# Removing unnamed columns - remnants of previous merge 
DF = DF.loc[:, ~DF.columns.str.contains('^Unnamed')]


In [4]:
# Dropping NA to be able to standardize easily later 

print('Size before dropna : ',len(DF))
DF = DF.dropna()
print('Size after dropna : ',len(DF))

Size before dropna :  1788
Size after dropna :  1634


In [5]:
columns = DF.columns.to_list()

In [6]:
DF.head()

Unnamed: 0,date_implement,policy,metric_change,state_x,policy_type,stateName,CEN_stateCode,CEN_Bachelor's Degree or Higher,CEN_Employment Rate,CEN_Hispanic or Latino (of any race),CEN_Median Household Income,CEN_Total Employer Establishments,CEN_Total Households,CEN_Total Housing Units,CEN_Total Population,CEN_Unnamed: 0,CEN_Unnamed: 0.1,CEN_Without Health Care Coverage,CP_0_diff,CP_1_diff,CP_2_diff,FD_adminAndFireResponse,FD_fireResponseOnly,FD_lessThan10kCitizens,FD_moreThan10kCitizens,FD_totalFireDepartments,HB_bed_For-Profit,HB_bed_Non-Profit,HB_bed_State/Local Government,HB_bed_Total,OB_Prevalence,SP_Governor Political Affiliation,SP_State Attorney General Political Affiliation,SP_State House Majority Political Affiliation,SP_State Senate Majority Political Affiliation,submission_date,state_y,new_case,caseInterpolate_MA7,caseInterpolate_savitzky31_3,caseInterpolate_gauss8,caseInterpolate_gauss3,new_case_zscore,caseInterpolate_MA7_7_trend,caseInterpolate_MA7_14_trend,caseInterpolate_MA7_21_trend,caseInterpolate_MA7_28_trend,caseInterpolate_gauss8_7_trend,caseInterpolate_gauss8_14_trend,caseInterpolate_gauss8_21_trend,caseInterpolate_gauss8_28_trend,caseInterpolate_gauss3_7_trend,caseInterpolate_gauss3_14_trend,caseInterpolate_gauss3_21_trend,caseInterpolate_gauss3_28_trend
0,2020-03-13,C1_School closing,1.0,Alaska,0,Alaska,AK,30.2,59.3,49824.0,75463.0,21399.0,252199.0,326200.0,733391.0,1.0,0.0,12.2,230.0,353.0,655.0,181.0,0.0,181.0,8.0,189.0,0.36,1.55,0.29,2.2,31.9,0.0,0.0,0.0,0.0,2020-03-13,AK,1.0,0.142857,-21.030578,2.0,0.0,-0.752799,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,2020-03-16,C1_School closing,2.0,Alaska,0,Alaska,AK,30.2,59.3,49824.0,75463.0,21399.0,252199.0,326200.0,733391.0,1.0,0.0,12.2,230.0,353.0,655.0,181.0,0.0,181.0,8.0,189.0,0.36,1.55,0.29,2.2,31.9,0.0,0.0,0.0,0.0,2020-03-16,AK,0.0,0.142857,21.332012,3.0,1.0,-0.75635,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2020-03-12,C2_Workplace closing,1.0,Alaska,0,Alaska,AK,30.2,59.3,49824.0,75463.0,21399.0,252199.0,326200.0,733391.0,1.0,0.0,12.2,230.0,353.0,655.0,181.0,0.0,181.0,8.0,189.0,0.36,1.55,0.29,2.2,31.9,0.0,0.0,0.0,0.0,2020-03-12,AK,0.0,0.0,-9.073048,1.0,0.0,-0.75635,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
3,2020-04-24,C2_Workplace closing,-1.0,Alaska,0,Alaska,AK,30.2,59.3,49824.0,75463.0,21399.0,252199.0,326200.0,733391.0,1.0,0.0,12.2,230.0,353.0,655.0,181.0,0.0,181.0,8.0,189.0,0.36,1.55,0.29,2.2,31.9,0.0,0.0,0.0,0.0,2020-04-24,AK,2.0,4.285714,861.998143,4.0,3.0,-0.749248,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,2020-03-13,C3_Cancel public events,1.0,Alaska,0,Alaska,AK,30.2,59.3,49824.0,75463.0,21399.0,252199.0,326200.0,733391.0,1.0,0.0,12.2,230.0,353.0,655.0,181.0,0.0,181.0,8.0,189.0,0.36,1.55,0.29,2.2,31.9,0.0,0.0,0.0,0.0,2020-03-13,AK,1.0,0.142857,-21.030578,2.0,0.0,-0.752799,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [7]:
# Feature Columns
X_COLUMNS = ['metric_change',
 "CEN_Bachelor's Degree or Higher",
 'CEN_Employment Rate',
 'CEN_Hispanic or Latino (of any race)',
 'CEN_Median Household Income',
 'CEN_Total Employer Establishments',
 'CEN_Total Households',
 'CEN_Total Housing Units',
 'CEN_Total Population',
 'CEN_Without Health Care Coverage',
 'CP_0_diff',
 'CP_1_diff',
 'CP_2_diff',
 'FD_adminAndFireResponse',
 'FD_fireResponseOnly',
 'FD_lessThan10kCitizens',
 'FD_moreThan10kCitizens',
 'FD_totalFireDepartments',
 'HB_bed_For-Profit',
 'HB_bed_Non-Profit',
 'HB_bed_State/Local Government',
 'HB_bed_Total',
 'OB_Prevalence',
 'SP_Governor Political Affiliation',
 'SP_State Attorney General Political Affiliation',
 'SP_State House Majority Political Affiliation',
 'SP_State Senate Majority Political Affiliation'] 

In [8]:
# Prediction Columns
Y_COLUMNS = ['caseInterpolate_MA7_7_trend',
 'caseInterpolate_MA7_14_trend',
 'caseInterpolate_MA7_21_trend',
 'caseInterpolate_MA7_28_trend',
 'caseInterpolate_gauss8_7_trend',
 'caseInterpolate_gauss8_14_trend',
 'caseInterpolate_gauss8_21_trend',
 'caseInterpolate_gauss8_28_trend',
 'caseInterpolate_gauss3_7_trend',
 'caseInterpolate_gauss3_14_trend',
 'caseInterpolate_gauss3_21_trend',
 'caseInterpolate_gauss3_28_trend']

In [9]:
# Generating a separate array for each y column 

yAr = []
for y_col in Y_COLUMNS:
    y = DF[[y_col]]
    yAr.append(y)
    

In [10]:
# Generating a feature dataframe 

X = DF[X_COLUMNS]

In [11]:
X = X.apply(zscore)

# Train Test Split 

In [15]:
X_train, X_test, y_train, y_test = train_test_split( X, yAr[0], test_size=0.33, random_state=42)

In [62]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score

In [63]:
# reg = linear_model.BayesianRidge()
reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit(X_train, y_train)

RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))

In [64]:
y_pred = reg.predict(X_test)

In [65]:
# accuracy_score(y_test, y_pred)

In [66]:
y_pred.shape

(540, 1)

In [67]:
y_test.shape

(540, 1)

In [68]:
# results = pd.DataFrame({'test': y_test, 'pred': y_pred}, columns=['test', 'pred'])