In [26]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [27]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os
from matplotlib.pyplot import hist

In [28]:
x = pd.read_stata("maindata.dta", convert_categoricals=False)

In [29]:
laws_csv = pd.read_csv("When_Were_Laws.csv")
laws_csv = laws_csv[np.logical_not(np.isnan(laws_csv["FIPS"]))]  # FIPS codes identify states
laws_csv = laws_csv.drop("State_Name", axis=1)  # Dropping as useless
laws_csv = laws_csv.rename({'FIPS': 'stfips'}, axis=1) 

# Merging
merged = pd.merge(laws_csv, x, on='stfips', how='outer')

In [30]:
basic_merged = merged.copy()  # To allow for re-running 

basic_merged = basic_merged[basic_merged["a_age"] <= 25]  # Can be changed later, but for now useful I think

# Dropping states who were treated < 97 (i.e. they always had programs)
# This is following Callaway + Sant'anna, as we cannot meaningfully 
# do any inference using those states. Although we can compare them later as a 
# robustness check, which may be interesting
basic_merged = basic_merged[basic_merged["Year_Implemented"].str.contains("always")==False]  

# Making it so that "never-treated" states are treated at T = infinity
basic_merged = basic_merged.replace("never", "1000000") 
basic_merged["Year_Implemented"] = basic_merged["Year_Implemented"].astype(int)  # converting to int

# Indicator for if the individual was treated (i.e. under 19 and in a state who added a law)
basic_merged["treatment"] = basic_merged["under19"] # * basic_merged["year_indic"]

# Generating list of confounders of interest, these are not necessarily optimal. 
list_of_confounders = ["year", "stfips", "fownu18", "faminctm1", "a_maritl", "female" , "fpovcut", "povll"]
list_of_confounders += ["anykids", "disability", "noemp_insured"]


In [31]:
#ML Checks for One Year

In [32]:
sub_merged = basic_merged.copy()
sub_merged = sub_merged[sub_merged["Year_Implemented"] == 1997]
sub_merged = sub_merged[sub_merged["year"] == 1997]
confounders_and_treat = sub_merged[["treatment","year", "stfips", "fownu18", "faminctm1", "a_maritl", "female" , "fpovcut", "povll", "anykids", "disability", "noemp_insured"]]
y_var = sub_merged["pubonly"]

In [33]:
x_train, x_test, y_train, y_test = train_test_split(confounders_and_treat, y_var, test_size=0.2)

In [34]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, log_loss
Q1 = RandomForestClassifier()
Q1.fit(x_train, y_train)
y_pred = Q1.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 

score2 = Q1.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.06748466257668712
Model Score: 0.9325153374233128




In [35]:
from sklearn.neighbors import KNeighborsClassifier
Q2 = KNeighborsClassifier()
Q2.fit(x_train, y_train)
y_pred = Q2.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 

score2 = Q2.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.09406952965235174
Model Score: 0.9059304703476483


In [36]:
from sklearn import linear_model
Q3 = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
Q3.fit(x_train, y_train)
y_pred = Q3.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")
score2 = Q3.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.07975460122699386
Model Score: 0.9202453987730062


In [37]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

Q4 = MLPClassifier(solver='adam', hidden_layer_sizes=(15,), random_state=1, max_iter=500)
Q4.fit(x_train, y_train)
y_pred = Q4.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")

score2 = Q4.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.07770961145194274
Model Score: 0.9222903885480572


In [38]:
Q5 = sklearn.ensemble.GradientBoostingClassifier()
Q5.fit(x_train, y_train)
y_pred = Q5.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")

score2 = Q5.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.05725971370143149
Model Score: 0.9427402862985685


In [40]:
#ML Checks For All Years

In [41]:
confounders_and_treat = basic_merged[["treatment","year", "stfips", "fownu18", "faminctm1", "a_maritl", "female" , "fpovcut", "povll", "anykids", "disability", "noemp_insured"]]
y_var = basic_merged["pubonly"]

In [42]:
x_train, x_test, y_train, y_test = train_test_split(confounders_and_treat, y_var, test_size=0.2)

In [43]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, log_loss
Q1 = RandomForestClassifier()
Q1.fit(x_train, y_train)
y_pred = Q1.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 

score2 = Q1.score(x_test,y_test)
print("Model Score: " + str(score2)) 



Test MSE of fit model 0.0937240365565284
Model Score: 0.9062759634434716


In [44]:
from sklearn.neighbors import KNeighborsClassifier
Q2 = KNeighborsClassifier()
Q2.fit(x_train, y_train)
y_pred = Q2.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 

score2 = Q2.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.11306746767218423
Model Score: 0.8869325323278158


In [45]:
from sklearn import linear_model
Q3 = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
Q3.fit(x_train, y_train)
y_pred = Q3.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")
score2 = Q3.score(x_test,y_test)
print("Model Score: " + str(score2))

Test MSE of fit model 0.11387699452504207
Model Score: 0.8861230054749579


In [46]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

Q4 = MLPClassifier(solver='adam', hidden_layer_sizes=(15,), random_state=1, max_iter=500)
Q4.fit(x_train, y_train)
y_pred = Q4.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")

score2 = Q4.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.10450352570247758
Model Score: 0.8954964742975224


In [47]:
Q5 = sklearn.ensemble.GradientBoostingClassifier()
Q5.fit(x_train, y_train)
y_pred = Q5.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}")

score2 = Q5.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.09519396689461239
Model Score: 0.9048060331053877
