In [55]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [56]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os
from matplotlib.pyplot import hist

In [57]:
x = pd.read_stata("maindata.dta", convert_categoricals=False)

In [58]:
laws_csv = pd.read_csv("When_Were_Laws.csv")
laws_csv = laws_csv[np.logical_not(np.isnan(laws_csv["FIPS"]))]  # FIPS codes identify states
laws_csv = laws_csv.drop("State_Name", axis=1)  # Dropping as useless
laws_csv = laws_csv.rename({'FIPS': 'stfips'}, axis=1) 

# Merging
merged = pd.merge(laws_csv, x, on='stfips', how='outer')

In [59]:
basic_merged = merged.copy()  # To allow for re-running 

basic_merged = basic_merged[basic_merged["a_age"] <= 25]  # Can be changed later, but for now useful I think
#age_subset = np.logical_and(np.greater_equal(basic_merged["a_age"],18), np.greater_equal(19,basic_merged["a_age"]))
# 17 <= age <= 21 (maybe should be like 22)
#basic_merged = basic_merged[age_subset]
#print(basic_merged.shape)

# Dropping states who were treated < 97 (i.e. they always had programs)
# This is following Callaway + Sant'anna, as we cannot meaningfully 
# do any inference using those states. Although we can compare them later as a 
# robustness check, which may be interesting
basic_merged = basic_merged[basic_merged["Year_Implemented"].str.contains("always")==False]  

# I also drop the never states, as they may be substantively different from others, although this can be relaxed later.
basic_merged = basic_merged.replace("never", "1000000") 
basic_merged["Year_Implemented"] = basic_merged["Year_Implemented"].astype(int)  # converting to intbasic_merged = basic_merged[basic_merged["Year_Implemented"].str.contains("never")==False]  # Only want to look at one for now. 

# As we are treating >19 as the never-treated group, we set their year implemented as 1000000 >> 1999
year_implemented_vector = basic_merged["Year_Implemented"].copy()
year_implemented_vector[basic_merged["under19"] == 0] = 1000000
basic_merged["group"] = year_implemented_vector  # Equals the year you were first treated. If >=19 then treated at t = infty

# Drop Arizona since they implemented late and later repealed policy
basic_merged = basic_merged[basic_merged["stfips"] != 5]

# Generating list of confounders of interest, these are not necessarily optimal. 
list_of_confounders = ["fownu18", "a_maritl", "female" , "povll"]#, "stfips"]
list_of_confounders += ["anykids", "disability", "collgrad", "hsgrad"] # coll + hs are extra for now. 

In [60]:
#ML Checks for Q Model

In [29]:
outcome = basic_merged['pubonly']
list_of_confounders_treatment = list_of_confounders + ["group"]
confounders_and_treat = basic_merged[list_of_confounders_treatment]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(confounders_and_treat, outcome, test_size=0.2)

In [31]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

In [33]:
#function to fit model and calculate the accuracy score, mse, and baseline
def fit_model(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    test_mse=mean_squared_error(y_pred, y_test)
    score2 = model.score(x_test,y_test)
    baseline_mse=mean_squared_error(y_train.mean()*np.ones_like(y_test), y_test)
    
    return test_mse, score2, baseline_mse

In [42]:
#models to test
Rf = RandomForestClassifier(n_estimators=100, max_depth=10)
KNN = KNeighborsClassifier()
LogReg = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
XGBoost = sklearn.ensemble.GradientBoostingClassifier()

models = [Rf, KNN, LogReg, XGBoost]

In [43]:
fit_stat = pd.DataFrame()
model_lst = ["RandomForest", "KNN", "LogReg", "XGBoost"]
g_ce = []
score = []
baseline = []

for model in models:
    x, y, z = fit_model(model)
    g_ce.append(x)
    baseline.append(z)
    score.append(y)

fit_stat["model"] = model_lst
fit_stat["test_mse"] = g_ce
fit_stat["baseline_mse"] = baseline
fit_stat["accuracy_score"] = score

In [41]:
fit_stat

Unnamed: 0,model,test_mse,baseline_mse,accuracy_score
0,RandomForest,0.110675,0.098426,0.889325
1,KNN,0.116026,0.098426,0.883974
2,LogReg,0.110675,0.098426,0.889325
3,XGBoost,0.100619,0.098426,0.899381


In [36]:
Q1 = RandomForestClassifier(n_estimators=100, max_depth=2)
Q1.fit(x_train, y_train)
y_pred = Q1.predict(x_test)

test_mse=mean_squared_error(y_pred, y_test)
print(f"Test MSE of fit model {test_mse}") 

score2 = Q1.score(x_test,y_test)
print("Model Score: " + str(score2)) 

Test MSE of fit model 0.11067475131087759
Model Score: 0.8893252486891224
