In [7]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os
from matplotlib.pyplot import hist

In [8]:
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)

In [11]:
x = pd.read_stata("maindata.dta", convert_categoricals=False)

In [13]:
laws_csv = pd.read_csv("When_Were_Laws.csv")
laws_csv = laws_csv[np.logical_not(np.isnan(laws_csv["FIPS"]))]  # FIPS codes identify states
laws_csv = laws_csv.drop("State_Name", axis=1)  # Dropping as useless
laws_csv = laws_csv.rename({'FIPS': 'stfips'}, axis=1) 

# Merging
merged = pd.merge(laws_csv, x, on='stfips', how='outer')

In [27]:
basic_merged = merged.copy()  # To allow for re-running 

# Dropping states who were treated < 97 (i.e. they always had programs)
basic_merged = basic_merged[basic_merged["Year_Implemented"].str.contains("always")==False]  

# Making it so that "never-treated" states are treated at T = infinity
basic_merged = basic_merged.replace("never", "1000000") 
basic_merged["Year_Implemented"] = basic_merged["Year_Implemented"].astype(int) # converting to int

# indicator for if treatment has occured in state i
basic_merged["year_indic"] = (basic_merged["year"] >= basic_merged["Year_Implemented"]) 

# Indicator for if the individual was treated (i.e. under 19 and in a state who added a law)
basic_merged["treatment"] = basic_merged["under19"] * basic_merged["year_indic"]

# Generating list of confounders of interest, these are not necessarily optimal. 
list_of_confounders = [ "fownu18", "fpovcut", "povll", "faminctm1", "a_maritl"] 
list_of_confounders += ["a_hga",  "anykids", "year", "stfips", "disability", "elig"] 

# Dropping years  outside of [1995,2000] 
basic_merged = basic_merged[basic_merged["year"] <= 2000]
basic_merged = basic_merged[basic_merged["year"] >= 1995]

# Subsetting our dataset to only include the columns we want, and dropping all rows with empty entries. 
basic_merged = basic_merged[list_of_confounders + ["treatment", "pubonly", "insured", "privonly", "Year_Implemented"]]
basic_merged = basic_merged.dropna(axis=0)


In [28]:
basic_merged

Unnamed: 0,fownu18,fpovcut,povll,faminctm1,a_maritl,a_hga,anykids,year,stfips,disability,elig,treatment,pubonly,insured,privonly,Year_Implemented
2,0.0,8480.0,1.0,0.0,7.0,0.0,0.0,1998.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1998
4,3.0,19154.0,8.0,47500.0,7.0,0.0,0.0,1997.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1998
8,3.0,17524.0,14.0,108090.0,7.0,0.0,0.0,2000.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1998
10,3.0,15967.0,1.0,2736.0,7.0,0.0,0.0,1996.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1998
13,1.0,13861.0,14.0,78249.0,7.0,0.0,0.0,2000.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918463,0.0,11921.0,7.0,22465.0,7.0,36.0,0.0,1995.0,56.0,0.0,0.0,0.0,0.0,1.0,1.0,1999
918464,0.0,16813.0,14.0,116918.0,7.0,39.0,0.0,1998.0,56.0,0.0,0.0,0.0,0.0,1.0,1.0,1999
918465,1.0,12629.0,7.0,22300.0,7.0,37.0,0.0,1996.0,56.0,0.0,0.0,0.0,0.0,1.0,1.0,1999
918469,2.0,19634.0,8.0,45500.0,7.0,42.0,0.0,1997.0,56.0,0.0,0.0,0.0,0.0,1.0,1.0,1999


In [21]:
def outcome_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, A:np.array, n_splits:int, output_type:str):
    '''
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    A: array of treatments
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"
    '''

    predictions0 = np.full_like(A, np.nan, dtype=float)
    predictions1 = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    # include the treatment as input feature
    X_w_treatment = X.copy()
    X_w_treatment["A"] = A

    # for predicting effect under treatment / control status for each data point 
    X0 = X_w_treatment.copy()
    X0["A"] = 0
    X1 = X_w_treatment.copy()
    X1["A"] = 1

    
    for train_index, test_index in kf.split(X_w_treatment, y):
        X_train = X_w_treatment.loc[train_index]
        y_train = y.loc[train_index]
        q = make_model()
        q.fit(X_train, y_train)

        if output_type =='binary':
            predictions0[test_index] = q.predict_proba(X0.loc[test_index])[:, 1]
            predictions1[test_index] = q.predict_proba(X1.loc[test_index])[:, 1]
        elif output_type == 'continuous':
            predictions0[test_index] = q.predict(X0.loc[test_index])
            predictions1[test_index] = q.predict(X1.loc[test_index])

    assert np.isnan(predictions0).sum() == 0
    assert np.isnan(predictions1).sum() == 0
    return predictions0, predictions1

In [46]:
def make_Q_model():
    ''' A function that returns a general ML q model for later use in k-folding'''
    return RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=100, max_depth=None)

In [37]:
basic_merged = basic_merged.reset_index()

confounders = basic_merged[["fownu18", "fpovcut", "povll", "faminctm1", "a_maritl", "a_hga",  "anykids", "year", "stfips", "disability", "elig"]]
treatment = basic_merged['treatment']
outcome = basic_merged['pubonly']

In [40]:
Q0_lm, Q1_lm = outcome_k_fold_fit_and_predict(make_Q_model, X=confounders, y=outcome, A=treatment, \
                                        n_splits=5, output_type="binary")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_folds = np.zeros(n_samples, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/

In [45]:
data_and_estimates_ml = pd.DataFrame({'Q0': Q0_lm, 'Q1': Q1_lm, 'A': treatment, 'Y': outcome})
data_and_estimates_ml

Unnamed: 0,Q0,Q1,A,Y
0,0.169171,0.275000,1.0,0.0
1,0.004000,0.020000,0.0,0.0
2,0.022000,0.016000,1.0,0.0
3,0.424000,0.524000,0.0,0.0
4,0.040000,0.068000,1.0,0.0
...,...,...,...,...
54121,0.076000,0.098000,0.0,0.0
54122,0.004000,0.002000,0.0,0.0
54123,0.044000,0.094000,0.0,0.0
54124,0.052000,0.070000,0.0,0.0
