In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

from twiser import twiser

In [2]:
fname = "/Users/rturner/tmp/dataverse_files/Study1Data.csv"

In [3]:
pre_exp_vars = ["Sex", "Party", "Ideo"]
sample_var = "Sample"
treatment_var = "LoanGroup"
response_var = "LoanSupp"

control_code = 1
treatment_code = 2
fb_sample_code = 6

In [4]:
cols_to_keep = pre_exp_vars + [sample_var, treatment_var, response_var]

In [5]:
df_full = pd.read_csv(fname, header=0, index_col=0, na_values=[".", "-99", "?", "-1"], low_memory=False)
df_full = df_full[cols_to_keep]
df_full = df_full.apply(pd.to_numeric, errors="coerce")
df_full[pre_exp_vars] = df_full[pre_exp_vars].fillna(df_full[pre_exp_vars].mean())
assert (df_full.dtypes == float).all()

In [6]:
df = df_full[df_full[sample_var] == fb_sample_code]
assert not (df == -1).any().any()
assert not (df == -99).any().any()

In [7]:
predictor = RandomForestRegressor(criterion="squared_error", random_state=0)

is_control = df[treatment_var] == control_code
is_treatment = df[treatment_var] == treatment_code
assert not (is_control & is_treatment).any()

x = df.loc[is_treatment, response_var].values
y = df.loc[is_control, response_var].values
x_covariates = df.loc[is_treatment, pre_exp_vars].values
y_covariates = df.loc[is_control, pre_exp_vars].values

x_covariates = x_covariates[~np.isnan(x), :]
x = x[~np.isnan(x)]

y_covariates = y_covariates[~np.isnan(y), :]
y = y[~np.isnan(y)]

R_basic = twiser.ztest(x, y, alpha=0.05)
print("basic", R_basic, R_basic[1][1] - R_basic[1][0], "*" if R_basic[2] < .05 else "")

R_vr = twiser.ztest_cv_train(x, x_covariates, y, y_covariates, alpha=0.05, train_frac=0.3, predictor=predictor, random=np.random.RandomState(123))
print("cv", R_vr, R_vr[1][1] - R_vr[1][0], "*" if R_vr[2] < .05 else "")

R_vr = twiser.ztest_stacked_train(x, x_covariates, y, y_covariates, alpha=0.05, k_fold=10, predictor=predictor, random=np.random.RandomState(123))
print("vr", R_vr, R_vr[1][1] - R_vr[1][0], "*" if R_vr[2] < .05 else "")

R_vr = twiser.ztest_in_sample_train(x, x_covariates, y, y_covariates, alpha=0.05, predictor=predictor, random=np.random.RandomState(123))
print("in_sample", R_vr, R_vr[1][1] - R_vr[1][0], "*" if R_vr[2] < .05 else "")

basic (0.8048128342245988, (-0.14116367806259345, 1.750789346511791), 0.09541747434590832) 1.8919530245743843 
cv (1.3976759834368528, (0.20459932380486157, 2.590752643068844), 0.02167154082472869) 2.3861533192639826 *
vr (1.3783350179974512, (0.5080398598051986, 2.2486301761897036), 0.001908559984656016) 1.740590316384505 *
in_sample (0.8648606904582837, (0.24239032838839314, 1.4873310525281744), 0.006465777380383715) 1.2449407241397812 *
