In [1]:
import numpy as np
import pickle
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler
from data_generation import get_data
from dml_algorithm import mm_ate, dml_ate

In [2]:
def get_models(svm_params_dict):
    model_g = []
    for d in [0, 1]:
        model = SVR()
        model.set_params(**svm_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = SVC(probability=True, random_state=42)
    model_m.set_params(**svm_params_dict['m'])
    return model_g, model_m

In [3]:
with open('results_dict.pkl', 'rb') as pickle_file:
    results_dict = pickle.load(pickle_file)

In [4]:
hi = results_dict[4000][0][:,1]
np.where(hi==hi.min())[0]

array([1388], dtype=int64)

In [5]:
with open('opt_params_svm.pkl', 'rb') as pickle_file:
    opt_params_svm = pickle.load(pickle_file)

In [9]:
results_dict[4000][0][1388,:]

array([ 3.06703872e-01, -1.35229445e+03,  4.62589515e-01, -5.77611376e+03])

In [39]:
rng = np.random.default_rng(seed=123)
for j in range(1389):
    y_data, d_data, x_data = get_data(4000, rng)
    if j == 1388:
        scaler = StandardScaler()
        x_data_stand = scaler.fit_transform(x_data)

In [40]:
print(mm_ate(y_data, d_data, x_data))

0.3067038720230787


In [50]:
x_data_all=[x_data, x_data_stand]
K=5
model_g, model_m = get_models(opt_params_svm[4000][1388])
alpha=0.05
classical=True
errors=True

In [51]:
if True:
    if isinstance(x_data_all, list):
        x_data_orig, x_data = x_data_all[0], x_data_all[1]
    else:
        x_data_orig, x_data = x_data_all, x_data_all

    # Partition the data for cross-fitting
    skf = StratifiedKFold(n_splits=K, shuffle=False)

    # Compute respective ML estimators and thereupon auxiliary estimators
    theta_0_check_list = []
    scores_list = []
    if classical:
        reg_check_list, ipw_check_list = [], []
    if errors:
        rmse_list = []
    
    for (train_indices, eval_indices) in skf.split(X=x_data, y=d_data):
        y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data[train_indices]
        y_eval, d_eval, x_eval, x_eval_orig = y_data[eval_indices], d_data[eval_indices], x_data[eval_indices], x_data_orig[eval_indices]

        # Estimate outcome regression functions g_0(d)
        g_0_hat = []
        for d in [0, 1]:
            model_g[d].fit(X=x_train[d_train==d], y=y_train[d_train==d])
            g_0_hat.append(model_g[d].predict(x_eval))

        # Estimate propensity score m_0
        model_m.fit(X=x_train, y=d_train)
        m_0_hat = model_m.predict_proba(x_eval)[:,1]
            
        # Compute auxiliary estimator
        scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
        theta_0_check_list.append(np.mean(scores))

        # For variance estimation
        scores_list.append(scores)

        # For regression & IPW estimators
        if classical:
            reg_check_list.append(np.mean(g_0_hat[1] - g_0_hat[0])) 
            ipw_check_list.append(np.mean(d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat)))

        # Assess RMSE of ML models on evaluation set
        if errors:
            rmse_g0 = root_mean_squared_error(g_0(0, x_eval_orig), g_0_hat[0])
            rmse_g1 = root_mean_squared_error(g_0(1, x_eval_orig), g_0_hat[1])
            rmse_m = root_mean_squared_error(m_0(x_eval_orig), m_0_hat)
            rmse_list.append([rmse_g0, rmse_g1, rmse_m])

    # Compute final estimator
    theta_0_hat = np.mean(theta_0_check_list)
    if classical:
        reg_hat, ipw_hat = np.mean(reg_check_list), np.mean(ipw_check_list)

    # Inference: estimate standard deviation and construct confidence interval
    sigma_hat = np.sqrt(np.mean((np.array(scores_list)-theta_0_hat)**2))
    N = len(y_data)
    quantile = norm.ppf(1-alpha/2)
    CI = np.array([theta_0_hat-quantile*sigma_hat/np.sqrt(N), theta_0_hat+quantile*sigma_hat/np.sqrt(N)])

    # Average RMSEs across folds
    if errors:
        rmse = np.mean(rmse_list, axis=0)

In [55]:
np.mean(d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat))

-28882.0305850478

In [57]:
hi = d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat)

In [65]:
d_eval[498]

0.0

In [68]:
m_0(x_eval[498])

array([0.99954653])

In [64]:
m_0_hat[498]

0.9999987293794964

In [70]:
mean = np.linspace(0.7, 0.0, 8)
cov = np.array([[round(0.6**abs(i-j)*((-1.01)**(i+j)), 3) for j in range(8)] for i in range(8)])
beta = np.linspace(-0.8, 1.0, 10)
df = 10
gamma = np.array([1.0, 2.0, 2.0, 3.0])
F = lambda z: 1.0 / (1.0 + np.exp(-z))   # standard logistic function

# Propensity score
def m_0(x):
    if x.ndim == 1:
        x = x.reshape(1,-1)
    return t.cdf(x @ beta + 0.25*x[:,7]**2 - x[:,8]*x[:,9], df)

# Outcome regression function
def g_0(d, x):
    if x.ndim == 1:
        x = x.reshape(1,-1)
    linear_part = x[:,:4] @ gamma + x[:,4]*(d+1)
    nonlinear_part = F(x[:,5])*x[:,6]**2 - x[:,8]*(np.sqrt(x[:,9])+2*x[:,6]) + d*x[:,2]*x[:,8]**(3/2)
    return linear_part + nonlinear_part

In [71]:
x_normal = rng.multivariate_normal(mean=mean, cov=cov, size=N)
x_uniform = rng.uniform(size=(N,2))
x_data = np.concatenate((x_normal, x_uniform), axis=1)

xi = rng.standard_t(df=df, size=N)
d_data = (x_data @ beta + 0.25*x_data[:,7]**2 - x_data[:,8]*x_data[:,9] + xi >= 0).astype(float)
        
u = rng.normal(scale=np.mean(np.abs(x_data), axis=-1))
y_data = g_0(d_data, x_data) + u

In [83]:
np.random.logistic(size=1000).min()

-7.010832816932092

In [80]:
from scipy.stats import t
x = x_data[eval_indices][498]
if x.ndim == 1:
    x = x.reshape(1,-1)
x @ beta + 0.25*x[:,7]**2 - x[:,8]*x[:,9]
t.cdf(x @ beta + 0.25*x[:,7]**2 - x[:,8]*x[:,9], df)

array([0.99845833])

In [81]:
xi.min()

-6.279944554525133

In [69]:
rng = np.random.default_rng(seed=123)
for j in range(1388):
    y_data, d_data, x_data = get_data(4000, rng)
    if j == 1388:
        scaler = StandardScaler()
        x_data_stand = scaler.fit_transform(x_data)
        print(mm_ate(y_data, d_data, x_data))
        model_g, model_m = get_models(opt_params_svm[4000][j])
        print(dml_ate(y_data, d_data, [x_data, x_data_stand], model_g, model_m))

In [35]:
y_data, d_data, x_data = get_data(4000, rng)
x_data_stand = scaler.fit_transform(x_data)

In [45]:
import numpy as np
from joblib import Parallel, delayed
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import root_mean_squared_error
from sklearn.base import is_regressor
from scipy.stats import norm
from data_generation import g_0, m_0

In [36]:
m_0(x_data_stand).max()

0.9999851166539763