In [1]:
import numpy as np
import pickle

from joblib import Parallel, delayed
from sklearn.base import is_regressor
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR, SVC
from data_generation import m_0, get_data
from dml_algorithm import dml_parallel_ate

In [2]:
with open('results_ate_svm.pkl', 'rb') as pickle_file:
    results_ate = pickle.load(pickle_file)

In [3]:
theta_0 = 0.5
N = 4000
squared_errors = list((results_ate[N][0][:, 1]-theta_0)**2)

In [4]:
w = squared_errors.index(max(squared_errors))
w

1388

In [5]:
results_ate[N][0][w, 1]

-1352.29445450176

In [6]:
results_ate[N][0][w, 4]

0.14885848734684504

In [7]:
with open('opt_params_svm.pkl', 'rb') as pickle_file:
    opt_params_svm = pickle.load(pickle_file)

In [8]:
# Get SVM models from hyperparameters
def get_models(svm_params_dict):
    model_g = []
    for d in [0, 1]:
        model = SVR()
        model.set_params(**svm_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = SVC(probability=True, random_state=42)
    model_m.set_params(**svm_params_dict['m'])
    return model_g, model_m

In [9]:
rng = np.random.default_rng(seed=123)
for j in range(w+1):
    y_data, d_data, x_data = get_data(N, rng)
    if j == w:        
        scaler = StandardScaler()
        x_data_stand = scaler.fit_transform(x_data)
        model_g, model_m = get_models(opt_params_svm[N][j])

In [10]:
dml_parallel_ate(y_data, d_data, [x_data, x_data_stand], model_g, model_m)

(array([-1.35229445e+03,  4.62589515e-01, -5.77611376e+03]),
 85547.11935761635,
 array([-4003.37843479,  1298.78952579]),
 array([0.35830321, 0.36560822, 0.05270982]))

In [11]:
m_bounds = None

In [12]:
# Process one data split in the cross-fitting procedure
def process_single_split(train_indices, eval_indices):
    y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data_stand[train_indices]
    y_eval, d_eval, x_eval = y_data[eval_indices], d_data[eval_indices], x_data_stand[eval_indices]
    
    # Estimate single nuisance function
    def fit_predict(model, X, y):
        model.fit(X, y)
        if is_regressor(model):
            return model.predict(x_eval)
        else:
            return model.predict_proba(x_eval)[:, 1]

    # Estimate outcome regression functions g_0(d) and propensity score m_0 in parallel
    model_data_list = [(model_g[d], x_train[d_train==d], y_train[d_train==d]) for d in [0, 1]] + [(model_m, x_train, d_train)]
    eta_0_hat = Parallel(n_jobs=3)(delayed(fit_predict)(model, X, y) for model, X, y in model_data_list)
    g_0_hat, m_0_hat = eta_0_hat[:2], eta_0_hat[2]
    if m_bounds is not None:
        np.clip(m_0_hat, m_bounds[0], m_bounds[1], out=m_0_hat)

    # Compute auxiliary estimator
    scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
    theta_0_check = np.mean(scores)

    return theta_0_check, scores

# Partition the data for cross-fitting
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=False)

# Cross-fitting, where the different splits are processed in parallel
results = Parallel(n_jobs=K)(delayed(process_single_split)(train_indices, eval_indices) for train_indices, eval_indices in skf.split(X=x_data, y=d_data))

# Collect results (in particular the auxiliary estimators)
theta_0_check_list = [result[0] for result in results]
scores_list = [result[1] for result in results]   # Needed for variance estimation

# Compute final estimator
theta_0_hat = np.mean(theta_0_check_list)
theta_0_hat

-1352.29445450176

In [13]:
theta_0_check_list

[0.5587124965541486,
 0.4735872401881259,
 0.6692981026652043,
 0.27118374203686496,
 -6763.4450540902435]

In [14]:
skf = StratifiedKFold(n_splits=K, shuffle=False)
indices_dict = {}

for k, (train_indices, eval_indices) in enumerate(skf.split(X=x_data, y=d_data)):
    indices_dict[k] = (train_indices, eval_indices)

train_indices, eval_indices = indices_dict[4]

In [15]:
y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data_stand[train_indices]
y_eval, d_eval, x_eval = y_data[eval_indices], d_data[eval_indices], x_data_stand[eval_indices]

# Estimate single nuisance function
def fit_predict(model, X, y):
    model.fit(X, y)
    if is_regressor(model):
        return model.predict(x_eval)
    else:
        return model.predict_proba(x_eval)[:, 1]

# Estimate outcome regression functions g_0(d) and propensity score m_0 in parallel
model_data_list = [(model_g[d], x_train[d_train==d], y_train[d_train==d]) for d in [0, 1]] + [(model_m, x_train, d_train)]
eta_0_hat = Parallel(n_jobs=3)(delayed(fit_predict)(model, X, y) for model, X, y in model_data_list)
g_0_hat, m_0_hat = eta_0_hat[:2], eta_0_hat[2]
if m_bounds is not None:
    np.clip(m_0_hat, m_bounds[0], m_bounds[1], out=m_0_hat)

# Compute auxiliary estimator
scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
theta_0_check = np.mean(scores)
theta_0_check

-6763.4450540902435

In [16]:
sorted(list(scores))

[-5411150.828504604,
 -16.821971534391906,
 -15.699014828635352,
 -13.03668543478901,
 -8.207365048423398,
 -7.272123091860534,
 -6.247049806229996,
 -6.211853986277343,
 -5.944535719030866,
 -5.830064683943018,
 -5.785888734053568,
 -5.613635295500147,
 -5.181124141469968,
 -5.16421099199615,
 -5.076503845575961,
 -5.003185778047177,
 -4.763383851694446,
 -4.748983528497509,
 -4.399268990422936,
 -4.382817284655438,
 -4.367708701431104,
 -4.280375624684269,
 -4.251698552628117,
 -4.203461163179374,
 -4.1650191845576785,
 -4.0979214723695065,
 -4.076717682026925,
 -4.073333762643828,
 -3.9859957618760165,
 -3.858164127103856,
 -3.8202093968945614,
 -3.6319477922759953,
 -3.5979730324089467,
 -3.595681084862091,
 -3.4063230922515757,
 -3.330221412893445,
 -3.282141132828843,
 -3.226651408645421,
 -3.2077558044295724,
 -3.1978283980138618,
 -3.165844750844802,
 -3.0716481296234592,
 -3.0448159038677574,
 -2.983744389690024,
 -2.8866871111624626,
 -2.853388192634715,
 -2.6619739846528825,

In [17]:
i = list(scores).index(min(list(scores)))
i

498

In [18]:
m_0_hat[i]

0.9999987293794964

In [19]:
m_0(x_data[eval_indices[i]])

array([0.99845833])

In [20]:
d_eval[i]

0.0

In [21]:
clipped_indices = [i for i, m in enumerate(m_0_hat) if m > 0.995]

In [22]:
m_0(x_data[eval_indices[clipped_indices]])

array([0.99929969, 0.99806271, 0.99845833, 0.99955671, 0.99594666])

In [23]:
d_eval[clipped_indices]

array([1., 1., 0., 1., 1.])

In [24]:
m_bounds = (0.005, 0.995)

In [25]:
y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data_stand[train_indices]
y_eval, d_eval, x_eval = y_data[eval_indices], d_data[eval_indices], x_data_stand[eval_indices]

# Estimate single nuisance function
def fit_predict(model, X, y):
    model.fit(X, y)
    if is_regressor(model):
        return model.predict(x_eval)
    else:
        return model.predict_proba(x_eval)[:, 1]

# Estimate outcome regression functions g_0(d) and propensity score m_0 in parallel
model_data_list = [(model_g[d], x_train[d_train==d], y_train[d_train==d]) for d in [0, 1]] + [(model_m, x_train, d_train)]
eta_0_hat = Parallel(n_jobs=3)(delayed(fit_predict)(model, X, y) for model, X, y in model_data_list)
g_0_hat, m_0_hat = eta_0_hat[:2], eta_0_hat[2]
if m_bounds is not None:
    np.clip(m_0_hat, m_bounds[0], m_bounds[1], out=m_0_hat)

# Compute auxiliary estimator
scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
theta_0_check = np.mean(scores)
theta_0_check

-1.228494246728636

In [26]:
scores[clipped_indices]

array([-1.55262023e+00,  1.09451886e+00, -1.37757994e+03,  2.87750296e-01,
        1.00405423e+00])

In [27]:
# Process one data split in the cross-fitting procedure
def process_single_split(train_indices, eval_indices):
    y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data_stand[train_indices]
    y_eval, d_eval, x_eval = y_data[eval_indices], d_data[eval_indices], x_data_stand[eval_indices]
    
    # Estimate single nuisance function
    def fit_predict(model, X, y):
        model.fit(X, y)
        if is_regressor(model):
            return model.predict(x_eval)
        else:
            return model.predict_proba(x_eval)[:, 1]

    # Estimate outcome regression functions g_0(d) and propensity score m_0 in parallel
    model_data_list = [(model_g[d], x_train[d_train==d], y_train[d_train==d]) for d in [0, 1]] + [(model_m, x_train, d_train)]
    eta_0_hat = Parallel(n_jobs=3)(delayed(fit_predict)(model, X, y) for model, X, y in model_data_list)
    g_0_hat, m_0_hat = eta_0_hat[:2], eta_0_hat[2]
    if m_bounds is not None:
        np.clip(m_0_hat, m_bounds[0], m_bounds[1], out=m_0_hat)

    # Compute auxiliary estimator
    scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
    theta_0_check = np.mean(scores)

    return theta_0_check, scores

# Partition the data for cross-fitting
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=False)

# Cross-fitting, where the different splits are processed in parallel
results = Parallel(n_jobs=K)(delayed(process_single_split)(train_indices, eval_indices) for train_indices, eval_indices in skf.split(X=x_data, y=d_data))

# Collect results (in particular the auxiliary estimators)
theta_0_check_list = [result[0] for result in results]
scores_list = [result[1] for result in results]   # Needed for variance estimation

# Compute final estimator
theta_0_hat = np.mean(theta_0_check_list)
theta_0_hat

0.14885848734686516

In [28]:
theta_0_check_list

[0.5587124965541486,
 0.4735889811735801,
 0.6693018356524716,
 0.2711833700827615,
 -1.228494246728636]