In [16]:
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import scipy

# One-Sample of Size 1000 for Double ML

In [17]:
# Define parameters

# dimensions
# sample size
N = 1000

# Yi: R, Xi: R^P, Zi: R^q
p = 1
q = 10

# expanded design size
NNp_ratio = 0.1
Np = int(np.ceil(N / NNp_ratio))

# number of repetitions
num_reps = 500

In [18]:
# Define functions
def r0(Z):
    return np.cos(np.sum(Z**2, axis=1))


def g0(Z):
    return np.prod(np.exp(Z), axis=1)


def f0(X):
    beta_0 = 1
    return X * beta_0

In [19]:
# Generation variables
seed = np.random.randint(0, 100)
np.random.seed(seed)

# E: R^12, with Ej ~ Uniform[0, 1/2]
E = np.random.uniform(0, 1/q, size=(N+1, q+2))
# Zij = (Ej+1 + rho * E12) / (1 + rho)
rho = 1
Zt = ((E[:, 1:11] + rho * E[:, 11].reshape(-1, 1)) / (1 + rho))
Z = Zt[1:, :]
Z0 = Zt[0, :].reshape(1, -1)

# or Xi = r0(Zi) + Vi
V = np.random.normal(0, 1, size=(N,))
X = r0(Z) + V

# Yi = f0(Xi) + g0(Z_i) + Ui
U = np.random.normal(0, 1, size=(N,))
Y = f0(X) + g0(Z) + U

In [20]:
# Define data for double ML
dml_data = DoubleMLData.from_arrays(Z, Y, X)
print(dml_data)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
Instrument variable(s): None
No. Observations: 1000

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 12 entries, X1 to d
dtypes: float64(12)
memory usage: 93.9 KB



In [21]:
# Double ML using linear regression
np.random.seed(seed)

# Define linear regression models
ols_learner = LinearRegression()
ols_g_hat = clone(ols_learner)  # Yi = f0(Xi) + g0(Z_i) + Ui
ols_r_hat = clone(ols_learner)  # Xi = r0(Zi) + Vi

# Initialize LinearDML with OLS models
dml_ols = DoubleMLPLR(dml_data, ml_l=ols_g_hat, ml_m=ols_r_hat, n_folds=10)

# Fit the model
dml_ols.fit(store_models=True)
print(dml_ols)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
Instrument variable(s): None
No. Observations: 1000

------------------ Score & algorithm ------------------
Score function: partialling out

------------------ Machine learner   ------------------
Learner ml_l: LinearRegression()
Learner ml_m: LinearRegression()
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[1.48709269]]
Learner ml_m RMSE: [[1.00309215]]

------------------ Resampling        ------------------
No. folds: 10
No. repeated sample splits: 1

------------------ Fit summary       ------------------
       coef   std err          t          P>|t|    2.5 %    97.5 %
d  1.040381  0.035486  29.318085  6.100561e-189  0.97083  1.109933


In [22]:
# Double ML using random forest
np.random.seed(seed)

# Define random forest models
rf_learner = RandomForestRegressor(n_estimators=q)
rf_g_hat = clone(rf_learner)  # Yi = f0(Xi) + g0(Z_i) + Ui
rf_r_hat = clone(rf_learner)  # Xi = r0(Zi) + Vi

# Initialize LinearDML with random forest models
dml_rf = DoubleMLPLR(dml_data, ml_l=rf_g_hat, ml_m=rf_r_hat, n_folds=10)

# Fit the model
dml_rf.fit(store_models=True)
print(dml_rf)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
Instrument variable(s): None
No. Observations: 1000

------------------ Score & algorithm ------------------
Score function: partialling out

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(n_estimators=10)
Learner ml_m: RandomForestRegressor(n_estimators=10)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[1.59688187]]
Learner ml_m RMSE: [[1.0847613]]

------------------ Resampling        ------------------
No. folds: 10
No. repeated sample splits: 1

------------------ Fit summary       ------------------
       coef   std err         t          P>|t|    2.5 %    97.5 %
d  0.908012  0.040053  22.67029  8.801177e-114  0.82951  0.986515


In [23]:
# Double ML using NN
nn_learner = MLPRegressor( 
    hidden_layer_sizes=(128, 64, 64, 128),
    activation="relu",
    solver="adam",
    max_iter=5000,
    random_state=seed,
)
# Define neural network models
nn_g_hat = clone(nn_learner)  # Yi = f0(Xi) + g0(Z_i) + Ui
nn_r_hat = clone(nn_learner)  # Xi = r0(Zi) + Vi

# Initialize LinearDML with neural networks
dml_nn = DoubleMLPLR(dml_data, ml_l=nn_g_hat, ml_m=nn_r_hat, n_folds=10)

# Fit the model
dml_nn.fit(store_models=True)
print(dml_nn)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
Instrument variable(s): None
No. Observations: 1000

------------------ Score & algorithm ------------------
Score function: partialling out

------------------ Machine learner   ------------------
Learner ml_l: MLPRegressor(hidden_layer_sizes=(128, 64, 64, 128), max_iter=5000,
             random_state=4)
Learner ml_m: MLPRegressor(hidden_layer_sizes=(128, 64, 64, 128), max_iter=5000,
             random_state=4)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[1.47493188]]
Learner ml_m RMSE: [[0.99593571]]

------------------ Resampling        ------------------
No. folds: 10
No. repeated sample splits: 1

------------------ Fit summary       ------------------
       coef   std err         t          P>|t|     2.5 %    97.5 %
d  1.034341  0.035578  29.07237  8.026175e-186  0.964609  1.104073


In [24]:
print(dml_ols.coef[0], dml_rf.coef[0], dml_nn.coef[0])

1.0403813798715436 0.9080122783131876 1.0343408633525129


## Expanded Design

In [26]:
np.random.seed(seed)
# Zi' = Zi + epsi
Eps = np.random.normal(0, 1/q, (Np, q))
Zp = Z0 + Eps

In [27]:
def CI_r0_round(Z, X, Z0, Zp, r_hat, alpha=0.05, seed=0):
    r_hat.fit(Z, X)
    X_hat = r_hat.predict(Z)
    X_hatp = r_hat.predict(Zp)
    
    mean_X_hatp = np.mean(X_hatp)
    delta = np.mean(X_hat - X)
    sigma_hat1 = np.mean((X_hat - X - delta) ** 2)
    sigma_hat2 = np.mean((X_hatp - mean_X_hatp) ** 2)

    def w_theta(alpha):
        return scipy.stats.norm.ppf(1 - alpha / (2 * p)) * np.sqrt(
            sigma_hat1 / N + sigma_hat2 / Np
        )

    def CI_r0(alpha):
        w = w_theta(alpha)
        lower = mean_X_hatp - delta - w
        upper = mean_X_hatp - delta + w
        return [lower, upper]

    def does_CI_cover(CI, truth):
        if truth > CI[0] and truth < CI[1]:
            return True
        else:
            return False

    print(CI_r0(alpha))
    print(r0(Z0))
    return does_CI_cover(CI_r0(alpha), r0(Z0))

In [28]:
CI_r0_round(Z, X, Z0, Zp, ols_r_hat, alpha=0.05, seed=seed)

[1.0050560987509412, 1.1294946668400085]
[0.99985939]


False

In [29]:
CI_r0_round(Z, X, Z0, Zp, rf_r_hat, alpha=0.05, seed=seed)

[0.8137977499456142, 0.874283497983563]
[0.99985939]


False

In [30]:
CI_r0_round(Z, X, Z0, Zp, nn_r_hat, alpha=0.05, seed=seed)

[0.9434907677072256, 1.0668945144056228]
[0.99985939]


True

In [41]:
print(Z0)

[[0.03724587 0.05851847 0.04562505 0.04477069 0.02068873 0.05869798
  0.01019577 0.02253337 0.03162383 0.0488534 ]]
