In [1]:
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Simulations

## Preliminary Numerical Results

In [2]:
# Define parameters

# dimensions
# sample size
N_set = (100, 300, 500, 700, 900)
N = N_set[0]

# expanded design size
NNp_ratio = 0.1
Np = N / NNp_ratio

# Yi: R, Xi: R^P, Zi: R^q
p = 1
q = 10

# number of repetitions
num_reps = 500

In [3]:
# Define functions
def r0(Z):
    return np.cos(np.sum(Z**2, axis=1))


def g0(Z):
    return np.prod(np.exp(Z), axis=1)


def f0(X):
    beta_0 = 1
    return X * beta_0

In [4]:
# Generation variables
seed = 36
np.random.seed(seed)

# E: R^12, with Ej ~ Uniform[0, 1]
E = np.random.uniform(size=(N, q + 2))
# Zij = (Ej+1 + rho * E12) / (1 + rho)
rho = 1
Z = (E[:, 1:11] + rho * E[:, 11].reshape(-1, 1)) / (1 + rho)

# Xi = (E1 + rho * E12) / (1 + rho)
# X = (E[:, 0] + rho * E[:, 11]) / (1 + rho)
# or Xi = r0(Zi) + Vi
V = np.random.normal(size=(N,))
X = r0(Z) + V

# Yi = f0(Xi) + g0(Z_i) + Ui
U = np.random.normal(size=(N,))
Y = f0(X) + g0(Z) + U

In [5]:
# Generate expanded design
# Zi' = Zi + epsi
Eps = np.random.normal(0, 1, (N, q))
Zp = Z + Eps

In [6]:
# Define data for double ML
dml_data = DoubleMLData.from_arrays(Z, Y, X)
print(dml_data)


------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
Instrument variable(s): None
No. Observations: 100

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 12 entries, X1 to d
dtypes: float64(12)
memory usage: 9.5 KB



In [11]:
# Double ML using linear regression

# Define linear regression models
learner = LinearRegression()
f_hat = RandomForestRegressor(
    n_estimators=100, max_features=1, max_depth=5, min_samples_leaf=2
)
g_hat = clone(learner)  # Yi = f0(Xi) + g0(Z_i) + Ui
r_hat = clone(learner)  # Xi = r0(Zi) + Vi

# Initialize LinearDML with OLS models
dml_ols = DoubleMLPLR(dml_data, f_hat, ml_m=r_hat, ml_g=g_hat, score="IV-type")

# Fit the model
dml_ols.fit()
print(dml_ols.summary)
print(dml_ols.confint())

         coef    std err        t     P>|t|      2.5 %      97.5 %
d  111.562998  24.594584  4.53608  0.000006  63.358499  159.767497
       2.5 %      97.5 %
d  63.358499  159.767497


In [12]:
# Double ML using NN

# Define neural network models
learner = MLPRegressor(
    hidden_layer_sizes=(128, 64, 32, 16),
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=seed,
)
f_hat = RandomForestRegressor(
    n_estimators=100, max_features=1, max_depth=5, min_samples_leaf=2
)
g_hat = clone(learner)  # Yi = f0(Xi) + g0(Z_i) + Ui
r_hat = clone(learner)  # Xi = r0(Zi) + Vi

# Initialize LinearDML with neural networks
dml_nn = DoubleMLPLR(dml_data, f_hat, ml_m=r_hat, ml_g=g_hat, score="IV-type")

# Fit the model
dml_nn.fit()

print(dml_nn.summary)
print(dml_ols.confint())

        coef    std err         t     P>|t|     2.5 %      97.5 %
d  16.040267  47.786479  0.335665  0.737123 -77.61951  109.700044
       2.5 %      97.5 %
d  63.358499  159.767497
