In [53]:
import numpy as np
from numpy.random import randn
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression

def simulate(s, alpha, beta, gamma, delta, n_samples):
    np.random.default_rng(seed=s)
    I = randn(n_samples)
    W = randn(n_samples)
    X = alpha * I + gamma * W + randn(n_samples)
    Y = beta * X + delta * W + randn(n_samples)
    return pd.DataFrame({ "I": I, "W": W, "X": X,"Y":Y})

def instrumentalData(s, alpha, beta, gamma, delta, n_samples):
    df = simulate(1, alpha, beta, gamma, delta, n_samples)
    Y = df["Y"].values.reshape(-1, 1)
    X = df["X"].values.reshape(-1, 1)
    I = df["I"].values.reshape(-1, 1)
    return (I,X,Y)

We simulate data for the structural causal model for a set of values for $\alpha, \beta, \gamma, \delta$:
    \begin{aligned}
    I &= \epsilon_I \\
    W &= \epsilon_W\\
    X &= \alpha I + \gamma W + \epsilon_X \\
    Y &= \beta X + \delta W + \epsilon_Y \\
    \epsilon_I, \epsilon_W,  \epsilon_X, \epsilon_Y  &\sim N(0,1) 
    \end{aligned}

In [54]:
alpha=5
beta=2
gamma=3
delta=4
n_samples = 1000

(I,X,Y) = instrumentalData(1, alpha, beta, gamma, delta, n_samples)

For given values of alpha, beta, gamma and delta, we show that the standard regression always predicts beta in a biased way, while the Instrumental variable method is closer to the true value of beta. Try changing the alpha, beta, gamma and delta parameters and see how the linear regression coefficient varies. What happens if gamma = 0?

In [55]:
# Beta estimated with linear regression
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)  # we don't add W as W is unobserved which explains why regressing only Y on X produce bias
beta_ols = linear_regressor.coef_[0][0]
print ("Estimated beta from linear regression: ", beta_ols)

# Beta estimated with IV method:
beta_IV = np.cov(I.T, Y.T)[1,0]/np.cov(I.T, X.T)[1,0]
print("Beta from Instrumental variable method: ", beta_IV)

# True beta in this example:
print("True beta: ", beta)

Estimated beta from linear regression:  2.37347606001822
Beta from Instrumental variable method:  2.046492775111044
True beta:  2


We will now try a simplified version of Two Stage Least Squares:

In [56]:
# Simplified 2SLS

# Step 1: regress X on I 

linear_regressor = LinearRegression()
linear_regressor.fit(I, X)
alpha_hat = linear_regressor.coef_[0][0]
print("Alpha_hat", alpha_hat, " True alpha", alpha)

# Step 2: create X_hat
X_hat = alpha_hat * I

# Step 3: regress Y on X_hat
linear_regressor = LinearRegression()
linear_regressor.fit(X_hat, Y)
beta_2SLS = linear_regressor.coef_[0][0]
print("Beta from Two Stage Least Squares: ", beta_2SLS, "True beta", beta)

Alpha_hat 5.121894916006067  True alpha 5
Beta from Two Stage Least Squares:  2.0464927751110435 True beta 2


We can now see that even if the estimated beta with Instrumental Variables and Two Stage Least Squares is unbiased, for small values of alpha, its variance is usually larger than the variance of the OLS estimator. This is true even in settings with a lot of data (e.g. 10k). Try changing alpha to a larger number (e.g. 100) and see how the variance changes.

In [63]:
beta_ols_list = []
beta_IV_list = []
beta_2SLS_list = []

alpha=5
beta=2
gamma=3
delta=4
n_datasets = 100

# We can run comparisons between the estimators on multiple sampled datasets.
def run_comparison(nsamples):
    for i in range(1,n_datasets):
        (I,X,Y) = instrumentalData(i, alpha, beta, gamma, delta, nsamples)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y)
        beta_ols = linear_regressor.coef_[0][0]
        beta_ols_list.append(beta_ols)
        beta_IV_list.append(np.cov(I.T, Y.T)[1,0]/np.cov(I.T, X.T)[1,0])
        linear_regressor = LinearRegression()
        linear_regressor.fit(I, X)
        alpha_hat = linear_regressor.coef_[0][0]
        linear_regressor = LinearRegression()
        linear_regressor.fit(alpha_hat * I, Y)
        beta_2SLS_list.append(linear_regressor.coef_[0][0])
    print("OLS :", np.mean(beta_ols_list), np.var(beta_ols_list))
    print("IV ",np.mean(beta_IV_list), np.var(beta_IV_list))
    print("2SLS ", np.mean(beta_2SLS_list), np.var(beta_2SLS_list))
    print("True beta: ", beta)

In [64]:
run_comparison(10)

OLS : 2.001888926813252 0.00024379321177126411
IV  2.000469533137782 0.00025183142797404075
2SLS  2.000469533137782 0.00025183142797404086
True beta:  2


In [65]:
run_comparison(100)

OLS : 2.0013431178602463 0.00012904911240265874
IV  2.0000411498918855 0.00013282082992212342
2SLS  2.0000411498918855 0.00013282082992212356
True beta:  2


In [66]:
run_comparison(1000)

OLS : 2.001262042571669 8.672855320210838e-05
IV  1.9999963738016933 8.924606208669998e-05
2SLS  1.9999963738016933 8.924606208670009e-05
True beta:  2


In [67]:
run_comparison(10000)

OLS : 2.00123289564692 6.509777295510617e-05
IV  1.999983517436772 6.698367337297387e-05
2SLS  1.999983517436772 6.698367337297396e-05
True beta:  2


In [68]:
run_comparison(100000)

OLS : 2.0012264415085865 5.2081727286437575e-05
IV  1.9999873528693255 5.359039916000093e-05
2SLS  1.9999873528693253 5.359039916000102e-05
True beta:  2
