In [7]:
import numpy as np
import math
import pandas as pd 
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from collections import defaultdict
from scipy.special import expit
basic_path = '../Simulations/'

def CalcRegressionError(gamma_T=1): # coefficient I -> T

    np.random.seed(42)
    proxy_type = 'Gaussian'

    r = 5         # confounder's dimension
    gamma_V = 1   # coefficient Ui -> X
    delta = 1     # coefficient U -> Y
    tau = 2       # causal effect
    alpha = np.array([-2,-3,-2,-3,-2]) * delta
    beta  = np.array([1,2,2,2,2])

    p = 1500   # number of proxies
    N = p + 50  # number of samples
    lasso = Lasso(alpha=1e-3, random_state=42, max_iter=800)
    lnreg = LinearRegression(n_jobs=-1)
    ridge = Ridge(alpha=3.0, random_state=42)
    ate_vec = list()

    V = np.random.randn(r,p)  # "We keep V fixed across the replications"
    Vi = gamma_V * np.random.randn(1,p)
    for rep in range(50):

        U = np.random.randn(N,r)
        I = np.random.randn(N)
        T = np.random.binomial(np.ones(N,dtype=int), (expit(U.dot(beta) + gamma_T*I)))
        Y = np.random.randn(N) + U.dot(alpha) + tau*T

        if proxy_type == 'Binary':
            X = np.random.binomial(1, (expit(U.dot(V) + np.outer(I,Vi))))
        elif proxy_type == 'Poisson':
            X = np.random.poisson(lam=np.ceil(expit(U.dot(V) + np.outer(I,Vi))))
        elif proxy_type == 'Gaussian':
            X  = 5*np.random.randn(N,p) + U.dot(V) + np.outer(I,Vi)
            
            
        #### Choose one ATE estimator ####
        
        # lasso.fit(X[np.where(T == 0)], Y[np.where(T == 0)])
        # y0 = lasso.predict(X)
        # lasso.fit(X[np.where(T == 1)], Y[np.where(T == 1)])
        # y1 = lasso.predict(X)
        # ate = np.mean(y1 - y0)

        lnreg.fit(X[np.where(T == 0)], Y[np.where(T == 0)])
        y0 = lnreg.predict(X)
        lnreg.fit(X[np.where(T == 1)], Y[np.where(T == 1)])
        y1 = lnreg.predict(X)
        ate = np.mean(y1 - y0)
        
#         TX = np.concatenate((T.reshape(N,1),X),axis=1)
#         lnreg.fit(TX,Y)
#         ate = lnreg.coef_[0]

#         IU = np.concatenate((I.reshape(N,1),U),axis=1)
#         lnreg.fit(IU[np.where(T == 1)], Y[np.where(T == 1)])
#         y1 = lnreg.predict(IU)
#         lnreg.fit(IU[np.where(T == 0)], Y[np.where(T == 0)])
#         y0 = lnreg.predict(IU)
#         ate = np.mean(y1-y0)

        ate_vec.append(ate)
    
    return(np.array(ate_vec))


In [25]:
ate1 = CalcRegressionError(gamma_T=1)
ate10 = CalcRegressionError(gamma_T=10)

print("mse1 = "  + str(np.mean((ate1-2)**2)))
print("bias1 = " + str(np.mean(ate1)-2))
print("var1 = "  + str(np.var(ate1)))

print("mse10 = "  + str(np.mean((ate10-2)**2)))
print("bias10 = " + str(np.mean(ate10)-2))
print("var10 = "  + str(np.var(ate10)))

mse1 = 0.25169617603753375
bias1 = -0.48794915324854005
var1 = 0.0136017998815667
mse10 = 0.08533198202721946
bias10 = -0.2560699136332998
var10 = 0.019760181359053882
