# Estimator Benchmarks

Notebook for benchmarking known IV estimators against different data generating processes.

Current roster:
- Split-sample IV
- 2SLS
- Jackknife IV
- LIML **TODO**
- Mostly harmless ML **TODO**
- DeepIV **TODO**
- DoubleML **TODO**

# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from linearmodels.iv import IV2SLS

In [4]:
# this is assuming a notebook placed inside the notebooks/ folder
import sys
sys.path.append('../src/data')
from iv_data_generator import LinearNormalDataGenerator

df = LinearNormalDataGenerator().generate()

# TabPFN-style data generating process

In [5]:
import torch
from torch import nn
import torch.nn.functional as F

In [31]:
MAX_VARS = 100

class GaussianNoise(nn.Module):
    """
    Lifted from https://github.com/automl/TabPFN/blob/main/tabpfn/priors/mlp.py
    TODO introduce shared covariance matrix
    """
    def __init__(self, std, device):
        super().__init__()
        self.std = std
        self.device=device

    def forward(self, x):
        return x + torch.normal(torch.zeros_like(x), self.std)


def causes_sampler_f(num_causes):
    means = np.random.normal(0, 1, (num_causes))
    std = np.abs(np.random.normal(0, 1, (num_causes)) * means)
    return means, std

class IVGenerator(nn.Module):
    """
    Neural network to generate IV datasets.

    TODO add ability to generate IV controls to cover the Angrist/Frandsen case
    """
    def __init__(self, 
                #n_controls: int,
                n_instruments: int,
                tau: float,
                confound_str: float,
                tau_activation: str = 'identity',
                #control_activation: str = 'identity',
                instrument_activation: str = 'identity',
                confounder_covariance: torch.Tensor = None,
                instrument_covariance: torch.Tensor = None,
                max_vars: int = MAX_VARS
                ):
        super().__init__()

        self.confounders = nn.Linear(max_vars, 1)
        self.instruments = nn.Linear(max_vars, 1)

        #self.n_confounds = n_confounds
        self.n_instruments = n_instruments

        assert confounder_covariance.shape == (n_confounds, n_confounds)

        self.tau = tau
        self.confound_str = confound_str
        
        self.activations = {
            'identity': lambda x: x,
            'relu': F.relu,
            'sigmoid': F.sigmoid,
            'tanh': F.tanh,
            'softplus': F.softplus,
            'leaky_relu': F.leaky_relu,
            'elu': F.elu,
        }
    
        self.tau_activation = self.activations[tau_activation]
        #self.confounder_activation = self.activations[confounder_activation]
        self.instrument_activation = self.activations[instrument_activation]


    def forward(self):
        """Generates a single data sample"""
        #confound_sample = torch.cat([torch.randn(self.n_confounds), torch.zeros(MAX_VARS - self.n_confounds)])

        # noise sample [\epislon_y, \epeilson_v] according to confounder covariance
        noise_sample = torch.normal()
        instrument_sample = torch.cat([torch.randn(self.n_instruments), torch.zeros(MAX_VARS - self.n_instruments)])

        confounds = self.confounder_activation(self.confounders(confound_sample))

        #print(confounds.shape)
        #print(self.instrument_activation(self.instruments(instrument_sample)).shape)

        treat = confounds + self.instrument_activation(self.instruments(instrument_sample))
        
        #print(treat.shape)

        outcome = self.tau*self.tau_activation(treat) + confounds + torch.randn(1)

        # return data matrix of T, Y, X, Z
        return torch.cat([treat, outcome, confound_sample, instrument_sample])
        
    def batch(self, batch_size: int):
        """Generate batch of examples"""
        return torch.stack([self.forward() for _ in range(batch_size)])


In [33]:
iv_gen = IVGenerator(n_confounds=1, 
                    n_instruments=7, 
                    tau=1,
                    confound_str=1)

data = iv_gen.batch(100).detach().numpy()

In [34]:
data.shape

(100, 202)

In [8]:
tensor = torch.randn(7)

In [10]:
torch.cat([tensor, torch.zeros(97)], dim=0).size()

torch.Size([104])

# Lennon et al. 2022 Figure 2 Replication

TODO