# Problem Set 1

Solutions to Computational Problems

## ATE under Exogeneity

---

# Problem 3

In [1]:
from collections.abc import Callable
from functools import partial

import jax
import jax.numpy as jnp
import numpy as np
import pandas as pd
from jax import random, vmap
from scipy.special import expit

In [2]:
def _simulate_from_model(
    treatment_effect: float | Callable,
    n_samples: int,
    n_sim: int,
    seed: int,
):
    """Simulate data from the model.

    Args:
        treatment_effect (float | Callable): Treatment effect. Must be a float or a
            function that depends on the regressors.
        n_samples (int): Number of samples per simulation.
        n_sim (int): Number of simulations.
        seed (int): Random seed that is passed to jax rng.

    Returns:
        - jax.Array: Outcomes of shape (n_sim, n_samples).
        - jax.Array: Regressors of shape (n_sim, n_samples).
        - jax.Array: Binary treatment status of shape (n_sim, n_samples).

    """
    key = jax.random.PRNGKey(seed)
    subkeys = jax.random.split(key, 3)

    e = random.normal(subkeys[0], (n_sim, n_samples))
    x = random.uniform(subkeys[1], (n_sim, n_samples))

    propensity_score = expit(-(2 * x - 0.5))
    d = random.binomial(subkeys[2], n=1, p=propensity_score)

    te = treatment_effect(x) if callable(treatment_effect) else treatment_effect

    y = te * d + 0.5 * x + e

    return y, x, d

In [3]:
# Define batched versions for least squares and matrix multiplication. These also work
# with an additional leading dimension which corresponds to the number of simulations
# here.
least_squares = vmap(jnp.linalg.lstsq, in_axes=0)
matmul = vmap(jnp.matmul, in_axes=0)


def _estimate_ate_via_naive_ols(y, x, d):
    """Estimate the ATE via the naive OLS estimator.

    This regresses y on (d, x), and returns the coefficient on d.

    """
    # build feature matrix for model y ~ d + x
    features = jnp.stack((d, x), axis=2)

    # get least squares coefficients and return the ate, which corresponds to the
    # coefficient on d
    coef, *_ = least_squares(features, y)
    return coef[:, 0]


@partial(vmap, in_axes=0)
def _estimate_conditional_expectations(y, features, d):
    """Estimate the conditional expectations of y given x for each treatment group."""
    # get least squares cofficients
    coef, *_ = jnp.linalg.lstsq(features, y)

    # predict y using least squares coefficients
    y_pred = features @ coef

    # calculate the estimated conditional means for each treatment group
    mu_hat_1 = (y_pred * d).sum() / d.sum()
    mu_hat_0 = (y_pred * (1 - d)).sum() / (1 - d).sum()
    return mu_hat_1, mu_hat_0


def _estimate_ate_via_conditional_means(y, x, d):
    """Estimate the ATE via the conditional means estimator.

    This estimates the conditional mean of y given x for each treatment group, and then
    takes the difference between the two.

    """
    # build feature matrix for model y ~ 1 + d + d*x + (1-d)*x
    intercept = jnp.ones_like(x)
    features = jnp.stack((intercept, d, d * x, (1 - d) * x), axis=2)

    # estimate conditional expectations
    mu_hat_1, mu_hat_0 = _estimate_conditional_expectations(y, features, d)

    # compute the ate as the difference between the two conditional expectations
    return mu_hat_1 - mu_hat_0

## Instrumental Variables

--- 

## Problem 4

In [4]:
data = pd.read_feather("data/angrist_and_evans.feather")

> If you don't have the data stored as a feather file you can use
> ```python
> pd.read_excel("path/to/excel/file")
> ```

In [5]:
dependent_variables_info = {
    "workedm": "Worked for pay",
    "weeksm1": "Weeks worked",
    "hourswm": "Hours worked",
    "incomem": "Labor income",
    "famincl": "Log family income",
}

independent_variables = [
    "morekids",
    "agem1",
    "agefstm",
    "boy1st",
    "boy2nd",
    "blackm",
    "hispm",
    "othracem",
]

instrumental_mapping = {"morekids": "samesex"}

### Functions

In [6]:
def _cov_sandwich_estimator_ols(x: np.ndarray, e: np.ndarray) -> np.ndarray:
    r"""Estimator for the asymptotic covariance matrix of OLS estimator.

    The code corresponds to the HC0 estimator of $V_\hat{\beta}}$. For reference see
    Section 7.8 in Econometrics by Bruce Hansen (version January 29, 2021).

    Args:
        x (np.ndarray): Regressors, shape (n, p).
        e (np.ndarray): Residuals, shape (n,)

    Returns:
        np.ndarray: Asymptotic covariance matrix, shape (p, p)

    """
    xtx_inverse = np.linalg.pinv(x.T @ x)
    scaling = (x.T * e**2) @ x
    return xtx_inverse @ scaling @ xtx_inverse


def _cov_sandwich_estimator_iv(
    x: np.ndarray,
    z: np.ndarray,
    e: np.ndarray,
) -> np.ndarray:
    r"""Estimator for the asymptotic covariance matrix of 2SLS IV estimator.

    The code corresponds to the heteroskedasticity-robust estimator of $V_\\hat{\beta}}$
    For reference see Section 12.18 in Econometrics by Bruce Hansen (version January 29,
    2021).

    Args:
        x (np.ndarray): Regressors, shape (n, p).
        z (np.ndarray): Instruments, shape (n, p).
        e (np.ndarray): Residuals, shape (n,)

    Returns:
        np.ndarray: Asymptotic covariance matrix, shape (p, p)

    """
    ztz_inv = np.linalg.pinv(z.T @ z)
    ztx = z.T @ x
    xtz = ztx.T
    outer = np.linalg.pinv(xtz @ ztz_inv @ ztx) @ xtz @ ztz_inv
    scaling = (z.T * e**2) @ z
    return outer @ scaling @ outer.T


def _format_entries(coef: str, se: str) -> str:
    """Paste together two strings."""
    return f"{coef:.4f} ({se:.4f})"


def _format_frame(result_frame: pd.DataFrame, name: str) -> pd.DataFrame:
    """Format coef and se column to single.

    Args:
        result_frame (pd.DataFrame): Result frame; has columns only "coef" and "se".
        name (str): Name of the new column.

    Returns:
        pd.DataFrame: Formatted frame.

    """
    str_repr = [_format_entries(*row[1]) for row in result_frame.iterrows()]
    return pd.DataFrame(str_repr, index=result_frame.index, columns=[name])

### OLS

In [7]:
# get y and x from data


y = data[dependent_variables_info.keys()]

features = data[independent_variables]

# add intercept
x = np.column_stack([np.ones(len(data)), features])

In [8]:
# run least squares regression for each dependent variable

coef_ols, *_ = np.linalg.lstsq(x, y, rcond=None)

In [9]:
# store the result with correct index and column names

coef_ols = pd.DataFrame(
    coef_ols,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [10]:
# compute standard errors for each dependent variable

residuals = y - x @ coef_ols

se_ols = {}

for outcome in dependent_variables_info:
    residual_array = residuals[outcome].to_numpy().flatten()

    cov = _cov_sandwich_estimator_ols(x, e=residual_array)

    se_ols[outcome] = pd.Series(np.sqrt(np.diag(cov)), index=coef_ols.index)


se_ols = pd.DataFrame(se_ols, columns=y.columns)

In [11]:
# combine coefficients and standard error in one frame

result_ols = pd.concat(
    [coef_ols.loc["morekids"], se_ols.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_ols = result_ols.rename(mapper=dependent_variables_info, axis=0)

result_ols = _format_frame(result_ols, name="ols")

## IV 

In [12]:
# get z from data (y and x are already defined)

instruments_names = [
    instrumental_mapping.get(var, var) for var in independent_variables
]

instruments = data[instruments_names]

# add intercept
z = np.column_stack([np.ones(len(data)), instruments])

In [13]:
# run two-stage least squares regression

# first stage

first_stage_coef, *_ = np.linalg.lstsq(z, x, rcond=None)
x_predicted = z @ first_stage_coef

# second stage

coef_iv, *_ = np.linalg.lstsq(x_predicted, y, rcond=None)

In [14]:
# store the result with the correct index and column names

coef_iv = pd.DataFrame(
    coef_iv,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [15]:
# compute standard errors for each dependent variable

residuals = y - x @ coef_ols

se_iv = {}

for outcome in dependent_variables_info:
    residual_array = residuals[outcome].to_numpy().flatten()

    cov = _cov_sandwich_estimator_iv(x, z=z, e=residual_array)

    se_iv[outcome] = pd.Series(np.sqrt(np.diag(cov)), index=coef_ols.index)


se_iv = pd.DataFrame(se_iv, columns=y.columns)

In [16]:
# combine coefficients and standard error in one frame

result_iv = pd.concat(
    [coef_iv.loc["morekids"], se_iv.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_iv = result_iv.rename(mapper=dependent_variables_info, axis=0)

result_iv = _format_frame(result_iv, name="iv")

### Result

In [17]:
_result = pd.concat([result_ols, result_iv], axis=1)

In [18]:
_result  # noqa: B018

Unnamed: 0,ols,iv
Worked for pay,-0.1764 (0.0016),-0.1173 (0.0251)
Weeks worked,-8.9782 (0.0706),-5.5588 (1.1147)
Hours worked,-6.6467 (0.0610),-4.5468 (0.9523)
Labor income,-3762.3826 (34.4127),-1902.9526 (544.5210)
Log family income,-0.1379 (0.0045),-0.0253 (0.0683)
