# Problem Set 1

Solutions to Computational Problems 

## Problem 2

In [30]:
import numpy as np
import pandas as pd

In [31]:
data = pd.read_feather("data/angrist_and_evans.feather")

> If you don't have the data stored as a feather file you can use
> ```python
> pd.read_excel("path/to/excel/file")
> ```

In [32]:
dependent_variables_info = {
    "workedm": "Worked for pay",
    "weeksm1": "Weeks worked",
    "hourswm": "Hours worked",
    "incomem": "Labor income",
    "famincl": "Log family income",
}

independent_variables = [
    "morekids",
    "agem1",
    "agefstm",
    "boy1st",
    "boy2nd",
    "blackm",
    "hispm",
    "othracem",
]

instrumental_mapping = {"morekids": "samesex"}

### Functions

In [33]:
def _cov_sandwich_estimator_ols(x: np.ndarray, e: np.ndarray) -> np.ndarray:
    r"""Estimator for the asymptotic covariance matrix of OLS estimator.

    The code corresponds to the HC0 estimator of $V_\hat{\beta}}$. For reference see
    Section 7.8 in Econometrics by Bruce Hansen (version January 29, 2021).

    Args:
        x (np.ndarray): Regressors, shape (n, p).
        e (np.ndarray): Residuals, shape (n,)

    Returns:
        np.ndarray: Asymptotic covariance matrix, shape (p, p)

    """
    xtx_inverse = np.linalg.pinv(x.T @ x)
    scaling = (x.T * e**2) @ x
    return xtx_inverse @ scaling @ xtx_inverse


def _cov_sandwich_estimator_iv(
    x: np.ndarray,
    z: np.ndarray,
    e: np.ndarray,
) -> np.ndarray:
    r"""Estimator for the asymptotic covariance matrix of 2SLS IV estimator.

    The code corresponds to the heteroskedasticity-robust estimator of $V_\\hat{\beta}}$
    For reference see Section 12.18 in Econometrics by Bruce Hansen (version January 29,
    2021).

    Args:
        x (np.ndarray): Regressors, shape (n, p).
        z (np.ndarray): Instruments, shape (n, p).
        e (np.ndarray): Residuals, shape (n,)

    Returns:
        np.ndarray: Asymptotic covariance matrix, shape (p, p)

    """
    ztz_inv = np.linalg.pinv(z.T @ z)
    ztx = z.T @ x
    xtz = ztx.T
    outer = np.linalg.pinv(xtz @ ztz_inv @ ztx) @ xtz @ ztz_inv
    scaling = (z.T * e**2) @ z
    return outer @ scaling @ outer.T


def _format_entries(coef: str, se: str) -> str:
    """Paste together two strings."""
    return f"{coef:.4f} ({se:.4f})"


def _format_frame(result_frame: pd.DataFrame, name: str) -> pd.DataFrame:
    """Format coef and se column to single.

    Args:
        result_frame (pd.DataFrame): Result frame; has columns only "coef" and "se".
        name (str): Name of the new column.

    Returns:
        pd.DataFrame: Formatted frame.

    """
    str_repr = [_format_entries(*row[1]) for row in result_frame.iterrows()]
    return pd.DataFrame(str_repr, index=result_frame.index, columns=[name])

### OLS

In [34]:
# get y and x from data


y = data[dependent_variables_info.keys()]

features = data[independent_variables]

# add intercept
x = np.column_stack([np.ones(len(data)), features])

In [35]:
# run least squares regression for each dependent variable

coef_ols, *_ = np.linalg.lstsq(x, y, rcond=None)

In [36]:
# store the result with correct index and column names

coef_ols = pd.DataFrame(
    coef_ols,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [37]:
# compute standard errors for each dependent variable

residuals = y - x @ coef_ols

se_ols = {}

for outcome in dependent_variables_info:
    residual_array = residuals[outcome].to_numpy().flatten()

    cov = _cov_sandwich_estimator_ols(x, e=residual_array)

    se_ols[outcome] = pd.Series(np.sqrt(np.diag(cov)), index=coef_ols.index)


se_ols = pd.DataFrame(se_ols, columns=y.columns)

In [38]:
# combine coefficients and standard error in one frame

result_ols = pd.concat(
    [coef_ols.loc["morekids"], se_ols.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_ols = result_ols.rename(mapper=dependent_variables_info, axis=0)

result_ols = _format_frame(result_ols, name="ols")

## IV 

In [39]:
# get z from data (y and x are already defined)

instruments_names = [
    instrumental_mapping.get(var, var) for var in independent_variables
]

instruments = data[instruments_names]

# add intercept
z = np.column_stack([np.ones(len(data)), instruments])

In [40]:
# run two-stage least squares regression

# first stage

first_stage_coef, *_ = np.linalg.lstsq(z, x, rcond=None)
x_predicted = z @ first_stage_coef

# second stage

coef_iv, *_ = np.linalg.lstsq(x_predicted, y, rcond=None)

In [41]:
# store the result with the correct index and column names

coef_iv = pd.DataFrame(
    coef_iv,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [42]:
# compute standard errors for each dependent variable

residuals = y - x @ coef_ols

se_iv = {}

for outcome in dependent_variables_info:
    residual_array = residuals[outcome].to_numpy().flatten()

    cov = _cov_sandwich_estimator_iv(x, z=z, e=residual_array)

    se_iv[outcome] = pd.Series(np.sqrt(np.diag(cov)), index=coef_ols.index)


se_iv = pd.DataFrame(se_iv, columns=y.columns)

In [43]:
# combine coefficients and standard error in one frame

result_iv = pd.concat(
    [coef_iv.loc["morekids"], se_iv.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_iv = result_iv.rename(mapper=dependent_variables_info, axis=0)

result_iv = _format_frame(result_iv, name="iv")

### Result

In [44]:
_result = pd.concat([result_ols, result_iv], axis=1)

In [45]:
_result  # noqa: B018

Unnamed: 0,ols,iv
Worked for pay,-0.1764 (0.0016),-0.1173 (0.0251)
Weeks worked,-8.9782 (0.0706),-5.5588 (1.1147)
Hours worked,-6.6467 (0.0610),-4.5468 (0.9523)
Labor income,-3762.3826 (34.4127),-1902.9526 (544.5210)
Log family income,-0.1379 (0.0045),-0.0253 (0.0683)


## Problem 4

In [46]:
from functools import partial
from itertools import product

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

### Functions

In [47]:
def _simulate_from_model(
    gamma: float,
    beta: float,
    n_samples: int,
    n_sim: int,
    rng: np.random.Generator,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Simulate data from the model.

    Args:
        gamma (float): Slope parameter of x onto z.
        beta (float): Slope parameter of y onto x.
        n_samples (int): Number of samples per simulation.
        n_sim (int): Number of simulations.
        rng (np.random.Generator): Random number generator.

    Returns:
        - np.ndarray: Outcomes of shape (n_sim, n_samples).
        - np.ndarray: Regressors of shape (n_sim, n_samples).
        - np.ndarray: Instruments of shape (n_sim, n_samples).

    """
    mean = np.array([1, 0, 0])
    cov = np.array(
        [
            [1, 0, 0],
            [0, 1, 0.8],
            [0, 0.8, 1],
        ],
    )

    mvnormal = rng.multivariate_normal(mean=mean, cov=cov, size=(n_samples, n_sim))

    z, e, v = mvnormal.swapaxes(0, 2)

    x = z * gamma + v
    y = x * beta + e

    return y, x, z

In [48]:
def _ols_1d(y: np.ndarray, x: np.ndarray) -> np.ndarray:
    """Fast estimation of coefficient in OLS model for 1d x and y.

    Args:
        y (np.ndarray): Outcomes of shape (n_sim, n_samples).
        x (np.ndarray): Regressors of shape (n_sim, n_samples).

    Returns:
        np.ndarray: Coefficients of shape (n_sim,).

    """
    return np.sum(x * y, axis=1) / np.sum(x**2, axis=1)

In [49]:
def _ols_coef_and_se(y: np.ndarray, x: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Fast estimation of coefficient and its se in OLS model for 1d x and y.

    Args:
        y (np.ndarray): Outcomes of shape (n_sim, n_samples).
        x (np.ndarray): Regressors of shape (n_sim, n_samples).

    Returns:
        - np.ndarray: Coefficients of shape (n_sim,).
        - np.ndarray: Standard errors of shape (n_sim,).

    """
    sum_x_squared = np.sum(x**2, axis=1)
    coef = np.sum(x * y, axis=1) / sum_x_squared
    residuals = y - x * coef.reshape(-1, 1)
    var = np.mean(residuals**2, axis=1) / sum_x_squared
    return coef, np.sqrt(var)

In [50]:
def _iv_coef_and_se(
    y: np.ndarray,
    x: np.ndarray,
    z: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
    """Fast estimation of coefficient and its se in IV model for 1d x, z, and y.

    Args:
        y (np.ndarray): Outcomes of shape (n_sim, n_samples).
        x (np.ndarray): Regressors of shape (n_sim, n_samples).
        z (np.ndarray): Instruments of shape (n_sim, n_samples).

    Returns:
        - np.ndarray: Coefficients of shape (n_sim,).
        - np.ndarray: Standard errors of shape (n_sim,).

    """
    sum_z_x = np.sum(z * x, axis=1)
    coef = np.sum(z * y, axis=1) / sum_z_x
    residuals = y - x * coef.reshape(-1, 1)
    var = sum_z_x ** (-2) * np.sum(z**2, axis=1) * np.mean(residuals**2, axis=1)
    return coef, np.sqrt(var)

In [51]:
def _simulation(
    gamma: float,
    n_samples: int,
    n_sim: int,
    rng: np.random.Generator,
) -> pd.DataFrame:
    """Perform a monte carlo simulation.

    Args:
        gamma (float): Slope parameter of x onto z.
        n_samples (int): Number of samples per simulation.
        n_sim (int): Number of simulations.
        rng (np.random.Generator): Random number generator.

    Returns:
        pd.DataFrame: Simulation result.

    """
    # simulate data
    y, x, z = _simulate_from_model(
        gamma=gamma,
        beta=1,
        n_samples=n_samples,
        n_sim=n_sim,
        rng=rng,
    )

    # estimate coefficients
    coef_ols = _ols_1d(y, x)

    coef_iv, coef_iv_se = _iv_coef_and_se(y, x, z)

    gamma_estimate, gamma_se = _ols_coef_and_se(x, z)

    # compute coverage of iv estimate
    lower = coef_iv - 1.96 * coef_iv_se
    upper = coef_iv + 1.96 * coef_iv_se

    coverage = np.logical_and(lower < 1, upper > 1)

    # compute f-test
    f_test_iv = (gamma_estimate / gamma_se) ** 2
    f_test_iv_larger_10 = f_test_iv > 10

    # compute conditional coverage
    _conditional_coverage = coverage[f_test_iv_larger_10].mean()

    # collect results
    result = {
        "Coefficient - OLS": coef_ols,
        "Coefficient - IV": coef_iv,
        "Standard Error - IV": coef_iv_se,
        "Coverage - IV": coverage,
        "Gamma estimate": gamma_estimate,
        "Gamma se": gamma_se,
        "Prob. F > 10": f_test_iv_larger_10,
    }

    # take mean across simulations
    result = pd.DataFrame(result).mean(axis=0)
    result["Cond. coverage"] = _conditional_coverage  # is already 'meaned'

    return result

### Computation

In [52]:
rng = np.random.default_rng(54321)

gamma_grid = np.linspace(0, 5 / 25, num=6)

simulation = partial(_simulation, n_sim=10_000, rng=rng)

In [53]:
parameter_grid = list(product(gamma_grid, [625, 2_500]))

raw_result = Parallel(n_jobs=8)(
    delayed(simulation)(gamma=gamma, n_samples=n_samples)
    for gamma, n_samples in parameter_grid
)

In [54]:
result = pd.DataFrame(
    raw_result,
    index=pd.MultiIndex.from_tuples(parameter_grid, names=["gamma", "No. samples"]),
).T

In [55]:
result.round(3)

gamma,0.00,0.00,0.04,0.04,0.08,0.08,0.12,0.12,0.16,0.16,0.20,0.20
No. samples,625,2500,625,2500,625,2500,625,2500,625,2500,625,2500
Coefficient - OLS,1.8,1.8,1.798,1.797,1.79,1.79,1.778,1.777,1.761,1.761,1.741,1.741
Coefficient - IV,2.766,0.052,0.614,0.769,0.565,0.973,0.945,0.989,0.973,0.994,0.983,0.996
Standard Error - IV,35190.751,39321.618,724.55,17.968,262.636,0.192,0.285,0.122,0.193,0.09,0.149,0.072
Coverage - IV,0.87,0.866,0.903,0.927,0.928,0.948,0.94,0.954,0.946,0.952,0.95,0.951
Gamma estimate,0.0,0.0,0.04,0.04,0.08,0.08,0.12,0.12,0.16,0.16,0.2,0.2
Gamma se,0.028,0.014,0.028,0.014,0.028,0.014,0.028,0.014,0.028,0.014,0.028,0.014
Prob. F > 10,0.001,0.001,0.042,0.377,0.376,0.995,0.86,1.0,0.992,1.0,1.0,1.0
Cond. coverage,0.0,0.0,0.383,0.817,0.82,0.947,0.93,0.954,0.946,0.952,0.95,0.951
