# Problem Set 1

Solutions to Computational Problems 

## Problem 2

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_feather("data/angrist_and_evans.feather")

> If you don't have the data stored as a feather file you can use
> ```python
> pd.read_excel("path/to/excel/file")
> ```

In [3]:
dependent_variables_info = {
    "workedm": "Worked for pay",
    "weeksm1": "Weeks worked",
    "hourswm": "Hours worked",
    "incomem": "Labor income",
    "famincl": "Log family income",
}

independent_variables = [
    "morekids",
    "agem1",
    "agefstm",
    "boy1st",
    "boy2nd",
    "blackm",
    "hispm",
    "othracem",
]

instrumental_mapping = {"morekids": "samesex"}

### Functions

In [4]:
def _cov_sandwich_estimator_ols(x, e):
    xtx_inverse = np.linalg.pinv(x.T @ x)
    scaling = (x.T * e**2) @ x
    return xtx_inverse @ scaling @ xtx_inverse


def _cov_sandwich_estimator_iv(x, z, e):
    ztz_inv = np.linalg.pinv(z.T @ z)
    ztx = z.T @ x
    xtz = ztx.T
    outer = np.linalg.pinv(xtz @ ztz_inv @ ztx) @ xtz @ ztz_inv
    scaling = (z.T * e**2) @ z
    return outer @ scaling @ outer.T


def _format_frame(result_data, name):
    str_repr = [_format_entries(*row[1]) for row in result_data.iterrows()]
    return pd.DataFrame(str_repr, index=result_data.index, columns=[name])


def _format_entries(coef, se):
    return f"{coef:.4f} ({se:.4f})"

### OLS

In [5]:
y = data[dependent_variables_info.keys()]

In [6]:
features = data[independent_variables]
# add intercept
x = np.column_stack([np.ones(len(data)), features])

In [7]:
coef, *_ = np.linalg.lstsq(x, y, rcond=None)

In [8]:
coef = pd.DataFrame(
    coef,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [9]:
residuals = y - x @ coef

cov = {
    outcome: _cov_sandwich_estimator_ols(x, e=residuals[outcome].to_numpy().flatten())
    for outcome in dependent_variables_info
}

se = {
    outcome: pd.Series(np.sqrt(np.diag(_cov)), index=coef.index)
    for outcome, _cov in cov.items()
}

se = pd.DataFrame(se, columns=y.columns)

In [10]:
result_ols = pd.concat(
    [coef.loc["morekids"], se.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_ols = result_ols.rename(mapper=dependent_variables_info, axis=0)
result_ols = _format_frame(result_ols, name="ols")

## IV 

In [11]:
instruments_names = [
    instrumental_mapping.get(var, var) for var in independent_variables
]
instruments = data[instruments_names]

# add intercept
z = np.column_stack([np.ones(len(data)), instruments])

In [12]:
# first stage

first_stage_coef, *_ = np.linalg.lstsq(z, x, rcond=None)
x_predicted = z @ first_stage_coef

# second stage

coef, *_ = np.linalg.lstsq(x_predicted, y, rcond=None)

In [13]:
coef = pd.DataFrame(
    coef,
    index=["intercept", *independent_variables],
    columns=y.columns,
)

In [14]:
residuals = y - x @ coef

cov = {
    outcome: _cov_sandwich_estimator_iv(x, z, e=residuals[outcome].to_numpy().flatten())
    for outcome in dependent_variables_info
}

se = {
    outcome: pd.Series(np.sqrt(np.diag(_cov)), index=coef.index)
    for outcome, _cov in cov.items()
}

se = pd.DataFrame(se, columns=y.columns)

In [15]:
result_iv = pd.concat(
    [coef.loc["morekids"], se.loc["morekids"]],
    axis=1,
    keys=["coef", "se"],
)

result_iv = result_iv.rename(mapper=dependent_variables_info, axis=0)
result_iv = _format_frame(result_iv, name="iv")

### Result

In [16]:
result = pd.concat([result_ols, result_iv], axis=1)

In [17]:
result  # noqa: B018

Unnamed: 0,ols,iv
Worked for pay,-0.1764 (0.0016),-0.1173 (0.0252)
Weeks worked,-8.9782 (0.0706),-5.5588 (1.1178)
Hours worked,-6.6467 (0.0610),-4.5468 (0.9536)
Labor income,-3762.3826 (34.4127),-1902.9526 (546.4186)
Log family income,-0.1379 (0.0045),-0.0253 (0.0683)


## Problem 4

In [18]:
from functools import partial

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

In [19]:
def _simulate_model(gamma, beta, n_samples, n_sim, rng):
    mean = np.array([1, 0, 0])
    cov = np.array(
        [
            [1, 0, 0],
            [0, 1, 0.8],
            [0, 0.8, 1],
        ],
    )

    mvnormal = rng.multivariate_normal(mean=mean, cov=cov, size=(n_samples, n_sim))

    z, e, v = mvnormal.swapaxes(0, 2)

    x = z * gamma + v
    y = x * beta + e

    return y, x, z

In [20]:
def _ols_1d(y, x):
    return np.sum(x * y, axis=1) / np.sum(x**2, axis=1)

In [21]:
def _ols_coef_and_se(y, x):
    coef = np.sum(x * y, axis=1) / np.sum(x**2, axis=1)
    residuals = y - x * coef.reshape(-1, 1)
    var = np.mean(residuals**2, axis=1) / np.sum(x**2, axis=1)
    return coef, np.sqrt(var)

In [22]:
def _iv_coef_and_se(y, x, z):
    coef = np.sum(z * y, axis=1) / np.sum(z * x, axis=1)
    residuals = y - x * coef.reshape(-1, 1)
    var = (
        np.sum(z * x, axis=1) ** (-2)
        * np.sum(z**2, axis=1)
        * np.mean(residuals**2, axis=1)
    )
    return coef, np.sqrt(var)

In [23]:
def _coverage(lower, upper, true):
    return np.mean(np.logical_and(lower < true, true < upper))

In [24]:
rng = np.random.default_rng(12345)

gamma_grid = np.linspace(0, 5 / 25, num=6)


def simulate(gamma, n_samples):
    simulate_from_model = partial(
        _simulate_model,
        n_samples=n_samples,
        beta=1,
    )

    y, x, z = simulate_from_model(gamma=gamma, n_sim=10_000, rng=rng)

    beta_estimate_ols = _ols_1d(y, x)

    beta_estimate_iv, beta_estimate_se_iv = _iv_coef_and_se(y, x, z)

    lower = beta_estimate_iv - 1.96 * beta_estimate_se_iv

    upper = beta_estimate_iv + 1.96 * beta_estimate_se_iv

    coverage = np.logical_and(lower < 1, upper > 1)

    gamma_estimate, gamma_se = _ols_coef_and_se(x, z)

    f_test_iv = (gamma_estimate / gamma_se) ** 2
    f_test_iv_larger_10 = f_test_iv > 10

    _result = {
        "beta_estimate_ols": beta_estimate_ols,
        "beta_estimate_iv": beta_estimate_iv,
        "beta_estimate_se_iv": beta_estimate_se_iv,
        "coverage": coverage,
        "gamma_estimate": gamma_estimate,
        "gamma_se": gamma_se,
        "f_test_iv": f_test_iv,
        "f_test_iv_larger_10": f_test_iv_larger_10,
    }

    _result = pd.DataFrame(_result).mean(axis=0)

    idx_f_test_larger_10 = np.where(f_test_iv_larger_10)[0]

    _result["conditional_coverage"] = coverage[idx_f_test_larger_10].mean()

    return _result

In [None]:
result = Parallel(n_jobs=len(gamma_grid))(
    delayed(simulate)(gamma, n_samples=2_500) for gamma in gamma_grid
)

In [None]:
result = pd.DataFrame(result, index=gamma_grid).T

In [None]:
result.round(3)