# Handout 1

---

*Tim Mensinger*

In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from pathlib import Path
from functools import partial

## Problem 1.1

In [2]:
DATA_PATH = Path("data/angrist_evans_1980.csv")

In [3]:
cols = [
    "workedm",
    "weeksm1",
    "hourswm",
    "incomem",
    "famincl",
    "agem1",
    "agefstm",
    "boy1st",
    "boy2nd",
    "blackm",
    "hispm",
    "othracem",
    "morekids",
    "samesex",
]
df = pd.read_csv(DATA_PATH)

In [4]:
categories = ["all", "married", "husband"]

## Problem 1.2

In [5]:
def simulate_model(gamma, beta, n_samples, n_sim):
    """Simulate model."""
    mean = np.array([1, 0, 0])
    cov = np.diag([1, 1, 1])
    cov[2, 1] = .8
    cov[1, 2] = .8
    mvnormal = np.random.multivariate_normal(mean, cov, size=(n_samples, n_sim))
    z, e, v = mvnormal.swapaxes(0, 2)
    x = z * gamma + v
    y = x * beta + e
    return y, x, z

In [6]:
def ols_1d(y, x):
    top = np.mean(x * y, axis=1) - np.mean(x, axis=1) * np.mean(y, axis=1)
    bottom = np.var(x, axis=1)
    slope = top / bottom
    return slope

In [7]:
def iv_1d(y, x, z):
    top = np.mean(z * y, axis=1) - np.mean(z, axis=1) * np.mean(y, axis=1)
    bottom = np.mean(z * x, axis=1) - np.mean(z, axis=1) * np.mean(x, axis=1)
    slope = top / bottom
    return slope

In [8]:
gamma_grid = np.linspace(0, 5/25, num=6)

In [9]:
model = partial(simulate_model, **{"n_samples": 625, "beta": 1})

In [10]:
y, x, z = model(gamma=10, n_sim=5_000)

In [11]:
ols_estimate = ols_1d(y, x)

In [12]:
iv_estimate = iv_1d(y, x, z)

## Problem 1.5

Monte Carlo Simulation

In [189]:
n_samples = 200
n_sim = 100_000

beta = np.array([3, 2])

In [190]:
e, v, w = np.random.normal(0, np.sqrt(0.5), size=(3, n_sim, n_samples))

x_star = np.random.normal(1, 1, (n_sim, n_samples))

In [191]:
x = x_star + v
z = x_star + w

y = beta[0] + beta[1] * x_star + e

In [192]:
x_demeaned = x - x.mean(axis=1).reshape(-1, 1)
z_demeaned = z - z.mean(axis=1).reshape(-1, 1)

beta_hat = (x_demeaned * y).sum(axis=1) / (x_demeaned * x).sum(axis=1)
beta_tilde = (z_demeaned * y).sum(axis=1) / (z_demeaned * x).sum(axis=1)

In [193]:
intercept_hat = y.mean(axis=1) - beta_hat * x.mean(axis=1)
intercept_tilde = y.mean(axis=1) - beta_tilde * x.mean(axis=1)

In [194]:
res_hat = y - intercept_hat.reshape(-1, 1) - beta_hat.reshape(-1, 1) * x
res_tilde = y - intercept_tilde.reshape(-1, 1) - beta_tilde.reshape(-1, 1) * x

In [195]:
# HC1 estimator

var_hat = ((res_hat ** 2) * (x ** 2)).sum(axis=1) / ((x ** 2).sum(axis=1) ** 2)
var_tilde = (res_tilde ** 2).mean(axis=1) * (z_demeaned ** 2).mean(axis=1) / ((x_demeaned * z_demeaned).mean(axis=1) ** 2)

sd_hat = np.sqrt(var_hat / n_samples)
sd_tilde = np.sqrt(var_tilde / n_samples)

In [196]:
coverage_hat = np.logical_and(
    beta_hat - 1.96 * sd_hat <= beta[1], beta[1] <= beta_hat + 1.96 * sd_hat
).mean()
coverage_tilde = np.logical_and(
    beta_tilde - 1.96 * sd_tilde <= beta[1], beta[1] <= beta_tilde + 1.96 * sd_tilde
).mean()

In [197]:
def evaluate(method, true, name, cvg):
    mean = method.mean()
    sd = np.sqrt(method.var())
    mse = ((method - true) ** 2).mean()
    return name, mean, sd, mse, cvg

In [198]:
data = [
    evaluate(method, beta[1], name, cvg)
    for method, name, cvg in zip(
        (beta_hat, beta_tilde), ("ols", "iv"), (coverage_hat, coverage_tilde)
    )
]

In [199]:
results = pd.DataFrame(data, columns=["method", "mean", "sd", "mse", "coverage"])

In [202]:
print(results.to_string())

  method      mean        sd       mse  coverage
0    ols  1.333370  0.078818  0.450607   0.00000
1     iv  2.008254  0.140270  0.019744   0.95015
