# Homework 5

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
# Code to create data for this question 
num = 100000 
 
difficulty = np.random.uniform(0, 1, (num,)) 
 
speed = np.maximum(np.random.normal(15, 5, (num, )) - difficulty * 10, 0) 
 
accident = np.minimum(np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (num,)), 0), 1) 
 
df = pd.DataFrame({'difficulty': difficulty, 'speed': speed, 'accident': accident}) 

## Question 1

In [5]:
def simulate(n=100_000, seed=None):
    rng = np.random.default_rng(seed)
    X = rng.uniform(0, 1, n)                          # difficulty
    Y = np.maximum(rng.normal(15, 5, n) - 10*X, 0)    # speed (mph)
    Z_star = 0.03*Y + 0.4*X + rng.normal(0, 0.3, n)   # latent accident risk
    Z = np.clip(Z_star, 0, 1)                         # bounded [0,1]
    return pd.DataFrame({'X': X, 'Y': Y, 'Z': Z})

def ols(y, X):
    Xc = sm.add_constant(X)
    return sm.OLS(y, Xc).fit()

df = simulate()

# 1) Biased: regress Z on Y only
b_y_only = ols(df['Z'], df[['Y']]).params['Y']

# 2) Identified: regress Z on Y and X
b_y_adj = ols(df['Z'], df[['Y','X']]).params['Y']
b_x_adj = ols(df['Z'], df[['Y','X']]).params['X']

# 3) Causal effect of X on Y (speed reduction)
g_x = ols(df['Y'], df[['X']]).params['X']

print(f"Z ~ Y only (biased): beta_Y = {b_y_only:.4f}")
print(f"Z ~ Y + X (identified): beta_Y = {b_y_adj:.4f}, beta_X = {b_x_adj:.4f}")
print(f"Y ~ X: gamma_X (difficulty -> speed) = {g_x:.4f}")

Z ~ Y only (biased): beta_Y = 0.0170
Z ~ Y + X (identified): beta_Y = 0.0258, beta_X = 0.3444
Y ~ X: gamma_X (difficulty -> speed) = -9.6373


## Question 2

In [6]:

def simulate_one(num=100_000, seed=None):
    rng = np.random.default_rng(seed)
    X = rng.uniform(0, 1, num)                          # difficulty
    Y = np.maximum(rng.normal(15, 5, num) - 10 * X, 0)  # speed
    Z_star = 0.03 * Y + 0.4 * X + rng.normal(0, 0.3, num)
    Z = np.clip(Z_star, 0, 1)

    XZ = sm.add_constant(pd.DataFrame({'X': X, 'Z': Z}))
    coef_X = sm.OLS(Y, XZ).fit().params['X']
    return coef_X

reps = 200
coefs = [simulate_one(seed=i) for i in range(reps)]
print(f"Average coef on X in Y ~ X + Z over {reps} runs: {np.mean(coefs):.3f}")
print(f"Std dev across runs: {np.std(coefs):.3f}")

Average coef on X in Y ~ X + Z over 200 runs: -10.328
Std dev across runs: 0.045


## Reflection Code

In [None]:
def simulate_ecosystem(n=1000, seed=None):
    rng = np.random.default_rng(seed)
    # Lightning storms intensity
    L = rng.uniform(0, 1, n)
    # Bears: storms reduce them
    B = np.clip(0.6 - 0.4*L + rng.normal(0, 0.1, n), 0, 1)
    # Deer: storms reduce them, bears eat them
    D = np.clip(0.7 - 0.3*L - 0.5*B + rng.normal(0, 0.1, n), 0, 1)
    # Flowers: storms help them grow, deer eat them
    F = np.clip(0.3 + 0.5*L - 0.6*D + rng.normal(0, 0.1, n), 0, 1)
    return pd.DataFrame({
        "Lightning": L,
        "Bears": B,
        "Deer": D,
        "Flowers": F
    })

# Example usage
df = simulate_ecosystem(100, seed=42)
print(df)

    Lightning     Bears      Deer   Flowers
0    0.773956  0.330395  0.302103  0.330044
1    0.438878  0.333901  0.385042  0.321214
2    0.858598  0.218745  0.366806  0.681951
3    0.697368  0.450976  0.406050  0.251668
4    0.094177  0.526703  0.417454  0.182999
..        ...       ...       ...       ...
95   0.630283  0.494031  0.007234  0.656740
96   0.361813  0.344570  0.395486  0.313810
97   0.087650  0.475467  0.453623  0.085476
98   0.118006  0.617130  0.385632  0.203637
99   0.961898  0.175780  0.286349  0.632061

[100 rows x 4 columns]
