In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
import wooldridge as woo

# ==========================================
# 1. LOAD DATA
# ==========================================
# Loading David Card's 1995 dataset from the wooldridge package.
# This saves me from having to download/clean the CSV manually.
df = woo.data('card')

# Dropping missing values just to be safe, though this dataset is usually clean.
df = df.dropna()

# ==========================================
# 2. DEFINE VARIABLES
# ==========================================
# Outcome (Y): log(wage) - The natural log of the person's wage.
# Treatment (D): educ - Years of education.
# Instrument (Z): nearc4 - Dummy variable (1 if grew up near a 4-year college, 0 otherwise).
# Controls (X): exper, race, smsa (urban), south.

# Adding a constant because statsmodels/linearmodels requires it for the intercept.
df['const'] = 1

# ==========================================
# 3. NAIVE OLS (The Biased Estimate)
# ==========================================
# I'm running this first to see what the correlation looks like.
# This is likely biased because 'ability' is omitted.
ols = sm.OLS(df['lwage'], df[['const', 'educ', 'exper', 'black', 'south', 'smsa']])
ols_res = ols.fit()

print("\n--- OLS Results (Biased) ---")
print(f"Return to education (OLS): {ols_res.params['educ']:.4f}")
# Expected result: ~0.07 (7% increase per year)

# ==========================================
# 4. MANUAL LATE CALCULATION (The Wald Estimator)
# ==========================================
# LATE = (Reduced Form) / (First Stage)
# This helps me understand what's happening under the hood.

# A. First Stage: Effect of Instrument (nearc4) on Treatment (educ)
# Checking relevance: Does living near a college actually increase education?
first_stage = sm.OLS(df['educ'], df[['const', 'nearc4', 'exper', 'black', 'south', 'smsa']])
fs_res = first_stage.fit()
alpha_1 = fs_res.params['nearc4']

# B. Reduced Form: Effect of Instrument (nearc4) on Outcome (lwage)
# Checking: Does living near a college increase wages directly (via education)?
reduced_form = sm.OLS(df['lwage'], df[['const', 'nearc4', 'exper', 'black', 'south', 'smsa']])
rf_res = reduced_form.fit()
beta_1 = rf_res.params['nearc4']

# C. Calculate LATE
wald_estimator = beta_1 / alpha_1

print("\n--- Manual Wald Estimator ---")
print(f"First Stage Effect (Alpha): {alpha_1:.4f}")
print(f"Reduced Form Effect (Beta): {beta_1:.4f}")
print(f"Calculated LATE (Beta/Alpha): {wald_estimator:.4f}")

# ==========================================
# 5. AUTOMATED 2SLS (The Proper Way)
# ==========================================
# Using linearmodels to get correct standard errors.
# Syntax: Dependent ~ Exogenous + [Endogenous ~ Instruments]
iv = IV2SLS(dependent=df['lwage'],
            exog=df[['const', 'exper', 'black', 'south', 'smsa']],
            endog=df['educ'],
            instruments=df['nearc4'])

iv_res = iv.fit()

print("\n--- 2SLS / IV Results ---")
print(iv_res.summary)

# Note to self: If IV > OLS, it means the "Compliers" (people who only went to college
# because it was close) have higher returns than the average person.
# This often indicates they were credit-constrained.


--- OLS Results (Biased) ---
Return to education (OLS): 0.0739

--- Manual Wald Estimator ---
First Stage Effect (Alpha): 0.2892
Reduced Form Effect (Beta): 0.0202
Calculated LATE (Beta/Alpha): 0.0697

--- 2SLS / IV Results ---
                          IV-2SLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                      0.1961
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1936
No. Observations:                1600   F-statistic:                    203.39
Date:                Fri, Feb 13 2026   P-value (F-stat)                0.0000
Time:                        18:04:17   Distribution:                  chi2(5)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lo