In [2]:
import pandas as pd
import numpy as np
import wooldridge
from linearmodels.iv import IV2SLS
import statsmodels.api as sm

# 1. DATA LOADING
# ---------------------------------------------------------
# I'm using the 'card' dataset from the wooldridge package.
# This corresponds to David Card's 1995 paper on the return to schooling.
df = wooldridge.data('card')

# Checking the head to make sure variables look right (educ, wage, nearc4)
print(df.head())

# 2. DATA PREPARATION
# ---------------------------------------------------------
# I need to drop missing values to ensure my OLS and IV models compare
# the exact same set of observations.
df = df.dropna(subset=['lwage', 'educ', 'exper', 'black', 'south', 'nearc4'])

# Adding a constant because statsmodels and linearmodels don't add one by default.
df['const'] = 1

# 3. NAIVE OLS REGRESSION
# ---------------------------------------------------------
# First, I am running a standard OLS regression.
# My hypothesis is that this coefficient for 'educ' will be BIASED
# because of omitted variable bias (e.g., 'ability' is missing).

# Defining my OLS variables
y_ols = df['lwage']
x_ols = df[['const', 'educ', 'exper', 'black', 'south']]

# Fitting the OLS model
ols_model = sm.OLS(y_ols, x_ols).fit()

print("--- My OLS Results ---")
print(ols_model.summary().tables[1])
# Note to self: The coefficient for 'educ' usually hovers around 0.07 (7% return).


# 4. INSTRUMENTAL VARIABLE REGRESSION (2SLS)
# ---------------------------------------------------------
# Now I'm using 'nearc4' (grew up near a 4-year college) as the instrument.
# Logic: Proximity reduces the cost of education (relevance) but shouldn't 
# directly affect my wage later in life (exclusion), other than through education.

# IV2SLS Syntax: dep ~ exog + [endog ~ instrument]
# Dependent: lwage
# Exogenous Controls: const, exper, black, south
# Endogenous: educ
# Instrument: nearc4

iv_model = IV2SLS(dependent=df['lwage'],
                  exog=df[['const', 'exper', 'black', 'south']],
                  endog=df['educ'],
                  instruments=df['nearc4']).fit(cov_type='robust')

print("\n--- My IV (2SLS) Results ---")
print(iv_model.summary)

# 5. POST-ESTIMATION ANALYSIS
# ---------------------------------------------------------
# I'm checking the first-stage F-statistic to see if 'nearc4' is a strong instrument.
# If F < 10, I might have a weak instrument problem.
print("\nFirst Stage Diagnostics:")
print(iv_model.first_stage)

   id  nearc2  nearc4  educ  age  fatheduc  motheduc    weight  momdad14  \
0   2       0       0     7   29       NaN       NaN  158413.0         1   
1   3       0       0    12   27       8.0       8.0  380166.0         1   
2   4       0       0    12   34      14.0      12.0  367470.0         1   
3   5       1       1    11   27      11.0      12.0  380166.0         1   
4   6       1       1    12   34       8.0       7.0  367470.0         1   

   sinmom14  ...  smsa66  wage  enroll   KWW     IQ  married  libcrd14  exper  \
0         0  ...       1   548       0  15.0    NaN      1.0       0.0     16   
1         0  ...       1   481       0  35.0   93.0      1.0       1.0      9   
2         0  ...       1   721       0  42.0  103.0      1.0       1.0     16   
3         0  ...       1   250       0  25.0   88.0      1.0       1.0     10   
4         0  ...       1   729       0  34.0  108.0      1.0       0.0     16   

      lwage  expersq  
0  6.306275      256  
1  6.17586