In [3]:
!pip install pandas numpy scikit-learn statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.1-cp38-cp38-macosx_10_9_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 1.3 MB/s eta 0:00:011
Collecting patsy>=0.5.4
  Downloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
[K     |████████████████████████████████| 233 kB 63.9 MB/s eta 0:00:01
[?25hCollecting packaging>=21.3
  Using cached packaging-25.0-py3-none-any.whl (66 kB)
Installing collected packages: patsy, packaging, statsmodels
  Attempting uninstall: packaging
    Found existing installation: packaging 20.4
    Uninstalling packaging-20.4:
      Successfully uninstalled packaging-20.4
Successfully installed packaging-25.0 patsy-1.0.2 statsmodels-0.14.1
You should consider upgrading via the '/usr/local/opt/python@3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [5]:
!pip install --upgrade statsmodels

You should consider upgrading via the '/usr/local/opt/python@3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [2]:
print("Importing packages...")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

Importing packages...


In [3]:
# LOAD AND PREPROCESS DATA
# --------------------------
print("Loading and preprocessing data...")
file_path = '401k.csv'

try:
    data = pd.read_csv(file_path)
    print(f"Successfully loaded '{file_path}'. Shape: {data.shape}")
except FileNotFoundError:
    print(f"Error: '{file_path}' not found.")
    print("Please make sure the dataset is in the same directory as this notebook/script.")
    data = None

if data is not None:
    # --- Feature Engineering & Variable Definition ---
    
    # Create log_inc
    # CONCEPT: We create log_inc (log of income) for a few reasons:
    # 1. It pulls in outliers. Income data is often highly skewed (a few very rich people).
    # 2. A 1-unit change in log_inc can be interpreted as a ~1% change in income,
    #    which is often more meaningful than a $1 change.
    # We use np.maximum(1, ...) to avoid log(0) errors if any incomes are $0.
    data['log_inc'] = np.log(np.maximum(1, data['inc']))
    
    # Define the key variables for our analysis
    
    # Y_VAR: The Outcome (Dependent) Variable
    # What we are trying to predict or explain.
    Y_VAR = 'tw'  # Total Wealth
    
    # T_VAR: The Treatment Variable
    # This is the variable whose causal effect we want to measure.
    T_VAR = 'p401' # Participation in 401(k)
    
    # Z_VAR: The Instrumental Variable
    # A special variable used to "fix" selection bias in the 2SLS model.
    Z_VAR = 'e401' # Eligibility for 401(k)
    
    # CONTROL_VARS: The Control Variables (Covariates)
    # All the other variables we specified that could also affect wealth.
    # We include these to "control for" their effects.
    CONTROL_VARS = [
        'log_inc', 'age', 'fsize', 'marr', 'twoearn', 'db', 'ira', 'pira', 
        'hown', 'male', 'educ', 'hmort', 'hequity', 'hval'
    ]
    
    # All variables needed for the analysis
    ALL_VARS = [Y_VAR, T_VAR, Z_VAR] + CONTROL_VARS
    
    # Data Cleaning: Drop rows with missing values
    # We only keep rows that have complete data for all variables we will use.
    original_rows = data.shape[0]
    data = data[ALL_VARS].dropna()
    print(f"Dropped {original_rows - data.shape[0]} rows with missing values.")
    print(f"Final dataset shape: {data.shape}")

Loading and preprocessing data...
Successfully loaded '401k.csv'. Shape: (9915, 24)
Dropped 0 rows with missing values.
Final dataset shape: (9915, 17)


In [4]:
# DEFINE FINAL VARIABLES FOR MODELS
# -----------------------------------
if 'data' in locals() and data is not None:
    # y: The outcome variable (a Series)
    y = data[Y_VAR]
    
    # t: The treatment variable (a Series)
    t = data[T_VAR]
    
    # X: The control variables (a DataFrame)
    # Note: This *excludes* the treatment 'p401'.
    # This is used for the causal models (DML and 2SLS).
    X = data[CONTROL_VARS]
    
    # Z: The instrumental variable (a Series)
    Z = data[Z_VAR]
    
    # X_with_treatment: All features for the *predictive* model
    # This *includes* the treatment 'p401' as a feature.
    # This is used for the "Naive LASSO" in Part 1.
    X_with_treatment = data[[T_VAR] + CONTROL_VARS]

    print("Data loading and preprocessing complete.")
    print(f"Target (y): {Y_VAR}")
    print(f"Treatment (t): {T_VAR}")
    print(f"Instrument (Z): {Z_VAR}")
    print(f"Controls (X): {X.columns.tolist()}")
    print(f"Features for Naive LASSO: {X_with_treatment.columns.tolist()}")
else:
    print("Data was not loaded. Fix the previous cell and re-run.")


Data loading and preprocessing complete.
Target (y): tw
Treatment (t): p401
Instrument (Z): e401
Controls (X): ['log_inc', 'age', 'fsize', 'marr', 'twoearn', 'db', 'ira', 'pira', 'hown', 'male', 'educ', 'hmort', 'hequity', 'hval']
Features for Naive LASSO: ['p401', 'log_inc', 'age', 'fsize', 'marr', 'twoearn', 'db', 'ira', 'pira', 'hown', 'male', 'educ', 'hmort', 'hequity', 'hval']


## Part 1: Naive LASSO Model (tw ~ p401 + controls)
We chose this as a standard machine learning model for **prediction**.

**Conceptual Clarification:** We call this "Naive" because it treats `p401` as just another feature. 
It doesn't know about "selection bias." The coefficient it finds for `p401` is **biased** and 
represents *correlation*, not *causation*. It answers: "How does `tw` change when `p401` is 1, 
*holding all else constant*?" This is biased because people who choose `p401=1` are different 
in unobserved ways.

In [5]:
# PART 1: NAIVE LASSO MODEL
# -------------------------
print("\n--- Part 1: Naive LASSO Model (tw ~ p401 + controls) ---")

# Split data into training and testing sets
# We train the model on 80% of the data and test its performance on 20%
X_train, X_test, y_train, y_test = train_test_split(
    X_with_treatment, y, test_size=0.2, random_state=42
)

# Create a 'Pipeline'
# Our pipeline will:
# 1. Impute: Fill any remaining missing values (as a safety step).
# 2. Scale: Standardize all features.
# 3. Lasso: Run the LASSO regression model.
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    
    # CONCEPT: StandardScaler is CRITICAL for LASSO.
    # LASSO penalizes large coefficients. If features are on different scales
    # (e.g., age 25-64 vs hval 0-300000+), the model will unfairly
    # penalize the coefficients for 'hval'. Scaling puts all features
    # on a similar scale (mean 0, std dev 1).
    ('scaler', StandardScaler()),
    
    # CONCEPT: LassoCV (Cross-Validation)
    # This automatically tests many different 'alphas' (penalization strengths)
    # using 10-fold cross-validation (cv=10) and selects the best one.
    ('lasso', LassoCV(cv=10, random_state=42, n_jobs=-1))
])

# Train the model
pipeline.fit(X_train, y_train)

# Get the best alpha (penalty strength) found by CV
best_alpha = pipeline.named_steps['lasso'].alpha_
print(f"Best alpha selected by LassoCV: {best_alpha:.4f}")

# Evaluate on the unseen test set
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Test Set RMSE: {rmse:.2f}")
print(f"Test Set R-squared: {r2:.2f}")

# Get coefficients
lasso_model = pipeline.named_steps['lasso']
lasso_coefs = pd.Series(lasso_model.coef_, index=X_with_treatment.columns)

print("\nLASSO Coefficients:")
print(lasso_coefs.sort_values(ascending=False))

p401_coef_lasso = lasso_coefs.get('p401')
print("\n------------------------------------------------------")
print("Project Insight (Model Performance & Causal Effect):")
print(f"The coefficient for 'p401' in this *predictive* model is: {p401_coef_lasso:.2f}")
print("This is the 'Naive' (or 'Biased') estimate. It's likely high")
print("because it's capturing both the true effect of 401(k)s AND")
print("the fact that people who save a lot are more likely to sign up.")
print("This is **selection bias**.")
print("------------------------------------------------------")


--- Part 1: Naive LASSO Model (tw ~ p401 + controls) ---
Best alpha selected by LassoCV: 79.2591
Test Set RMSE: 66060.99
Test Set R-squared: 0.63

LASSO Coefficients:
hequity    52505.221108
ira        26943.488541
hval       16121.140435
log_inc    10949.202509
age         7137.367674
p401        4650.277035
marr        3687.592929
pira        3233.435161
educ        1501.808628
male         918.589618
hmort          0.000000
db         -1098.004872
fsize      -1635.398022
hown       -5883.377366
twoearn    -8592.614474
dtype: float64

------------------------------------------------------
Project Insight (Model Performance & Causal Effect):
The coefficient for 'p401' in this *predictive* model is: 4650.28
This is the 'Naive' (or 'Biased') estimate. It's likely high
because it's capturing both the true effect of 401(k)s AND
the fact that people who save a lot are more likely to sign up.
This is **selection bias**.
------------------------------------------------------


## Part 2: Double-Selection (DML) LASSO (For Causal Effect)

This is a modern econometric method using ML to estimate a **causal effect**.

**Conceptual Clarification:** DML (from Chernozhukov, et al.) "de-biases" the `p401` coefficient. 
It recognizes that the bias in Part 1 comes from omitting variables that are related to *both* 
participation (`p401`) and wealth (`tw`).

It works in 3 main steps (Steps 1, 2, 3 here are 2 stages in the paper):
1.  **Stage 1 (Outcome):** Use LASSO to find all controls `X` that are good at predicting the 
outcome `tw`.
2.  **Stage 2 (Treatment):** Use LASSO to find all controls `X` that are good at predicting 
the treatment `p401`. (This is the "selection" model).
3.  **Final OLS:** Run a simple OLS regression of `tw ~ p401 + union_of_controls`, where 
union_of_controls` is the *combination* of all variables selected in Step 1 *and* Step 2.

This ensures we control for *everything* that could be causing bias.

In [11]:
# HELPER FUNCTION FOR DML
# -----------------------
def run_lasso_selection(X, y, alpha=None):
    """Helper function to run LASSO for feature selection."""
    if alpha is None:
        # Use LassoCV to find the best alpha if one isn't provided
        model = LassoCV(cv=10, random_state=42, n_jobs=-1).fit(X, y)
    else:
        model = Lasso(alpha=alpha, random_state=42).fit(X, y)
    
    # Get coefficients and return the *indices* of non-zero ones
    coefs = np.abs(model.coef_)
    print(coefs)
    return model, np.where(coefs > 1e-6)[0] # 1e-6 to avoid tiny floating points

In [9]:
# PART 2: DOUBLE-SELECTION LASSO
# ------------------------------
print("\n--- Part 2: Double-Selection LASSO (For Causal Effect) ---")

# Standardize data for LASSO selection steps
# As before, scaling is critical for LASSO
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=CONTROL_VARS)

# Impute just in case. We'll use these imputed, scaled versions.
imputer = SimpleImputer(strategy='median')
X_scaled_imputed = imputer.fit_transform(X_scaled)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
t_imputed = imputer.fit_transform(t.values.reshape(-1, 1)).ravel()

# --- Step 1: LASSO for outcome model (tw ~ controls) ---
# "What controls predict wealth?"
model_y, selected_y_indices = run_lasso_selection(X_scaled_imputed, y_imputed)
selected_y_vars = X.columns[selected_y_indices].tolist()
print(f"Controls selected by outcome model (tw ~ X): {selected_y_vars}")

# --- Step 2: LASSO for treatment model (p401 ~ controls) ---
# "What controls predict participation? (i.e., what causes selection bias?)"
# CONCEPT: Since p401 is binary (0/1), we can't use standard LASSO.
# We must use a classifier. LogisticRegression with 'l1' penalty *is* LASSO
# for classification.
logreg_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median')),
    ('model', LogisticRegression(penalty='l1', solver='liblinear', random_state=42))
])
# We use GridSearchCV to find the best 'C' (which is 1/alpha, the penalty strength)
param_grid = {'model__C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(logreg_pipe, param_grid, cv=10, n_jobs=-1)
grid.fit(X.values, t_imputed) # Use original X.values for this pipeline

logreg_model = grid.best_estimator_.named_steps['model']
selected_t_indices = np.where(np.abs(logreg_model.coef_[0]) > 1e-6)[0]
selected_t_vars = X.columns[selected_t_indices].tolist()
print(f"Controls selected by treatment model (p401 ~ X): {selected_t_vars}")

# --- Step 3: Union of selected controls ---
# This is the "Double-Selection" part. We combine both lists.
union_vars = list(set(selected_y_vars) | set(selected_t_vars))
print(f"Union of selected controls: {union_vars}")

if not union_vars:
    print("Warning: DML selected no control variables. Effect may be unreliable.")
    X_final = pd.DataFrame(index=X.index) # Empty dataframe with original index
else:
    X_final = X[union_vars] # Select only the union of controls
    
# --- Step 4: Final OLS ---
# Now we run a simple OLS (tw ~ p401 + union_of_controls)
# This gives the "debiased" causal effect of p401.
X_ols_dml = pd.concat([t.reset_index(drop=True), X_final.reset_index(drop=True)], axis=1)
X_ols_dml.columns = [T_VAR] + union_vars
X_ols_dml = sm.add_constant(X_ols_dml, prepend=False) # Add intercept
y_ols_dml = y.reset_index(drop=True)

dml_model = sm.OLS(y_ols_dml, X_ols_dml).fit()

print(dml_model.summary(xname=[T_VAR] + union_vars + ['const']))

dml_coef = dml_model.params[T_VAR]
dml_se = dml_model.bse[T_VAR]

print("\n------------------------------------------------------")
print("Project Insight (DML Causal Effect):")
print(f"The DML estimate for 'p401' is: {dml_coef:.2f} (SE: {dml_se:.2f})")
print("This is the 'unbiased' ML-based estimate. Compare this to the")
print("Naive LASSO coefficient. This is a more credible causal effect.")
print("------------------------------------------------------")


--- Part 2: Double-Selection LASSO (For Causal Effect) ---
[11814.86388836  6574.98674172  2111.34673033  3546.56256922
  8221.51099888   486.8124728  26544.62497005  3770.61983464
  5691.04689871  1442.96064232  1292.88114301     0.
 53610.0127981  15351.66119028]
Controls selected by outcome model (tw ~ X): ['log_inc', 'age', 'fsize', 'marr', 'twoearn', 'db', 'ira', 'pira', 'hown', 'male', 'educ', 'hequity', 'hval']
Controls selected by treatment model (p401 ~ X): ['log_inc', 'age', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown', 'male', 'educ', 'hmort', 'hequity']
Union of selected controls: ['ira', 'db', 'hown', 'age', 'hval', 'hequity', 'hmort', 'marr', 'fsize', 'pira', 'educ', 'twoearn', 'male', 'log_inc']
                            OLS Regression Results                            
Dep. Variable:                     tw   R-squared:                       0.607
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares 

## Part 3: Replication of OLS and 2SLS Models
 
This is a required part of your project, replicating the "traditional" econometric counterparts.

* **OLS (Ordinary Least Squares):** This is the traditional version of Part 1. It will also be **biased** by selection, just like the Naive LASSO.
* **2SLS (Two-Stage Least Squares):** This is the traditional *causal* model. It's the ancestor  of the DML model. It "solves" selection bias using an **Instrumental Variable (IV)**.

In [8]:
# PART 3: REPLICATION OF OLS AND 2SLS
# -----------------------------------
print("\n--- Part 3: Replication of OLS and 2SLS Models ---")

# Prepare data for statsmodels (already done as y, t, X, Z)
# We must manually add a constant (intercept) for statsmodels
X_const = sm.add_constant(X, prepend=False)

# --- Model 1: OLS (tw ~ p401 + controls) ---
# This is the "traditional" econometric counterpart to your naive LASSO
X_ols_rep = pd.concat([t, X_const], axis=1)
ols_model = sm.OLS(y, X_ols_rep).fit()

ols_coef = ols_model.params[T_VAR]
ols_se = ols_model.bse[T_VAR]

print("\n--- OLS Model (Replication) ---")
print(f"OLS coefficient for 'p401': {ols_coef:.2f} (SE: {ols_se:.2f})")
print("This is the standard OLS estimate, which suffers from selection bias.")
print("This should be very similar to the Naive LASSO coefficient.")

# --- Model 2: 2SLS (tw ~ p401 + controls, using e401 as instrument) ---
print("\n--- 2SLS Model (Replication) ---")

# Instrumental Variable (IV)
# We have selection bias because 'p401' is chosen by people.
# An IV ('e401', eligibility) is a variable that:
# 1. (Relevance) Strongly predicts participation 'p401'.
#    (True: You can't participate if not eligible).
# 2. (Exclusion) Affects wealth 'tw' *only through* its effect on 'p401'.
#    (Plausible: Being eligible doesn't make you rich *unless* you participate).
#
# 2SLS uses 'e401' to isolate the "clean" part of 'p401' (the part
# "forced" by eligibility) and uses *only* that to estimate the causal effect.

# MOVED: Define IV variables *before* the try/except block
# This makes them available to both the 'try' and 'except' scopes.
# Note: X_const, T, and Z must have compatible indices
Y_iv = y.reset_index(drop=True)
X_const_iv = X_const.reset_index(drop=True) # Exogenous controls
T_iv = t.reset_index(drop=True)         # Endogenous treatment
Z_iv = Z.reset_index(drop=True)         # Instrument

try:
    # We will try to import and use the "modern" IV2SLS module
    from statsmodels.iv.api import IV2SLS
    
    # `dependent` = Y (tw)
    # `exog` = All *exogenous* controls (X_const)
    # `endog` = The *endogenous* treatment (t = p401)
    # `instruments` = The *instrument* (Z = e401)
    iv_model = IV2SLS(
        dependent=Y_iv,
        exog=X_const_iv,
        endog=T_iv,
        instruments=Z_iv
    ).fit()

    print(iv_model.summary(xname=X_const.columns.tolist() + [T_VAR]))

    iv_coef = iv_model.params[T_VAR]
    iv_se = iv_model.bse[T_VAR]
        
except Exception as e:
    # This 'except' block will run if you have an old statsmodels version
    print(f"\nCould not run statsmodels.iv.api.IV2SLS (error: {e})")
    print("Falling back to manual 2SLS for demonstration.")
    
    # --- Manual 2SLS ---
    # 1st Stage: Regress the endogenous variable (p401) on the
    #            instrument (e401) and all controls (X_const).
    X_stage1 = pd.concat([Z_iv, X_const_iv], axis=1)
    model_stage1 = sm.OLS(T_iv, X_stage1).fit()
    p401_predicted = model_stage1.predict(X_stage1)
    
    # 2nd Stage: Regress the outcome (tw) on the *predicted* p401
    #            (p401_predicted) and all controls (X_const).
    X_stage2 = pd.concat([p401_predicted, X_const_iv], axis=1)
    X_stage2.columns = [T_VAR] + X_const_iv.columns.tolist()
    model_stage2 = sm.OLS(Y_iv, X_stage2).fit()

    print("\n--- 2SLS Model (Manual) ---")
    print(model_stage2.summary())
    iv_coef = model_stage2.params[T_VAR]
    iv_se = model_stage2.bse[T_VAR] # NOTE: These SEs are technically incorrect
    iv_model = model_stage2 # Save for summary
    print("\n*** Warning: Manual 2SLS standard errors are incorrect. ***")
    print("*** The coefficient estimate, however, is consistent. ***")

print("\n------------------------------------------------------")
print("Project Insight (2SLS Causal Effect):")
print(f"The 2SLS (Instrumental Variable) estimate for 'p401' is: {iv_coef:.2f} (SE: {iv_se:.2f})")
print("This is the 'traditional' econometric method for finding the")
print("causal effect. You should compare this value to the DML (Part 2)")
print("estimate. They are two different ways of trying to get the 'true' causal number.")
print("------------------------------------------------------")


--- Part 3: Replication of OLS and 2SLS Models ---

--- OLS Model (Replication) ---
OLS coefficient for 'p401': 12463.10 (SE: 1693.41)
This is the standard OLS estimate, which suffers from selection bias.
This should be very similar to the Naive LASSO coefficient.

--- 2SLS Model (Replication) ---

Could not run statsmodels.iv.api.IV2SLS (error: No module named 'statsmodels.iv')
Falling back to manual 2SLS for demonstration.

--- 2SLS Model (Manual) ---
                            OLS Regression Results                            
Dep. Variable:                     tw   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     1085.
Date:                Tue, 11 Nov 2025   Prob (F-statistic):               0.00
Time:                        19:22:43   Log-Likelihood:            -1.2469e+05
No. Observations:                9915   AIC:                       

## Part 4: Analyzing Heterogeneity

This is an optional but important part of your project.

**Conceptual Clarification:** "Heterogeneity" asks: "Is the effect of 401(k) participation 
*the same* for everyone?"

Maybe the effect is *larger* for high-income earners or *smaller* for those with less 
education. We can test this by adding **interaction terms**.

In [9]:
# PART 4: HETEROGENEITY ANALYSIS
# ------------------------------
print("\n--- Part 4: Analyzing Heterogeneity (Example) ---")
print("Testing for heterogeneity by income and education...")

# We will use the main dataset `data`
data_het = data.copy()

# Create interaction terms
# 'p401_x_log_inc' = p401 * log_inc
data_het['p401_x_log_inc'] = data_het[T_VAR] * data_het['log_inc']
data_het['p401_x_educ'] = data_het[T_VAR] * data_het['educ']

Y_het = data_het[Y_VAR]

# The new "treatment" variables are p401, AND the interaction terms
X_het_vars = [T_VAR, 'p401_x_log_inc', 'p401_x_educ'] + CONTROL_VARS
X_het = data_het[X_het_vars]
X_het = sm.add_constant(X_het, prepend=False) # Add intercept

try:
    # We run a simple OLS model.
    # Note: This OLS model is *also* biased, just like in Part 3.
    # A more advanced analysis would interact the *instrument* (2SLS)
    # or use a causal forest, but this is a good first step.
    model_het = sm.OLS(Y_het, X_het).fit()
    print("\n--- OLS Model with Heterogeneity ---")
    print(model_het.summary(
        xname=X_het_vars + ['const']
    ))
    
    print("\n------------------------------------------------------")
    print("Project Insight (Heterogeneity):")
    print("To interpret this model:")
    print(f" - 'p401' coef: The effect of p401 when log_inc and educ are 0 (not very useful).")
    print(f" - 'p401_x_log_inc' coef: How the effect of p401 *changes* with a one-unit increase in log_inc.")
    print(f" - 'p401_x_educ' coef: How the effect of p401 *changes* with one more year of education.")
    print("If the p-value (P>|t|) for an interaction term is significant (e.g., < 0.05),")
    print("it suggests the effect of 401(k) is *not* one-size-fits-all.")
    print("------------------------------------------------------")
except Exception as e:
    print(f"Could not run heterogeneity analysis: {e}")



--- Part 4: Analyzing Heterogeneity (Example) ---
Testing for heterogeneity by income and education...

--- OLS Model with Heterogeneity ---
                            OLS Regression Results                            
Dep. Variable:                     tw   R-squared:                       0.608
Model:                            OLS   Adj. R-squared:                  0.608
Method:                 Least Squares   F-statistic:                     960.3
Date:                Tue, 11 Nov 2025   Prob (F-statistic):               0.00
Time:                        19:23:27   Log-Likelihood:            -1.2466e+05
No. Observations:                9915   AIC:                         2.493e+05
Df Residuals:                    9898   BIC:                         2.495e+05
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.0

In [11]:
# FINAL SUMMARY
# ---------------
print("\n\n" + "="*70)
print("FINAL SUMMARY OF 401(k) PARTICIPATION EFFECT ('p401') ON WEALTH ('tw')")
print("="*70)

try:
    # This table collects the key 'p401' coefficient from all 4 models.
    # This is the central table for your project report.
    summary_df = pd.DataFrame({
        'Model': [
            '1. Naive LASSO', 
            '2. Replication OLS', 
            '3. Double-Selection LASSO (DML)', 
            '4. Replication 2SLS (IV)'
        ],
        'Method': [
            'ML Prediction (Biased)', 
            'Traditional (Biased)', 
            'ML Causal (Unbiased)', 
            'Traditional Causal (Unbiased)'
        ],
        'p401_Coefficient': [
            p401_coef_lasso,
            ols_model.params[T_VAR],
            dml_model.params[T_VAR],
            iv_model.params[T_VAR]
        ],
        'Standard_Error': [
            None, # LASSO doesn't provide easy SEs
            ols_model.bse[T_VAR],
            dml_model.bse[T_VAR],
            iv_model.bse[T_VAR] # May be incorrect if manual 2SLS was used
        ]
    })
    
    print(summary_df.to_string(index=False, float_format="%.2f"))
    
    print("\n--- Final Comparison ---")
    print("This table is the core of your project's comparison.")
    print("\n  GROUP 1: BIASED ASSOCIATION (Correlation)")
    print("   - (1) Naive LASSO & (2) Replication OLS.")
    print("   - These models show the *association* between p401 and tw.")
    print("   - They are biased by 'selection' (e.g., savers sign up more).")
    
    print("\n  GROUP 2: CAUSAL EFFECT (Causation)")
    print("   - (3) DML LASSO & (4) Replication 2SLS.")
    print("   - These models use different methods to try and *correct* for selection bias.")
    print("   - (3) uses ML to select the right controls.")
    print("   - (4) uses an 'instrumental variable' (e401).")
    
    print("\nWe should discuss why (1) & (2) differ from (3) & (4).")

except Exception as e:
    print(f"Could not generate final summary (models may not have run): {e}")
    print("Please review the output of each part above.")



FINAL SUMMARY OF 401(k) PARTICIPATION EFFECT ('p401') ON WEALTH ('tw')
                          Model                        Method  p401_Coefficient  Standard_Error
                 1. Naive LASSO        ML Prediction (Biased)           4650.28             NaN
             2. Replication OLS          Traditional (Biased)          12463.10         1693.41
3. Double-Selection LASSO (DML)          ML Causal (Unbiased)          12463.10         1693.41
       4. Replication 2SLS (IV) Traditional Causal (Unbiased)          10897.52         2267.94

--- Final Comparison ---
This table is the core of your project's comparison.

  GROUP 1: BIASED ASSOCIATION (Correlation)
   - (1) Naive LASSO & (2) Replication OLS.
   - These models show the *association* between p401 and tw.
   - They are biased by 'selection' (e.g., savers sign up more).

  GROUP 2: CAUSAL EFFECT (Causation)
   - (3) DML LASSO & (4) Replication 2SLS.
   - These models use different methods to try and *correct* for select