# Day 2: Regularized Regression (LASSO, Ridge, Elastic Net)

**WISE Workshop | Addis Ababa, Feb 2026**

In this notebook, you'll apply regularization techniques to prevent overfitting in supply chain demand prediction.

## Setup

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

print("Packages loaded!")

---

## Part 1: Data Preparation

We'll create a dataset with many features to see regularization in action.

In [None]:
# Create sample supply chain data with many features
np.random.seed(42)
n_rows = 1000

# Generate dates and basic features
dates = pd.date_range('2023-01-01', periods=n_rows, freq='D')
regions = np.random.choice(['Addis Ababa', 'Oromia', 'Amhara', 'SNNP', 'Tigray'], n_rows)
facility_types = np.random.choice(['Hospital', 'Health Center', 'Clinic'], n_rows, p=[0.2, 0.5, 0.3])

# Create demand with clear patterns
base_demand = 100
facility_effect = np.where(facility_types == 'Hospital', 80, 
                          np.where(facility_types == 'Health Center', 30, 0))
region_effect = np.where(regions == 'Addis Ababa', 50,
                        np.where(regions == 'Oromia', 20, 0))
seasonal_effect = 25 * np.sin(2 * np.pi * dates.dayofyear / 365)
noise = np.random.normal(0, 15, n_rows)

demand = base_demand + facility_effect + region_effect + seasonal_effect + noise
demand = np.maximum(demand, 10)

df = pd.DataFrame({
    'date': dates,
    'region': regions,
    'facility_type': facility_types,
    'demand': demand.astype(int)
})

print(f"Data shape: {df.shape}")
df.head()

In [None]:
# Feature Engineering: Create MANY features (some useful, some noise)

# Time-based features
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['day_of_year'] = df['date'].dt.dayofyear
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

# Cyclical encoding for month (sine/cosine)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Create dummy variables for categorical features
region_dummies = pd.get_dummies(df['region'], prefix='region', drop_first=False)
facility_dummies = pd.get_dummies(df['facility_type'], prefix='facility', drop_first=False)

# Add some noise features (to see if LASSO eliminates them)
for i in range(5):
    df[f'noise_{i}'] = np.random.normal(0, 1, n_rows)

# Combine all features
df = pd.concat([df, region_dummies, facility_dummies], axis=1)

print(f"After feature engineering: {df.shape}")
print(f"\nFeature columns:")
print([c for c in df.columns if c not in ['date', 'region', 'facility_type', 'demand']])

In [None]:
# Define features and target
feature_cols = [c for c in df.columns if c not in ['date', 'region', 'facility_type', 'demand']]

X = df[feature_cols]
y = df['demand']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# IMPORTANT: Scale features for regularization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Number of features: {len(feature_cols)}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

---

## Part 2: Ridge Regression

Ridge regression adds an L2 penalty: $\lambda \sum \beta_j^2$

This **shrinks** all coefficients toward zero but never sets them exactly to zero.

In [None]:
# Fit Ridge with different alpha (lambda) values
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
ridge_coefs = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    ridge_coefs.append(ridge.coef_)

ridge_coefs = np.array(ridge_coefs)

In [None]:
# Visualize coefficient shrinkage with Ridge
fig, ax = plt.subplots(figsize=(12, 6))

for i in range(ridge_coefs.shape[1]):
    ax.plot(np.log10(alphas), ridge_coefs[:, i], linewidth=1)

ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax.set_xlabel('log10(alpha)', fontsize=12)
ax.set_ylabel('Coefficient Value', fontsize=12)
ax.set_title('Ridge Regression: Coefficient Paths\n(All coefficients shrink, but none reach zero)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Use cross-validation to find optimal alpha
alphas_cv = np.logspace(-3, 3, 50)

ridge_cv = RidgeCV(alphas=alphas_cv, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)

print(f"Optimal alpha: {ridge_cv.alpha_:.4f}")

# Evaluate on test set
ridge_pred = ridge_cv.predict(X_test_scaled)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
ridge_r2 = r2_score(y_test, ridge_pred)

print(f"Ridge Test RMSE: {ridge_rmse:.2f}")
print(f"Ridge Test RÂ²: {ridge_r2:.3f}")

---

## Part 3: LASSO Regression

LASSO adds an L1 penalty: $\lambda \sum |\beta_j|$

This **shrinks** coefficients AND can set them **exactly to zero** (feature selection!).

In [None]:
# Fit LASSO with different alpha values
alphas_lasso = [0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10]
lasso_coefs = []

for alpha in alphas_lasso:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train_scaled, y_train)
    lasso_coefs.append(lasso.coef_)

lasso_coefs = np.array(lasso_coefs)

In [None]:
# Visualize coefficient paths with LASSO
fig, ax = plt.subplots(figsize=(12, 6))

for i in range(lasso_coefs.shape[1]):
    ax.plot(np.log10(alphas_lasso), lasso_coefs[:, i], linewidth=1)

ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax.set_xlabel('log10(alpha)', fontsize=12)
ax.set_ylabel('Coefficient Value', fontsize=12)
ax.set_title('LASSO Regression: Coefficient Paths\n(Watch coefficients go to EXACTLY zero!)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Use cross-validation to find optimal alpha for LASSO
lasso_cv = LassoCV(alphas=np.logspace(-3, 1, 50), cv=5, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)

print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")

# Evaluate on test set
lasso_pred = lasso_cv.predict(X_test_scaled)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
lasso_r2 = r2_score(y_test, lasso_pred)

print(f"LASSO Test RMSE: {lasso_rmse:.2f}")
print(f"LASSO Test RÂ²: {lasso_r2:.3f}")

In [None]:
# See which features LASSO selected (non-zero coefficients)
lasso_coef_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso_cv.coef_
})

# Count zero and non-zero coefficients
n_selected = (lasso_coef_df['coefficient'] != 0).sum()
n_eliminated = (lasso_coef_df['coefficient'] == 0).sum()

print(f"Features SELECTED by LASSO: {n_selected}")
print(f"Features ELIMINATED by LASSO: {n_eliminated}")
print(f"\nSelected features:")
print(lasso_coef_df[lasso_coef_df['coefficient'] != 0].sort_values('coefficient', key=abs, ascending=False))

In [None]:
# Visualize LASSO feature selection
fig, ax = plt.subplots(figsize=(12, 6))

# Sort by absolute coefficient value
sorted_df = lasso_coef_df.copy()
sorted_df['abs_coef'] = sorted_df['coefficient'].abs()
sorted_df = sorted_df.sort_values('abs_coef', ascending=True)

colors = ['green' if c != 0 else 'gray' for c in sorted_df['coefficient']]
ax.barh(range(len(sorted_df)), sorted_df['coefficient'], color=colors)
ax.set_yticks(range(len(sorted_df)))
ax.set_yticklabels(sorted_df['feature'], fontsize=8)
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Coefficient Value', fontsize=12)
ax.set_title(f'LASSO Feature Selection\n(Green = Selected, Gray = Eliminated)', fontsize=14)
plt.tight_layout()
plt.show()

### ðŸ¤” Discussion

Look at which features LASSO selected:
- Did it keep the region and facility features (which have real effects)?
- Did it eliminate the noise features?
- What about the cyclical month encoding?

---

## Part 4: Elastic Net

Elastic Net combines L1 and L2 penalties:

$\text{Loss} = \text{MSE} + \lambda_1 \sum |\beta_j| + \lambda_2 \sum \beta_j^2$

This gives you the best of both worlds: feature selection (L1) + grouped selection of correlated features (L2).

In [None]:
# Fit Elastic Net with cross-validation
# l1_ratio controls the mix: 1 = LASSO, 0 = Ridge
elastic_cv = ElasticNetCV(
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99],
    alphas=np.logspace(-3, 1, 30),
    cv=5,
    max_iter=10000
)
elastic_cv.fit(X_train_scaled, y_train)

print(f"Optimal alpha: {elastic_cv.alpha_:.4f}")
print(f"Optimal l1_ratio: {elastic_cv.l1_ratio_:.2f}")

# Evaluate on test set
elastic_pred = elastic_cv.predict(X_test_scaled)
elastic_rmse = np.sqrt(mean_squared_error(y_test, elastic_pred))
elastic_r2 = r2_score(y_test, elastic_pred)

print(f"Elastic Net Test RMSE: {elastic_rmse:.2f}")
print(f"Elastic Net Test RÂ²: {elastic_r2:.3f}")

In [None]:
# Count features selected by Elastic Net
n_selected_elastic = (elastic_cv.coef_ != 0).sum()
print(f"Features selected by Elastic Net: {n_selected_elastic}")

---

## Part 5: Model Comparison

In [None]:
# Fit OLS for baseline
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)
ols_pred = ols.predict(X_test_scaled)
ols_rmse = np.sqrt(mean_squared_error(y_test, ols_pred))
ols_r2 = r2_score(y_test, ols_pred)

In [None]:
# Summary comparison table
results = pd.DataFrame({
    'Model': ['OLS (Baseline)', 'Ridge', 'LASSO', 'Elastic Net'],
    'Test RMSE': [ols_rmse, ridge_rmse, lasso_rmse, elastic_rmse],
    'Test RÂ²': [ols_r2, ridge_r2, lasso_r2, elastic_r2],
    'Features Used': [
        len(feature_cols),
        len(feature_cols),  # Ridge uses all
        n_selected,
        n_selected_elastic
    ]
})

print("Model Comparison:")
display(results)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# RMSE comparison
colors = ['gray', 'blue', 'green', 'orange']
axes[0].bar(results['Model'], results['Test RMSE'], color=colors)
axes[0].set_ylabel('Test RMSE')
axes[0].set_title('Model Performance (Lower is Better)')
axes[0].tick_params(axis='x', rotation=45)

# Features used comparison
axes[1].bar(results['Model'], results['Features Used'], color=colors)
axes[1].set_ylabel('Number of Features')
axes[1].set_title('Model Complexity (Features Used)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Predictions vs Actual plot
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

models = [('OLS', ols_pred), ('Ridge', ridge_pred), ('LASSO', lasso_pred), ('Elastic Net', elastic_pred)]

for ax, (name, pred) in zip(axes.flat, models):
    ax.scatter(y_test, pred, alpha=0.5)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    ax.set_xlabel('Actual Demand')
    ax.set_ylabel('Predicted Demand')
    ax.set_title(name)

plt.tight_layout()
plt.show()

---

## Summary

In this notebook, you:

1. **Created features** including some noise features
2. **Applied Ridge regression** and saw all coefficients shrink
3. **Applied LASSO regression** and saw automatic feature selection
4. **Applied Elastic Net** combining both approaches
5. **Compared performance** across all methods

### Key Takeaways

| Method | What it does | When to use |
|--------|--------------|-------------|
| **Ridge** | Shrinks all coefficients | Many small/medium effects |
| **LASSO** | Sets some coefficients to zero | Feature selection needed |
| **Elastic Net** | Combines both | Correlated features + selection |

### Connection to This Afternoon

You now have a **LASSO baseline** to compare against:
- Decision Trees
- Random Forests
- Gradient Boosting

These tree-based methods handle nonlinearities differently but also have their own regularization (max_depth, min_samples_leaf, etc.).

---

**Next:** Tree-Based Methods notebook