# Day 2: Regularized Regression (LASSO, Ridge, Elastic Net) - R Version

**WISE Workshop | Addis Ababa, Feb 2026**

In this notebook, you'll apply regularization techniques to prevent overfitting in supply chain demand prediction using the tidymodels framework.

## Setup

::: {.callout-important}
## Google Colab R Runtime
Make sure you're using the R runtime: **Runtime -> Change runtime type -> R**
:::

In [None]:
# Install packages if needed (run once in Colab)
if (!require("tidymodels", quietly = TRUE)) {
  install.packages(c("tidymodels", "glmnet"))
}

# Load packages
library(tidymodels)
library(tidyverse)
library(glmnet)

# Settings
set.seed(42)
theme_set(theme_minimal())

cat("Packages loaded!\n")

---

## Part 1: Data Preparation

We'll create a dataset with many features to see regularization in action.

In [None]:
# Create sample supply chain data with many features
set.seed(42)
n_rows <- 1000

# Generate dates and basic features
dates <- seq(as.Date('2023-01-01'), by = 'day', length.out = n_rows)
regions <- sample(c('Addis Ababa', 'Oromia', 'Amhara', 'SNNP', 'Tigray'), n_rows, replace = TRUE)
facility_types <- sample(c('Hospital', 'Health Center', 'Clinic'), n_rows, replace = TRUE, 
                        prob = c(0.2, 0.5, 0.3))

# Create demand with clear patterns
base_demand <- 100
facility_effect <- ifelse(facility_types == 'Hospital', 80,
                         ifelse(facility_types == 'Health Center', 30, 0))
region_effect <- ifelse(regions == 'Addis Ababa', 50,
                       ifelse(regions == 'Oromia', 20, 0))
day_of_year <- as.numeric(format(dates, '%j'))
seasonal_effect <- 25 * sin(2 * pi * day_of_year / 365)
noise <- rnorm(n_rows, 0, 15)

demand <- pmax(base_demand + facility_effect + region_effect + seasonal_effect + noise, 10)

df <- tibble(
  date = dates,
  region = regions,
  facility_type = facility_types,
  demand = as.integer(demand)
)

cat("Data shape:", nrow(df), "rows x", ncol(df), "columns\n")
head(df)

In [None]:
# Feature Engineering: Create MANY features (some useful, some noise)

df <- df %>%
  mutate(
    # Time-based features
    month = month(date),
    day_of_week = wday(date),
    quarter = quarter(date),
    day_of_year = yday(date),
    week_of_year = isoweek(date),
    is_weekend = as.integer(day_of_week %in% c(1, 7)),
    
    # Cyclical encoding for month (sine/cosine)
    month_sin = sin(2 * pi * month / 12),
    month_cos = cos(2 * pi * month / 12),
    
    # Noise features (to see if LASSO eliminates them)
    noise_0 = rnorm(n()),
    noise_1 = rnorm(n()),
    noise_2 = rnorm(n()),
    noise_3 = rnorm(n()),
    noise_4 = rnorm(n())
  )

cat("After feature engineering:", nrow(df), "rows x", ncol(df), "columns\n")
cat("\nNumeric features:", names(df)[sapply(df, is.numeric)], "\n")

In [None]:
# Define features and target using recipes
# We'll use a recipe to handle categorical encoding and scaling

# Split data first
set.seed(42)
data_split <- initial_split(df, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

cat("Training samples:", nrow(train_data), "\n")
cat("Test samples:", nrow(test_data), "\n")

In [None]:
# Create a recipe for preprocessing
demand_recipe <- recipe(demand ~ ., data = train_data) %>%
  # Remove date (not a predictor)
  step_rm(date) %>%
  # Create dummy variables for categorical features
  step_dummy(all_nominal_predictors()) %>%
  # IMPORTANT: Normalize features for regularization
  step_normalize(all_numeric_predictors())

# Prep the recipe to see what features we have
prepped_recipe <- prep(demand_recipe)
train_processed <- bake(prepped_recipe, new_data = NULL)

cat("Number of features after preprocessing:", ncol(train_processed) - 1, "\n")
cat("Feature names:\n")
print(names(train_processed)[names(train_processed) != "demand"])

---

## Part 2: Ridge Regression

Ridge regression adds an L2 penalty: lambda * sum(beta_j^2)

This **shrinks** all coefficients toward zero but never sets them exactly to zero.

In tidymodels, we use `linear_reg()` with `mixture = 0` for Ridge.

In [None]:
# Fit Ridge with different penalty (lambda) values using glmnet directly for visualization
# Prepare data matrices
X_train <- train_processed %>% select(-demand) %>% as.matrix()
y_train <- train_processed$demand

# Fit Ridge path
ridge_fit <- glmnet(X_train, y_train, alpha = 0)  # alpha = 0 is Ridge

# Visualize coefficient shrinkage with Ridge
plot(ridge_fit, xvar = "lambda", label = TRUE)
title("Ridge Regression: Coefficient Paths\n(All coefficients shrink, but none reach zero)")

In [None]:
# Use cross-validation to find optimal penalty with tidymodels

# Define the Ridge model specification (mixture = 0)
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%
  set_engine("glmnet")

# Create workflow
ridge_workflow <- workflow() %>%
  add_recipe(demand_recipe) %>%
  add_model(ridge_spec)

# Create cross-validation folds
set.seed(42)
cv_folds <- vfold_cv(train_data, v = 5)

# Define penalty grid
penalty_grid <- tibble(penalty = 10^seq(-3, 3, length.out = 50))

# Tune the model
ridge_tune <- tune_grid(
  ridge_workflow,
  resamples = cv_folds,
  grid = penalty_grid,
  metrics = metric_set(rmse, rsq)
)

# Show best results
cat("Best Ridge penalty (by RMSE):\n")
show_best(ridge_tune, metric = "rmse", n = 3)

In [None]:
# Finalize and evaluate Ridge model
best_ridge <- select_best(ridge_tune, metric = "rmse")
cat("Optimal penalty:", best_ridge$penalty, "\n")

# Finalize workflow with best parameters
final_ridge <- ridge_workflow %>%
  finalize_workflow(best_ridge) %>%
  fit(data = train_data)

# Evaluate on test set
ridge_pred <- predict(final_ridge, test_data) %>%
  bind_cols(test_data %>% select(demand))

ridge_metrics <- ridge_pred %>%
  metrics(truth = demand, estimate = .pred)

ridge_rmse <- ridge_metrics %>% filter(.metric == "rmse") %>% pull(.estimate)
ridge_rsq <- ridge_metrics %>% filter(.metric == "rsq") %>% pull(.estimate)

cat("Ridge Test RMSE:", round(ridge_rmse, 2), "\n")
cat("Ridge Test R-squared:", round(ridge_rsq, 3), "\n")

---

## Part 3: LASSO Regression

LASSO adds an L1 penalty: lambda * sum(|beta_j|)

This **shrinks** coefficients AND can set them **exactly to zero** (feature selection!).

In tidymodels, we use `linear_reg()` with `mixture = 1` for LASSO.

In [None]:
# Fit LASSO path for visualization
lasso_fit <- glmnet(X_train, y_train, alpha = 1)  # alpha = 1 is LASSO

# Visualize coefficient paths with LASSO
plot(lasso_fit, xvar = "lambda", label = TRUE)
title("LASSO Regression: Coefficient Paths\n(Watch coefficients go to EXACTLY zero!)")

In [None]:
# Use cross-validation to find optimal penalty for LASSO with tidymodels

# Define the LASSO model specification (mixture = 1)
lasso_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")

# Create workflow
lasso_workflow <- workflow() %>%
  add_recipe(demand_recipe) %>%
  add_model(lasso_spec)

# Tune the model (use same folds and grid)
lasso_tune <- tune_grid(
  lasso_workflow,
  resamples = cv_folds,
  grid = penalty_grid,
  metrics = metric_set(rmse, rsq)
)

# Show best results
cat("Best LASSO penalty (by RMSE):\n")
show_best(lasso_tune, metric = "rmse", n = 3)

In [None]:
# Finalize and evaluate LASSO model
best_lasso <- select_best(lasso_tune, metric = "rmse")
cat("Optimal penalty:", best_lasso$penalty, "\n")

# Finalize workflow with best parameters
final_lasso <- lasso_workflow %>%
  finalize_workflow(best_lasso) %>%
  fit(data = train_data)

# Evaluate on test set
lasso_pred <- predict(final_lasso, test_data) %>%
  bind_cols(test_data %>% select(demand))

lasso_metrics <- lasso_pred %>%
  metrics(truth = demand, estimate = .pred)

lasso_rmse <- lasso_metrics %>% filter(.metric == "rmse") %>% pull(.estimate)
lasso_rsq <- lasso_metrics %>% filter(.metric == "rsq") %>% pull(.estimate)

cat("LASSO Test RMSE:", round(lasso_rmse, 2), "\n")
cat("LASSO Test R-squared:", round(lasso_rsq, 3), "\n")

In [None]:
# See which features LASSO selected (non-zero coefficients)
lasso_coefs <- final_lasso %>%
  extract_fit_parsnip() %>%
  tidy()

# Count zero and non-zero coefficients (excluding intercept)
lasso_coefs_no_int <- lasso_coefs %>% filter(term != "(Intercept)")
n_selected <- sum(lasso_coefs_no_int$estimate != 0)
n_eliminated <- sum(lasso_coefs_no_int$estimate == 0)

cat("Features SELECTED by LASSO:", n_selected, "\n")
cat("Features ELIMINATED by LASSO:", n_eliminated, "\n")
cat("\nSelected features:\n")
lasso_coefs_no_int %>%
  filter(estimate != 0) %>%
  arrange(desc(abs(estimate))) %>%
  print()

In [None]:
# Visualize LASSO feature selection
lasso_coefs_plot <- lasso_coefs_no_int %>%
  mutate(
    selected = estimate != 0,
    term = fct_reorder(term, abs(estimate))
  )

ggplot(lasso_coefs_plot, aes(x = estimate, y = term, fill = selected)) +
  geom_col() +
  geom_vline(xintercept = 0, color = "black", linewidth = 0.5) +
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "gray70"),
                    labels = c("Eliminated", "Selected")) +
  labs(
    x = "Coefficient Value",
    y = "Feature",
    title = "LASSO Feature Selection",
    subtitle = "Green = Selected, Gray = Eliminated",
    fill = ""
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

### Discussion

Look at which features LASSO selected:
- Did it keep the region and facility features (which have real effects)?
- Did it eliminate the noise features?
- What about the cyclical month encoding?

---

## Part 4: Elastic Net

Elastic Net combines L1 and L2 penalties:

Loss = MSE + lambda1 * sum(|beta_j|) + lambda2 * sum(beta_j^2)

This gives you the best of both worlds: feature selection (L1) + grouped selection of correlated features (L2).

In tidymodels, `mixture` controls the blend: 0 = Ridge, 1 = LASSO, 0.5 = balanced mix.

In [None]:
# Fit Elastic Net with cross-validation, tuning both penalty and mixture

# Define the Elastic Net model specification
elastic_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

# Create workflow
elastic_workflow <- workflow() %>%
  add_recipe(demand_recipe) %>%
  add_model(elastic_spec)

# Define grid for both penalty and mixture
elastic_grid <- grid_regular(
  penalty(range = c(-3, 1)),
  mixture(range = c(0.1, 1)),
  levels = c(30, 6)
)

# Tune the model
elastic_tune <- tune_grid(
  elastic_workflow,
  resamples = cv_folds,
  grid = elastic_grid,
  metrics = metric_set(rmse, rsq)
)

# Show best results
cat("Best Elastic Net parameters (by RMSE):\n")
show_best(elastic_tune, metric = "rmse", n = 3)

In [None]:
# Finalize and evaluate Elastic Net model
best_elastic <- select_best(elastic_tune, metric = "rmse")
cat("Optimal penalty:", best_elastic$penalty, "\n")
cat("Optimal mixture:", best_elastic$mixture, "\n")

# Finalize workflow with best parameters
final_elastic <- elastic_workflow %>%
  finalize_workflow(best_elastic) %>%
  fit(data = train_data)

# Evaluate on test set
elastic_pred <- predict(final_elastic, test_data) %>%
  bind_cols(test_data %>% select(demand))

elastic_metrics <- elastic_pred %>%
  metrics(truth = demand, estimate = .pred)

elastic_rmse <- elastic_metrics %>% filter(.metric == "rmse") %>% pull(.estimate)
elastic_rsq <- elastic_metrics %>% filter(.metric == "rsq") %>% pull(.estimate)

cat("Elastic Net Test RMSE:", round(elastic_rmse, 2), "\n")
cat("Elastic Net Test R-squared:", round(elastic_rsq, 3), "\n")

# Count features selected
elastic_coefs <- final_elastic %>%
  extract_fit_parsnip() %>%
  tidy() %>%
  filter(term != "(Intercept)")

n_selected_elastic <- sum(elastic_coefs$estimate != 0)
cat("Features selected by Elastic Net:", n_selected_elastic, "\n")

---

## Part 5: Model Comparison

In [None]:
# Fit OLS for baseline using tidymodels
ols_spec <- linear_reg() %>%
  set_engine("lm")

ols_workflow <- workflow() %>%
  add_recipe(demand_recipe) %>%
  add_model(ols_spec)

ols_fit <- ols_workflow %>% fit(data = train_data)

ols_pred <- predict(ols_fit, test_data) %>%
  bind_cols(test_data %>% select(demand))

ols_metrics <- ols_pred %>%
  metrics(truth = demand, estimate = .pred)

ols_rmse <- ols_metrics %>% filter(.metric == "rmse") %>% pull(.estimate)
ols_rsq <- ols_metrics %>% filter(.metric == "rsq") %>% pull(.estimate)

In [None]:
# Summary comparison table
total_features <- ncol(train_processed) - 1

results <- tibble(
  Model = c('OLS (Baseline)', 'Ridge', 'LASSO', 'Elastic Net'),
  Test_RMSE = round(c(ols_rmse, ridge_rmse, lasso_rmse, elastic_rmse), 2),
  Test_Rsq = round(c(ols_rsq, ridge_rsq, lasso_rsq, elastic_rsq), 3),
  Features_Used = c(total_features, total_features, n_selected, n_selected_elastic)
)

cat("Model Comparison:\n")
print(results)

In [None]:
# Visualize comparison
p1 <- ggplot(results, aes(x = Model, y = Test_RMSE, fill = Model)) +
  geom_col() +
  scale_fill_manual(values = c("gray60", "steelblue", "forestgreen", "coral")) +
  labs(title = "Model Performance (Lower is Better)", y = "Test RMSE") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

p2 <- ggplot(results, aes(x = Model, y = Features_Used, fill = Model)) +
  geom_col() +
  scale_fill_manual(values = c("gray60", "steelblue", "forestgreen", "coral")) +
  labs(title = "Model Complexity (Features Used)", y = "Number of Features") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# Display plots side by side
gridExtra::grid.arrange(p1, p2, ncol = 2)

In [None]:
# Predictions vs Actual plot
all_preds <- bind_rows(
  ols_pred %>% mutate(Model = "OLS"),
  ridge_pred %>% mutate(Model = "Ridge"),
  lasso_pred %>% mutate(Model = "LASSO"),
  elastic_pred %>% mutate(Model = "Elastic Net")
)

ggplot(all_preds, aes(x = demand, y = .pred)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
  facet_wrap(~Model, nrow = 2) +
  labs(
    x = "Actual Demand",
    y = "Predicted Demand",
    title = "Predictions vs Actual by Model"
  ) +
  theme_minimal()

---

## Summary

In this notebook, you:

1. **Created features** including some noise features
2. **Applied Ridge regression** and saw all coefficients shrink
3. **Applied LASSO regression** and saw automatic feature selection
4. **Applied Elastic Net** combining both approaches
5. **Compared performance** across all methods

### Key Takeaways

| Method | What it does | When to use | tidymodels mixture |
|--------|--------------|-------------|--------------------|
| **Ridge** | Shrinks all coefficients | Many small/medium effects | `mixture = 0` |
| **LASSO** | Sets some coefficients to zero | Feature selection needed | `mixture = 1` |
| **Elastic Net** | Combines both | Correlated features + selection | `mixture = 0.5` |

### Connection to This Afternoon

You now have a **LASSO baseline** to compare against:
- Decision Trees
- Random Forests (using `rand_forest()` with `ranger` engine)
- Gradient Boosting (using `boost_tree()` with `xgboost` engine)

These tree-based methods handle nonlinearities differently but also have their own regularization (max_depth, min_n, etc.).

---

**Next:** Tree-Based Methods notebook