# Day 2: Demand Forecasting with Machine Learning

**WISE Workshop | Addis Ababa, Feb 2026**

In this notebook, you'll build your first machine learning model to forecast demand in the pharmaceutical supply chain.

## Setup

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

print("Packages loaded!")

## Part 1: Load and Prepare Data

In [None]:
# Create sample data (will be replaced with real data URL)
np.random.seed(42)
n_rows = 1000

# Generate data with patterns
dates = pd.date_range('2023-01-01', periods=n_rows, freq='D')
regions = np.random.choice(['Addis Ababa', 'Oromia', 'Amhara', 'SNNP', 'Tigray'], n_rows)
facility_types = np.random.choice(['Hospital', 'Health Center', 'Clinic'], n_rows, p=[0.2, 0.5, 0.3])

# Create demand with seasonal pattern and facility effects
base_demand = 100
facility_effect = np.where(facility_types == 'Hospital', 100, 
                          np.where(facility_types == 'Health Center', 50, 0))
seasonal_effect = 20 * np.sin(2 * np.pi * dates.dayofyear / 365)
noise = np.random.normal(0, 15, n_rows)

demand = base_demand + facility_effect + seasonal_effect + noise
demand = np.maximum(demand, 10)  # Ensure positive

df = pd.DataFrame({
    'date': dates,
    'region': regions,
    'facility_type': facility_types,
    'demand': demand.astype(int)
})

print(f"Data shape: {df.shape}")
df.head()

## Part 2: Feature Engineering

In [None]:
# Extract time features
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['day_of_year'] = df['date'].dt.dayofyear

# Encode categorical variables
le_region = LabelEncoder()
le_facility = LabelEncoder()

df['region_encoded'] = le_region.fit_transform(df['region'])
df['facility_encoded'] = le_facility.fit_transform(df['facility_type'])

print("Features created:")
df.head()

In [None]:
# Define features and target
feature_cols = ['month', 'day_of_week', 'quarter', 'region_encoded', 'facility_encoded']

X = df[feature_cols]
y = df['demand']

print(f"Features: {feature_cols}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

## Part 3: Train/Test Split

In [None]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Part 4: Train Models

In [None]:
# Model 1: Linear Regression (Baseline)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression:")
print(f"  RMSE: {lr_rmse:.2f}")
print(f"  R²: {lr_r2:.3f}")

In [None]:
# Model 2: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest:")
print(f"  RMSE: {rf_rmse:.2f}")
print(f"  R²: {rf_r2:.3f}")

In [None]:
# Model 3: Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_r2 = r2_score(y_test, gb_pred)

print("Gradient Boosting:")
print(f"  RMSE: {gb_rmse:.2f}")
print(f"  R²: {gb_r2:.3f}")

## Part 5: Compare Models

In [None]:
# Summary table
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'RMSE': [lr_rmse, rf_rmse, gb_rmse],
    'R²': [lr_r2, rf_r2, gb_r2]
})

print("Model Comparison:")
display(results)

In [None]:
# Visualize predictions vs actual
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, pred, name in zip(axes, [lr_pred, rf_pred, gb_pred], 
                          ['Linear Regression', 'Random Forest', 'Gradient Boosting']):
    ax.scatter(y_test, pred, alpha=0.5)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    ax.set_xlabel('Actual Demand')
    ax.set_ylabel('Predicted Demand')
    ax.set_title(name)

plt.tight_layout()
plt.show()

## Part 6: Feature Importance

In [None]:
# Feature importance from Random Forest
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(data=importance, x='importance', y='feature', ax=ax)
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.set_title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

## Summary

In this notebook, you:
- Created features from raw data
- Trained three different ML models
- Compared model performance
- Analyzed feature importance

**Next:** Model Tuning notebook to optimize the best performer