# 02 - Forecasting Model Experiments

This notebook develops and evaluates time-series forecasting models for rat complaint prediction.

**Models Evaluated:**
- XGBoost (gradient boosting)
- LSTM (deep learning)
- Prophet (Facebook's time-series library)
- Ensemble (weighted combination)

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from src.data_preprocessing import load_master_dataset, get_train_val_test_split
from src.feature_engineering import FeatureEngineer, create_sequences
from src.forecasting_models import (
    XGBoostForecaster, LSTMForecaster, ProphetForecaster,
    EnsembleForecaster, evaluate_model, compare_models
)
from src import config

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load and Prepare Data

In [None]:
# Load master dataset
df = load_master_dataset()
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Train/val/test split (chronological)
train_df, val_df, test_df = get_train_val_test_split(df)

print(f"Train: {len(train_df)} ({train_df['date'].min()} to {train_df['date'].max()})")
print(f"Val: {len(val_df)} ({val_df['date'].min()} to {val_df['date'].max()})")
print(f"Test: {len(test_df)} ({test_df['date'].min()} to {test_df['date'].max()})")

## 2. Feature Engineering

In [None]:
# Create features
fe = FeatureEngineer()

train_df_fe, y_train = fe.fit_transform(train_df)
val_df_fe, y_val = fe.transform(val_df)
test_df_fe, y_test = fe.transform(test_df)

X_train = fe.get_feature_matrix(train_df_fe)
X_val = fe.get_feature_matrix(val_df_fe)
X_test = fe.get_feature_matrix(test_df_fe)

print(f"Features created: {len(fe.all_features)}")
print(f"Training shape: {X_train.shape}")

## 3. Baseline Model

Simple baseline: predict previous month's value

In [None]:
# Baseline: lag-1 prediction
baseline_pred = test_df_fe['lag_1'].values * fe.scaler.scale_[0] + fe.scaler.mean_[0]
baseline_metrics = evaluate_model(y_test, baseline_pred)

print("Baseline (Previous Month) Metrics:")
for metric, value in baseline_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 4. XGBoost Model

In [None]:
# Train XGBoost
xgb_model = XGBoostForecaster()
xgb_model.fit(X_train, y_train, X_val, y_val)

# Evaluate
xgb_pred = xgb_model.predict(X_test)
xgb_metrics = evaluate_model(y_test, xgb_pred)

print("XGBoost Metrics:")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Feature importance
importance = xgb_model.get_feature_importance()
importance_df = pd.DataFrame({
    'feature': fe.all_features,
    'importance': importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importance (Top 15)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. LSTM Model

In [None]:
# Create sequences for LSTM
seq_len = 12
X_train_seq, y_train_seq = create_sequences(X_train, y_train, seq_len)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, seq_len)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, seq_len)

print(f"Sequence shape: {X_train_seq.shape}")

In [None]:
# Train LSTM
lstm_model = LSTMForecaster(input_size=X_train.shape[1], sequence_length=seq_len)
lstm_model.fit(X_train_seq, y_train_seq, X_val_seq, y_val_seq, epochs=50)

# Evaluate
lstm_pred = lstm_model.predict(X_test_seq)
lstm_metrics = evaluate_model(y_test_seq, lstm_pred)

print("LSTM Metrics:")
for metric, value in lstm_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Training history
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].plot(lstm_model.training_history['train_loss'], label='Train')
ax[0].plot(lstm_model.training_history['val_loss'], label='Validation')
ax[0].set_xlabel('Epoch')
ax[0].set_ylabel('Loss')
ax[0].set_title('LSTM Training Loss')
ax[0].legend()

ax[1].scatter(y_test_seq, lstm_pred, alpha=0.5)
ax[1].plot([0, max(y_test_seq)], [0, max(y_test_seq)], 'r--')
ax[1].set_xlabel('Actual')
ax[1].set_ylabel('Predicted')
ax[1].set_title('LSTM: Actual vs Predicted')

plt.tight_layout()
plt.show()

## 6. Prophet Model

In [None]:
# Train Prophet (one model per location)
prophet_model = ProphetForecaster()
prophet_model.fit(train_df, date_col='date', target_col='complaint_count', group_col='zip_code')

# Evaluate
prophet_pred = prophet_model.predict(test_df)
prophet_metrics = evaluate_model(y_test[:len(prophet_pred)], prophet_pred)

print("Prophet Metrics:")
for metric, value in prophet_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 7. Model Comparison

In [None]:
# Compare all models
results = pd.DataFrame({
    'Model': ['Baseline', 'XGBoost', 'LSTM', 'Prophet'],
    'MAE': [baseline_metrics['mae'], xgb_metrics['mae'], lstm_metrics['mae'], prophet_metrics['mae']],
    'RMSE': [baseline_metrics['rmse'], xgb_metrics['rmse'], lstm_metrics['rmse'], prophet_metrics['rmse']],
    'R²': [baseline_metrics['r2'], xgb_metrics['r2'], lstm_metrics['r2'], prophet_metrics['r2']],
    'MAPE': [baseline_metrics['mape'], xgb_metrics['mape'], lstm_metrics['mape'], prophet_metrics['mape']],
})

print("Model Comparison:")
print(results.to_string(index=False))

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

metrics_to_plot = ['MAE', 'RMSE', 'R²']
for ax, metric in zip(axes, metrics_to_plot):
    ax.bar(results['Model'], results[metric])
    ax.set_title(metric)
    ax.set_ylabel(metric)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

plt.tight_layout()
plt.show()

## 8. Ensemble Model

In [None]:
# Create weighted ensemble
ensemble = EnsembleForecaster(
    models={'xgboost': xgb_model, 'lstm': lstm_model, 'prophet': prophet_model},
    weights={'xgboost': 0.45, 'lstm': 0.35, 'prophet': 0.20}
)

# For ensemble prediction, we need aligned predictions
# Using XGBoost predictions as primary
ensemble_pred = ensemble.predict(X_test, test_df)
ensemble_metrics = evaluate_model(y_test, ensemble_pred)

print("Ensemble Metrics:")
for metric, value in ensemble_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 9. Error Analysis

In [None]:
# Analyze prediction errors
errors = y_test - xgb_pred

fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Error distribution
axes[0].hist(errors, bins=30, edgecolor='black')
axes[0].axvline(x=0, color='r', linestyle='--')
axes[0].set_xlabel('Prediction Error')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Error Distribution')

# Residual plot
axes[1].scatter(xgb_pred, errors, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Value')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residuals vs Predicted')

# Q-Q plot
from scipy import stats
stats.probplot(errors, dist='norm', plot=axes[2])
axes[2].set_title('Q-Q Plot')

plt.tight_layout()
plt.show()

## 10. Save Models

In [None]:
# Save all models
from pathlib import Path

save_dir = Path('../models/forecasting')
save_dir.mkdir(parents=True, exist_ok=True)

xgb_model.save(str(save_dir / 'xgboost.joblib'))
lstm_model.save(str(save_dir / 'lstm.pt'))
prophet_model.save(str(save_dir / 'prophet.joblib'))
fe.save(str(save_dir / 'feature_engineer.joblib'))

print("Models saved!")