## Setup

In [None]:
# Add src to path (for development)
import sys
from pathlib import Path
src_path = Path().absolute().parent / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
print(f"âœ“ Path configured: {src_path}")

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Import autotsforecast components
from autotsforecast import (
    AutoForecaster,
    VARForecaster,
    LinearForecaster,
    MovingAverageForecaster,
    RandomForestForecaster,
    XGBoostForecaster
)
from autotsforecast.hierarchical import HierarchicalReconciler
from autotsforecast.interpretability import DriverAnalyzer

print("âœ“ All imports successful!")

## Part 1: Basic Forecasting with Single Models

Let's start with forecasting sales for 3 regions using different models.

In [None]:
# Generate synthetic sales data for 3 regions
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=200, freq='D')

# Create trend + seasonality + noise
t = np.arange(200)
trend = t * 0.5
seasonality = 20 * np.sin(2 * np.pi * t / 30)

data = pd.DataFrame({
    'North': 100 + trend + seasonality + np.random.normal(0, 10, 200),
    'South': 120 + trend * 1.2 + seasonality + np.random.normal(0, 12, 200),
    'East': 80 + trend * 0.8 + seasonality * 0.8 + np.random.normal(0, 8, 200)
}, index=dates)

print(f"Data shape: {data.shape}")
print(data.head())

# Visualize the data
plt.figure(figsize=(12, 4))
data.plot(ax=plt.gca())
plt.title('Sales by Region')
plt.ylabel('Sales')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Split into train and test
train_size = 150
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

print(f"Train: {len(train_data)} samples")
print(f"Test: {len(test_data)} samples")

### 1.1 VAR (Vector AutoRegression) Model

VAR models capture dependencies between multiple time series.

In [None]:
# Create and fit VAR model
var_model = VARForecaster(lags=7, horizon=50)
var_model.fit(train_data)

# Generate forecasts
var_forecasts = var_model.predict()

print(f"VAR Forecast shape: {var_forecasts.shape}")
print(var_forecasts.head())

### 1.2 Random Forest Model

Random Forest uses ensemble learning with lag features.

In [None]:
# Create and fit Random Forest model
rf_model = RandomForestForecaster(n_lags=7, horizon=50, n_estimators=100)
rf_model.fit(train_data)

# Generate forecasts
rf_forecasts = rf_model.predict()

print(f"Random Forest Forecast shape: {rf_forecasts.shape}")
print(rf_forecasts.head())

### 1.3 Compare Models Visually

In [None]:
# Plot actual vs forecasts for North region
plt.figure(figsize=(14, 5))

plt.plot(train_data.index[-30:], train_data['North'][-30:], 'o-', label='Historical', linewidth=2)
plt.plot(test_data.index, test_data['North'], 'o-', label='Actual', linewidth=2, color='green')
plt.plot(test_data.index, var_forecasts['North'], '--', label='VAR Forecast', linewidth=2)
plt.plot(test_data.index, rf_forecasts['North'], '--', label='RF Forecast', linewidth=2)

plt.axvline(x=train_data.index[-1], color='red', linestyle='--', alpha=0.5, label='Train/Test Split')
plt.title('Sales Forecasts: North Region')
plt.ylabel('Sales')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate RMSE
var_rmse = np.sqrt(np.mean((var_forecasts['North'].values - test_data['North'].values)**2))
rf_rmse = np.sqrt(np.mean((rf_forecasts['North'].values - test_data['North'].values)**2))

print(f"\nVAR Model RMSE: {var_rmse:.2f}")
print(f"Random Forest RMSE: {rf_rmse:.2f}")

## Part 2: AutoForecaster - Automatic Model Selection

Instead of manually trying models, let **AutoForecaster** automatically select the best one using backtesting.

In [None]:
# Define candidate models to evaluate
candidates = [
    MovingAverageForecaster(window=5, horizon=50),
    MovingAverageForecaster(window=7, horizon=50),
    VARForecaster(lags=3, horizon=50),
    VARForecaster(lags=5, horizon=50),
    VARForecaster(lags=7, horizon=50),
    LinearForecaster(n_lags=5, horizon=50),
    RandomForestForecaster(n_lags=7, horizon=50, n_estimators=50),
    XGBoostForecaster(n_lags=7, horizon=50)
]

# Create AutoForecaster
auto = AutoForecaster(
    candidate_models=candidates,
    metric='rmse',
    n_splits=3,
    test_size=10,
    verbose=True
)

# Fit and select best model
auto.fit(train_data)

# Generate forecasts with best model
auto_forecasts = auto.forecast()

print(f"\nBest Model: {auto.best_model_name_}")
print(f"Forecast shape: {auto_forecasts.shape}")
print(auto_forecasts.head())

In [None]:
# Get detailed performance summary
summary = auto.get_summary()
print("\n" + summary)

In [None]:
# Visualize backtesting results
auto.plot_backtesting_results()

## Part 3: Using Covariates (External Variables)

Improve forecasts by including external factors like promotions, holidays, or weather.

In [None]:
# Generate synthetic covariates (e.g., marketing spend and temperature)
covariates = pd.DataFrame({
    'marketing_spend': 500 + 200 * np.sin(2 * np.pi * t / 60) + np.random.normal(0, 50, 200),
    'temperature': 20 + 10 * np.sin(2 * np.pi * t / 365) + np.random.normal(0, 3, 200)
}, index=dates)

# Add covariates as features to the sales data
sales_with_covariates = pd.concat([
    data['North'] * (1 + covariates['marketing_spend'] / 5000),  # Marketing impact
    data['South'] * (1 + covariates['temperature'] / 100),        # Temperature impact
    data['East']
], axis=1)
sales_with_covariates.columns = ['North', 'South', 'East']

# Split data
train_y = sales_with_covariates.iloc[:train_size]
test_y = sales_with_covariates.iloc[train_size:]
train_X = covariates.iloc[:train_size]
test_X = covariates.iloc[train_size:]

print(f"Target shape: {train_y.shape}")
print(f"Covariates shape: {train_X.shape}")
print(f"\nCovariates preview:")
print(train_X.head())

In [None]:
# Train model WITHOUT covariates
model_no_cov = RandomForestForecaster(n_lags=7, horizon=50, n_estimators=100)
model_no_cov.fit(train_y)
pred_no_cov = model_no_cov.predict()

# Train model WITH covariates
model_with_cov = RandomForestForecaster(n_lags=7, horizon=50, n_estimators=100)
model_with_cov.fit(train_y, train_X)
pred_with_cov = model_with_cov.predict(test_X)

# Compare performance
rmse_no_cov = np.sqrt(np.mean((pred_no_cov.values - test_y.values)**2))
rmse_with_cov = np.sqrt(np.mean((pred_with_cov.values - test_y.values)**2))

print(f"RMSE without covariates: {rmse_no_cov:.2f}")
print(f"RMSE with covariates: {rmse_with_cov:.2f}")
print(f"Improvement: {((rmse_no_cov - rmse_with_cov) / rmse_no_cov * 100):.1f}%")

## Part 4: Hierarchical Reconciliation

Ensure forecasts are coherent across aggregation levels (e.g., Total = North + South + East).

In [None]:
# Define hierarchy structure
# Total -> [North, South, East]
hierarchy = {
    'Total': ['North', 'South', 'East']
}

# Generate base forecasts (these may be incoherent)
base_forecasts = auto_forecasts.copy()
base_forecasts['Total'] = base_forecasts.sum(axis=1)

print("Base Forecasts (potentially incoherent):")
print(base_forecasts.head())
print(f"\nSum of regions: {base_forecasts[['North', 'South', 'East']].sum(axis=1).iloc[0]:.2f}")
print(f"Total forecast: {base_forecasts['Total'].iloc[0]:.2f}")
print(f"Difference: {abs(base_forecasts['Total'].iloc[0] - base_forecasts[['North', 'South', 'East']].sum(axis=1).iloc[0]):.2f}")

In [None]:
# Apply hierarchical reconciliation
reconciler = HierarchicalReconciler(hierarchy=hierarchy)

# Method 1: Bottom-up (aggregate from lowest level)
reconciled_bu = reconciler.reconcile(base_forecasts, method='bottom_up')
print("Bottom-Up Reconciliation:")
print(reconciled_bu.head())

# Method 2: MinTrace with shrinkage (optimal)
reconciled_mint = reconciler.reconcile(base_forecasts, method='mint_shrink')
print("\nMinTrace Reconciliation:")
print(reconciled_mint.head())

# Verify coherence
print(f"\nCoherence check (Bottom-Up):")
print(f"Sum of regions: {reconciled_bu[['North', 'South', 'East']].sum(axis=1).iloc[0]:.2f}")
print(f"Total forecast: {reconciled_bu['Total'].iloc[0]:.2f}")
print(f"Difference: {abs(reconciled_bu['Total'].iloc[0] - reconciled_bu[['North', 'South', 'East']].sum(axis=1).iloc[0]):.10f}")

## Part 5: Model Interpretability with SHAP

Understand which features drive your forecasts using SHAP (SHapley Additive exPlanations).

In [None]:
# Use a Random Forest model trained with covariates
interpreter = DriverAnalyzer(model_with_cov)

# Calculate SHAP values
# Note: We need to recreate the feature matrix that the model uses
from autotsforecast.utils.data import CovariatePreprocessor

# Get training features the model actually used
preprocessor = CovariatePreprocessor()
X_processed = preprocessor.fit_transform(train_X)

# Create lagged features
n_lags = 7
features = []
for lag in range(1, n_lags + 1):
    lagged = train_y.shift(lag)
    lagged.columns = [f"{col}_lag{lag}" for col in train_y.columns]
    features.append(lagged)
features.append(X_processed)
X_features = pd.concat(features, axis=1).dropna()

print(f"Feature matrix shape: {X_features.shape}")
print(f"Features: {list(X_features.columns)}")

In [None]:
# Calculate SHAP values
try:
    shap_values = interpreter.calculate_shap_values(
        X=X_features,
        max_samples=100  # Use subset for speed
    )
    
    print("âœ“ SHAP values calculated successfully!")
    print(f"Number of outputs: {len(shap_values)}")
    
    # Get feature importance
    importance = interpreter.get_shap_feature_importance(shap_values)
    print("\nTop 10 Most Important Features:")
    print(importance.head(10))
    
except Exception as e:
    print(f"SHAP calculation not available: {e}")
    print("Make sure SHAP is installed: pip install shap")

In [None]:
# Plot SHAP summary for North region
try:
    interpreter.plot_shap_summary(
        X=X_features,
        shap_values_dict=shap_values,
        target_name='North',
        plot_type='bar'
    )
except Exception as e:
    print(f"Plotting not available: {e}")

## Summary

This tutorial covered the complete **autotsforecast** workflow:

### âœ… What You Learned

1. **Basic Forecasting**: Used VAR, Random Forest, and other models
2. **AutoForecaster**: Automated model selection with 8 candidate models
3. **Covariates**: Improved accuracy by including external variables
4. **Hierarchical Reconciliation**: Ensured forecast coherence across levels
5. **SHAP Interpretability**: Understood feature importance in predictions

### ðŸ“š Key Features

- **Multiple Models**: VAR, Linear, Moving Average, Random Forest, XGBoost
- **Automatic Selection**: Backtesting-based model comparison
- **Covariate Support**: Include external variables for better forecasts
- **Hierarchical Methods**: Bottom-up, Top-down, MinTrace (OLS, WLS, Shrinkage)
- **Interpretability**: SHAP values for tree-based and linear models

### ðŸš€ Next Steps

- Try with your own data
- Experiment with different model parameters
- Use custom hierarchies for your business structure
- Install via pip (coming soon): `pip install autotsforecast`