From d55224069ce0dc622c5d7806f1c366217c73aa1c Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 03:20:52 +0000 Subject: [PATCH 01/11] docs: update INITIAL-6.md --- INITIAL-6.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/INITIAL-6.md b/INITIAL-6.md index 665be982..db5bb727 100644 --- a/INITIAL-6.md +++ b/INITIAL-6.md @@ -10,6 +10,16 @@ - sMAPE - (optional) pinball loss later - Persist split boundaries and evaluation artifacts. +- Advanced Time-Series Splitting: + - Support for 'Expanding' and 'Sliding' windows. + - Integration of a 'Gap' parameter to simulate operational data latency. +- Comprehensive Metric Suite: + - Accuracy: MAE, sMAPE, WAPE. + - Reliability: Forecast Bias, Stability Index. +- Automated Benchmarking: + - Mandatory side-by-side comparison with Baseline models. +- Data Lineage: + - Storage of full 'Actual vs. Predicted' datasets per fold for downstream UI visualization. ## EXAMPLES: - `examples/backtest/run_backtest.py` — generates splits from config and executes evaluations. From fa1480e05587e225a8e77df29e47da77cff6fe7a Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 03:51:52 +0000 Subject: [PATCH 02/11] feat(backtesting): implement time-series backtesting module (PRP-6) Add complete backtesting infrastructure for model evaluation: - TimeSeriesSplitter with expanding/sliding window strategies and gap support - MetricsCalculator with MAE, sMAPE, WAPE, Bias, and Stability Index - BacktestingService for orchestrating backtests with baseline comparisons - POST /backtesting/run endpoint with full response schema - 95 unit tests covering schemas, splitter, metrics, and service - Example scripts for API usage, split visualization, and metrics demo Co-Authored-By: Claude Opus 4.5 --- PRPs/PRP-6-backtesting.md | 1145 +++++++++++++++++ app/core/config.py | 6 + app/features/backtesting/__init__.py | 30 + app/features/backtesting/metrics.py | 344 +++++ app/features/backtesting/routes.py | 138 ++ app/features/backtesting/schemas.py | 250 ++++ app/features/backtesting/service.py | 438 +++++++ app/features/backtesting/splitter.py | 226 ++++ app/features/backtesting/tests/__init__.py | 1 + app/features/backtesting/tests/conftest.py | 111 ++ .../backtesting/tests/test_metrics.py | 378 ++++++ .../backtesting/tests/test_schemas.py | 285 ++++ .../backtesting/tests/test_service.py | 548 ++++++++ .../backtesting/tests/test_splitter.py | 348 +++++ app/main.py | 2 + examples/backtest/inspect_splits.py | 139 ++ examples/backtest/metrics_demo.py | 172 +++ examples/backtest/run_backtest.py | 129 ++ 18 files changed, 4690 insertions(+) create mode 100644 PRPs/PRP-6-backtesting.md create mode 100644 app/features/backtesting/__init__.py create mode 100644 app/features/backtesting/metrics.py create mode 100644 app/features/backtesting/routes.py create mode 100644 app/features/backtesting/schemas.py create mode 100644 app/features/backtesting/service.py create mode 100644 app/features/backtesting/splitter.py create mode 100644 app/features/backtesting/tests/__init__.py create mode 100644 app/features/backtesting/tests/conftest.py create mode 100644 app/features/backtesting/tests/test_metrics.py create mode 100644 app/features/backtesting/tests/test_schemas.py create mode 100644 app/features/backtesting/tests/test_service.py create mode 100644 app/features/backtesting/tests/test_splitter.py create mode 100644 examples/backtest/inspect_splits.py create mode 100644 examples/backtest/metrics_demo.py create mode 100644 examples/backtest/run_backtest.py diff --git a/PRPs/PRP-6-backtesting.md b/PRPs/PRP-6-backtesting.md new file mode 100644 index 00000000..a4e71890 --- /dev/null +++ b/PRPs/PRP-6-backtesting.md @@ -0,0 +1,1145 @@ +# PRP-6: Backtesting + Metrics (ForecastOps Core) + +## Goal + +Implement a comprehensive backtesting framework for time-series forecasting models with time-based cross-validation, a full metrics suite, and data lineage for UI visualization. The module provides configurable splitting strategies (expanding/sliding windows with gap support), per-series and aggregated metrics, and mandatory baseline comparisons. + +**End State:** A production-ready `backtesting` vertical slice with: +- `TimeSeriesSplitter` — Generates time-based train/test splits (expanding/sliding + gap) +- `BacktestConfig` — Immutable configuration with validation and config_hash() +- `MetricsCalculator` — Computes MAE, sMAPE, WAPE, Forecast Bias, Stability Index +- `BacktestResult` — Per-fold actuals vs predictions with lineage metadata +- `BacktestingService` — Orchestrates split generation, model training, prediction, evaluation +- `POST /backtesting/run` — Execute backtest for a series with configurable strategy +- `GET /backtesting/results/{backtest_id}` — Retrieve backtest results with fold details +- Mandatory baseline comparison (naive/seasonal_naive) +- All validation gates passing (ruff, mypy, pyright, pytest) + +--- + +## Why + +- **Model Validation**: Backtesting is the gold standard for evaluating time-series models +- **Leakage Prevention**: Time-based splits ensure no future data contaminates training +- **Metric Transparency**: Per-series distributions expose failures that aggregation masks +- **Baseline Benchmarking**: Every model must beat naive baselines to justify complexity +- **Reproducibility**: Stored split boundaries + config hash enable exact replication +- **UI Integration**: Actual vs Predicted datasets per fold enable rich visualizations + +--- + +## What + +### User-Visible Behavior + +1. **Run Backtest**: Accept series ID, model config, split strategy, return backtest_id +2. **Retrieve Results**: Get per-fold metrics, aggregated metrics, actual vs predicted data +3. **Split Strategies**: Expanding window (default), sliding window, configurable gap +4. **Metrics Suite**: MAE, sMAPE, WAPE, Forecast Bias, Stability Index +5. **Baseline Comparison**: Automatic benchmarking against naive and seasonal_naive + +### Success Criteria + +- [ ] TimeSeriesSplitter generates correct expanding/sliding splits with gap +- [ ] All 5 metrics implemented with edge case handling (zeros, empty arrays) +- [ ] BacktestingService orchestrates train → predict → evaluate loop +- [ ] Per-fold actuals vs predictions stored for UI lineage +- [ ] Baseline comparison runs automatically with every backtest +- [ ] Leakage sanity checks verify no future data in training +- [ ] 50+ unit tests covering splits, metrics, service, routes +- [ ] Example files demonstrating each splitting strategy + +--- + +## All Needed Context + +### Documentation & References + +```yaml +# MUST READ - Include these in your context window + +# sklearn TimeSeriesSplit (expanding window only) +- url: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html + why: "Reference implementation for expanding window, gap parameter" + critical: "sklearn only supports expanding; we need sliding window too" + +# Skforecast Backtesting Guide +- url: https://skforecast.org/0.14.0/user_guides/backtesting.html + why: "backtesting_forecaster() patterns, refit strategies" + critical: "Supports both expanding and sliding windows with custom metrics" + +# Time Series Cross-Validation Best Practices +- url: https://forecastegy.com/posts/time-series-cross-validation-python/ + why: "Visual diagrams of expanding vs sliding windows" + critical: "Gap parameter simulates operational data latency" + +# sMAPE Definition and Edge Cases +- url: https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error + why: "Formula: 100/n * sum(2*|F-A|/(|A|+|F|))" + critical: "Undefined when both actual and forecast are 0; use fallback" + +# WAPE vs MAPE Comparison +- url: https://www.baeldung.com/cs/mape-vs-wape-vs-wmape + why: "WAPE = sum(|A-F|) / sum(|A|) * 100" + critical: "WAPE handles low/zero values better than MAPE" + +# Forecast Bias Definition +- url: https://demandplanning.net/mape-wmape-and-forecast-bias/ + why: "Bias = sum(A-F) / n; negative = over-forecast" + critical: "Detects systematic over/under forecasting" + +# Backtest Machine Learning Models for Time Series +- url: https://machinelearningmastery.com/backtest-machine-learning-models-time-series-forecasting/ + why: "Walk-forward validation patterns" + critical: "Emphasizes importance of no data leakage" + +# Internal Codebase References +- file: app/features/forecasting/models.py + why: "BaseForecaster interface for fit/predict" + +- file: app/features/forecasting/service.py + why: "Pattern for ForecastingService with async DB operations" + +- file: app/features/forecasting/schemas.py + why: "Pattern for ModelConfig with config_hash()" + +- file: app/features/featuresets/service.py + why: "Pattern for cutoff_date enforcement (leakage prevention)" + +- file: app/core/config.py + why: "Pattern for Settings with environment variables" + +- file: PRPs/PRP-5-forecasting.md + why: "Reference PRP structure and task breakdown" +``` + +### Current Codebase Tree (Relevant Parts) + +```text +app/ +├── core/ +│ ├── config.py # Settings singleton +│ ├── database.py # AsyncSession, get_db +│ ├── exceptions.py # ForecastLabError base +│ └── logging.py # Structured logging +├── shared/ +│ └── models.py # TimestampMixin +├── features/ +│ ├── data_platform/ +│ │ └── models.py # SalesDaily, Store, Product, Calendar +│ ├── featuresets/ +│ │ ├── schemas.py # FeatureSetConfig, config_hash() +│ │ └── service.py # FeatureEngineeringService +│ └── forecasting/ +│ ├── models.py # BaseForecaster, NaiveForecaster, etc. +│ ├── schemas.py # ModelConfig, TrainRequest +│ ├── service.py # ForecastingService +│ └── persistence.py # ModelBundle, save/load +└── main.py # FastAPI app with router registration +``` + +### Desired Codebase Tree + +```text +app/features/backtesting/ # NEW: Backtesting vertical slice +├── __init__.py # Module exports +├── schemas.py # BacktestConfig, BacktestRequest, BacktestResponse, etc. +├── splitter.py # TimeSeriesSplitter (expanding/sliding + gap) +├── metrics.py # MetricsCalculator (MAE, sMAPE, WAPE, Bias, Stability) +├── service.py # BacktestingService (orchestration) +├── routes.py # POST /backtesting/run, GET /backtesting/results/{id} +└── tests/ + ├── __init__.py + ├── conftest.py # Fixtures: sample series, configs + ├── test_schemas.py # Config validation, immutability + ├── test_splitter.py # Split generation, gap handling + ├── test_metrics.py # Metric calculations, edge cases + ├── test_service.py # Orchestration logic + └── test_routes.py # Integration tests + +examples/backtest/ # NEW: Example scripts +├── run_backtest.py # Execute backtest with different strategies +├── inspect_splits.py # Visualize split boundaries +└── metrics_demo.py # Metric edge cases (zeros in sMAPE) + +app/core/config.py # MODIFY: Add backtesting settings +app/main.py # MODIFY: Register backtesting router +``` + +### Known Gotchas + +```python +# CRITICAL: sMAPE is undefined when both actual and forecast are 0 +# Use epsilon fallback: denominator = max(|A| + |F|, epsilon) +# Return 0.0 when both are exactly 0 (perfect forecast of zero) + +# CRITICAL: WAPE divides by sum(|actual|) - handle zero denominator +# When all actuals are 0, return np.inf or raise ValueError + +# CRITICAL: Sliding window requires enough data for min_train_size + gap + horizon +# Validate data length before attempting split generation + +# CRITICAL: Gap parameter simulates operational latency +# gap=1 means 1 day between last training date and first forecast date +# This is common in production where data has reporting delays + +# CRITICAL: Stability Index measures forecast consistency across folds +# Formula: std(fold_metrics) / mean(fold_metrics) * 100 +# Lower is better; high values indicate unstable model + +# CRITICAL: Baseline comparison is MANDATORY +# Every backtest must include naive and seasonal_naive benchmarks +# If custom model doesn't beat baselines, warn user + +# CRITICAL: Per-fold actuals vs predictions must be stored +# This enables UI visualization of forecast errors over time +# Store as list of FoldResult with dates, actuals, predictions + +# CRITICAL: Use cutoff_date = train_end_date for feature computation +# This is inherited from forecasting module - no future data +``` + +--- + +## Implementation Blueprint + +### Data Models and Schemas + +```python +# app/features/backtesting/schemas.py + +from __future__ import annotations +from datetime import date as date_type +from typing import Literal +import hashlib + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class SplitConfig(BaseModel): + """Configuration for time-series splitting.""" + model_config = ConfigDict(frozen=True, extra="forbid") + + strategy: Literal["expanding", "sliding"] = Field( + default="expanding", + description="Expanding grows training window; sliding keeps fixed size" + ) + n_splits: int = Field(default=5, ge=2, le=20, description="Number of CV folds") + min_train_size: int = Field(default=30, ge=7, description="Minimum training samples") + gap: int = Field(default=0, ge=0, le=30, description="Gap between train end and test start") + horizon: int = Field(default=14, ge=1, le=90, description="Forecast horizon per fold") + + @field_validator("horizon") + @classmethod + def validate_horizon_vs_gap(cls, v: int, info) -> int: + """Ensure horizon is reasonable relative to gap.""" + data = getattr(info, "data", {}) + gap = data.get("gap", 0) + if v <= gap: + raise ValueError(f"horizon ({v}) must be greater than gap ({gap})") + return v + + +class BacktestConfig(BaseModel): + """Complete backtest configuration.""" + model_config = ConfigDict(frozen=True, extra="forbid") + + schema_version: str = Field(default="1.0", pattern=r"^\d+\.\d+(\.\d+)?$") + split_config: SplitConfig = Field(default_factory=SplitConfig) + model_config_main: ModelConfig # The model to evaluate (from forecasting.schemas) + include_baselines: bool = Field(default=True, description="Include naive/seasonal benchmarks") + store_fold_details: bool = Field(default=True, description="Store per-fold actuals/predictions") + + def config_hash(self) -> str: + """Deterministic hash for reproducibility.""" + return hashlib.sha256(self.model_dump_json().encode()).hexdigest()[:16] + + +class SplitBoundary(BaseModel): + """Boundary dates for a single CV split.""" + fold_index: int + train_start: date_type + train_end: date_type + test_start: date_type + test_end: date_type + train_size: int + test_size: int + + +class FoldResult(BaseModel): + """Results for a single backtest fold.""" + fold_index: int + split: SplitBoundary + dates: list[date_type] + actuals: list[float] + predictions: list[float] + metrics: dict[str, float] # {"mae": 1.23, "smape": 5.67, ...} + + +class ModelBacktestResult(BaseModel): + """Backtest results for a single model.""" + model_type: str + config_hash: str + fold_results: list[FoldResult] + aggregated_metrics: dict[str, float] # Mean across folds + metric_std: dict[str, float] # Std across folds for stability + + +class BacktestResponse(BaseModel): + """Complete backtest response.""" + backtest_id: str + store_id: int + product_id: int + config_hash: str + split_config: SplitConfig + main_model_results: ModelBacktestResult + baseline_results: list[ModelBacktestResult] | None = None # naive, seasonal_naive + comparison_summary: dict[str, dict[str, float]] | None = None # Model vs baselines + duration_ms: float + leakage_check_passed: bool +``` + +### Time Series Splitter + +```python +# app/features/backtesting/splitter.py + +from __future__ import annotations +from dataclasses import dataclass +from datetime import date as date_type, timedelta +from typing import Iterator + +import numpy as np + +from app.features.backtesting.schemas import SplitBoundary, SplitConfig + + +@dataclass +class TimeSeriesSplit: + """A single train/test split with indices and dates.""" + fold_index: int + train_indices: np.ndarray + test_indices: np.ndarray + train_dates: list[date_type] + test_dates: list[date_type] + + +class TimeSeriesSplitter: + """Generate time-based CV splits with expanding or sliding window. + + CRITICAL: Respects temporal order - no future data in training. + + Expanding Window: + Fold 1: [0..30] train, [31..44] test + Fold 2: [0..44] train, [45..58] test (training grows) + Fold 3: [0..58] train, [59..72] test + + Sliding Window: + Fold 1: [0..30] train, [31..44] test + Fold 2: [14..44] train, [45..58] test (training slides) + Fold 3: [28..58] train, [59..72] test + + Gap Parameter: + gap=1 inserts 1 day between train_end and test_start + This simulates operational data latency + """ + + def __init__(self, config: SplitConfig) -> None: + self.config = config + + def split( + self, + dates: list[date_type], + y: np.ndarray, + ) -> Iterator[TimeSeriesSplit]: + """Generate train/test splits. + + Args: + dates: Sorted list of dates (must match y length) + y: Target values array + + Yields: + TimeSeriesSplit objects for each fold + + Raises: + ValueError: If data is insufficient for requested splits + """ + n_samples = len(dates) + min_required = self.config.min_train_size + self.config.gap + self.config.horizon + + if n_samples < min_required: + raise ValueError( + f"Need at least {min_required} samples, got {n_samples}. " + f"(min_train={self.config.min_train_size}, gap={self.config.gap}, " + f"horizon={self.config.horizon})" + ) + + # Calculate test set positions + test_size = self.config.horizon + n_splits = self.config.n_splits + + # Work backwards from end of data + # Last test set ends at n_samples + # Each fold's test set is `test_size` samples + # We need n_splits * test_size for test sets + total_test_samples = n_splits * test_size + + # First fold's train_end position + if self.config.strategy == "expanding": + # Expanding: first train ends at min_train_size + first_train_end = self.config.min_train_size + else: + # Sliding: calculate so last fold uses all data + # Last fold: train_end + gap + test_size = n_samples + # Working backwards... + first_train_end = self.config.min_train_size + + # Calculate step size between folds + available_for_folds = n_samples - first_train_end - self.config.gap - test_size + step = max(1, available_for_folds // (n_splits - 1)) if n_splits > 1 else 0 + + for fold_idx in range(n_splits): + if self.config.strategy == "expanding": + # Training starts at 0, ends grow with each fold + train_start_idx = 0 + train_end_idx = first_train_end + (fold_idx * step) + else: + # Sliding: both start and end move forward + train_start_idx = fold_idx * step + train_end_idx = train_start_idx + self.config.min_train_size + (fold_idx * step // (n_splits or 1)) + # Ensure minimum train size + train_end_idx = max(train_end_idx, train_start_idx + self.config.min_train_size) + + # Test starts after gap + test_start_idx = train_end_idx + self.config.gap + test_end_idx = min(test_start_idx + test_size, n_samples) + + # Bounds check + if test_end_idx > n_samples or train_end_idx >= n_samples: + break + + yield TimeSeriesSplit( + fold_index=fold_idx, + train_indices=np.arange(train_start_idx, train_end_idx), + test_indices=np.arange(test_start_idx, test_end_idx), + train_dates=dates[train_start_idx:train_end_idx], + test_dates=dates[test_start_idx:test_end_idx], + ) + + def get_boundaries(self, dates: list[date_type], y: np.ndarray) -> list[SplitBoundary]: + """Get split boundaries without full split objects.""" + boundaries = [] + for split in self.split(dates, y): + boundaries.append(SplitBoundary( + fold_index=split.fold_index, + train_start=split.train_dates[0], + train_end=split.train_dates[-1], + test_start=split.test_dates[0], + test_end=split.test_dates[-1], + train_size=len(split.train_indices), + test_size=len(split.test_indices), + )) + return boundaries +``` + +### Metrics Calculator + +```python +# app/features/backtesting/metrics.py + +from __future__ import annotations +from dataclasses import dataclass +from typing import Any + +import numpy as np + + +@dataclass +class MetricResult: + """Result of a single metric calculation.""" + name: str + value: float + n_samples: int + warnings: list[str] + + +class MetricsCalculator: + """Calculate forecasting accuracy metrics. + + Supported Metrics: + - MAE: Mean Absolute Error + - sMAPE: Symmetric Mean Absolute Percentage Error + - WAPE: Weighted Absolute Percentage Error + - Bias: Forecast Bias (positive = under-forecast) + - Stability: Coefficient of variation of per-fold metrics + + CRITICAL: All metrics handle edge cases (zeros, empty arrays). + """ + + EPSILON = 1e-10 # Fallback for division by zero + + @staticmethod + def mae(actuals: np.ndarray, predictions: np.ndarray) -> MetricResult: + """Mean Absolute Error. + + Formula: mean(|actual - predicted|) + + Args: + actuals: Ground truth values + predictions: Predicted values + + Returns: + MetricResult with MAE value + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="mae", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError(f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}") + + mae_value = float(np.mean(np.abs(actuals - predictions))) + + return MetricResult(name="mae", value=mae_value, n_samples=len(actuals), warnings=warnings) + + @staticmethod + def smape(actuals: np.ndarray, predictions: np.ndarray) -> MetricResult: + """Symmetric Mean Absolute Percentage Error. + + Formula: 100/n * sum(2 * |A - F| / (|A| + |F|)) + + CRITICAL: When both A and F are 0, contributes 0 to sum (perfect forecast). + Uses epsilon fallback to avoid division by zero. + + Args: + actuals: Ground truth values + predictions: Predicted values + + Returns: + MetricResult with sMAPE value (0-200 scale) + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="smape", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError(f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}") + + numerator = 2 * np.abs(actuals - predictions) + denominator = np.abs(actuals) + np.abs(predictions) + + # Handle zeros: when both are 0, result is 0 (perfect forecast of zero) + # When denominator is 0 but numerator isn't, use epsilon + with np.errstate(divide='ignore', invalid='ignore'): + ratios = np.where( + (actuals == 0) & (predictions == 0), + 0.0, # Perfect forecast of zero + np.where( + denominator == 0, + 2.0, # Maximum error (shouldn't happen if above handles 0/0) + numerator / denominator + ) + ) + + smape_value = float(100.0 * np.mean(ratios)) + + n_zeros = int(np.sum((actuals == 0) | (predictions == 0))) + if n_zeros > 0: + warnings.append(f"{n_zeros} samples with zero values") + + return MetricResult(name="smape", value=smape_value, n_samples=len(actuals), warnings=warnings) + + @staticmethod + def wape(actuals: np.ndarray, predictions: np.ndarray) -> MetricResult: + """Weighted Absolute Percentage Error. + + Formula: sum(|A - F|) / sum(|A|) * 100 + + CRITICAL: Better than MAPE for intermittent/low-volume series. + Returns inf if sum of actuals is zero. + + Args: + actuals: Ground truth values + predictions: Predicted values + + Returns: + MetricResult with WAPE value + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="wape", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError(f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}") + + sum_abs_error = float(np.sum(np.abs(actuals - predictions))) + sum_abs_actual = float(np.sum(np.abs(actuals))) + + if sum_abs_actual == 0: + warnings.append("Sum of actuals is zero; WAPE undefined") + return MetricResult(name="wape", value=np.inf, n_samples=len(actuals), warnings=warnings) + + wape_value = (sum_abs_error / sum_abs_actual) * 100.0 + + return MetricResult(name="wape", value=wape_value, n_samples=len(actuals), warnings=warnings) + + @staticmethod + def bias(actuals: np.ndarray, predictions: np.ndarray) -> MetricResult: + """Forecast Bias. + + Formula: mean(actual - predicted) + + Interpretation: + - Positive: Model under-forecasts (actuals > predictions) + - Negative: Model over-forecasts (actuals < predictions) + - Zero: No systematic bias + + Args: + actuals: Ground truth values + predictions: Predicted values + + Returns: + MetricResult with Bias value + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="bias", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError(f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}") + + bias_value = float(np.mean(actuals - predictions)) + + if abs(bias_value) > np.std(actuals - predictions): + warnings.append("Bias exceeds error standard deviation; systematic over/under-forecasting detected") + + return MetricResult(name="bias", value=bias_value, n_samples=len(actuals), warnings=warnings) + + @staticmethod + def stability_index(fold_metric_values: list[float]) -> MetricResult: + """Stability Index (coefficient of variation across folds). + + Formula: std(metrics) / mean(metrics) * 100 + + Interpretation: + - Lower is better (more stable model) + - High values indicate inconsistent performance across time periods + + Args: + fold_metric_values: List of metric values from each fold + + Returns: + MetricResult with Stability Index value + """ + warnings: list[str] = [] + + if len(fold_metric_values) < 2: + return MetricResult( + name="stability_index", + value=np.nan, + n_samples=len(fold_metric_values), + warnings=["Need at least 2 folds for stability calculation"] + ) + + values = np.array(fold_metric_values) + mean_val = float(np.mean(values)) + std_val = float(np.std(values)) + + if mean_val == 0: + warnings.append("Mean is zero; stability index undefined") + return MetricResult(name="stability_index", value=np.inf, n_samples=len(fold_metric_values), warnings=warnings) + + stability = (std_val / abs(mean_val)) * 100.0 + + if stability > 50: + warnings.append("High instability (>50%); model performance varies significantly across folds") + + return MetricResult(name="stability_index", value=stability, n_samples=len(fold_metric_values), warnings=warnings) + + def calculate_all( + self, + actuals: np.ndarray, + predictions: np.ndarray + ) -> dict[str, float]: + """Calculate all point metrics for a single fold. + + Args: + actuals: Ground truth values + predictions: Predicted values + + Returns: + Dictionary of metric name to value + """ + return { + "mae": self.mae(actuals, predictions).value, + "smape": self.smape(actuals, predictions).value, + "wape": self.wape(actuals, predictions).value, + "bias": self.bias(actuals, predictions).value, + } + + def aggregate_fold_metrics( + self, + fold_metrics: list[dict[str, float]], + ) -> tuple[dict[str, float], dict[str, float]]: + """Aggregate metrics across folds. + + Args: + fold_metrics: List of per-fold metric dictionaries + + Returns: + Tuple of (aggregated_means, stability_std) + """ + if not fold_metrics: + return {}, {} + + metric_names = fold_metrics[0].keys() + aggregated: dict[str, float] = {} + stability: dict[str, float] = {} + + for name in metric_names: + values = [fm[name] for fm in fold_metrics if not np.isnan(fm[name])] + if values: + aggregated[name] = float(np.mean(values)) + stability[f"{name}_stability"] = self.stability_index(values).value + else: + aggregated[name] = np.nan + stability[f"{name}_stability"] = np.nan + + return aggregated, stability +``` + +--- + +## Task List + +### Task 1: Add backtesting settings to config + +```yaml +FILE: app/core/config.py +ACTION: MODIFY +FIND: "forecast_enable_lightgbm: bool = False" +INJECT AFTER: + - "# Backtesting" + - "backtest_max_splits: int = 20" + - "backtest_default_min_train_size: int = 30" + - "backtest_max_gap: int = 30" + - "backtest_results_dir: str = './artifacts/backtests'" +VALIDATION: + - uv run mypy app/core/config.py + - uv run pyright app/core/config.py +``` + +### Task 2: Create backtesting module structure + +```yaml +ACTION: CREATE directories and __init__.py +FILES: + - app/features/backtesting/__init__.py + - app/features/backtesting/tests/__init__.py +PATTERN: Mirror forecasting module exports +``` + +### Task 3: Implement schemas.py + +```yaml +FILE: app/features/backtesting/schemas.py +ACTION: CREATE +IMPLEMENT: + - SplitConfig with frozen=True, strategy validation + - BacktestConfig with config_hash() + - SplitBoundary for fold boundaries + - FoldResult for per-fold actuals/predictions + - ModelBacktestResult for single model results + - BacktestRequest, BacktestResponse schemas +PATTERN: Mirror app/features/forecasting/schemas.py +CRITICAL: + - Import ModelConfig from forecasting.schemas + - Validate horizon > gap + - Use Literal["expanding", "sliding"] for strategy +VALIDATION: + - uv run mypy app/features/backtesting/schemas.py + - uv run pyright app/features/backtesting/schemas.py +``` + +### Task 4: Implement splitter.py + +```yaml +FILE: app/features/backtesting/splitter.py +ACTION: CREATE +IMPLEMENT: + - TimeSeriesSplit dataclass (indices + dates) + - TimeSeriesSplitter class with split() generator + - get_boundaries() for boundary inspection + - Support expanding and sliding strategies + - Gap parameter between train end and test start +CRITICAL: + - Validate sufficient data for requested splits + - Expanding: train grows, start stays at 0 + - Sliding: both start and end move forward + - Yield splits in chronological order +VALIDATION: + - uv run mypy app/features/backtesting/splitter.py + - uv run pyright app/features/backtesting/splitter.py +``` + +### Task 5: Implement metrics.py + +```yaml +FILE: app/features/backtesting/metrics.py +ACTION: CREATE +IMPLEMENT: + - MetricResult dataclass with warnings + - MetricsCalculator class + - mae() - Mean Absolute Error + - smape() - Symmetric Mean Absolute Percentage Error + - wape() - Weighted Absolute Percentage Error + - bias() - Forecast Bias + - stability_index() - Coefficient of variation + - calculate_all() - Compute all metrics for a fold + - aggregate_fold_metrics() - Mean + stability across folds +CRITICAL: + - Handle zeros in sMAPE denominator + - Handle zero sum of actuals in WAPE + - Return np.nan for empty arrays + - Log warnings for edge cases +VALIDATION: + - uv run mypy app/features/backtesting/metrics.py + - uv run pyright app/features/backtesting/metrics.py +``` + +### Task 6: Implement service.py + +```yaml +FILE: app/features/backtesting/service.py +ACTION: CREATE +IMPLEMENT: + - BacktestingService class + - run_backtest() - Main orchestration method + - _load_series_data() - Query SalesDaily for series + - _run_single_model_backtest() - Train/predict/evaluate per fold + - _run_baseline_comparison() - Run naive + seasonal_naive + - _check_leakage() - Verify no future data in training + - _generate_comparison_summary() - Model vs baselines +CRITICAL: + - Use ForecastingService for model training/prediction + - Cutoff date = train_end for each fold + - Store per-fold actuals/predictions if config.store_fold_details + - Return BacktestResponse with all results +PATTERN: Mirror app/features/forecasting/service.py +VALIDATION: + - uv run mypy app/features/backtesting/service.py + - uv run pyright app/features/backtesting/service.py +``` + +### Task 7: Implement routes.py + +```yaml +FILE: app/features/backtesting/routes.py +ACTION: CREATE +IMPLEMENT: + - APIRouter(prefix="/backtesting", tags=["backtesting"]) + - POST /run - Execute backtest, return results + - GET /results/{backtest_id} - (Optional) Retrieve stored results +PATTERN: Mirror app/features/forecasting/routes.py +CRITICAL: + - time.perf_counter() for duration_ms + - Depends(get_db) for database session + - Structured logging: backtesting.run_started, backtesting.run_completed + - Return 400 for insufficient data +VALIDATION: + - uv run mypy app/features/backtesting/routes.py + - uv run pyright app/features/backtesting/routes.py +``` + +### Task 8: Register router in main.py + +```yaml +FILE: app/main.py +ACTION: MODIFY +FIND: "app.include_router(forecasting_router)" +INJECT AFTER: + - "from app.features.backtesting.routes import router as backtesting_router" + - "app.include_router(backtesting_router)" +VALIDATION: + - uv run python -c "from app.main import app; print('OK')" +``` + +### Task 9: Create test fixtures (conftest.py) + +```yaml +FILE: app/features/backtesting/tests/conftest.py +ACTION: CREATE +IMPLEMENT: + - sample_daily_series: 120 days of sequential dates + values + - sample_seasonal_series: 84 days (12 weeks) with weekly pattern + - sample_split_config_expanding: SplitConfig with strategy="expanding" + - sample_split_config_sliding: SplitConfig with strategy="sliding" + - sample_backtest_config: Full BacktestConfig with naive model +PATTERN: Mirror app/features/forecasting/tests/conftest.py +``` + +### Task 10: Create test_schemas.py + +```yaml +FILE: app/features/backtesting/tests/test_schemas.py +ACTION: CREATE +IMPLEMENT: + - Test SplitConfig validation (positive values, ranges) + - Test SplitConfig strategy validation ("expanding", "sliding") + - Test SplitConfig horizon > gap validation + - Test BacktestConfig immutability (frozen=True) + - Test config_hash() determinism +VALIDATION: + - uv run pytest app/features/backtesting/tests/test_schemas.py -v +``` + +### Task 11: Create test_splitter.py + +```yaml +FILE: app/features/backtesting/tests/test_splitter.py +ACTION: CREATE +IMPLEMENT: + - TestTimeSeriesSplitter class + - test_expanding_window_splits: Train grows, start stays at 0 + - test_sliding_window_splits: Both start and end move + - test_gap_between_train_test: Verify gap days between train_end and test_start + - test_insufficient_data_raises: ValueError for too little data + - test_boundaries_match_split_indices: get_boundaries() consistency + - test_no_overlap_between_folds: Verify non-overlapping test sets + - test_chronological_order: Folds are in time order +CRITICAL: + - Assert exact indices for deterministic splits + - Verify train/test don't overlap + - Verify gap is respected +VALIDATION: + - uv run pytest app/features/backtesting/tests/test_splitter.py -v +``` + +### Task 12: Create test_metrics.py + +```yaml +FILE: app/features/backtesting/tests/test_metrics.py +ACTION: CREATE +IMPLEMENT: + - TestMAE: Basic calculation, empty array, length mismatch + - TestSMAPE: Basic calculation, zeros handling, both-zero case + - TestWAPE: Basic calculation, zero actuals + - TestBias: Positive bias (under-forecast), negative bias (over-forecast) + - TestStabilityIndex: Low stability (good), high stability (bad) + - TestCalculateAll: All metrics at once + - TestAggregateFoldMetrics: Mean and stability across folds +CRITICAL: + - Test edge case: actuals = [0, 0, 0], predictions = [0, 0, 0] + - Test edge case: actuals = [0, 1, 2], predictions = [0.5, 0.5, 0.5] + - Assert exact expected values for known inputs +VALIDATION: + - uv run pytest app/features/backtesting/tests/test_metrics.py -v +``` + +### Task 13: Create test_service.py + +```yaml +FILE: app/features/backtesting/tests/test_service.py +ACTION: CREATE +IMPLEMENT: + - Test run_backtest happy path (mock DB, mock ForecastingService) + - Test baseline comparison included when config.include_baselines=True + - Test fold_details stored when config.store_fold_details=True + - Test leakage check passes for valid splits + - Test insufficient data returns appropriate error + - Test comparison_summary shows model vs baselines +VALIDATION: + - uv run pytest app/features/backtesting/tests/test_service.py -v +``` + +### Task 14: Create test_routes.py (optional integration) + +```yaml +FILE: app/features/backtesting/tests/test_routes.py +ACTION: CREATE +IMPLEMENT: + - Test POST /backtesting/run with valid request + - Test 400 response for insufficient data + - Test 422 response for invalid config +PATTERN: Mirror app/features/forecasting/tests/ patterns +VALIDATION: + - uv run pytest app/features/backtesting/tests/test_routes.py -v +``` + +### Task 15: Create example files + +```yaml +FILES: + - examples/backtest/run_backtest.py + - examples/backtest/inspect_splits.py + - examples/backtest/metrics_demo.py +ACTION: CREATE +IMPLEMENT: + - run_backtest.py: Execute backtest with expanding and sliding configs + - inspect_splits.py: Visualize split boundaries with print output + - metrics_demo.py: Show metric calculations with edge cases +``` + +### Task 16: Update module __init__.py exports + +```yaml +FILE: app/features/backtesting/__init__.py +ACTION: MODIFY +IMPLEMENT: + - Export all public classes + - __all__ list (sorted alphabetically) +VALIDATION: + - uv run python -c "from app.features.backtesting import *; print('OK')" +``` + +--- + +## Validation Loop + +### Level 1: Syntax & Style + +```bash +# Run after EACH file creation +uv run ruff check app/features/backtesting/ --fix +uv run ruff format app/features/backtesting/ + +# Expected: All checks passed! +``` + +### Level 2: Type Checking + +```bash +# Run after completing schemas, splitter, metrics, service +uv run mypy app/features/backtesting/ +uv run pyright app/features/backtesting/ + +# Expected: Success: no issues found +``` + +### Level 3: Unit Tests + +```bash +# Run incrementally as tests are created +uv run pytest app/features/backtesting/tests/test_schemas.py -v +uv run pytest app/features/backtesting/tests/test_splitter.py -v +uv run pytest app/features/backtesting/tests/test_metrics.py -v +uv run pytest app/features/backtesting/tests/test_service.py -v + +# Run all +uv run pytest app/features/backtesting/tests/ -v + +# Expected: 50+ tests passed +``` + +### Level 4: Integration Test + +```bash +# Start API +uv run uvicorn app.main:app --reload --port 8123 + +# Test backtest endpoint (requires seeded DB with 120+ days of data) +curl -X POST http://localhost:8123/backtesting/run \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "start_date": "2024-01-01", + "end_date": "2024-06-30", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14 + }, + "model_config_main": { + "model_type": "naive" + }, + "include_baselines": true, + "store_fold_details": true + } + }' + +# Expected: JSON with main_model_results, baseline_results, comparison_summary +``` + +### Level 5: Full Validation + +```bash +# Complete validation suite +uv run ruff check app/features/backtesting/ && \ +uv run mypy app/features/backtesting/ && \ +uv run pyright app/features/backtesting/ && \ +uv run pytest app/features/backtesting/tests/ -v + +# Expected: All green +``` + +--- + +## Final Checklist + +- [ ] All 16 tasks completed +- [ ] `uv run ruff check .` — no errors +- [ ] `uv run mypy app/features/backtesting/` — no errors +- [ ] `uv run pyright app/features/backtesting/` — no errors +- [ ] `uv run pytest app/features/backtesting/tests/ -v` — 50+ tests passed +- [ ] Example scripts run successfully +- [ ] Router registered in main.py +- [ ] Settings added to config.py +- [ ] Logging events follow standard format +- [ ] Baseline comparison works automatically +- [ ] Per-fold actuals/predictions stored for UI + +--- + +## Anti-Patterns to Avoid + +- **DON'T** use random splits — time-series requires temporal ordering +- **DON'T** ignore the gap parameter — it simulates real operational latency +- **DON'T** aggregate metrics without exposing per-fold distributions +- **DON'T** skip baseline comparison — it's mandatory for model validation +- **DON'T** use future data in training — enforce cutoff_date strictly +- **DON'T** catch generic Exception — be specific about error types +- **DON'T** hardcode metric thresholds — make them configurable +- **DON'T** silently handle zero division — return np.nan with warnings + +--- + +## Confidence Score: 8/10 + +**Strengths:** +- Clear patterns from forecasting module to follow +- Well-documented time-series CV patterns (sklearn, skforecast) +- Comprehensive metrics suite with edge case handling +- Strong task breakdown with validation gates +- Baseline comparison ensures practical model evaluation + +**Risks:** +- Service orchestration complexity (train/predict loop per fold) +- Database queries for large series may need optimization +- Integration tests require seeded database with sufficient data +- Sliding window logic is more complex than expanding + +**Mitigation:** +- Focus on expanding window first (simpler, matches sklearn) +- Add pagination/batching for large series if needed +- Provide seed script with 120+ days of data +- Thoroughly test sliding window edge cases + +--- + +## Sources + +- [sklearn TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) +- [Skforecast Backtesting Guide](https://skforecast.org/0.14.0/user_guides/backtesting.html) +- [Time Series Cross-Validation Best Practices](https://forecastegy.com/posts/time-series-cross-validation-python/) +- [sMAPE Definition (Wikipedia)](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error) +- [MAPE vs WAPE vs WMAPE (Baeldung)](https://www.baeldung.com/cs/mape-vs-wape-vs-wmape) +- [Forecast Bias Definition](https://demandplanning.net/mape-wmape-and-forecast-bias/) +- [Backtest ML Models for Time Series](https://machinelearningmastery.com/backtest-machine-learning-models-time-series-forecasting/) diff --git a/app/core/config.py b/app/core/config.py index d3635014..39c81f1d 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -47,6 +47,12 @@ class Settings(BaseSettings): forecast_model_artifacts_dir: str = "./artifacts/models" forecast_enable_lightgbm: bool = False + # Backtesting + backtest_max_splits: int = 20 + backtest_default_min_train_size: int = 30 + backtest_max_gap: int = 30 + backtest_results_dir: str = "./artifacts/backtests" + @property def is_development(self) -> bool: """Check if running in development mode.""" diff --git a/app/features/backtesting/__init__.py b/app/features/backtesting/__init__.py new file mode 100644 index 00000000..55a0ec79 --- /dev/null +++ b/app/features/backtesting/__init__.py @@ -0,0 +1,30 @@ +"""Backtesting module for time-series forecasting evaluation. + +Provides time-based cross-validation, metrics calculation, and baseline comparisons. +""" + +from app.features.backtesting.metrics import MetricResult, MetricsCalculator +from app.features.backtesting.schemas import ( + BacktestConfig, + BacktestRequest, + BacktestResponse, + FoldResult, + ModelBacktestResult, + SplitBoundary, + SplitConfig, +) +from app.features.backtesting.splitter import TimeSeriesSplit, TimeSeriesSplitter + +__all__ = [ + "BacktestConfig", + "BacktestRequest", + "BacktestResponse", + "FoldResult", + "MetricResult", + "MetricsCalculator", + "ModelBacktestResult", + "SplitBoundary", + "SplitConfig", + "TimeSeriesSplit", + "TimeSeriesSplitter", +] diff --git a/app/features/backtesting/metrics.py b/app/features/backtesting/metrics.py new file mode 100644 index 00000000..7bb90c0d --- /dev/null +++ b/app/features/backtesting/metrics.py @@ -0,0 +1,344 @@ +"""Metrics calculator for forecast evaluation. + +Supported Metrics: +- MAE: Mean Absolute Error +- sMAPE: Symmetric Mean Absolute Percentage Error +- WAPE: Weighted Absolute Percentage Error +- Bias: Forecast Bias (positive = under-forecast) +- Stability: Coefficient of variation of per-fold metrics + +CRITICAL: All metrics handle edge cases (zeros, empty arrays). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import numpy as np + + +@dataclass +class MetricResult: + """Result of a single metric calculation. + + Attributes: + name: Name of the metric. + value: Calculated value (may be nan for edge cases). + n_samples: Number of samples used in calculation. + warnings: List of warnings generated during calculation. + """ + + name: str + value: float + n_samples: int + warnings: list[str] = field(default_factory=lambda: []) + + +class MetricsCalculator: + """Calculate forecasting accuracy metrics. + + Provides methods for computing various forecast accuracy metrics + with proper edge case handling. + + Supported Metrics: + - MAE: Mean Absolute Error + - sMAPE: Symmetric Mean Absolute Percentage Error (0-200 scale) + - WAPE: Weighted Absolute Percentage Error + - Bias: Forecast Bias (positive = under-forecast) + - Stability: Coefficient of variation of per-fold metrics + + CRITICAL: All metrics handle edge cases (zeros, empty arrays). + """ + + EPSILON = 1e-10 # Fallback for division by zero + + @staticmethod + def mae( + actuals: np.ndarray[Any, np.dtype[np.floating[Any]]], + predictions: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> MetricResult: + """Mean Absolute Error. + + Formula: mean(|actual - predicted|) + + Args: + actuals: Ground truth values. + predictions: Predicted values. + + Returns: + MetricResult with MAE value. + + Raises: + ValueError: If arrays have different lengths. + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="mae", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError( + f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}" + ) + + mae_value = float(np.mean(np.abs(actuals - predictions))) + + return MetricResult(name="mae", value=mae_value, n_samples=len(actuals), warnings=warnings) + + @staticmethod + def smape( + actuals: np.ndarray[Any, np.dtype[np.floating[Any]]], + predictions: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> MetricResult: + """Symmetric Mean Absolute Percentage Error. + + Formula: 100/n * sum(2 * |A - F| / (|A| + |F|)) + + CRITICAL: When both A and F are 0, contributes 0 to sum (perfect forecast). + Uses epsilon fallback to avoid division by zero. + + Args: + actuals: Ground truth values. + predictions: Predicted values. + + Returns: + MetricResult with sMAPE value (0-200 scale). + + Raises: + ValueError: If arrays have different lengths. + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="smape", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError( + f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}" + ) + + numerator = 2.0 * np.abs(actuals - predictions) + denominator = np.abs(actuals) + np.abs(predictions) + + # Handle zeros: when both are 0, result is 0 (perfect forecast of zero) + # When denominator is 0 but numerator isn't, use epsilon + with np.errstate(divide="ignore", invalid="ignore"): + ratios = np.where( + (actuals == 0) & (predictions == 0), + 0.0, # Perfect forecast of zero + np.where( + denominator == 0, + 2.0, # Maximum error (shouldn't happen if above handles 0/0) + numerator / denominator, + ), + ) + + smape_value = float(100.0 * np.mean(ratios)) + + n_zeros = int(np.sum((actuals == 0) | (predictions == 0))) + if n_zeros > 0: + warnings.append(f"{n_zeros} samples with zero values") + + return MetricResult( + name="smape", value=smape_value, n_samples=len(actuals), warnings=warnings + ) + + @staticmethod + def wape( + actuals: np.ndarray[Any, np.dtype[np.floating[Any]]], + predictions: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> MetricResult: + """Weighted Absolute Percentage Error. + + Formula: sum(|A - F|) / sum(|A|) * 100 + + CRITICAL: Better than MAPE for intermittent/low-volume series. + Returns inf if sum of actuals is zero. + + Args: + actuals: Ground truth values. + predictions: Predicted values. + + Returns: + MetricResult with WAPE value. + + Raises: + ValueError: If arrays have different lengths. + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="wape", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError( + f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}" + ) + + sum_abs_error = float(np.sum(np.abs(actuals - predictions))) + sum_abs_actual = float(np.sum(np.abs(actuals))) + + if sum_abs_actual == 0: + warnings.append("Sum of actuals is zero; WAPE undefined") + return MetricResult( + name="wape", value=float("inf"), n_samples=len(actuals), warnings=warnings + ) + + wape_value = (sum_abs_error / sum_abs_actual) * 100.0 + + return MetricResult( + name="wape", value=wape_value, n_samples=len(actuals), warnings=warnings + ) + + @staticmethod + def bias( + actuals: np.ndarray[Any, np.dtype[np.floating[Any]]], + predictions: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> MetricResult: + """Forecast Bias. + + Formula: mean(actual - predicted) + + Interpretation: + - Positive: Model under-forecasts (actuals > predictions) + - Negative: Model over-forecasts (actuals < predictions) + - Zero: No systematic bias + + Args: + actuals: Ground truth values. + predictions: Predicted values. + + Returns: + MetricResult with Bias value. + + Raises: + ValueError: If arrays have different lengths. + """ + warnings: list[str] = [] + + if len(actuals) == 0: + return MetricResult(name="bias", value=np.nan, n_samples=0, warnings=["Empty array"]) + + if len(actuals) != len(predictions): + raise ValueError( + f"Length mismatch: actuals={len(actuals)}, predictions={len(predictions)}" + ) + + errors = actuals - predictions + bias_value = float(np.mean(errors)) + error_std = float(np.std(errors)) + + if error_std > 0 and abs(bias_value) > error_std: + warnings.append( + "Bias exceeds error standard deviation; systematic over/under-forecasting detected" + ) + + return MetricResult( + name="bias", value=bias_value, n_samples=len(actuals), warnings=warnings + ) + + @staticmethod + def stability_index(fold_metric_values: list[float]) -> MetricResult: + """Stability Index (coefficient of variation across folds). + + Formula: std(metrics) / |mean(metrics)| * 100 + + Interpretation: + - Lower is better (more stable model) + - High values indicate inconsistent performance across time periods + + Args: + fold_metric_values: List of metric values from each fold. + + Returns: + MetricResult with Stability Index value. + """ + warnings: list[str] = [] + + # Filter out nan values + valid_values = [v for v in fold_metric_values if not np.isnan(v)] + + if len(valid_values) < 2: + return MetricResult( + name="stability_index", + value=np.nan, + n_samples=len(valid_values), + warnings=["Need at least 2 valid folds for stability calculation"], + ) + + values = np.array(valid_values) + mean_val = float(np.mean(values)) + std_val = float(np.std(values)) + + if mean_val == 0: + warnings.append("Mean is zero; stability index undefined") + return MetricResult( + name="stability_index", + value=float("inf"), + n_samples=len(valid_values), + warnings=warnings, + ) + + stability = (std_val / abs(mean_val)) * 100.0 + + if stability > 50: + warnings.append( + "High instability (>50%); model performance varies significantly across folds" + ) + + return MetricResult( + name="stability_index", value=stability, n_samples=len(valid_values), warnings=warnings + ) + + def calculate_all( + self, + actuals: np.ndarray[Any, np.dtype[np.floating[Any]]], + predictions: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> dict[str, float]: + """Calculate all point metrics for a single fold. + + Args: + actuals: Ground truth values. + predictions: Predicted values. + + Returns: + Dictionary of metric name to value. + """ + return { + "mae": self.mae(actuals, predictions).value, + "smape": self.smape(actuals, predictions).value, + "wape": self.wape(actuals, predictions).value, + "bias": self.bias(actuals, predictions).value, + } + + def aggregate_fold_metrics( + self, + fold_metrics: list[dict[str, float]], + ) -> tuple[dict[str, float], dict[str, float]]: + """Aggregate metrics across folds. + + Args: + fold_metrics: List of per-fold metric dictionaries. + + Returns: + Tuple of (aggregated_means, stability_indices). + """ + if not fold_metrics: + return {}, {} + + metric_names = list(fold_metrics[0].keys()) + aggregated: dict[str, float] = {} + stability: dict[str, float] = {} + + for name in metric_names: + values = [fm[name] for fm in fold_metrics if not np.isnan(fm[name])] + if values: + aggregated[name] = float(np.mean(values)) + stability_result = self.stability_index(values) + stability[f"{name}_stability"] = stability_result.value + else: + aggregated[name] = np.nan + stability[f"{name}_stability"] = np.nan + + return aggregated, stability diff --git a/app/features/backtesting/routes.py b/app/features/backtesting/routes.py new file mode 100644 index 00000000..3971bf85 --- /dev/null +++ b/app/features/backtesting/routes.py @@ -0,0 +1,138 @@ +"""FastAPI routes for backtesting endpoints. + +Endpoints: +- POST /backtesting/run - Execute backtest for a series +""" + +from __future__ import annotations + +import time + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.database import get_db +from app.core.exceptions import DatabaseError +from app.core.logging import get_logger +from app.features.backtesting.schemas import BacktestRequest, BacktestResponse +from app.features.backtesting.service import BacktestingService + +logger = get_logger(__name__) + +router = APIRouter(prefix="/backtesting", tags=["backtesting"]) + + +@router.post( + "/run", + response_model=BacktestResponse, + status_code=status.HTTP_200_OK, + summary="Run a backtest", + description=""" +Run a time-series backtest for a store/product series. + +**Split Strategies:** +- `expanding`: Training window grows with each fold (sklearn-like) +- `sliding`: Training window slides forward with fixed size + +**Gap Parameter:** +- Simulates operational data latency +- gap=1 means 1 day between training end and test start + +**Metrics Calculated:** +- MAE: Mean Absolute Error +- sMAPE: Symmetric Mean Absolute Percentage Error (0-200) +- WAPE: Weighted Absolute Percentage Error +- Bias: Forecast bias (positive = under-forecast) + +**Baseline Comparison:** +When `include_baselines=true`, automatically compares against: +- Naive (last value) +- Seasonal Naive (same day previous week) + +**Response includes:** +- Per-fold metrics and predictions (if `store_fold_details=true`) +- Aggregated metrics across all folds +- Comparison summary vs baselines +- Leakage validation status +""", +) +async def run_backtest( + request: BacktestRequest, + db: AsyncSession = Depends(get_db), +) -> BacktestResponse: + """Run a backtest for a single series. + + Args: + request: Backtest request with configuration. + db: Async database session from dependency. + + Returns: + BacktestResponse with all results. + + Raises: + HTTPException: If validation fails or insufficient data. + DatabaseError: If database operation fails. + """ + start_time = time.perf_counter() + + logger.info( + "backtesting.request_received", + store_id=request.store_id, + product_id=request.product_id, + model_type=request.config.model_config_main.model_type, + strategy=request.config.split_config.strategy, + n_splits=request.config.split_config.n_splits, + ) + + service = BacktestingService() + + try: + response = await service.run_backtest( + db=db, + store_id=request.store_id, + product_id=request.product_id, + start_date=request.start_date, + end_date=request.end_date, + config=request.config, + ) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + logger.info( + "backtesting.request_completed", + store_id=request.store_id, + product_id=request.product_id, + backtest_id=response.backtest_id, + n_folds=len(response.main_model_results.fold_results), + duration_ms=duration_ms, + ) + + return response + + except ValueError as e: + logger.warning( + "backtesting.request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e), + ) from e + + except SQLAlchemyError as e: + logger.error( + "backtesting.request_failed", + store_id=request.store_id, + product_id=request.product_id, + error=str(e), + error_type=type(e).__name__, + exc_info=True, + ) + raise DatabaseError( + message="Failed to run backtest", + details={"error": str(e)}, + ) from e diff --git a/app/features/backtesting/schemas.py b/app/features/backtesting/schemas.py new file mode 100644 index 00000000..205f8547 --- /dev/null +++ b/app/features/backtesting/schemas.py @@ -0,0 +1,250 @@ +"""Pydantic schemas for backtesting configuration and API contracts. + +Schemas are designed to be: +- Immutable (frozen=True) for reproducibility +- Versioned (schema_version) for registry storage +- Hashable (config_hash) for deduplication +""" + +from __future__ import annotations + +import hashlib +from datetime import date as date_type +from typing import Annotated, Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +from app.features.forecasting.schemas import ModelConfig + +# ============================================================================= +# Split Configuration +# ============================================================================= + + +class SplitConfig(BaseModel): + """Configuration for time-series splitting. + + Attributes: + strategy: 'expanding' grows training window; 'sliding' keeps fixed size. + n_splits: Number of CV folds (2-20). + min_train_size: Minimum training samples required. + gap: Gap days between train end and test start (simulates data latency). + horizon: Forecast horizon per fold. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + strategy: Literal["expanding", "sliding"] = Field( + default="expanding", + description="Expanding grows training window; sliding keeps fixed size", + ) + n_splits: int = Field( + default=5, + ge=2, + le=20, + description="Number of CV folds", + ) + min_train_size: int = Field( + default=30, + ge=7, + description="Minimum training samples", + ) + gap: int = Field( + default=0, + ge=0, + le=30, + description="Gap between train end and test start", + ) + horizon: int = Field( + default=14, + ge=1, + le=90, + description="Forecast horizon per fold", + ) + + @field_validator("horizon") + @classmethod + def validate_horizon_vs_gap(cls, v: int, info: object) -> int: + """Ensure horizon is reasonable relative to gap.""" + data = getattr(info, "data", {}) + gap = data.get("gap", 0) + if gap is not None and v <= gap: + raise ValueError(f"horizon ({v}) must be greater than gap ({gap})") + return v + + +# ============================================================================= +# Backtest Configuration +# ============================================================================= + + +class BacktestConfig(BaseModel): + """Complete backtest configuration. + + Attributes: + schema_version: Semantic version of this config schema. + split_config: Configuration for time-series splitting. + model_config_main: The model configuration to evaluate. + include_baselines: Whether to include naive/seasonal_naive benchmarks. + store_fold_details: Whether to store per-fold actuals/predictions. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + schema_version: str = Field( + default="1.0", + description="Semantic version of this config schema", + pattern=r"^\d+\.\d+(\.\d+)?$", + ) + split_config: SplitConfig = Field(default_factory=SplitConfig) + model_config_main: Annotated[ModelConfig, Field(discriminator="model_type")] + include_baselines: bool = Field( + default=True, + description="Include naive/seasonal benchmarks", + ) + store_fold_details: bool = Field( + default=True, + description="Store per-fold actuals/predictions", + ) + + def config_hash(self) -> str: + """Generate deterministic hash of configuration. + + Returns: + 16-character hex string hash of config JSON. + """ + config_json = self.model_dump_json() + return hashlib.sha256(config_json.encode()).hexdigest()[:16] + + +# ============================================================================= +# Split Boundary and Fold Results +# ============================================================================= + + +class SplitBoundary(BaseModel): + """Boundary dates for a single CV split. + + Attributes: + fold_index: Index of the fold (0-based). + train_start: Start date of training period. + train_end: End date of training period. + test_start: Start date of test period. + test_end: End date of test period. + train_size: Number of training samples. + test_size: Number of test samples. + """ + + fold_index: int + train_start: date_type + train_end: date_type + test_start: date_type + test_end: date_type + train_size: int + test_size: int + + +class FoldResult(BaseModel): + """Results for a single backtest fold. + + Attributes: + fold_index: Index of the fold (0-based). + split: Split boundary information. + dates: List of dates in the test period. + actuals: Actual values for the test period. + predictions: Predicted values for the test period. + metrics: Dictionary of metric names to values. + """ + + fold_index: int + split: SplitBoundary + dates: list[date_type] + actuals: list[float] + predictions: list[float] + metrics: dict[str, float] + + +class ModelBacktestResult(BaseModel): + """Backtest results for a single model. + + Attributes: + model_type: Type of the model. + config_hash: Hash of the model configuration. + fold_results: Results for each fold. + aggregated_metrics: Mean metrics across folds. + metric_std: Standard deviation of metrics across folds. + """ + + model_type: str + config_hash: str + fold_results: list[FoldResult] + aggregated_metrics: dict[str, float] + metric_std: dict[str, float] + + +# ============================================================================= +# API Request/Response Schemas +# ============================================================================= + + +class BacktestRequest(BaseModel): + """Request body for POST /backtesting/run. + + Attributes: + store_id: Store ID to run backtest for. + product_id: Product ID to run backtest for. + start_date: Start date of the data range. + end_date: End date of the data range. + config: Backtest configuration. + """ + + model_config = ConfigDict(strict=True) + + store_id: int = Field(..., ge=1, description="Store ID") + product_id: int = Field(..., ge=1, description="Product ID") + start_date: date_type = Field( + ..., + description="Start date of data range", + ) + end_date: date_type = Field( + ..., + description="End date of data range", + ) + config: BacktestConfig + + @field_validator("end_date") + @classmethod + def validate_date_range(cls, v: date_type, info: object) -> date_type: + """Ensure end_date is after start_date.""" + data = getattr(info, "data", {}) + if "start_date" in data and v <= data["start_date"]: + raise ValueError("end_date must be after start_date") + return v + + +class BacktestResponse(BaseModel): + """Response body for POST /backtesting/run. + + Attributes: + backtest_id: Unique identifier for this backtest run. + store_id: Store ID the backtest was run for. + product_id: Product ID the backtest was run for. + config_hash: Hash of the backtest configuration. + split_config: Split configuration used. + main_model_results: Results for the main model. + baseline_results: Results for baseline models (if included). + comparison_summary: Summary comparing main model to baselines. + duration_ms: Total duration in milliseconds. + leakage_check_passed: Whether leakage sanity checks passed. + """ + + backtest_id: str + store_id: int + product_id: int + config_hash: str + split_config: SplitConfig + main_model_results: ModelBacktestResult + baseline_results: list[ModelBacktestResult] | None = None + comparison_summary: dict[str, dict[str, float]] | None = None + duration_ms: float + leakage_check_passed: bool diff --git a/app/features/backtesting/service.py b/app/features/backtesting/service.py new file mode 100644 index 00000000..4a72118b --- /dev/null +++ b/app/features/backtesting/service.py @@ -0,0 +1,438 @@ +"""Backtesting service for model evaluation. + +Orchestrates: +- Loading time series data from database +- Generating time-based CV splits +- Training and predicting with models per fold +- Calculating metrics and aggregating results +- Running baseline comparisons + +CRITICAL: All operations respect time-safety constraints. +""" + +from __future__ import annotations + +import time +import uuid +from dataclasses import dataclass, field +from datetime import date as date_type +from typing import TYPE_CHECKING, Any + +import numpy as np +import structlog +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.config import get_settings +from app.features.backtesting.metrics import MetricsCalculator +from app.features.backtesting.schemas import ( + BacktestConfig, + BacktestResponse, + FoldResult, + ModelBacktestResult, + SplitBoundary, +) +from app.features.backtesting.splitter import TimeSeriesSplitter +from app.features.data_platform.models import SalesDaily +from app.features.forecasting.models import model_factory +from app.features.forecasting.schemas import ( + ModelConfig, + NaiveModelConfig, + SeasonalNaiveModelConfig, +) + +if TYPE_CHECKING: + pass + +logger = structlog.get_logger() + + +@dataclass +class SeriesData: + """Container for loaded time series data. + + Attributes: + dates: List of dates in chronological order. + values: Target values as numpy array. + store_id: Store ID. + product_id: Product ID. + n_observations: Number of observations. + """ + + dates: list[date_type] + values: np.ndarray[Any, np.dtype[np.floating[Any]]] + store_id: int + product_id: int + n_observations: int = field(init=False) + + def __post_init__(self) -> None: + """Compute derived fields.""" + self.n_observations = len(self.values) + + +class BacktestingService: + """Service for running backtests on forecasting models. + + Provides orchestration layer for: + - Loading time series data from database + - Generating time-based CV splits + - Training and predicting per fold + - Computing and aggregating metrics + - Running mandatory baseline comparisons + + CRITICAL: All operations use Settings for reproducibility. + """ + + def __init__(self) -> None: + """Initialize the backtesting service.""" + self.settings = get_settings() + self.metrics_calculator = MetricsCalculator() + + async def run_backtest( + self, + db: AsyncSession, + store_id: int, + product_id: int, + start_date: date_type, + end_date: date_type, + config: BacktestConfig, + ) -> BacktestResponse: + """Run a complete backtest for a single series. + + Args: + db: Database session. + store_id: Store ID to backtest. + product_id: Product ID to backtest. + start_date: Start date of data range. + end_date: End date of data range. + config: Backtest configuration. + + Returns: + BacktestResponse with all results. + + Raises: + ValueError: If insufficient data for requested splits. + """ + start_time = time.perf_counter() + backtest_id = uuid.uuid4().hex[:16] + + logger.info( + "backtesting.run_started", + backtest_id=backtest_id, + store_id=store_id, + product_id=product_id, + start_date=str(start_date), + end_date=str(end_date), + config_hash=config.config_hash(), + model_type=config.model_config_main.model_type, + strategy=config.split_config.strategy, + n_splits=config.split_config.n_splits, + ) + + # Load series data + series_data = await self._load_series_data( + db=db, + store_id=store_id, + product_id=product_id, + start_date=start_date, + end_date=end_date, + ) + + if series_data.n_observations == 0: + raise ValueError( + f"No data found for store={store_id}, product={product_id} " + f"between {start_date} and {end_date}" + ) + + # Create splitter and validate + splitter = TimeSeriesSplitter(config.split_config) + + # Run main model backtest + main_results = self._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=config.model_config_main, + store_fold_details=config.store_fold_details, + ) + + # Run baseline comparisons if requested + baseline_results: list[ModelBacktestResult] | None = None + comparison_summary: dict[str, dict[str, float]] | None = None + + if config.include_baselines: + baseline_results = self._run_baseline_comparisons( + series_data=series_data, + splitter=splitter, + store_fold_details=config.store_fold_details, + ) + comparison_summary = self._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Validate no leakage + leakage_check_passed = splitter.validate_no_leakage( + dates=series_data.dates, + y=series_data.values, + ) + + duration_ms = (time.perf_counter() - start_time) * 1000 + + logger.info( + "backtesting.run_completed", + backtest_id=backtest_id, + store_id=store_id, + product_id=product_id, + n_folds=len(main_results.fold_results), + main_model_mae=main_results.aggregated_metrics.get("mae"), + leakage_check_passed=leakage_check_passed, + duration_ms=duration_ms, + ) + + return BacktestResponse( + backtest_id=backtest_id, + store_id=store_id, + product_id=product_id, + config_hash=config.config_hash(), + split_config=config.split_config, + main_model_results=main_results, + baseline_results=baseline_results, + comparison_summary=comparison_summary, + duration_ms=duration_ms, + leakage_check_passed=leakage_check_passed, + ) + + def _run_model_backtest( + self, + series_data: SeriesData, + splitter: TimeSeriesSplitter, + model_config: ModelConfig, + store_fold_details: bool, + ) -> ModelBacktestResult: + """Run backtest for a single model configuration. + + Args: + series_data: Loaded time series data. + splitter: Time series splitter. + model_config: Model configuration. + store_fold_details: Whether to store per-fold details. + + Returns: + ModelBacktestResult with all fold results. + """ + fold_results: list[FoldResult] = [] + fold_metrics: list[dict[str, float]] = [] + + for split in splitter.split(series_data.dates, series_data.values): + # Extract train and test data + y_train = series_data.values[split.train_indices] + y_test = series_data.values[split.test_indices] + + # Create and fit model + model = model_factory(model_config, random_state=self.settings.forecast_random_seed) + model.fit(y_train) + + # Generate predictions + horizon = len(split.test_indices) + predictions = model.predict(horizon) + + # Calculate metrics + metrics = self.metrics_calculator.calculate_all( + actuals=y_test, + predictions=predictions, + ) + fold_metrics.append(metrics) + + # Create fold result + split_boundary = SplitBoundary( + fold_index=split.fold_index, + train_start=split.train_dates[0], + train_end=split.train_dates[-1], + test_start=split.test_dates[0], + test_end=split.test_dates[-1], + train_size=len(split.train_indices), + test_size=len(split.test_indices), + ) + + if store_fold_details: + fold_result = FoldResult( + fold_index=split.fold_index, + split=split_boundary, + dates=split.test_dates, + actuals=[float(v) for v in y_test], + predictions=[float(v) for v in predictions], + metrics=metrics, + ) + else: + # Store minimal fold result without detailed arrays + fold_result = FoldResult( + fold_index=split.fold_index, + split=split_boundary, + dates=[], + actuals=[], + predictions=[], + metrics=metrics, + ) + + fold_results.append(fold_result) + + # Aggregate metrics + aggregated_metrics, metric_std = self.metrics_calculator.aggregate_fold_metrics( + fold_metrics + ) + + return ModelBacktestResult( + model_type=model_config.model_type, + config_hash=model_config.config_hash(), + fold_results=fold_results, + aggregated_metrics=aggregated_metrics, + metric_std=metric_std, + ) + + def _run_baseline_comparisons( + self, + series_data: SeriesData, + splitter: TimeSeriesSplitter, + store_fold_details: bool, + ) -> list[ModelBacktestResult]: + """Run backtests for baseline models. + + Args: + series_data: Loaded time series data. + splitter: Time series splitter. + store_fold_details: Whether to store per-fold details. + + Returns: + List of ModelBacktestResult for each baseline. + """ + baselines: list[ModelConfig] = [ + NaiveModelConfig(), + SeasonalNaiveModelConfig(season_length=7), + ] + + results: list[ModelBacktestResult] = [] + + for baseline_config in baselines: + try: + result = self._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=baseline_config, + store_fold_details=store_fold_details, + ) + results.append(result) + except ValueError as e: + # Log warning but continue with other baselines + logger.warning( + "backtesting.baseline_failed", + model_type=baseline_config.model_type, + error=str(e), + ) + + return results + + def _generate_comparison_summary( + self, + main_results: ModelBacktestResult, + baseline_results: list[ModelBacktestResult], + ) -> dict[str, dict[str, float]]: + """Generate summary comparing main model to baselines. + + Args: + main_results: Results for the main model. + baseline_results: Results for baseline models. + + Returns: + Dictionary with comparison metrics. + Keys are metric names, values are dicts with: + - main: Main model value + - naive: Naive baseline value (if available) + - seasonal_naive: Seasonal naive value (if available) + - vs_naive_pct: Percentage improvement over naive + - vs_seasonal_pct: Percentage improvement over seasonal + """ + summary: dict[str, dict[str, float]] = {} + + # Get baseline values by type + baseline_by_type: dict[str, dict[str, float]] = {} + for result in baseline_results: + baseline_by_type[result.model_type] = result.aggregated_metrics + + # Compare each metric + for metric_name, main_value in main_results.aggregated_metrics.items(): + comparison: dict[str, float] = {"main": main_value} + + # Add baseline values and compute improvements + if "naive" in baseline_by_type: + naive_value = baseline_by_type["naive"].get(metric_name, np.nan) + comparison["naive"] = naive_value + if not np.isnan(naive_value) and naive_value != 0: + # Negative improvement means main is worse + comparison["vs_naive_pct"] = ((naive_value - main_value) / naive_value) * 100 + + if "seasonal_naive" in baseline_by_type: + seasonal_value = baseline_by_type["seasonal_naive"].get(metric_name, np.nan) + comparison["seasonal_naive"] = seasonal_value + if not np.isnan(seasonal_value) and seasonal_value != 0: + comparison["vs_seasonal_pct"] = ( + (seasonal_value - main_value) / seasonal_value + ) * 100 + + summary[metric_name] = comparison + + return summary + + async def _load_series_data( + self, + db: AsyncSession, + store_id: int, + product_id: int, + start_date: date_type, + end_date: date_type, + ) -> SeriesData: + """Load time series data from database. + + Args: + db: Database session. + store_id: Store ID. + product_id: Product ID. + start_date: Start date (inclusive). + end_date: End date (inclusive). + + Returns: + SeriesData container with loaded data. + """ + stmt = ( + select( + SalesDaily.date, + SalesDaily.quantity, + ) + .where( + (SalesDaily.store_id == store_id) + & (SalesDaily.product_id == product_id) + & (SalesDaily.date >= start_date) + & (SalesDaily.date <= end_date) + ) + .order_by(SalesDaily.date) + ) + + result = await db.execute(stmt) + rows = result.all() + + if not rows: + return SeriesData( + dates=[], + values=np.array([], dtype=np.float64), + store_id=store_id, + product_id=product_id, + ) + + dates = [row.date for row in rows] + values = np.array([float(row.quantity) for row in rows], dtype=np.float64) + + return SeriesData( + dates=dates, + values=values, + store_id=store_id, + product_id=product_id, + ) diff --git a/app/features/backtesting/splitter.py b/app/features/backtesting/splitter.py new file mode 100644 index 00000000..b8d3da84 --- /dev/null +++ b/app/features/backtesting/splitter.py @@ -0,0 +1,226 @@ +"""Time-series splitter for backtesting cross-validation. + +CRITICAL: Respects temporal order - no future data in training. + +Supports two strategies: +- Expanding: Training window grows with each fold (start stays at 0) +- Sliding: Training window slides forward (both start and end move) + +Gap parameter simulates operational data latency. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass +from datetime import date as date_type +from typing import TYPE_CHECKING, Any + +import numpy as np + +from app.features.backtesting.schemas import SplitBoundary, SplitConfig + +if TYPE_CHECKING: + pass + + +@dataclass +class TimeSeriesSplit: + """A single train/test split with indices and dates. + + Attributes: + fold_index: Index of the fold (0-based). + train_indices: Numpy array of training indices. + test_indices: Numpy array of test indices. + train_dates: List of training dates. + test_dates: List of test dates. + """ + + fold_index: int + train_indices: np.ndarray[Any, np.dtype[np.intp]] + test_indices: np.ndarray[Any, np.dtype[np.intp]] + train_dates: list[date_type] + test_dates: list[date_type] + + +class TimeSeriesSplitter: + """Generate time-based CV splits with expanding or sliding window. + + CRITICAL: Respects temporal order - no future data in training. + + Expanding Window Example (n_splits=3, min_train=30, horizon=14): + Fold 0: [0..30] train, [30..44] test + Fold 1: [0..44] train, [44..58] test (training grows) + Fold 2: [0..58] train, [58..72] test + + Sliding Window Example (n_splits=3, min_train=30, horizon=14): + Fold 0: [0..30] train, [30..44] test + Fold 1: [14..44] train, [44..58] test (training slides) + Fold 2: [28..58] train, [58..72] test + + Gap Parameter: + gap=1 inserts 1 sample between train_end and test_start + This simulates operational data latency + + Attributes: + config: Split configuration. + """ + + def __init__(self, config: SplitConfig) -> None: + """Initialize the splitter. + + Args: + config: Split configuration. + """ + self.config = config + + def split( + self, + dates: list[date_type], + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> Iterator[TimeSeriesSplit]: + """Generate train/test splits. + + Args: + dates: Sorted list of dates (must match y length). + y: Target values array. + + Yields: + TimeSeriesSplit objects for each fold. + + Raises: + ValueError: If data is insufficient for requested splits. + """ + n_samples = len(dates) + min_required = self.config.min_train_size + self.config.gap + self.config.horizon + + if n_samples < min_required: + raise ValueError( + f"Need at least {min_required} samples, got {n_samples}. " + f"(min_train={self.config.min_train_size}, gap={self.config.gap}, " + f"horizon={self.config.horizon})" + ) + + if len(y) != n_samples: + raise ValueError(f"dates and y must have same length: {n_samples} vs {len(y)}") + + test_size = self.config.horizon + n_splits = self.config.n_splits + gap = self.config.gap + + # Calculate available space for test sets + # We need: min_train_size + gap + (n_splits * test_size) + total_needed = self.config.min_train_size + gap + (n_splits * test_size) + + if n_samples < total_needed: + # Reduce number of splits if not enough data + available_for_tests = n_samples - self.config.min_train_size - gap + actual_splits = max(1, available_for_tests // test_size) + n_splits = min(n_splits, actual_splits) + + # Calculate step size between folds + # For expanding: step moves the test window forward + # For sliding: step moves both train and test windows forward + if n_splits > 1: + # Total space available for test windows after first fold + available_space = n_samples - self.config.min_train_size - gap - test_size + step = max(1, available_space // (n_splits - 1)) + else: + step = test_size + + for fold_idx in range(n_splits): + if self.config.strategy == "expanding": + # Expanding: training always starts at 0 + train_start_idx = 0 + train_end_idx = self.config.min_train_size + (fold_idx * step) + else: + # Sliding: training window moves forward + train_start_idx = fold_idx * step + train_end_idx = train_start_idx + self.config.min_train_size + + # Test starts after gap from train end + test_start_idx = train_end_idx + gap + test_end_idx = test_start_idx + test_size + + # Bounds check + if test_end_idx > n_samples: + break + + if train_end_idx > n_samples: + break + + yield TimeSeriesSplit( + fold_index=fold_idx, + train_indices=np.arange(train_start_idx, train_end_idx), + test_indices=np.arange(test_start_idx, test_end_idx), + train_dates=dates[train_start_idx:train_end_idx], + test_dates=dates[test_start_idx:test_end_idx], + ) + + def get_boundaries( + self, + dates: list[date_type], + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> list[SplitBoundary]: + """Get split boundaries without full split objects. + + Args: + dates: Sorted list of dates. + y: Target values array. + + Returns: + List of SplitBoundary objects. + """ + boundaries: list[SplitBoundary] = [] + for split in self.split(dates, y): + boundaries.append( + SplitBoundary( + fold_index=split.fold_index, + train_start=split.train_dates[0], + train_end=split.train_dates[-1], + test_start=split.test_dates[0], + test_end=split.test_dates[-1], + train_size=len(split.train_indices), + test_size=len(split.test_indices), + ) + ) + return boundaries + + def validate_no_leakage( + self, + dates: list[date_type], + y: np.ndarray[Any, np.dtype[np.floating[Any]]], + ) -> bool: + """Validate that no future data leaks into training. + + Checks that for all folds: + 1. train_end < test_start + 2. Gap is respected + 3. No overlap between train and test indices + + Args: + dates: Sorted list of dates. + y: Target values array. + + Returns: + True if no leakage detected, False otherwise. + """ + for split in self.split(dates, y): + # Check train_end < test_start + if split.train_dates[-1] >= split.test_dates[0]: + return False + + # Check gap is respected + train_end_idx = split.train_indices[-1] + test_start_idx = split.test_indices[0] + actual_gap = test_start_idx - train_end_idx - 1 + if actual_gap < self.config.gap: + return False + + # Check no overlap + train_set = set(split.train_indices.tolist()) + test_set = set(split.test_indices.tolist()) + if train_set & test_set: + return False + + return True diff --git a/app/features/backtesting/tests/__init__.py b/app/features/backtesting/tests/__init__.py new file mode 100644 index 00000000..a52cd9cc --- /dev/null +++ b/app/features/backtesting/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for backtesting module.""" diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py new file mode 100644 index 00000000..519738af --- /dev/null +++ b/app/features/backtesting/tests/conftest.py @@ -0,0 +1,111 @@ +"""Test fixtures for backtesting module.""" + +from datetime import date, timedelta + +import numpy as np +import pytest + +from app.features.backtesting.schemas import BacktestConfig, SplitConfig +from app.features.forecasting.schemas import NaiveModelConfig, SeasonalNaiveModelConfig + + +@pytest.fixture +def sample_dates_120() -> list[date]: + """Create 120 consecutive dates starting from 2024-01-01.""" + start = date(2024, 1, 1) + return [start + timedelta(days=i) for i in range(120)] + + +@pytest.fixture +def sample_values_120() -> np.ndarray: + """Create 120 sequential values (1, 2, 3, ..., 120).""" + return np.array(range(1, 121), dtype=np.float64) + + +@pytest.fixture +def sample_dates_84() -> list[date]: + """Create 84 consecutive dates (12 weeks) starting from 2024-01-01.""" + start = date(2024, 1, 1) + return [start + timedelta(days=i) for i in range(84)] + + +@pytest.fixture +def sample_seasonal_values_84() -> np.ndarray: + """Create 84 values with weekly pattern (12 weeks). + + Pattern: [10, 20, 30, 40, 50, 60, 70] repeated 12 times. + """ + weekly_pattern = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0]) + return np.tile(weekly_pattern, 12) + + +@pytest.fixture +def sample_split_config_expanding() -> SplitConfig: + """Create a SplitConfig with expanding window strategy.""" + return SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ) + + +@pytest.fixture +def sample_split_config_sliding() -> SplitConfig: + """Create a SplitConfig with sliding window strategy.""" + return SplitConfig( + strategy="sliding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ) + + +@pytest.fixture +def sample_split_config_with_gap() -> SplitConfig: + """Create a SplitConfig with gap between train and test.""" + return SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=30, + gap=7, + horizon=14, + ) + + +@pytest.fixture +def sample_naive_config() -> NaiveModelConfig: + """Create a naive model configuration.""" + return NaiveModelConfig() + + +@pytest.fixture +def sample_seasonal_config() -> SeasonalNaiveModelConfig: + """Create a seasonal naive model configuration.""" + return SeasonalNaiveModelConfig(season_length=7) + + +@pytest.fixture +def sample_backtest_config_naive(sample_split_config_expanding: SplitConfig) -> BacktestConfig: + """Create a BacktestConfig with naive model.""" + return BacktestConfig( + split_config=sample_split_config_expanding, + model_config_main=NaiveModelConfig(), + include_baselines=True, + store_fold_details=True, + ) + + +@pytest.fixture +def sample_backtest_config_no_baselines( + sample_split_config_expanding: SplitConfig, +) -> BacktestConfig: + """Create a BacktestConfig without baselines.""" + return BacktestConfig( + split_config=sample_split_config_expanding, + model_config_main=NaiveModelConfig(), + include_baselines=False, + store_fold_details=True, + ) diff --git a/app/features/backtesting/tests/test_metrics.py b/app/features/backtesting/tests/test_metrics.py new file mode 100644 index 00000000..80d85b87 --- /dev/null +++ b/app/features/backtesting/tests/test_metrics.py @@ -0,0 +1,378 @@ +"""Tests for backtesting metrics calculator.""" + +import math + +import numpy as np +import pytest + +from app.features.backtesting.metrics import MetricsCalculator + + +class TestMAE: + """Tests for Mean Absolute Error calculation.""" + + def test_mae_perfect_predictions(self) -> None: + """Test MAE is 0 for perfect predictions.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([10.0, 20.0, 30.0]) + + result = calc.mae(actuals, predictions) + assert result.value == 0.0 + + def test_mae_known_values(self) -> None: + """Test MAE with known values.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([12.0, 18.0, 33.0]) + + # |10-12| + |20-18| + |30-33| = 2 + 2 + 3 = 7 + # MAE = 7/3 = 2.333... + result = calc.mae(actuals, predictions) + assert result.value == pytest.approx(7 / 3) + + def test_mae_negative_errors(self) -> None: + """Test MAE handles negative errors correctly.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0]) + predictions = np.array([15.0, 15.0]) # Over and under + + # |10-15| + |20-15| = 5 + 5 = 10 + # MAE = 10/2 = 5 + result = calc.mae(actuals, predictions) + assert result.value == 5.0 + + def test_mae_n_samples(self) -> None: + """Test MAE returns correct n_samples.""" + calc = MetricsCalculator() + actuals = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + predictions = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + + result = calc.mae(actuals, predictions) + assert result.n_samples == 5 + + +class TestSMAPE: + """Tests for Symmetric Mean Absolute Percentage Error calculation.""" + + def test_smape_perfect_predictions(self) -> None: + """Test sMAPE is 0 for perfect predictions.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([10.0, 20.0, 30.0]) + + result = calc.smape(actuals, predictions) + assert result.value == 0.0 + + def test_smape_known_values(self) -> None: + """Test sMAPE with known values.""" + calc = MetricsCalculator() + actuals = np.array([100.0]) + predictions = np.array([80.0]) + + # |100-80| / (|100|+|80|) * 200 = 20/180 * 200 = 22.22... + result = calc.smape(actuals, predictions) + expected = (20 / 180) * 200 + assert result.value == pytest.approx(expected) + + def test_smape_range_0_to_200(self) -> None: + """Test sMAPE is in range 0-200.""" + calc = MetricsCalculator() + actuals = np.array([100.0, 50.0, 25.0]) + predictions = np.array([0.0, 100.0, 0.0]) # Extreme predictions + + result = calc.smape(actuals, predictions) + assert 0 <= result.value <= 200 + + def test_smape_both_zero_returns_zero(self) -> None: + """Test sMAPE returns 0 when both actual and prediction are 0.""" + calc = MetricsCalculator() + actuals = np.array([0.0, 10.0, 0.0]) + predictions = np.array([0.0, 10.0, 0.0]) + + result = calc.smape(actuals, predictions) + assert result.value == 0.0 + + def test_smape_actual_zero_pred_nonzero(self) -> None: + """Test sMAPE when actual is 0 but prediction is not.""" + calc = MetricsCalculator() + actuals = np.array([0.0]) + predictions = np.array([10.0]) + + # |0-10| / (|0|+|10|) * 200 = 10/10 * 200 = 200 + result = calc.smape(actuals, predictions) + assert result.value == 200.0 + + def test_smape_symmetric(self) -> None: + """Test sMAPE is symmetric (actual/pred interchangeable).""" + calc = MetricsCalculator() + actuals1 = np.array([100.0]) + predictions1 = np.array([80.0]) + + actuals2 = np.array([80.0]) + predictions2 = np.array([100.0]) + + result1 = calc.smape(actuals1, predictions1) + result2 = calc.smape(actuals2, predictions2) + + assert result1.value == pytest.approx(result2.value) + + +class TestWAPE: + """Tests for Weighted Absolute Percentage Error calculation.""" + + def test_wape_perfect_predictions(self) -> None: + """Test WAPE is 0 for perfect predictions.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([10.0, 20.0, 30.0]) + + result = calc.wape(actuals, predictions) + assert result.value == 0.0 + + def test_wape_known_values(self) -> None: + """Test WAPE with known values.""" + calc = MetricsCalculator() + actuals = np.array([100.0, 200.0]) + predictions = np.array([90.0, 220.0]) + + # sum(|errors|) / sum(|actuals|) * 100 + # (10 + 20) / 300 * 100 = 10% + result = calc.wape(actuals, predictions) + assert result.value == pytest.approx(10.0) + + def test_wape_zero_actuals_returns_inf(self) -> None: + """Test WAPE returns inf when sum of actuals is zero.""" + calc = MetricsCalculator() + actuals = np.array([0.0, 0.0, 0.0]) + predictions = np.array([1.0, 2.0, 3.0]) + + result = calc.wape(actuals, predictions) + assert math.isinf(result.value) + assert len(result.warnings) > 0 + + def test_wape_weighted_properly(self) -> None: + """Test WAPE weights larger actuals more heavily.""" + calc = MetricsCalculator() + # Same absolute error (10) but different actuals + actuals = np.array([10.0, 100.0]) + predictions = np.array([0.0, 90.0]) + + # sum(|errors|) / sum(|actuals|) * 100 + # (10 + 10) / 110 * 100 = 18.18% + result = calc.wape(actuals, predictions) + assert result.value == pytest.approx(20 / 110 * 100) + + +class TestBias: + """Tests for Forecast Bias calculation.""" + + def test_bias_no_bias(self) -> None: + """Test bias is 0 when over/under predictions cancel out.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0]) + predictions = np.array([15.0, 15.0]) # +5 and -5 cancel + + result = calc.bias(actuals, predictions) + assert result.value == pytest.approx(0.0) + + def test_bias_positive_under_forecast(self) -> None: + """Test positive bias indicates under-forecasting.""" + calc = MetricsCalculator() + actuals = np.array([100.0, 100.0]) + predictions = np.array([80.0, 80.0]) + + # Bias = mean(actuals - predictions) = mean(20, 20) = 20 + result = calc.bias(actuals, predictions) + assert result.value == 20.0 + + def test_bias_negative_over_forecast(self) -> None: + """Test negative bias indicates over-forecasting.""" + calc = MetricsCalculator() + actuals = np.array([100.0, 100.0]) + predictions = np.array([120.0, 120.0]) + + # Bias = mean(actuals - predictions) = mean(-20, -20) = -20 + result = calc.bias(actuals, predictions) + assert result.value == -20.0 + + +class TestCalculateAll: + """Tests for calculate_all method.""" + + def test_calculate_all_returns_all_metrics(self) -> None: + """Test calculate_all returns all expected metrics.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([12.0, 18.0, 33.0]) + + result = calc.calculate_all(actuals, predictions) + + assert "mae" in result + assert "smape" in result + assert "wape" in result + assert "bias" in result + + def test_calculate_all_values_consistent(self) -> None: + """Test calculate_all values match individual calculations.""" + calc = MetricsCalculator() + actuals = np.array([10.0, 20.0, 30.0]) + predictions = np.array([12.0, 18.0, 33.0]) + + all_metrics = calc.calculate_all(actuals, predictions) + + assert all_metrics["mae"] == calc.mae(actuals, predictions).value + assert all_metrics["smape"] == calc.smape(actuals, predictions).value + assert all_metrics["wape"] == calc.wape(actuals, predictions).value + assert all_metrics["bias"] == calc.bias(actuals, predictions).value + + +class TestAggregateFoldMetrics: + """Tests for aggregate_fold_metrics method.""" + + def test_aggregate_computes_mean(self) -> None: + """Test aggregation computes mean across folds.""" + calc = MetricsCalculator() + fold_metrics = [ + {"mae": 10.0, "smape": 20.0}, + {"mae": 20.0, "smape": 40.0}, + {"mae": 30.0, "smape": 60.0}, + ] + + aggregated, _ = calc.aggregate_fold_metrics(fold_metrics) + + assert aggregated["mae"] == pytest.approx(20.0) # mean of 10, 20, 30 + assert aggregated["smape"] == pytest.approx(40.0) # mean of 20, 40, 60 + + def test_aggregate_computes_stability(self) -> None: + """Test aggregation computes stability index (coefficient of variation).""" + calc = MetricsCalculator() + fold_metrics = [ + {"mae": 10.0}, + {"mae": 20.0}, + {"mae": 30.0}, + ] + + _, stability = calc.aggregate_fold_metrics(fold_metrics) + + # Stability = std/mean * 100 = sqrt(200/3)/20 * 100 ≈ 40.82% + expected_std = np.std([10.0, 20.0, 30.0]) + expected_mean = np.mean([10.0, 20.0, 30.0]) + expected_stability = (expected_std / expected_mean) * 100 + assert stability["mae_stability"] == pytest.approx(expected_stability) + + def test_aggregate_empty_folds(self) -> None: + """Test aggregation handles empty fold list.""" + calc = MetricsCalculator() + fold_metrics: list[dict[str, float]] = [] + + aggregated, std = calc.aggregate_fold_metrics(fold_metrics) + + assert aggregated == {} + assert std == {} + + def test_aggregate_single_fold(self) -> None: + """Test aggregation with single fold.""" + calc = MetricsCalculator() + fold_metrics = [{"mae": 15.0, "smape": 25.0}] + + aggregated, stability = calc.aggregate_fold_metrics(fold_metrics) + + assert aggregated["mae"] == 15.0 + assert aggregated["smape"] == 25.0 + # Single fold: stability_index returns nan (need at least 2 folds) + assert np.isnan(stability["mae_stability"]) + assert np.isnan(stability["smape_stability"]) + + +class TestStabilityIndex: + """Tests for stability index calculation.""" + + def test_stability_index_perfect_stability(self) -> None: + """Test stability index is 0 for identical values.""" + calc = MetricsCalculator() + values = [10.0, 10.0, 10.0, 10.0] + + result = calc.stability_index(values) + assert result.value == 0.0 + + def test_stability_index_known_cv(self) -> None: + """Test stability index with known coefficient of variation.""" + calc = MetricsCalculator() + # Values with known std and mean + values = [10.0, 20.0, 30.0] + # std ≈ 8.165, mean = 20 + # CV = 8.165 / 20 * 100 ≈ 40.82% + + result = calc.stability_index(values) + expected_cv = (np.std(values) / np.mean(values)) * 100 + assert result.value == pytest.approx(expected_cv) + + def test_stability_index_zero_mean(self) -> None: + """Test stability index handles zero mean.""" + calc = MetricsCalculator() + values = [-10.0, 0.0, 10.0] # mean = 0 + + result = calc.stability_index(values) + assert math.isinf(result.value) + assert len(result.warnings) > 0 + + def test_stability_higher_for_variable_data(self) -> None: + """Test higher stability index for more variable data.""" + calc = MetricsCalculator() + stable = [100.0, 101.0, 99.0, 100.0] + variable = [50.0, 100.0, 150.0, 200.0] + + stable_result = calc.stability_index(stable) + variable_result = calc.stability_index(variable) + + assert variable_result.value > stable_result.value + + +class TestEdgeCases: + """Tests for edge cases and error handling.""" + + def test_single_sample(self) -> None: + """Test metrics work with single sample.""" + calc = MetricsCalculator() + actuals = np.array([100.0]) + predictions = np.array([90.0]) + + result = calc.calculate_all(actuals, predictions) + + assert result["mae"] == 10.0 + assert result["bias"] == 10.0 + + def test_large_values(self) -> None: + """Test metrics handle large values correctly.""" + calc = MetricsCalculator() + actuals = np.array([1e9, 2e9, 3e9]) + predictions = np.array([1.1e9, 1.9e9, 3.1e9]) + + result = calc.calculate_all(actuals, predictions) + + # Should compute without overflow + assert not math.isnan(result["mae"]) + assert not math.isnan(result["smape"]) + + def test_small_values(self) -> None: + """Test metrics handle small values correctly.""" + calc = MetricsCalculator() + actuals = np.array([0.001, 0.002, 0.003]) + predictions = np.array([0.0011, 0.0019, 0.0031]) + + result = calc.calculate_all(actuals, predictions) + + # Should compute without underflow issues + assert not math.isnan(result["mae"]) + assert not math.isnan(result["smape"]) + + def test_mixed_positive_negative_actuals(self) -> None: + """Test metrics handle mixed positive/negative actuals.""" + calc = MetricsCalculator() + actuals = np.array([-10.0, 0.0, 10.0]) + predictions = np.array([-8.0, 2.0, 8.0]) + + # MAE should still work + mae_result = calc.mae(actuals, predictions) + assert mae_result.value == pytest.approx(2.0) # mean of |2|, |2|, |2| diff --git a/app/features/backtesting/tests/test_schemas.py b/app/features/backtesting/tests/test_schemas.py new file mode 100644 index 00000000..97c56fc3 --- /dev/null +++ b/app/features/backtesting/tests/test_schemas.py @@ -0,0 +1,285 @@ +"""Tests for backtesting schemas.""" + +import pytest +from pydantic import ValidationError + +from app.features.backtesting.schemas import ( + BacktestConfig, + BacktestRequest, + FoldResult, + ModelBacktestResult, + SplitBoundary, + SplitConfig, +) +from app.features.forecasting.schemas import NaiveModelConfig + + +class TestSplitConfig: + """Tests for SplitConfig schema.""" + + def test_default_values(self): + """Test SplitConfig has correct default values.""" + config = SplitConfig() + + assert config.strategy == "expanding" + assert config.n_splits == 5 + assert config.min_train_size == 30 + assert config.gap == 0 + assert config.horizon == 14 + + def test_expanding_strategy(self): + """Test expanding strategy is valid.""" + config = SplitConfig(strategy="expanding") + assert config.strategy == "expanding" + + def test_sliding_strategy(self): + """Test sliding strategy is valid.""" + config = SplitConfig(strategy="sliding") + assert config.strategy == "sliding" + + def test_invalid_strategy_raises(self): + """Test invalid strategy raises validation error.""" + with pytest.raises(ValidationError): + SplitConfig(strategy="random") # type: ignore + + def test_n_splits_minimum(self): + """Test n_splits must be at least 2.""" + with pytest.raises(ValidationError): + SplitConfig(n_splits=1) + + def test_n_splits_maximum(self): + """Test n_splits must be at most 20.""" + with pytest.raises(ValidationError): + SplitConfig(n_splits=21) + + def test_min_train_size_minimum(self): + """Test min_train_size must be at least 7.""" + with pytest.raises(ValidationError): + SplitConfig(min_train_size=6) + + def test_gap_minimum(self): + """Test gap must be non-negative.""" + with pytest.raises(ValidationError): + SplitConfig(gap=-1) + + def test_gap_maximum(self): + """Test gap must be at most 30.""" + with pytest.raises(ValidationError): + SplitConfig(gap=31) + + def test_horizon_minimum(self): + """Test horizon must be at least 1.""" + with pytest.raises(ValidationError): + SplitConfig(horizon=0) + + def test_horizon_maximum(self): + """Test horizon must be at most 90.""" + with pytest.raises(ValidationError): + SplitConfig(horizon=91) + + def test_horizon_must_be_greater_than_gap(self): + """Test horizon must be greater than gap.""" + with pytest.raises(ValidationError) as exc_info: + SplitConfig(horizon=5, gap=5) + assert "horizon (5) must be greater than gap (5)" in str(exc_info.value) + + def test_horizon_greater_than_gap_valid(self): + """Test horizon > gap is valid.""" + config = SplitConfig(horizon=10, gap=5) + assert config.horizon == 10 + assert config.gap == 5 + + def test_frozen_config(self): + """Test SplitConfig is immutable.""" + config = SplitConfig() + with pytest.raises(ValidationError): + config.n_splits = 10 + + +class TestBacktestConfig: + """Tests for BacktestConfig schema.""" + + def test_default_values(self): + """Test BacktestConfig has correct default values.""" + config = BacktestConfig(model_config_main=NaiveModelConfig()) + + assert config.schema_version == "1.0" + assert config.include_baselines is True + assert config.store_fold_details is True + + def test_config_hash_determinism(self): + """Test config_hash is deterministic.""" + config1 = BacktestConfig(model_config_main=NaiveModelConfig()) + config2 = BacktestConfig(model_config_main=NaiveModelConfig()) + + assert config1.config_hash() == config2.config_hash() + + def test_config_hash_changes_with_config(self): + """Test config_hash changes when config changes.""" + config1 = BacktestConfig( + model_config_main=NaiveModelConfig(), + include_baselines=True, + ) + config2 = BacktestConfig( + model_config_main=NaiveModelConfig(), + include_baselines=False, + ) + + assert config1.config_hash() != config2.config_hash() + + def test_config_hash_length(self): + """Test config_hash has correct length.""" + config = BacktestConfig(model_config_main=NaiveModelConfig()) + assert len(config.config_hash()) == 16 + + def test_frozen_config(self): + """Test BacktestConfig is immutable.""" + config = BacktestConfig(model_config_main=NaiveModelConfig()) + with pytest.raises(ValidationError): + config.include_baselines = False + + def test_invalid_schema_version(self): + """Test invalid schema_version raises error.""" + with pytest.raises(ValidationError): + BacktestConfig( + model_config_main=NaiveModelConfig(), + schema_version="invalid", + ) + + def test_valid_schema_versions(self): + """Test various valid schema versions.""" + for version in ["1.0", "2.1", "10.20.30"]: + config = BacktestConfig( + model_config_main=NaiveModelConfig(), + schema_version=version, + ) + assert config.schema_version == version + + +class TestSplitBoundary: + """Tests for SplitBoundary schema.""" + + def test_split_boundary_creation(self): + """Test SplitBoundary creation.""" + from datetime import date + + boundary = SplitBoundary( + fold_index=0, + train_start=date(2024, 1, 1), + train_end=date(2024, 1, 30), + test_start=date(2024, 1, 31), + test_end=date(2024, 2, 13), + train_size=30, + test_size=14, + ) + + assert boundary.fold_index == 0 + assert boundary.train_size == 30 + assert boundary.test_size == 14 + + +class TestFoldResult: + """Tests for FoldResult schema.""" + + def test_fold_result_creation(self): + """Test FoldResult creation.""" + from datetime import date + + boundary = SplitBoundary( + fold_index=0, + train_start=date(2024, 1, 1), + train_end=date(2024, 1, 30), + test_start=date(2024, 1, 31), + test_end=date(2024, 2, 13), + train_size=30, + test_size=14, + ) + + result = FoldResult( + fold_index=0, + split=boundary, + dates=[date(2024, 1, 31), date(2024, 2, 1)], + actuals=[10.0, 20.0], + predictions=[12.0, 18.0], + metrics={"mae": 2.0, "smape": 10.0}, + ) + + assert result.fold_index == 0 + assert len(result.dates) == 2 + assert result.metrics["mae"] == 2.0 + + +class TestModelBacktestResult: + """Tests for ModelBacktestResult schema.""" + + def test_model_backtest_result_creation(self): + """Test ModelBacktestResult creation.""" + result = ModelBacktestResult( + model_type="naive", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"mae": 5.0}, + metric_std={"mae_stability": 10.0}, + ) + + assert result.model_type == "naive" + assert result.aggregated_metrics["mae"] == 5.0 + + +class TestBacktestRequest: + """Tests for BacktestRequest schema.""" + + def test_valid_request(self): + """Test valid BacktestRequest.""" + from datetime import date + + request = BacktestRequest( + store_id=1, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 6, 30), + config=BacktestConfig(model_config_main=NaiveModelConfig()), + ) + + assert request.store_id == 1 + assert request.product_id == 1 + + def test_end_date_must_be_after_start_date(self): + """Test end_date must be after start_date.""" + from datetime import date + + with pytest.raises(ValidationError) as exc_info: + BacktestRequest( + store_id=1, + product_id=1, + start_date=date(2024, 6, 30), + end_date=date(2024, 1, 1), + config=BacktestConfig(model_config_main=NaiveModelConfig()), + ) + assert "end_date must be after start_date" in str(exc_info.value) + + def test_store_id_must_be_positive(self): + """Test store_id must be positive.""" + from datetime import date + + with pytest.raises(ValidationError): + BacktestRequest( + store_id=0, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 6, 30), + config=BacktestConfig(model_config_main=NaiveModelConfig()), + ) + + def test_product_id_must_be_positive(self): + """Test product_id must be positive.""" + from datetime import date + + with pytest.raises(ValidationError): + BacktestRequest( + store_id=1, + product_id=0, + start_date=date(2024, 1, 1), + end_date=date(2024, 6, 30), + config=BacktestConfig(model_config_main=NaiveModelConfig()), + ) diff --git a/app/features/backtesting/tests/test_service.py b/app/features/backtesting/tests/test_service.py new file mode 100644 index 00000000..2ed9bc62 --- /dev/null +++ b/app/features/backtesting/tests/test_service.py @@ -0,0 +1,548 @@ +"""Tests for backtesting service.""" + +from datetime import date, timedelta +from unittest.mock import AsyncMock, MagicMock + +import numpy as np +import pytest + +from app.features.backtesting.schemas import ( + BacktestConfig, + BacktestResponse, + SplitConfig, +) +from app.features.backtesting.service import BacktestingService, SeriesData +from app.features.forecasting.schemas import NaiveModelConfig, SeasonalNaiveModelConfig + + +class TestSeriesData: + """Tests for SeriesData dataclass.""" + + def test_series_data_creation(self) -> None: + """Test SeriesData creation and n_observations computation.""" + dates = [date(2024, 1, 1), date(2024, 1, 2), date(2024, 1, 3)] + values = np.array([10.0, 20.0, 30.0]) + + data = SeriesData( + dates=dates, + values=values, + store_id=1, + product_id=1, + ) + + assert data.n_observations == 3 + assert data.store_id == 1 + assert data.product_id == 1 + + def test_series_data_empty(self) -> None: + """Test SeriesData with empty data.""" + data = SeriesData( + dates=[], + values=np.array([], dtype=np.float64), + store_id=1, + product_id=1, + ) + + assert data.n_observations == 0 + + +class TestBacktestingServiceRunModelBacktest: + """Tests for _run_model_backtest method.""" + + def test_run_model_backtest_naive( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test running backtest with naive model.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_120, + values=sample_values_120, + store_id=1, + product_id=1, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(sample_split_config_expanding) + + result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=True, + ) + + assert result.model_type == "naive" + assert len(result.fold_results) == sample_split_config_expanding.n_splits + assert "mae" in result.aggregated_metrics + assert "smape" in result.aggregated_metrics + + def test_run_model_backtest_without_fold_details( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test running backtest without storing fold details.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_120, + values=sample_values_120, + store_id=1, + product_id=1, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(sample_split_config_expanding) + + result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=False, + ) + + # Fold results should have empty arrays + for fold in result.fold_results: + assert fold.dates == [] + assert fold.actuals == [] + assert fold.predictions == [] + # But metrics should still be present + assert fold.metrics is not None + + +class TestBacktestingServiceBaselineComparisons: + """Tests for baseline comparison functionality.""" + + def test_run_baseline_comparisons( + self, + sample_dates_84: list[date], + sample_seasonal_values_84: np.ndarray, + ) -> None: + """Test running baseline comparisons.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_84, + values=sample_seasonal_values_84, + store_id=1, + product_id=1, + ) + + config = SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=21, + gap=0, + horizon=7, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(config) + + results = service._run_baseline_comparisons( + series_data=series_data, + splitter=splitter, + store_fold_details=True, + ) + + # Should have naive and seasonal_naive baselines + model_types = [r.model_type for r in results] + assert "naive" in model_types + assert "seasonal_naive" in model_types + + def test_generate_comparison_summary( + self, + sample_dates_84: list[date], + sample_seasonal_values_84: np.ndarray, + ) -> None: + """Test comparison summary generation.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_84, + values=sample_seasonal_values_84, + store_id=1, + product_id=1, + ) + + config = SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=21, + gap=0, + horizon=7, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(config) + + main_results = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=True, + ) + + baseline_results = service._run_baseline_comparisons( + series_data=series_data, + splitter=splitter, + store_fold_details=True, + ) + + summary = service._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Check summary structure + assert "mae" in summary + assert "main" in summary["mae"] + + # Check baseline comparisons are present + if "naive" in [r.model_type for r in baseline_results]: + assert "naive" in summary["mae"] + + def test_comparison_improvement_percentage(self) -> None: + """Test improvement percentage calculation.""" + service = BacktestingService() + + from app.features.backtesting.schemas import ModelBacktestResult + + # Create mock results + main_results = ModelBacktestResult( + model_type="test_model", + config_hash="abc123", + fold_results=[], + aggregated_metrics={"mae": 10.0}, + metric_std={"mae_std": 1.0}, + ) + + baseline_results = [ + ModelBacktestResult( + model_type="naive", + config_hash="def456", + fold_results=[], + aggregated_metrics={"mae": 20.0}, # Naive is worse + metric_std={"mae_std": 2.0}, + ) + ] + + summary = service._generate_comparison_summary( + main_results=main_results, + baseline_results=baseline_results, + ) + + # Main model has MAE=10, naive has MAE=20 + # Improvement = (20-10)/20 * 100 = 50% + assert summary["mae"]["vs_naive_pct"] == pytest.approx(50.0) + + +class TestBacktestingServiceLoadData: + """Tests for _load_series_data method.""" + + @pytest.mark.asyncio + async def test_load_series_data_returns_empty_for_no_data(self) -> None: + """Test loading returns empty SeriesData when no data found.""" + service = BacktestingService() + + # Mock database session + mock_result = MagicMock() + mock_result.all.return_value = [] + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + + data = await service._load_series_data( + db=mock_db, + store_id=999, + product_id=999, + start_date=date(2024, 1, 1), + end_date=date(2024, 12, 31), + ) + + assert data.n_observations == 0 + assert len(data.dates) == 0 + assert len(data.values) == 0 + + @pytest.mark.asyncio + async def test_load_series_data_with_rows(self) -> None: + """Test loading series data with mock rows.""" + service = BacktestingService() + + # Create mock rows + mock_rows = [ + type("Row", (), {"date": date(2024, 1, 1), "quantity": 100.0})(), + type("Row", (), {"date": date(2024, 1, 2), "quantity": 150.0})(), + type("Row", (), {"date": date(2024, 1, 3), "quantity": 200.0})(), + ] + + mock_result = MagicMock() + mock_result.all.return_value = mock_rows + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + + data = await service._load_series_data( + db=mock_db, + store_id=1, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 1, 31), + ) + + assert data.n_observations == 3 + assert data.store_id == 1 + assert data.product_id == 1 + assert len(data.dates) == 3 + assert data.values[0] == 100.0 + + +class TestBacktestingServiceRunBacktest: + """Tests for run_backtest method.""" + + @pytest.mark.asyncio + async def test_run_backtest_no_data_raises(self) -> None: + """Test run_backtest raises ValueError when no data found.""" + service = BacktestingService() + + # Mock database returning no data + mock_result = MagicMock() + mock_result.all.return_value = [] + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + + config = BacktestConfig( + split_config=SplitConfig(), + model_config_main=NaiveModelConfig(), + ) + + with pytest.raises(ValueError, match="No data found"): + await service.run_backtest( + db=mock_db, + store_id=1, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 12, 31), + config=config, + ) + + @pytest.mark.asyncio + async def test_run_backtest_returns_response(self) -> None: + """Test run_backtest returns BacktestResponse.""" + service = BacktestingService() + + # Create mock rows for 120 days + start = date(2024, 1, 1) + mock_rows = [ + type("Row", (), {"date": start + timedelta(days=i), "quantity": float(i + 1)})() + for i in range(120) + ] + + mock_result = MagicMock() + mock_result.all.return_value = mock_rows + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=True, + store_fold_details=True, + ) + + response = await service.run_backtest( + db=mock_db, + store_id=1, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 30), + config=config, + ) + + assert isinstance(response, BacktestResponse) + assert response.store_id == 1 + assert response.product_id == 1 + assert response.backtest_id is not None + assert len(response.main_model_results.fold_results) == 3 + assert response.baseline_results is not None + assert response.comparison_summary is not None + assert response.leakage_check_passed is True + + @pytest.mark.asyncio + async def test_run_backtest_without_baselines(self) -> None: + """Test run_backtest without baseline comparisons.""" + service = BacktestingService() + + # Create mock rows for 120 days + start = date(2024, 1, 1) + mock_rows = [ + type("Row", (), {"date": start + timedelta(days=i), "quantity": float(i + 1)})() + for i in range(120) + ] + + mock_result = MagicMock() + mock_result.all.return_value = mock_rows + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=False, + store_fold_details=True, + ) + + response = await service.run_backtest( + db=mock_db, + store_id=1, + product_id=1, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 30), + config=config, + ) + + assert response.baseline_results is None + assert response.comparison_summary is None + + +class TestBacktestingServiceMetrics: + """Tests for metrics in backtest results.""" + + def test_fold_metrics_are_computed( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test that fold metrics are computed correctly.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_120, + values=sample_values_120, + store_id=1, + product_id=1, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(sample_split_config_expanding) + + result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=True, + ) + + # Check each fold has metrics + for fold in result.fold_results: + assert "mae" in fold.metrics + assert "smape" in fold.metrics + assert "wape" in fold.metrics + assert "bias" in fold.metrics + + def test_aggregated_metrics_include_stability( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test that aggregated metrics include stability index.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_120, + values=sample_values_120, + store_id=1, + product_id=1, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(sample_split_config_expanding) + + result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=True, + ) + + # Check stability metrics exist + assert "mae_stability" in result.metric_std + assert "smape_stability" in result.metric_std + + +class TestBacktestingServiceSeasonalModel: + """Tests for seasonal model in backtesting.""" + + def test_seasonal_naive_on_seasonal_data( + self, + sample_dates_84: list[date], + sample_seasonal_values_84: np.ndarray, + ) -> None: + """Test seasonal naive performs well on seasonal data.""" + service = BacktestingService() + + series_data = SeriesData( + dates=sample_dates_84, + values=sample_seasonal_values_84, + store_id=1, + product_id=1, + ) + + config = SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=21, # 3 weeks minimum + gap=0, + horizon=7, + ) + + from app.features.backtesting.splitter import TimeSeriesSplitter + + splitter = TimeSeriesSplitter(config) + + # Run both naive and seasonal naive + naive_result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=NaiveModelConfig(), + store_fold_details=True, + ) + + seasonal_result = service._run_model_backtest( + series_data=series_data, + splitter=splitter, + model_config=SeasonalNaiveModelConfig(season_length=7), + store_fold_details=True, + ) + + # Seasonal naive should perform better on seasonal data + # (lower MAE) + assert seasonal_result.aggregated_metrics["mae"] < naive_result.aggregated_metrics["mae"] diff --git a/app/features/backtesting/tests/test_splitter.py b/app/features/backtesting/tests/test_splitter.py new file mode 100644 index 00000000..89b94bd2 --- /dev/null +++ b/app/features/backtesting/tests/test_splitter.py @@ -0,0 +1,348 @@ +"""Tests for time series splitter.""" + +from datetime import date, timedelta + +import numpy as np +import pytest + +from app.features.backtesting.schemas import SplitConfig +from app.features.backtesting.splitter import TimeSeriesSplitter + + +class TestTimeSeriesSplitterInit: + """Tests for TimeSeriesSplitter initialization.""" + + def test_init_with_expanding_strategy(self, sample_split_config_expanding: SplitConfig) -> None: + """Test splitter initialization with expanding strategy.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + assert splitter.config.strategy == "expanding" + + def test_init_with_sliding_strategy(self, sample_split_config_sliding: SplitConfig) -> None: + """Test splitter initialization with sliding strategy.""" + splitter = TimeSeriesSplitter(sample_split_config_sliding) + assert splitter.config.strategy == "sliding" + + +class TestTimeSeriesSplitterExpanding: + """Tests for expanding window strategy.""" + + def test_expanding_generates_correct_number_of_splits( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test expanding strategy generates requested number of splits.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + assert len(splits) == sample_split_config_expanding.n_splits + + def test_expanding_train_size_increases( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test expanding strategy has increasing train sizes.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + train_sizes = [len(s.train_indices) for s in splits] + for i in range(1, len(train_sizes)): + assert train_sizes[i] > train_sizes[i - 1], ( + f"Train size should increase: fold {i - 1}={train_sizes[i - 1]}, " + f"fold {i}={train_sizes[i]}" + ) + + def test_expanding_first_fold_has_min_train_size( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test first fold has minimum train size.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + assert len(splits[0].train_indices) >= sample_split_config_expanding.min_train_size + + def test_expanding_test_size_equals_horizon( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test all folds have test size equal to horizon.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for split in splits: + assert len(split.test_indices) == sample_split_config_expanding.horizon + + +class TestTimeSeriesSplitterSliding: + """Tests for sliding window strategy.""" + + def test_sliding_generates_correct_number_of_splits( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_sliding: SplitConfig, + ) -> None: + """Test sliding strategy generates requested number of splits.""" + splitter = TimeSeriesSplitter(sample_split_config_sliding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + assert len(splits) == sample_split_config_sliding.n_splits + + def test_sliding_train_size_constant( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_sliding: SplitConfig, + ) -> None: + """Test sliding strategy has constant train sizes.""" + splitter = TimeSeriesSplitter(sample_split_config_sliding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + train_sizes = [len(s.train_indices) for s in splits] + # All train sizes should be equal + assert len(set(train_sizes)) == 1, f"Train sizes should be constant: {train_sizes}" + + def test_sliding_window_moves_forward( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_sliding: SplitConfig, + ) -> None: + """Test sliding window moves forward each fold.""" + splitter = TimeSeriesSplitter(sample_split_config_sliding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for i in range(1, len(splits)): + assert splits[i].train_indices[0] > splits[i - 1].train_indices[0], ( + f"Sliding window should move forward: " + f"fold {i - 1} start={splits[i - 1].train_indices[0]}, " + f"fold {i} start={splits[i].train_indices[0]}" + ) + + +class TestTimeSeriesSplitterWithGap: + """Tests for splitter with gap parameter.""" + + def test_gap_creates_separation( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_with_gap: SplitConfig, + ) -> None: + """Test gap creates separation between train and test.""" + splitter = TimeSeriesSplitter(sample_split_config_with_gap) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + gap = sample_split_config_with_gap.gap + for split in splits: + train_end = split.train_indices[-1] + test_start = split.test_indices[0] + actual_gap = test_start - train_end - 1 + assert actual_gap == gap, ( + f"Gap should be {gap} but got {actual_gap}: " + f"train_end={train_end}, test_start={test_start}" + ) + + def test_gap_dates_have_correct_separation( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_with_gap: SplitConfig, + ) -> None: + """Test gap dates have correct temporal separation.""" + splitter = TimeSeriesSplitter(sample_split_config_with_gap) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + gap = sample_split_config_with_gap.gap + for split in splits: + train_end_date = split.train_dates[-1] + test_start_date = split.test_dates[0] + date_diff = (test_start_date - train_end_date).days + expected_diff = gap + 1 + assert date_diff == expected_diff, ( + f"Date gap should be {expected_diff} days but got {date_diff}: " + f"train_end={train_end_date}, test_start={test_start_date}" + ) + + +class TestTimeSeriesSplitterBoundaries: + """Tests for split boundaries.""" + + def test_get_boundaries_returns_all_folds( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test get_boundaries returns boundaries for all folds.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + boundaries = splitter.get_boundaries(sample_dates_120, sample_values_120) + + assert len(boundaries) == sample_split_config_expanding.n_splits + + def test_boundaries_have_correct_dates( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test boundaries have correct date ranges.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + boundaries = splitter.get_boundaries(sample_dates_120, sample_values_120) + + for boundary in boundaries: + assert boundary.train_start <= boundary.train_end + assert boundary.test_start <= boundary.test_end + assert boundary.train_end < boundary.test_start + + def test_boundaries_have_correct_sizes( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test boundaries have correct train and test sizes.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + boundaries = splitter.get_boundaries(sample_dates_120, sample_values_120) + + for split, boundary in zip(splits, boundaries, strict=True): + assert boundary.train_size == len(split.train_indices) + assert boundary.test_size == len(split.test_indices) + + +class TestTimeSeriesSplitterLeakageValidation: + """Tests for leakage validation.""" + + def test_validate_no_leakage_passes_for_valid_splits( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test leakage validation passes for valid splits.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + # Generate splits to populate boundaries + list(splitter.split(sample_dates_120, sample_values_120)) + + result = splitter.validate_no_leakage(sample_dates_120, sample_values_120) + assert result is True + + def test_train_test_indices_do_not_overlap( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test train and test indices never overlap.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for split in splits: + train_set = set(split.train_indices) + test_set = set(split.test_indices) + overlap = train_set & test_set + assert len(overlap) == 0, f"Overlap found in fold {split.fold_index}: {overlap}" + + def test_test_indices_always_after_train( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test test indices are always after train indices.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for split in splits: + max_train = max(split.train_indices) + min_test = min(split.test_indices) + assert min_test > max_train, ( + f"Test should be after train in fold {split.fold_index}: " + f"max_train={max_train}, min_test={min_test}" + ) + + +class TestTimeSeriesSplitterEdgeCases: + """Tests for edge cases.""" + + def test_minimum_data_for_single_split(self) -> None: + """Test minimum data required for a single split.""" + config = SplitConfig( + strategy="expanding", + n_splits=2, + min_train_size=7, + gap=0, + horizon=7, + ) + splitter = TimeSeriesSplitter(config) + + # Need: min_train_size + horizon * n_splits + step * (n_splits - 1) + # Minimum: 7 + 7*2 = 21 for 2 splits with no step + start = date(2024, 1, 1) + dates = [start + timedelta(days=i) for i in range(30)] + values = np.arange(30, dtype=np.float64) + + splits = list(splitter.split(dates, values)) + assert len(splits) == 2 + + def test_insufficient_data_raises(self) -> None: + """Test insufficient data raises ValueError.""" + config = SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ) + splitter = TimeSeriesSplitter(config) + + # Too little data + start = date(2024, 1, 1) + dates = [start + timedelta(days=i) for i in range(20)] + values = np.arange(20, dtype=np.float64) + + with pytest.raises(ValueError, match="Need at least"): + list(splitter.split(dates, values)) + + def test_consecutive_dates_preserved( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test dates in splits are consecutive.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for split in splits: + # Check train dates are consecutive + for i in range(1, len(split.train_dates)): + diff = (split.train_dates[i] - split.train_dates[i - 1]).days + assert diff == 1, f"Train dates not consecutive in fold {split.fold_index}" + + # Check test dates are consecutive + for i in range(1, len(split.test_dates)): + diff = (split.test_dates[i] - split.test_dates[i - 1]).days + assert diff == 1, f"Test dates not consecutive in fold {split.fold_index}" + + def test_fold_index_is_sequential( + self, + sample_dates_120: list[date], + sample_values_120: np.ndarray, + sample_split_config_expanding: SplitConfig, + ) -> None: + """Test fold indices are sequential starting from 0.""" + splitter = TimeSeriesSplitter(sample_split_config_expanding) + splits = list(splitter.split(sample_dates_120, sample_values_120)) + + for i, split in enumerate(splits): + assert split.fold_index == i diff --git a/app/main.py b/app/main.py index 9175219b..eee3b908 100644 --- a/app/main.py +++ b/app/main.py @@ -10,6 +10,7 @@ from app.core.health import router as health_router from app.core.logging import configure_logging, get_logger from app.core.middleware import RequestIdMiddleware +from app.features.backtesting.routes import router as backtesting_router from app.features.featuresets.routes import router as featuresets_router from app.features.forecasting.routes import router as forecasting_router from app.features.ingest.routes import router as ingest_router @@ -72,6 +73,7 @@ def create_app() -> FastAPI: app.include_router(ingest_router) app.include_router(featuresets_router) app.include_router(forecasting_router) + app.include_router(backtesting_router) return app diff --git a/examples/backtest/inspect_splits.py b/examples/backtest/inspect_splits.py new file mode 100644 index 00000000..dc1b37cb --- /dev/null +++ b/examples/backtest/inspect_splits.py @@ -0,0 +1,139 @@ +"""Example: Inspecting time-series CV splits. + +Demonstrates how the TimeSeriesSplitter generates splits for +both expanding and sliding window strategies. + +Usage: + python examples/backtest/inspect_splits.py +""" + +from datetime import date, timedelta + +import numpy as np + +from app.features.backtesting.schemas import SplitConfig +from app.features.backtesting.splitter import TimeSeriesSplitter + + +def print_splits(title: str, config: SplitConfig, dates: list[date], values: np.ndarray) -> None: + """Print split details for visualization.""" + print("=" * 70) + print(f"{title}") + print("=" * 70) + print(f"Strategy: {config.strategy}") + print(f"N Splits: {config.n_splits}") + print(f"Min Train Size: {config.min_train_size}") + print(f"Gap: {config.gap}") + print(f"Horizon: {config.horizon}") + print(f"Total Data: {len(dates)} observations ({dates[0]} to {dates[-1]})") + print() + + splitter = TimeSeriesSplitter(config) + + for split in splitter.split(dates, values): + print(f"--- Fold {split.fold_index} ---") + print(f" Train: indices [{split.train_indices[0]}:{split.train_indices[-1]+1}]") + print(f" dates {split.train_dates[0]} to {split.train_dates[-1]}") + print(f" size {len(split.train_indices)} observations") + + if config.gap > 0: + gap_start = split.train_dates[-1] + timedelta(days=1) + gap_end = split.test_dates[0] - timedelta(days=1) + print(f" Gap: {gap_start} to {gap_end} ({config.gap} days)") + + print(f" Test: indices [{split.test_indices[0]}:{split.test_indices[-1]+1}]") + print(f" dates {split.test_dates[0]} to {split.test_dates[-1]}") + print(f" size {len(split.test_indices)} observations") + print() + + # Print boundaries summary + print("Boundaries Summary:") + boundaries = splitter.get_boundaries(dates, values) + for b in boundaries: + print( + f" Fold {b.fold_index}: " + f"train[{b.train_size}] → gap[{config.gap}] → test[{b.test_size}]" + ) + + +def main(): + # Create sample data (90 days) + start_date = date(2024, 1, 1) + n_days = 90 + dates = [start_date + timedelta(days=i) for i in range(n_days)] + values = np.sin(np.linspace(0, 4 * np.pi, n_days)) * 50 + 100 + + # Example 1: Expanding Window + expanding_config = SplitConfig( + strategy="expanding", + n_splits=4, + min_train_size=20, + gap=0, + horizon=10, + ) + print_splits("EXPANDING WINDOW STRATEGY", expanding_config, dates, values) + + print("\n" + "=" * 70 + "\n") + + # Example 2: Sliding Window + sliding_config = SplitConfig( + strategy="sliding", + n_splits=4, + min_train_size=30, + gap=0, + horizon=10, + ) + print_splits("SLIDING WINDOW STRATEGY", sliding_config, dates, values) + + print("\n" + "=" * 70 + "\n") + + # Example 3: With Gap + gap_config = SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=20, + gap=7, + horizon=10, + ) + print_splits("EXPANDING WITH 7-DAY GAP", gap_config, dates, values) + + print("\n" + "=" * 70 + "\n") + + # Visual representation + print("VISUAL REPRESENTATION (Expanding)") + print("=" * 70) + print("Each row represents a fold. 'T' = train, 'G' = gap, 'E' = test\n") + + # Use smaller dataset for visualization + dates_small = dates[:50] + values_small = values[:50] + config_small = SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=10, + gap=3, + horizon=5, + ) + splitter = TimeSeriesSplitter(config_small) + + for split in splitter.split(dates_small, values_small): + row = ["."] * len(dates_small) + + for i in split.train_indices: + row[i] = "T" + + gap_start_idx = split.train_indices[-1] + 1 + gap_end_idx = split.test_indices[0] + for i in range(gap_start_idx, gap_end_idx): + row[i] = "G" + + for i in split.test_indices: + row[i] = "E" + + print(f"Fold {split.fold_index}: {''.join(row)}") + + print("\nLegend: T=Train, G=Gap, E=Test (Evaluation), .=Unused") + + +if __name__ == "__main__": + main() diff --git a/examples/backtest/metrics_demo.py b/examples/backtest/metrics_demo.py new file mode 100644 index 00000000..95065191 --- /dev/null +++ b/examples/backtest/metrics_demo.py @@ -0,0 +1,172 @@ +"""Example: Metrics calculation and interpretation. + +Demonstrates the forecasting metrics suite and their interpretation +for model evaluation. + +Usage: + python examples/backtest/metrics_demo.py +""" + +import numpy as np + +from app.features.backtesting.metrics import MetricsCalculator + + +def print_metric_result(result): + """Pretty print a MetricResult.""" + print(f" {result.name.upper()}: {result.value:.4f}") + if result.warnings: + for warning in result.warnings: + print(f" ⚠ {warning}") + + +def main(): + calc = MetricsCalculator() + + print("=" * 70) + print("FORECASTING METRICS DEMONSTRATION") + print("=" * 70) + + # Scenario 1: Perfect Predictions + print("\n--- Scenario 1: Perfect Predictions ---") + actuals = np.array([100.0, 200.0, 300.0, 400.0, 500.0]) + predictions = np.array([100.0, 200.0, 300.0, 400.0, 500.0]) + + print(f"Actuals: {actuals}") + print(f"Predictions: {predictions}") + print("\nMetrics:") + print_metric_result(calc.mae(actuals, predictions)) + print_metric_result(calc.smape(actuals, predictions)) + print_metric_result(calc.wape(actuals, predictions)) + print_metric_result(calc.bias(actuals, predictions)) + + # Scenario 2: Over-Forecasting + print("\n--- Scenario 2: Consistent Over-Forecasting ---") + actuals = np.array([100.0, 100.0, 100.0, 100.0, 100.0]) + predictions = np.array([120.0, 120.0, 120.0, 120.0, 120.0]) + + print(f"Actuals: {actuals}") + print(f"Predictions: {predictions}") + print("\nMetrics:") + print_metric_result(calc.mae(actuals, predictions)) + print_metric_result(calc.smape(actuals, predictions)) + print_metric_result(calc.wape(actuals, predictions)) + print_metric_result(calc.bias(actuals, predictions)) + print(" → Negative bias indicates over-forecasting") + + # Scenario 3: Under-Forecasting + print("\n--- Scenario 3: Consistent Under-Forecasting ---") + actuals = np.array([100.0, 100.0, 100.0, 100.0, 100.0]) + predictions = np.array([80.0, 80.0, 80.0, 80.0, 80.0]) + + print(f"Actuals: {actuals}") + print(f"Predictions: {predictions}") + print("\nMetrics:") + print_metric_result(calc.mae(actuals, predictions)) + print_metric_result(calc.smape(actuals, predictions)) + print_metric_result(calc.wape(actuals, predictions)) + print_metric_result(calc.bias(actuals, predictions)) + print(" → Positive bias indicates under-forecasting") + + # Scenario 4: Mixed Errors (no bias) + print("\n--- Scenario 4: Mixed Errors (No Systematic Bias) ---") + actuals = np.array([100.0, 100.0, 100.0, 100.0]) + predictions = np.array([110.0, 90.0, 110.0, 90.0]) # +10, -10, +10, -10 + + print(f"Actuals: {actuals}") + print(f"Predictions: {predictions}") + print("\nMetrics:") + print_metric_result(calc.mae(actuals, predictions)) + print_metric_result(calc.smape(actuals, predictions)) + print_metric_result(calc.wape(actuals, predictions)) + print_metric_result(calc.bias(actuals, predictions)) + print(" → Bias ≈ 0 despite non-zero MAE") + + # Scenario 5: Intermittent Series (zeros) + print("\n--- Scenario 5: Intermittent Series (With Zeros) ---") + actuals = np.array([0.0, 50.0, 0.0, 100.0, 0.0]) + predictions = np.array([10.0, 40.0, 5.0, 90.0, 0.0]) + + print(f"Actuals: {actuals}") + print(f"Predictions: {predictions}") + print("\nMetrics:") + print_metric_result(calc.mae(actuals, predictions)) + print_metric_result(calc.smape(actuals, predictions)) + print_metric_result(calc.wape(actuals, predictions)) + print_metric_result(calc.bias(actuals, predictions)) + print(" → WAPE is robust for intermittent series") + + # Scenario 6: Stability Index + print("\n--- Scenario 6: Fold Stability Comparison ---") + + stable_folds = [10.0, 11.0, 9.5, 10.5, 10.0] + unstable_folds = [5.0, 20.0, 8.0, 25.0, 12.0] + + print(f"Stable fold MAEs: {stable_folds}") + stable_result = calc.stability_index(stable_folds) + print_metric_result(stable_result) + + print(f"\nUnstable fold MAEs: {unstable_folds}") + unstable_result = calc.stability_index(unstable_folds) + print_metric_result(unstable_result) + print(" → Lower stability index = more consistent performance") + + # Aggregation example + print("\n--- Scenario 7: Fold Aggregation ---") + fold_metrics = [ + {"mae": 10.0, "smape": 15.0, "wape": 12.0, "bias": 2.0}, + {"mae": 12.0, "smape": 18.0, "wape": 14.0, "bias": 3.0}, + {"mae": 8.0, "smape": 12.0, "wape": 10.0, "bias": 1.0}, + {"mae": 11.0, "smape": 16.0, "wape": 13.0, "bias": 2.5}, + ] + + print("Fold metrics:") + for i, fm in enumerate(fold_metrics): + print(f" Fold {i}: MAE={fm['mae']}, sMAPE={fm['smape']}, WAPE={fm['wape']}, Bias={fm['bias']}") + + aggregated, stability = calc.aggregate_fold_metrics(fold_metrics) + + print("\nAggregated (mean across folds):") + for metric, value in aggregated.items(): + stab_key = f"{metric}_stability" + stab_val = stability.get(stab_key, float("nan")) + print(f" {metric}: {value:.4f} (stability: {stab_val:.2f}%)") + + # Metric interpretation guide + print("\n" + "=" * 70) + print("METRIC INTERPRETATION GUIDE") + print("=" * 70) + print(""" +MAE (Mean Absolute Error): + - Unit: Same as target variable (e.g., units sold) + - Lower is better + - Easy to interpret: "On average, we're off by X units" + +sMAPE (Symmetric Mean Absolute Percentage Error): + - Unit: Percentage (0-200 scale) + - Lower is better + - Symmetric: treats over/under-forecasting equally + - 0 = perfect, 200 = maximum error + +WAPE (Weighted Absolute Percentage Error): + - Unit: Percentage + - Lower is better + - Better than MAPE for intermittent/low-volume series + - Weights errors by actual values + +Bias (Forecast Bias): + - Unit: Same as target variable + - Closer to 0 is better + - Positive = under-forecasting (actuals > predictions) + - Negative = over-forecasting (actuals < predictions) + +Stability Index (Coefficient of Variation): + - Unit: Percentage + - Lower is better + - Measures consistency across folds + - High values indicate unreliable model performance +""") + + +if __name__ == "__main__": + main() diff --git a/examples/backtest/run_backtest.py b/examples/backtest/run_backtest.py new file mode 100644 index 00000000..2a947b66 --- /dev/null +++ b/examples/backtest/run_backtest.py @@ -0,0 +1,129 @@ +"""Example: Running a backtest via the API. + +Demonstrates how to call the backtesting endpoint to evaluate a model +on historical data using time-series cross-validation. + +Prerequisites: + - API server running: uv run uvicorn app.main:app --reload --port 8123 + - Database with sales data (run seed_demo_data.py first) + +Usage: + python examples/backtest/run_backtest.py +""" + +import httpx + +API_BASE = "http://localhost:8123" + + +def main(): + # 1. Prepare backtest request + request_payload = { + "store_id": 1, + "product_id": 1, + "start_date": "2024-01-01", + "end_date": "2024-06-30", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": { + "model_type": "naive", + }, + "include_baselines": True, + "store_fold_details": True, + }, + } + + print("=" * 60) + print("BACKTEST REQUEST") + print("=" * 60) + print(f"Store ID: {request_payload['store_id']}") + print(f"Product ID: {request_payload['product_id']}") + print(f"Date Range: {request_payload['start_date']} to {request_payload['end_date']}") + print(f"Strategy: {request_payload['config']['split_config']['strategy']}") + print(f"N Splits: {request_payload['config']['split_config']['n_splits']}") + print(f"Horizon: {request_payload['config']['split_config']['horizon']} days") + print() + + # 2. Send request to API + print("Sending request to API...") + with httpx.Client(timeout=30.0) as client: + response = client.post( + f"{API_BASE}/backtesting/run", + json=request_payload, + ) + + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.text) + return + + result = response.json() + + # 3. Display results + print("\n" + "=" * 60) + print("BACKTEST RESULTS") + print("=" * 60) + print(f"Backtest ID: {result['backtest_id']}") + print(f"Config Hash: {result['config_hash']}") + print(f"Duration: {result['duration_ms']:.1f} ms") + print(f"Leakage Check: {'PASSED' if result['leakage_check_passed'] else 'FAILED'}") + + # 4. Main model results + main_results = result["main_model_results"] + print(f"\n--- Main Model: {main_results['model_type']} ---") + print("Aggregated Metrics:") + for metric, value in main_results["aggregated_metrics"].items(): + stability = main_results["metric_std"].get(f"{metric}_stability", "N/A") + if isinstance(stability, float): + print(f" {metric}: {value:.4f} (stability: {stability:.2f}%)") + else: + print(f" {metric}: {value:.4f}") + + # 5. Per-fold details + if main_results["fold_results"]: + print("\nPer-Fold Results:") + for fold in main_results["fold_results"]: + split = fold["split"] + print( + f" Fold {fold['fold_index']}: " + f"train={split['train_start']} to {split['train_end']} ({split['train_size']} days), " + f"test={split['test_start']} to {split['test_end']} ({split['test_size']} days)" + ) + print(f" MAE: {fold['metrics']['mae']:.4f}, sMAPE: {fold['metrics']['smape']:.2f}") + + # 6. Baseline comparisons + if result.get("baseline_results"): + print("\n--- Baseline Comparisons ---") + for baseline in result["baseline_results"]: + print(f"\n{baseline['model_type']}:") + for metric, value in baseline["aggregated_metrics"].items(): + print(f" {metric}: {value:.4f}") + + # 7. Comparison summary + if result.get("comparison_summary"): + print("\n--- Comparison Summary (vs Baselines) ---") + for metric, comparison in result["comparison_summary"].items(): + print(f"\n{metric}:") + print(f" Main model: {comparison['main']:.4f}") + if "naive" in comparison: + print(f" Naive: {comparison['naive']:.4f}") + if "vs_naive_pct" in comparison: + imp = comparison["vs_naive_pct"] + direction = "better" if imp > 0 else "worse" + print(f" vs Naive: {abs(imp):.1f}% {direction}") + if "seasonal_naive" in comparison: + print(f" Seasonal Naive: {comparison['seasonal_naive']:.4f}") + if "vs_seasonal_pct" in comparison: + imp = comparison["vs_seasonal_pct"] + direction = "better" if imp > 0 else "worse" + print(f" vs Seasonal: {abs(imp):.1f}% {direction}") + + +if __name__ == "__main__": + main() From f4370d1b18295893545da4b5f526a330dce043e2 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 03:57:47 +0000 Subject: [PATCH 03/11] docs: update documentation for backtesting module (PRP-6) - README.md: Add backtesting endpoint, examples, and project structure - ARCHITECTURE.md: Mark backtesting as implemented with full details Co-Authored-By: Claude Opus 4.5 --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++- docs/ARCHITECTURE.md | 39 +++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index de8a8e96..69e79bf9 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,8 @@ app/ │ ├── data_platform/ # Store, product, calendar, sales tables │ ├── ingest/ # Batch upsert endpoints for sales data │ ├── featuresets/ # Time-safe feature engineering (lags, rolling, calendar) -│ └── forecasting/ # Model training, prediction, persistence +│ ├── forecasting/ # Model training, prediction, persistence +│ └── backtesting/ # Time-series CV, metrics, baseline comparisons └── main.py # FastAPI entry point tests/ # Test fixtures and helpers @@ -105,6 +106,7 @@ examples/ ├── schema/ # Table documentation ├── queries/ # Example SQL queries ├── models/ # Baseline model examples (naive, seasonal_naive, moving_average) +├── backtest/ # Backtesting examples (run_backtest, inspect_splits, metrics_demo) └── compute_features_demo.py # Feature engineering demo scripts/ # Utility scripts ``` @@ -227,6 +229,56 @@ curl -X POST http://localhost:8123/forecasting/predict \ See [examples/models/](examples/models/) for baseline model examples. +### Backtesting + +- `POST /backtesting/run` - Run time-series cross-validation backtest + +**Example Request:** +```bash +curl -X POST http://localhost:8123/backtesting/run \ + -H "Content-Type: application/json" \ + -d '{ + "store_id": 1, + "product_id": 1, + "start_date": "2024-01-01", + "end_date": "2024-06-30", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14 + }, + "model_config_main": { + "model_type": "naive" + }, + "include_baselines": true, + "store_fold_details": true + } + }' +``` + +**Split Strategies:** +- `expanding` - Training window grows with each fold (sklearn-like TimeSeriesSplit) +- `sliding` - Fixed-size training window slides forward + +**Gap Parameter:** +- Simulates operational data latency between training and test periods +- `gap=7` means 7 days between train end and test start + +**Metrics Calculated:** +- MAE: Mean Absolute Error +- sMAPE: Symmetric Mean Absolute Percentage Error (0-200 scale) +- WAPE: Weighted Absolute Percentage Error +- Bias: Forecast bias (positive = under-forecast) +- Stability Index: Coefficient of variation across folds + +**Baseline Comparisons:** +When `include_baselines=true`, automatically compares against naive and seasonal_naive models. + +See [examples/backtest/](examples/backtest/) for usage examples. + ## API Documentation Once the server is running: diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9bcd3e72..a36af84e 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -276,9 +276,41 @@ forecast_model_artifacts_dir: str = "./artifacts/models" forecast_enable_lightgbm: bool = False ``` -### 7.5 Backtesting Protocol (Planned) -- Time-based CV only: rolling or expanding splits (no random split). -- Metrics: MAE, sMAPE (pinball loss later if needed). +### 7.5 Backtesting Protocol — ✅ IMPLEMENTED + +**Implemented via PRP-6** - Time-series backtesting module provides: + +**Split Strategies:** +| Strategy | Description | Train Size Behavior | +|----------|-------------|---------------------| +| `expanding` | Train window grows each fold | Increases per fold | +| `sliding` | Fixed-size train window slides | Constant | + +**Gap Parameter:** Simulates operational data latency (e.g., `gap=7` = 7 days between train end and test start). + +**Metrics Suite:** +| Metric | Description | Scale | +|--------|-------------|-------| +| MAE | Mean Absolute Error | Same as target | +| sMAPE | Symmetric MAPE | 0-200 | +| WAPE | Weighted Absolute Percentage Error | 0-100+ | +| Bias | Forecast bias (positive = under-forecast) | Same as target | +| Stability Index | CV of metrics across folds | 0-100+ | + +**Baseline Comparisons:** Automatic comparison against naive and seasonal_naive models with improvement percentages. + +**Leakage Validation:** Built-in validation ensures no data leakage in splits. + +**API Endpoint:** `POST /backtesting/run` + +**Location:** +- Schemas: `app/features/backtesting/schemas.py` +- Splitter: `app/features/backtesting/splitter.py` +- Metrics: `app/features/backtesting/metrics.py` +- Service: `app/features/backtesting/service.py` +- Routes: `app/features/backtesting/routes.py` +- Tests: `app/features/backtesting/tests/` (95 tests) +- Examples: `examples/backtest/` (run_backtest.py, inspect_splits.py, metrics_demo.py) ### 7.6 Model Registry (Planned) Each run stores: @@ -301,6 +333,7 @@ Each run stores: - `POST /featuresets/preview` - Preview features with sample rows - `POST /forecasting/train` - Train forecasting model (returns model_path) - `POST /forecasting/predict` - Generate forecasts using saved model +- `POST /backtesting/run` - Run time-series CV backtest with baseline comparisons **Planned Endpoints:** - `GET /runs`, `GET /runs/{run_id}` - Model registry and leaderboard From 019a38fcdde57b551420a5921c50f606f41b502a Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:02:07 +0000 Subject: [PATCH 04/11] chore: update uv.lock version to 0.1.7 Co-Authored-By: Claude Opus 4.5 --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 8cf97caa..9dbe5217 100644 --- a/uv.lock +++ b/uv.lock @@ -216,7 +216,7 @@ wheels = [ [[package]] name = "forecastlabai" -version = "0.1.6" +version = "0.1.7" source = { editable = "." } dependencies = [ { name = "alembic" }, From 2b34a211dae82d29a2943ba0b301c0fab3112e1b Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:17:34 +0000 Subject: [PATCH 05/11] test(backtesting): add integration tests for routes and service (PRP-6) Add 16 integration tests that run against real PostgreSQL database: - 8 route tests for POST /backtesting/run endpoint - 8 service tests for BacktestingService._load_series_data Tests use @pytest.mark.integration marker and require docker-compose. Test data: 120 days of sequential sales (quantity = day number 1-120). Co-Authored-By: Claude Opus 4.5 --- README.md | 26 +- app/features/backtesting/tests/conftest.py | 156 +++++++ .../tests/test_routes_integration.py | 395 ++++++++++++++++++ .../tests/test_service_integration.py | 297 +++++++++++++ docs/validation/pytest-standard.md | 96 ++++- 5 files changed, 967 insertions(+), 3 deletions(-) create mode 100644 app/features/backtesting/tests/test_routes_integration.py create mode 100644 app/features/backtesting/tests/test_service_integration.py diff --git a/README.md b/README.md index 69e79bf9..39f1f957 100644 --- a/README.md +++ b/README.md @@ -66,12 +66,34 @@ curl http://localhost:8123/health ## Development -### Commands +### Testing ```bash -# Run tests +# Run all tests uv run pytest -v +# Run unit tests only (no database required) +uv run pytest -v -m "not integration" + +# Run integration tests (requires PostgreSQL via docker-compose) +docker-compose up -d # Start database first +uv run pytest -v -m integration + +# Run feature-specific tests +uv run pytest app/features/backtesting/tests/ -v # All backtesting tests +uv run pytest app/features/forecasting/tests/ -v # All forecasting tests +uv run pytest app/features/backtesting/tests/ -v -m integration # Backtesting integration tests +``` + +**Test Coverage:** +- Unit tests: Fast, isolated tests that mock database dependencies +- Integration tests: End-to-end tests against real PostgreSQL database + - Marked with `@pytest.mark.integration` + - Require `docker-compose up -d` before running + +### Commands + +```bash # Type checking uv run mypy app/ uv run pyright app/ diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py index 519738af..3e88053e 100644 --- a/app/features/backtesting/tests/conftest.py +++ b/app/features/backtesting/tests/conftest.py @@ -1,12 +1,168 @@ """Test fixtures for backtesting module.""" +from collections.abc import AsyncGenerator from datetime import date, timedelta +from decimal import Decimal import numpy as np import pytest +from httpx import ASGITransport, AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from app.core.config import get_settings +from app.core.database import Base, get_db from app.features.backtesting.schemas import BacktestConfig, SplitConfig +from app.features.data_platform.models import Calendar, Product, SalesDaily, Store from app.features.forecasting.schemas import NaiveModelConfig, SeasonalNaiveModelConfig +from app.main import app + +# ============================================================================= +# Database Fixtures for Integration Tests +# ============================================================================= + + +@pytest.fixture +async def db_session() -> AsyncGenerator[AsyncSession, None]: + """Create async database session for integration tests. + + This fixture creates all tables, provides a session, and cleans up after. + Requires PostgreSQL to be running (docker-compose up -d). + """ + settings = get_settings() + engine = create_async_engine(settings.database_url, echo=False) + + # Create tables + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + # Create session + async_session_maker = async_sessionmaker( + engine, + class_=AsyncSession, + expire_on_commit=False, + ) + + async with async_session_maker() as session: + try: + yield session + finally: + await session.rollback() + + # Cleanup: drop all tables + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + await engine.dispose() + + +@pytest.fixture +async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]: + """Create test client with database dependency override.""" + app.dependency_overrides[get_db] = lambda: db_session + + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test", + ) as ac: + yield ac + + app.dependency_overrides.clear() + + +@pytest.fixture +async def sample_store(db_session: AsyncSession) -> Store: + """Create a sample store for testing.""" + store = Store( + code="TEST001", + name="Test Store", + region="Test Region", + city="Test City", + store_type="supermarket", + ) + db_session.add(store) + await db_session.commit() + await db_session.refresh(store) + return store + + +@pytest.fixture +async def sample_product(db_session: AsyncSession) -> Product: + """Create a sample product for testing.""" + product = Product( + sku="SKU-TEST-001", + name="Test Product", + category="Test Category", + brand="Test Brand", + base_price=Decimal("19.99"), + base_cost=Decimal("9.99"), + ) + db_session.add(product) + await db_session.commit() + await db_session.refresh(product) + return product + + +@pytest.fixture +async def sample_calendar_120(db_session: AsyncSession) -> list[Calendar]: + """Create 120 calendar records starting from 2024-01-01.""" + start = date(2024, 1, 1) + calendars = [] + + for i in range(120): + d = start + timedelta(days=i) + calendar = Calendar( + date=d, + day_of_week=d.weekday(), + month=d.month, + quarter=(d.month - 1) // 3 + 1, + year=d.year, + is_holiday=False, + ) + calendars.append(calendar) + db_session.add(calendar) + + await db_session.commit() + for cal in calendars: + await db_session.refresh(cal) + return calendars + + +@pytest.fixture +async def sample_sales_120( + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_calendar_120: list[Calendar], +) -> list[SalesDaily]: + """Create 120 days of sequential sales data. + + Sales quantity = day number (1, 2, 3, ..., 120) for predictable verification. + """ + sales_records = [] + + for i, calendar in enumerate(sample_calendar_120): + quantity = i + 1 # 1, 2, 3, ..., 120 + unit_price = Decimal("9.99") + sales = SalesDaily( + date=calendar.date, + store_id=sample_store.id, + product_id=sample_product.id, + quantity=quantity, + unit_price=unit_price, + total_amount=unit_price * quantity, + ) + sales_records.append(sales) + db_session.add(sales) + + await db_session.commit() + for sale in sales_records: + await db_session.refresh(sale) + return sales_records + + +# ============================================================================= +# Unit Test Fixtures (original) +# ============================================================================= @pytest.fixture diff --git a/app/features/backtesting/tests/test_routes_integration.py b/app/features/backtesting/tests/test_routes_integration.py new file mode 100644 index 00000000..efe2af33 --- /dev/null +++ b/app/features/backtesting/tests/test_routes_integration.py @@ -0,0 +1,395 @@ +"""Integration tests for backtesting routes. + +These tests run against a real PostgreSQL database to verify the complete flow +from API request through database queries to response. + +Requires PostgreSQL to be running: docker-compose up -d +""" + +from datetime import date + +import pytest +from httpx import AsyncClient + +from app.features.data_platform.models import Product, SalesDaily, Store + + +@pytest.mark.integration +@pytest.mark.asyncio +class TestBacktestingRouteIntegration: + """Integration tests for POST /backtesting/run endpoint.""" + + async def test_run_backtest_expanding_strategy( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test backtest with expanding window strategy.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + assert data["store_id"] == sample_store.id + assert data["product_id"] == sample_product.id + assert data["leakage_check_passed"] is True + assert data["main_model_results"]["model_type"] == "naive" + assert len(data["main_model_results"]["fold_results"]) == 5 + + # Verify train size increases with expanding window + fold_results = data["main_model_results"]["fold_results"] + train_sizes = [f["split"]["train_size"] for f in fold_results] + assert train_sizes == sorted(train_sizes), ( + "Train sizes should increase for expanding window" + ) + + async def test_run_backtest_sliding_strategy( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test backtest with sliding window strategy.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "sliding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + assert data["main_model_results"]["model_type"] == "naive" + assert len(data["main_model_results"]["fold_results"]) == 5 + + # Verify train size is constant with sliding window + fold_results = data["main_model_results"]["fold_results"] + train_sizes = [f["split"]["train_size"] for f in fold_results] + assert len(set(train_sizes)) == 1, "Train sizes should be constant for sliding window" + + async def test_run_backtest_with_gap( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test backtest with gap between train and test.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 3, + "min_train_size": 30, + "gap": 7, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + # Verify gap is respected: test_start should be > train_end + gap days + fold_results = data["main_model_results"]["fold_results"] + for fold in fold_results: + train_end = date.fromisoformat(fold["split"]["train_end"]) + test_start = date.fromisoformat(fold["split"]["test_start"]) + gap_days = (test_start - train_end).days + assert gap_days >= 7, f"Gap should be at least 7 days, got {gap_days}" + + async def test_run_backtest_with_baselines( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test backtest with baseline comparison enabled.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": True, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + # Verify baseline results are present + assert data["baseline_results"] is not None + assert len(data["baseline_results"]) >= 1 + + # Verify comparison summary is present + assert data["comparison_summary"] is not None + assert "mae" in data["comparison_summary"] + + # Check baseline model types + baseline_types = [r["model_type"] for r in data["baseline_results"]] + assert "naive" in baseline_types or "seasonal_naive" in baseline_types + + async def test_run_backtest_without_fold_details( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test backtest with store_fold_details=False.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": False, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + # Verify fold results exist but have empty arrays + fold_results = data["main_model_results"]["fold_results"] + assert len(fold_results) == 5 + for fold in fold_results: + assert fold["dates"] == [] + assert fold["actuals"] == [] + assert fold["predictions"] == [] + # Metrics should still be present + assert "mae" in fold["metrics"] + + async def test_run_backtest_insufficient_data_returns_400( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that insufficient data returns 400 error.""" + # Request a date range with only 20 days of data but require min_train=30 + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-01-20", # Only 20 days + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, # Requires 30 days minimum + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 400 + assert "detail" in response.json() + + async def test_run_backtest_no_data_returns_400( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that no data for given filters returns 400 error.""" + # Request data for a different store that doesn't exist + response = await client.post( + "/backtesting/run", + json={ + "store_id": 9999, # Non-existent store + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": False, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 400 + data = response.json() + assert "No data found" in data["detail"] + + async def test_response_contains_all_expected_fields( + self, + client: AsyncClient, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that response contains all expected fields with correct types.""" + response = await client.post( + "/backtesting/run", + json={ + "store_id": sample_store.id, + "product_id": sample_product.id, + "start_date": "2024-01-01", + "end_date": "2024-04-29", + "config": { + "split_config": { + "strategy": "expanding", + "n_splits": 5, + "min_train_size": 30, + "gap": 0, + "horizon": 14, + }, + "model_config_main": {"model_type": "naive"}, + "include_baselines": True, + "store_fold_details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + + # Top-level fields + assert "backtest_id" in data + assert isinstance(data["backtest_id"], str) + assert len(data["backtest_id"]) == 16 + + assert "store_id" in data + assert isinstance(data["store_id"], int) + + assert "product_id" in data + assert isinstance(data["product_id"], int) + + assert "config_hash" in data + assert isinstance(data["config_hash"], str) + + assert "split_config" in data + assert isinstance(data["split_config"], dict) + + assert "duration_ms" in data + assert isinstance(data["duration_ms"], float) + assert data["duration_ms"] > 0 + + assert "leakage_check_passed" in data + assert isinstance(data["leakage_check_passed"], bool) + + # Main model results + main_results = data["main_model_results"] + assert "model_type" in main_results + assert "config_hash" in main_results + assert "fold_results" in main_results + assert "aggregated_metrics" in main_results + assert "metric_std" in main_results + + # Aggregated metrics + agg_metrics = main_results["aggregated_metrics"] + expected_metrics = ["mae", "smape", "wape", "bias"] + for metric in expected_metrics: + assert metric in agg_metrics, f"Missing metric: {metric}" + assert isinstance(agg_metrics[metric], float) + + # Fold results + for fold in main_results["fold_results"]: + assert "fold_index" in fold + assert "split" in fold + assert "dates" in fold + assert "actuals" in fold + assert "predictions" in fold + assert "metrics" in fold + + # Split details + split = fold["split"] + assert "train_start" in split + assert "train_end" in split + assert "test_start" in split + assert "test_end" in split + assert "train_size" in split + assert "test_size" in split diff --git a/app/features/backtesting/tests/test_service_integration.py b/app/features/backtesting/tests/test_service_integration.py new file mode 100644 index 00000000..d1b0fbd7 --- /dev/null +++ b/app/features/backtesting/tests/test_service_integration.py @@ -0,0 +1,297 @@ +"""Integration tests for BacktestingService. + +These tests verify the service layer interacts correctly with the database, +focusing on data loading and full backtest execution. + +Requires PostgreSQL to be running: docker-compose up -d +""" + +from datetime import date + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from app.features.backtesting.schemas import BacktestConfig, SplitConfig +from app.features.backtesting.service import BacktestingService +from app.features.data_platform.models import Product, SalesDaily, Store +from app.features.forecasting.schemas import NaiveModelConfig + + +@pytest.mark.integration +@pytest.mark.asyncio +class TestBacktestingServiceIntegration: + """Integration tests for BacktestingService._load_series_data and run_backtest.""" + + async def test_load_series_data_returns_correct_values( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that _load_series_data returns correct values from database.""" + service = BacktestingService() + + series_data = await service._load_series_data( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + ) + + assert series_data.store_id == sample_store.id + assert series_data.product_id == sample_product.id + assert series_data.n_observations == 120 + + # Verify values are 1, 2, 3, ..., 120 (sequential) + for i, val in enumerate(series_data.values): + expected = float(i + 1) + assert val == expected, f"Expected {expected} at index {i}, got {val}" + + async def test_load_series_data_filters_by_date_range( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that _load_series_data correctly filters by date range.""" + service = BacktestingService() + + # Request only first 30 days + series_data = await service._load_series_data( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 1, 30), + ) + + assert series_data.n_observations == 30 + assert series_data.dates[0] == date(2024, 1, 1) + assert series_data.dates[-1] == date(2024, 1, 30) + + # Values should be 1 through 30 + assert float(series_data.values[0]) == 1.0 + assert float(series_data.values[-1]) == 30.0 + + async def test_load_series_data_filters_by_store_product( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that _load_series_data returns empty for non-matching store/product.""" + service = BacktestingService() + + # Request with non-existent store + series_data = await service._load_series_data( + db=db_session, + store_id=9999, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + ) + + assert series_data.n_observations == 0 + assert len(series_data.dates) == 0 + assert len(series_data.values) == 0 + + async def test_load_series_data_returns_chronological_order( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that _load_series_data returns dates in chronological order.""" + service = BacktestingService() + + series_data = await service._load_series_data( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + ) + + # Verify dates are sorted + dates = series_data.dates + assert dates == sorted(dates), "Dates should be in chronological order" + + # Verify each date is one day after previous + for i in range(1, len(dates)): + delta = (dates[i] - dates[i - 1]).days + assert delta == 1, f"Gap between dates at index {i}: expected 1, got {delta}" + + async def test_full_backtest_with_real_data( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test complete backtest execution with real database data.""" + service = BacktestingService() + + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=True, + store_fold_details=True, + ) + + response = await service.run_backtest( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + config=config, + ) + + # Verify response structure + assert response.store_id == sample_store.id + assert response.product_id == sample_product.id + assert response.leakage_check_passed is True + assert response.duration_ms > 0 + + # Verify main model results + main_results = response.main_model_results + assert main_results.model_type == "naive" + assert len(main_results.fold_results) == 5 + + # Verify aggregated metrics exist and are reasonable + agg_metrics = main_results.aggregated_metrics + assert "mae" in agg_metrics + assert "smape" in agg_metrics + assert "wape" in agg_metrics + assert "bias" in agg_metrics + assert agg_metrics["mae"] >= 0 + assert 0 <= agg_metrics["smape"] <= 200 + + # Verify baseline results + assert response.baseline_results is not None + assert len(response.baseline_results) >= 1 + + # Verify comparison summary + assert response.comparison_summary is not None + + async def test_full_backtest_with_sliding_window( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test complete backtest with sliding window strategy.""" + service = BacktestingService() + + config = BacktestConfig( + split_config=SplitConfig( + strategy="sliding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=False, + store_fold_details=True, + ) + + response = await service.run_backtest( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + config=config, + ) + + # Verify sliding window: train sizes should be constant + fold_results = response.main_model_results.fold_results + train_sizes = [f.split.train_size for f in fold_results] + assert len(set(train_sizes)) == 1, f"Train sizes should be constant, got {train_sizes}" + + async def test_backtest_raises_for_no_data( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that backtest raises ValueError when no data is found.""" + service = BacktestingService() + + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=5, + min_train_size=30, + gap=0, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=False, + store_fold_details=True, + ) + + with pytest.raises(ValueError, match="No data found"): + await service.run_backtest( + db=db_session, + store_id=9999, # Non-existent + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + config=config, + ) + + async def test_backtest_with_gap_produces_correct_splits( + self, + db_session: AsyncSession, + sample_store: Store, + sample_product: Product, + sample_sales_120: list[SalesDaily], + ) -> None: + """Test that gap parameter creates correct separation between train and test.""" + service = BacktestingService() + + gap_days = 7 + config = BacktestConfig( + split_config=SplitConfig( + strategy="expanding", + n_splits=3, + min_train_size=30, + gap=gap_days, + horizon=14, + ), + model_config_main=NaiveModelConfig(), + include_baselines=False, + store_fold_details=True, + ) + + response = await service.run_backtest( + db=db_session, + store_id=sample_store.id, + product_id=sample_product.id, + start_date=date(2024, 1, 1), + end_date=date(2024, 4, 29), + config=config, + ) + + # Verify gap between train_end and test_start + for fold in response.main_model_results.fold_results: + train_end = fold.split.train_end + test_start = fold.split.test_start + actual_gap = (test_start - train_end).days + # Gap should be at least gap_days (could be more if data is sparse) + assert actual_gap >= gap_days, f"Expected gap >= {gap_days}, got {actual_gap}" diff --git a/docs/validation/pytest-standard.md b/docs/validation/pytest-standard.md index 2d17b023..d889312e 100644 --- a/docs/validation/pytest-standard.md +++ b/docs/validation/pytest-standard.md @@ -504,6 +504,19 @@ app/ │ ├── logging.py │ ├── middleware.py │ └── database.py +├── features/ +│ ├── backtesting/ +│ │ └── tests/ +│ │ ├── conftest.py # Unit + integration fixtures +│ │ ├── test_metrics.py # Unit tests for metrics +│ │ ├── test_runner.py # Unit tests for runner +│ │ ├── test_schemas.py # Unit tests for schemas +│ │ ├── test_splitter.py # Unit tests for splitter +│ │ ├── test_routes_integration.py # Integration tests for routes +│ │ └── test_service_integration.py # Integration tests for service +│ └── forecasting/ +│ └── tests/ +│ └── ... └── shared/ ├── tests/ │ └── test_utils.py @@ -698,6 +711,87 @@ def test_file_processing(tmp_path): assert result == "processed: test content" ``` +## Feature-Specific Testing + +### Backtesting Integration Tests + +The backtesting module includes comprehensive integration tests that verify the complete flow from API request through database queries to response. + +#### Test Fixtures (conftest.py) + +```python +# Database fixtures for integration tests +@pytest.fixture +async def db_session() -> AsyncGenerator[AsyncSession, None]: + """Create async database session with table lifecycle management.""" + settings = get_settings() + engine = create_async_engine(settings.database_url, echo=False) + + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + async_session_maker = async_sessionmaker(engine, class_=AsyncSession) + async with async_session_maker() as session: + try: + yield session + finally: + await session.rollback() + + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + await engine.dispose() + +@pytest.fixture +async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]: + """Create test client with database dependency override.""" + app.dependency_overrides[get_db] = lambda: db_session + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as ac: + yield ac + app.dependency_overrides.clear() + +# Sample data fixtures +@pytest.fixture +async def sample_store(db_session: AsyncSession) -> Store: + """Create a sample store for testing.""" + +@pytest.fixture +async def sample_product(db_session: AsyncSession) -> Product: + """Create a sample product for testing.""" + +@pytest.fixture +async def sample_calendar_120(db_session: AsyncSession) -> list[Calendar]: + """Create 120 calendar records starting from 2024-01-01.""" + +@pytest.fixture +async def sample_sales_120(...) -> list[SalesDaily]: + """Create 120 days of sequential sales data (quantity = day number 1-120).""" +``` + +#### Running Backtesting Tests + +```bash +# All backtesting tests (unit + integration) +uv run pytest app/features/backtesting/tests/ -v + +# Integration tests only (requires PostgreSQL) +docker-compose up -d +uv run pytest app/features/backtesting/tests/ -v -m integration + +# Unit tests only +uv run pytest app/features/backtesting/tests/ -v -m "not integration" +``` + +#### Test Coverage + +| Test File | Type | Count | Description | +|-----------|------|-------|-------------| +| `test_metrics.py` | Unit | ~20 | Metric calculations (MAE, sMAPE, WAPE, Bias) | +| `test_runner.py` | Unit | ~25 | Backtest runner logic | +| `test_schemas.py` | Unit | ~15 | Pydantic schema validation | +| `test_splitter.py` | Unit | ~35 | Time series splitter strategies | +| `test_routes_integration.py` | Integration | 8 | API endpoint tests | +| `test_service_integration.py` | Integration | 8 | Service layer database tests | + ## CI/CD Integration ```yaml @@ -732,6 +826,6 @@ jobs: --- -**Last Updated:** 2025-10-29 +**Last Updated:** 2026-02-01 **Pytest Version:** 8.4.2+ **Python Version:** 3.12+ From ad4fe01bccdec52bb454b42818770109bf04f078 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:21:41 +0000 Subject: [PATCH 06/11] fix(backtesting): fix integration test fixtures and format examples - Use savepoint-based transaction isolation instead of table drop/create - Fix client dependency override to use async generator - Format example files (inspect_splits.py, metrics_demo.py) Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/tests/conftest.py | 46 ++++++++++++++-------- examples/backtest/inspect_splits.py | 4 +- examples/backtest/metrics_demo.py | 4 +- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py index 3e88053e..5c7a31df 100644 --- a/app/features/backtesting/tests/conftest.py +++ b/app/features/backtesting/tests/conftest.py @@ -10,7 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from app.core.config import get_settings -from app.core.database import Base, get_db +from app.core.database import get_db from app.features.backtesting.schemas import BacktestConfig, SplitConfig from app.features.data_platform.models import Calendar, Product, SalesDaily, Store from app.features.forecasting.schemas import NaiveModelConfig, SeasonalNaiveModelConfig @@ -25,32 +25,41 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: """Create async database session for integration tests. - This fixture creates all tables, provides a session, and cleans up after. + Uses savepoint-based isolation: each test runs in a transaction that is + rolled back after the test completes. Tables must already exist (via migrations). + Requires PostgreSQL to be running (docker-compose up -d). """ settings = get_settings() engine = create_async_engine(settings.database_url, echo=False) - # Create tables - async with engine.begin() as conn: - await conn.run_sync(Base.metadata.create_all) - - # Create session + # Create session factory async_session_maker = async_sessionmaker( engine, class_=AsyncSession, expire_on_commit=False, ) - async with async_session_maker() as session: - try: - yield session - finally: - await session.rollback() + # Use a connection with a transaction for isolation + async with engine.connect() as conn: + # Start an outer transaction + trans = await conn.begin() + + # Create session bound to this connection + async with async_session_maker(bind=conn) as session: + # Create a savepoint for nested transaction + nested = await conn.begin_nested() - # Cleanup: drop all tables - async with engine.begin() as conn: - await conn.run_sync(Base.metadata.drop_all) + try: + yield session + finally: + # Roll back to savepoint + if nested.is_active: + await nested.rollback() + + # Roll back outer transaction (cleans up all test data) + if trans.is_active: + await trans.rollback() await engine.dispose() @@ -58,7 +67,12 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: @pytest.fixture async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]: """Create test client with database dependency override.""" - app.dependency_overrides[get_db] = lambda: db_session + + # Create an async generator that yields the session + async def override_get_db() -> AsyncGenerator[AsyncSession, None]: + yield db_session + + app.dependency_overrides[get_db] = override_get_db async with AsyncClient( transport=ASGITransport(app=app), diff --git a/examples/backtest/inspect_splits.py b/examples/backtest/inspect_splits.py index dc1b37cb..5fe07f46 100644 --- a/examples/backtest/inspect_splits.py +++ b/examples/backtest/inspect_splits.py @@ -32,7 +32,7 @@ def print_splits(title: str, config: SplitConfig, dates: list[date], values: np. for split in splitter.split(dates, values): print(f"--- Fold {split.fold_index} ---") - print(f" Train: indices [{split.train_indices[0]}:{split.train_indices[-1]+1}]") + print(f" Train: indices [{split.train_indices[0]}:{split.train_indices[-1] + 1}]") print(f" dates {split.train_dates[0]} to {split.train_dates[-1]}") print(f" size {len(split.train_indices)} observations") @@ -41,7 +41,7 @@ def print_splits(title: str, config: SplitConfig, dates: list[date], values: np. gap_end = split.test_dates[0] - timedelta(days=1) print(f" Gap: {gap_start} to {gap_end} ({config.gap} days)") - print(f" Test: indices [{split.test_indices[0]}:{split.test_indices[-1]+1}]") + print(f" Test: indices [{split.test_indices[0]}:{split.test_indices[-1] + 1}]") print(f" dates {split.test_dates[0]} to {split.test_dates[-1]}") print(f" size {len(split.test_indices)} observations") print() diff --git a/examples/backtest/metrics_demo.py b/examples/backtest/metrics_demo.py index 95065191..15d7f6cd 100644 --- a/examples/backtest/metrics_demo.py +++ b/examples/backtest/metrics_demo.py @@ -122,7 +122,9 @@ def main(): print("Fold metrics:") for i, fm in enumerate(fold_metrics): - print(f" Fold {i}: MAE={fm['mae']}, sMAPE={fm['smape']}, WAPE={fm['wape']}, Bias={fm['bias']}") + print( + f" Fold {i}: MAE={fm['mae']}, sMAPE={fm['smape']}, WAPE={fm['wape']}, Bias={fm['bias']}" + ) aggregated, stability = calc.aggregate_fold_metrics(fold_metrics) From 34bfe76d8e680e2cd15b124d098b312f8588e10f Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:24:34 +0000 Subject: [PATCH 07/11] fix(backtesting): simplify db_session fixture for CI compatibility Remove complex savepoint-based isolation that caused issues with FastAPI dependency injection. Use simpler session pattern that matches other working integration tests. Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/tests/conftest.py | 32 +++++----------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py index 5c7a31df..459dbb3d 100644 --- a/app/features/backtesting/tests/conftest.py +++ b/app/features/backtesting/tests/conftest.py @@ -25,41 +25,24 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: """Create async database session for integration tests. - Uses savepoint-based isolation: each test runs in a transaction that is - rolled back after the test completes. Tables must already exist (via migrations). - + Creates tables if needed, provides a session, and cleans up test data. Requires PostgreSQL to be running (docker-compose up -d). """ settings = get_settings() engine = create_async_engine(settings.database_url, echo=False) - # Create session factory + # Create session async_session_maker = async_sessionmaker( engine, class_=AsyncSession, expire_on_commit=False, ) - # Use a connection with a transaction for isolation - async with engine.connect() as conn: - # Start an outer transaction - trans = await conn.begin() - - # Create session bound to this connection - async with async_session_maker(bind=conn) as session: - # Create a savepoint for nested transaction - nested = await conn.begin_nested() - - try: - yield session - finally: - # Roll back to savepoint - if nested.is_active: - await nested.rollback() - - # Roll back outer transaction (cleans up all test data) - if trans.is_active: - await trans.rollback() + async with async_session_maker() as session: + try: + yield session + finally: + await session.rollback() await engine.dispose() @@ -68,7 +51,6 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]: """Create test client with database dependency override.""" - # Create an async generator that yields the session async def override_get_db() -> AsyncGenerator[AsyncSession, None]: yield db_session From 606e772452fee5e30e13f302ee21766141fad44b Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:27:47 +0000 Subject: [PATCH 08/11] fix(backtesting): use unique IDs and proper cleanup for test isolation - Generate unique store codes and SKUs using UUID per test - Use merge() for calendar fixture to handle existing records - Clean up test data after each test (SalesDaily, TEST-* stores/products) - Preserve shared Calendar data between tests Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/tests/conftest.py | 32 +++++++++++++++------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py index 459dbb3d..3e394d5e 100644 --- a/app/features/backtesting/tests/conftest.py +++ b/app/features/backtesting/tests/conftest.py @@ -1,5 +1,6 @@ """Test fixtures for backtesting module.""" +import uuid from collections.abc import AsyncGenerator from datetime import date, timedelta from decimal import Decimal @@ -7,6 +8,7 @@ import numpy as np import pytest from httpx import ASGITransport, AsyncClient +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from app.core.config import get_settings @@ -42,7 +44,13 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: try: yield session finally: - await session.rollback() + # Clean up test data (delete in correct order due to FK constraints) + # Only delete test-specific data (with TEST- prefix) + await session.execute(delete(SalesDaily)) + await session.execute(delete(Product).where(Product.sku.like("TEST-%"))) + await session.execute(delete(Store).where(Store.code.like("TEST-%"))) + # Don't delete Calendar - it's shared and safe to keep + await session.commit() await engine.dispose() @@ -67,9 +75,10 @@ async def override_get_db() -> AsyncGenerator[AsyncSession, None]: @pytest.fixture async def sample_store(db_session: AsyncSession) -> Store: - """Create a sample store for testing.""" + """Create a sample store for testing with unique ID.""" + unique_id = uuid.uuid4().hex[:8] store = Store( - code="TEST001", + code=f"TEST-{unique_id}", name="Test Store", region="Test Region", city="Test City", @@ -83,9 +92,10 @@ async def sample_store(db_session: AsyncSession) -> Store: @pytest.fixture async def sample_product(db_session: AsyncSession) -> Product: - """Create a sample product for testing.""" + """Create a sample product for testing with unique ID.""" + unique_id = uuid.uuid4().hex[:8] product = Product( - sku="SKU-TEST-001", + sku=f"TEST-{unique_id}", name="Test Product", category="Test Category", brand="Test Brand", @@ -100,7 +110,10 @@ async def sample_product(db_session: AsyncSession) -> Product: @pytest.fixture async def sample_calendar_120(db_session: AsyncSession) -> list[Calendar]: - """Create 120 calendar records starting from 2024-01-01.""" + """Create 120 calendar records starting from 2024-01-01. + + Uses merge to handle existing records gracefully (idempotent). + """ start = date(2024, 1, 1) calendars = [] @@ -114,12 +127,11 @@ async def sample_calendar_120(db_session: AsyncSession) -> list[Calendar]: year=d.year, is_holiday=False, ) - calendars.append(calendar) - db_session.add(calendar) + # Use merge to handle existing records (upsert behavior) + merged = await db_session.merge(calendar) + calendars.append(merged) await db_session.commit() - for cal in calendars: - await db_session.refresh(cal) return calendars From c2765acd369b1c96eeb3c566498d699506aa5ab8 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:32:47 +0000 Subject: [PATCH 09/11] debug: add error message to test assertion Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/tests/test_routes_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/features/backtesting/tests/test_routes_integration.py b/app/features/backtesting/tests/test_routes_integration.py index efe2af33..51250625 100644 --- a/app/features/backtesting/tests/test_routes_integration.py +++ b/app/features/backtesting/tests/test_routes_integration.py @@ -49,7 +49,7 @@ async def test_run_backtest_expanding_strategy( }, ) - assert response.status_code == 200 + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" data = response.json() assert data["store_id"] == sample_store.id From d2374e7d87ee4bffcf4945631147e381f5821c80 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:35:05 +0000 Subject: [PATCH 10/11] fix(backtesting): remove strict=True from BacktestRequest to allow date coercion The strict=True config prevented Pydantic from automatically converting ISO date strings to date objects in JSON requests, causing 422 errors. Changed to extra="forbid" to still reject unknown fields while allowing normal type coercion. Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/schemas.py | 2 +- app/features/backtesting/tests/test_routes_integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/features/backtesting/schemas.py b/app/features/backtesting/schemas.py index 205f8547..537809f0 100644 --- a/app/features/backtesting/schemas.py +++ b/app/features/backtesting/schemas.py @@ -198,7 +198,7 @@ class BacktestRequest(BaseModel): config: Backtest configuration. """ - model_config = ConfigDict(strict=True) + model_config = ConfigDict(extra="forbid") store_id: int = Field(..., ge=1, description="Store ID") product_id: int = Field(..., ge=1, description="Product ID") diff --git a/app/features/backtesting/tests/test_routes_integration.py b/app/features/backtesting/tests/test_routes_integration.py index 51250625..efe2af33 100644 --- a/app/features/backtesting/tests/test_routes_integration.py +++ b/app/features/backtesting/tests/test_routes_integration.py @@ -49,7 +49,7 @@ async def test_run_backtest_expanding_strategy( }, ) - assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + assert response.status_code == 200 data = response.json() assert data["store_id"] == sample_store.id From 51bbbfc6e69ea4789bdc3abbe7ef109d9ee7408e Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Sun, 1 Feb 2026 04:37:22 +0000 Subject: [PATCH 11/11] fix(backtesting): clean up calendar entries in test date range Delete calendar entries from 2024-01-01 to 2024-04-29 during test cleanup to prevent conflicts with other test modules that insert calendar records in the same date range. Co-Authored-By: Claude Opus 4.5 --- app/features/backtesting/tests/conftest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/app/features/backtesting/tests/conftest.py b/app/features/backtesting/tests/conftest.py index 3e394d5e..3d998393 100644 --- a/app/features/backtesting/tests/conftest.py +++ b/app/features/backtesting/tests/conftest.py @@ -45,11 +45,15 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: yield session finally: # Clean up test data (delete in correct order due to FK constraints) - # Only delete test-specific data (with TEST- prefix) await session.execute(delete(SalesDaily)) await session.execute(delete(Product).where(Product.sku.like("TEST-%"))) await session.execute(delete(Store).where(Store.code.like("TEST-%"))) - # Don't delete Calendar - it's shared and safe to keep + # Clean up calendar entries in our test date range (2024-01-01 to 2024-04-29) + await session.execute( + delete(Calendar).where( + (Calendar.date >= date(2024, 1, 1)) & (Calendar.date <= date(2024, 4, 29)) + ) + ) await session.commit() await engine.dispose()