Skip to content

Add plot_residuals #539

Merged
merged 9 commits into from
Feb 18, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
- Add find_change_points function ([#521](https://github.com/tinkoff-ai/etna/pull/521))
-
-
- Add plot_residuals ([#539](https://github.com/tinkoff-ai/etna/pull/539))

### Changed
- Change the way `ProphetModel` works with regressors ([#383](https://github.com/tinkoff-ai/etna/pull/383))
Expand Down
1 change: 1 addition & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@
from etna.analysis.plotters import plot_clusters
from etna.analysis.plotters import plot_correlation_matrix
from etna.analysis.plotters import plot_forecast
from etna.analysis.plotters import plot_residuals
from etna.analysis.plotters import plot_time_series_with_change_points
134 changes: 105 additions & 29 deletions etna/analysis/plotters.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,41 @@
import math
from copy import deepcopy
from typing import TYPE_CHECKING
from typing import Callable
from typing import Dict
from typing import List
from typing import Literal
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

import matplotlib.axes
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import seaborn as sns

from etna.transforms import Transform

if TYPE_CHECKING:
from etna.datasets import TSDataset


def prepare_axes(segments: List[str], columns_num: int, figsize: Tuple[int, int]) -> Sequence[matplotlib.axes.Axes]:
"""Prepare axes according to segments, figure size and number of columns."""
segments_number = len(segments)
columns_num = min(columns_num, len(segments))
rows_num = math.ceil(segments_number / columns_num)

figsize = (figsize[0] * columns_num, figsize[1] * rows_num)
_, ax = plt.subplots(rows_num, columns_num, figsize=figsize, constrained_layout=True)
ax = np.array([ax]).ravel()
return ax


def plot_forecast(
forecast_ts: "TSDataset",
test_ts: Optional["TSDataset"] = None,
Expand Down Expand Up @@ -49,13 +67,8 @@ def plot_forecast(
"""
if not segments:
segments = list(set(forecast_ts.columns.get_level_values("segment")))
segments_number = len(segments)
columns_num = min(columns_num, len(segments))
rows_num = math.ceil(segments_number / columns_num)

figsize = (figsize[0] * columns_num, figsize[1] * rows_num)
_, ax = plt.subplots(rows_num, columns_num, figsize=figsize, constrained_layout=True)
ax = np.array([ax]).ravel()
ax = prepare_axes(segments=segments, columns_num=columns_num, figsize=figsize)

if train_ts is not None:
train_ts.df.sort_values(by="timestamp", inplace=True)
Expand Down Expand Up @@ -124,17 +137,12 @@ def plot_backtest(
if not segments:
segments = sorted(ts.segments)
df = ts.df
segments_number = len(segments)
columns_num = min(columns_num, len(segments))
rows_num = math.ceil(segments_number / columns_num)

ax = prepare_axes(segments=segments, columns_num=columns_num, figsize=figsize)

if not folds:
folds = sorted(set(forecast_df[segments[0]]["fold_number"]))

figsize = (figsize[0] * columns_num, figsize[1] * rows_num)
_, ax = plt.subplots(rows_num, columns_num, figsize=figsize, constrained_layout=True)
ax = np.array([ax]).ravel()

forecast_start = forecast_df.index.min()
history_df = df[df.index < forecast_start]
backtest_df = df[df.index >= forecast_start]
Expand Down Expand Up @@ -317,23 +325,17 @@ def plot_anomalies(
TSDataset of timeseries that was used for detect anomalies
anomaly_dict:
dictionary derived from anomaly detection function
segments: list of str, optional
segments:
segments to plot
columns_num: int
columns_num:
number of subplots columns
figsize:
size of the figure per subplot with one segment in inches
"""
if not segments:
segments = sorted(ts.segments)

segments_number = len(segments)
columns_num = min(columns_num, len(segments))
rows_num = math.ceil(segments_number / columns_num)

figsize = (figsize[0] * columns_num, figsize[1] * rows_num)
_, ax = plt.subplots(rows_num, columns_num, figsize=figsize, constrained_layout=True)
ax = np.array([ax]).ravel()
ax = prepare_axes(segments=segments, columns_num=columns_num, figsize=figsize)

for i, segment in enumerate(segments):
segment_df = ts[:, segment, :][segment]
Expand Down Expand Up @@ -563,13 +565,7 @@ def plot_time_series_with_change_points(
if not segments:
segments = sorted(ts.segments)

segments_number = len(segments)
columns_num = min(columns_num, len(segments))
rows_num = math.ceil(segments_number / columns_num)

figsize = (figsize[0] * columns_num, figsize[1] * rows_num)
_, ax = plt.subplots(rows_num, columns_num, figsize=figsize, constrained_layout=True)
ax = np.array([ax]).ravel()
ax = prepare_axes(segments=segments, columns_num=columns_num, figsize=figsize)

for i, segment in enumerate(segments):
segment_df = ts[:, segment, :][segment]
Expand All @@ -593,3 +589,83 @@ def plot_time_series_with_change_points(

ax[i].set_title(segment)
ax[i].tick_params("x", rotation=45)


def plot_residuals(
forecast_df: pd.DataFrame,
ts: "TSDataset",
feature: Union[str, Literal["timestamp"]] = "timestamp",
transforms: Sequence[Transform] = (),
segments: Optional[List[str]] = None,
columns_num: int = 2,
figsize: Tuple[int, int] = (10, 5),
):
"""Plot residuals for predictions from backtest against some feature.

Parameters
----------
forecast_df:
forecasted dataframe with timeseries data
ts:
dataframe of timeseries that was used for backtest
feature:
feature name to draw against residuals, if "timestamp" plot residuals against the timestamp
transforms:
sequence of transforms to get feature column
segments:
segments to use
columns_num:
number of columns in subplots
figsize:
size of the figure per subplot with one segment in inches

Raises
------
ValueError:
if feature isn't present in the dataset after applying transformations

Notes
-----
Parameter `transforms` is necessary because some pipelines doesn't save features in their forecasts,
e.g. `etna.ensembles` pipelines.
"""
if not segments:
segments = sorted(ts.segments)

ax = prepare_axes(segments=segments, columns_num=columns_num, figsize=figsize)

ts_copy = deepcopy(ts)
ts_copy.fit_transform(transforms=transforms)
df = ts_copy.to_pandas()
# check if feature is present in dataset
if feature != "timestamp":
all_features = set(df.columns.get_level_values("feature").unique())
if feature not in all_features:
raise ValueError("Given feature isn't present in the dataset after applying transformations")

for i, segment in enumerate(segments):
segment_df = df.loc[forecast_df.index, pd.IndexSlice[segment, :]][segment].reset_index()
segment_forecast_df = forecast_df.loc[:, pd.IndexSlice[segment, :]][segment].reset_index()
segment_df.rename(columns={"target": "y_true"}, inplace=True)
segment_df["y_pred"] = segment_forecast_df["target"].values

residuals = (segment_df["y_true"] - segment_df["y_pred"]).values
feature_values = segment_df[feature].values

# highlight different backtest folds
if feature == "timestamp":
folds = sorted(set(segment_forecast_df["fold_number"]))
for fold_number in folds:
forecast_df_slice_fold = segment_forecast_df[segment_forecast_df["fold_number"] == fold_number]
ax[i].axvspan(
forecast_df_slice_fold["timestamp"].min(),
forecast_df_slice_fold["timestamp"].max(),
alpha=0.15 * (int(forecast_df_slice_fold["fold_number"].max() + 1) % 2),
color="skyblue",
)

ax[i].scatter(feature_values, residuals, c="b")

ax[i].set_title(segment)
ax[i].tick_params("x", rotation=45)
ax[i].set_xlabel(feature)
17 changes: 17 additions & 0 deletions tests/test_analysis/test_plotters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pytest

from etna.analysis import plot_residuals
from etna.metrics import MAE
from etna.models import LinearPerSegmentModel
from etna.pipeline import Pipeline
from etna.transforms import LagTransform


def test_plot_residuals_fails_unkown_feature(example_tsdf):
"""Test that plot_residuals fails if meet unknown feature."""
pipeline = Pipeline(
model=LinearPerSegmentModel(), transforms=[LagTransform(in_column="target", lags=[5, 6, 7])], horizon=5
)
metrics, forecast_df, info = pipeline.backtest(ts=example_tsdf, metrics=[MAE()], n_folds=3)
with pytest.raises(ValueError, match="Given feature isn't present in the dataset"):
plot_residuals(forecast_df=forecast_df, ts=example_tsdf, feature="unkown_feature")