Skip to content

Etna-760 #96

Merged
merged 39 commits into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Examples to TSDataset methods with doctest ([#92](https://github.com/tinkoff-ai/etna-ts/pull/92))
- WandbLogger ([#71](https://github.com/tinkoff-ai/etna-ts/pull/71))
- Pipeline ([#78](https://github.com/tinkoff-ai/etna-ts/pull/78))
- Sequence anomalies ([#96](https://github.com/tinkoff-ai/etna-ts/pull/96))

### Changed
- SklearnTransform out column names ([#99](https://github.com/tinkoff-ai/etna-ts/pull/99))
- Update EDA notebook ([#96](https://github.com/tinkoff-ai/etna-ts/pull/96))

### Fixed
- Add more obvious Exception Error for forecasting with unfitted model ([#102](https://github.com/tinkoff-ai/etna-ts/pull/102))
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ ETNA documentation is available [here](https://etna-docs.netlify.app/).
[Bunin Dmitriy](https://github.com/Mr-Geekman),
[Chikov Aleksandr](https://github.com/alex-hse-repository),
[Barinov Nikita](https://github.com/diadorer),
[Romantsov Nikolay](),
[Romantsov Nikolay](https://github.com/WinstonDovlatov),
[Makhin Artem](https://github.com/Ama16),
[Denisov Vladislav](https://github.com/v-v-denisov),
[Mitskovets Ivan](https://github.com/imitskovets),
Expand Down
1 change: 1 addition & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from etna.analysis.eda_utils import sample_pacf_plot
from etna.analysis.outliers.density_outliers import get_anomalies_density
from etna.analysis.outliers.median_outliers import get_anomalies_median
from etna.analysis.outliers.sequence_outliers import get_sequence_anomalies
from etna.analysis.plotters import get_correlation_matrix
from etna.analysis.plotters import plot_anomalies
from etna.analysis.plotters import plot_anomalies_interactive
Expand Down
1 change: 1 addition & 0 deletions etna/analysis/outliers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from etna.analysis.outliers.density_outliers import get_anomalies_density
from etna.analysis.outliers.median_outliers import get_anomalies_median
from etna.analysis.outliers.sequence_outliers import get_sequence_anomalies
102 changes: 102 additions & 0 deletions etna/analysis/outliers/sequence_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import warnings
from typing import TYPE_CHECKING
from typing import Dict
from typing import List
from typing import Tuple

import numpy as np
import pandas as pd
from saxpy.hotsax import find_discords_hotsax

if TYPE_CHECKING:
from etna.datasets import TSDataset


def get_segment_sequence_anomalies(
series: np.ndarray, num_anomalies: int = 1, anomaly_lenght: int = 15, alphabet_size: int = 3, word_lenght: int = 3
) -> List[Tuple[int, int]]:
"""Get indices of start and end of sequence outliers for one segment using SAX HOT algorithm.
Parameters
----------
series:
array to find outliers in
num_anomalies:
number of outliers to be found
anomaly_lenght:
target lenght of outliers
alphabet_size:
the number of letters with which the subsequence will be encrypted
word_lenght:
the number of segments into which the subsequence will be divided by the paa algorithm
Returns
-------
list of tuples with start and end of outliers.
"""
start_points = find_discords_hotsax(
series=series, win_size=anomaly_lenght, num_discords=num_anomalies, a_size=alphabet_size, paa_size=word_lenght
)

result = [(pt[0], pt[0] + anomaly_lenght) for pt in start_points]

return result


def get_sequence_anomalies(
ts: "TSDataset",
num_anomalies: int = 1,
anomaly_lenght: int = 15,
alphabet_size: int = 3,
word_lenght: int = 3,
in_column: str = "target",
) -> Dict[str, List[pd.Timestamp]]:
"""Find the start and end of the sequence outliers for each segment using the SAX HOT algorithm.

We use saxpy under the hood.
Repository link: https://github.com/seninp/saxpy.
Parameters
----------
ts:
TSDataset with timeseries data
num_anomalies:
number of outliers to be found
anomaly_lenght:
target lenght of outliers
alphabet_size:
the number of letters with which the subsequence will be encrypted
word_lenght:
the number of segments into which the subsequence will be divided by the paa algorithm
in_column:
name of the column in which the anomaly is searching
Returns
-------
dict of sequence outliers in format {segment_name: [outliers]}, where outliers
are a pd.Timestamp.
"""
segments = ts.segments
outliers_per_segment = dict()

for seg in segments:
segment_df = ts[:, seg, :][seg]
if segment_df[in_column].isnull().sum():
warnings.warn(
f"Segment {seg} contains nan-s. They will be removed when calculating outliers."
+ "Make sure this behavior is acceptable",
RuntimeWarning,
)
segment_df = segment_df.dropna().reset_index()
outliers_idxs = get_segment_sequence_anomalies(
series=segment_df[in_column].values,
num_anomalies=num_anomalies,
anomaly_lenght=anomaly_lenght,
alphabet_size=alphabet_size,
word_lenght=word_lenght,
)

timestamps = segment_df["timestamp"].values
outliers_per_segment[seg] = []
for left_bound, right_bound in outliers_idxs:
outliers_per_segment[seg].extend(timestamps[left_bound:right_bound])
return outliers_per_segment


__all__ = ["get_sequence_anomalies"]
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from etna.transforms.log import LogTransform
from etna.transforms.outliers import DensityOutliersTransform
from etna.transforms.outliers import MedianOutliersTransform
from etna.transforms.outliers import SAXOutliersTransform
from etna.transforms.power import BoxCoxTransform
from etna.transforms.power import YeoJohnsonTransform
from etna.transforms.pytorch_forecasting import PytorchForecastingTransform
Expand Down
67 changes: 54 additions & 13 deletions etna/transforms/outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from etna.analysis import get_anomalies_density
from etna.analysis import get_anomalies_median
from etna.analysis import get_sequence_anomalies
from etna.datasets import TSDataset
from etna.transforms.base import Transform

Expand All @@ -19,7 +20,6 @@ class OutliersTransform(Transform, ABC):
def __init__(self, in_column: str):
"""
Create instance of OutliersTransform.

WinstonDovlatov marked this conversation as resolved.
Show resolved Hide resolved
Parameters
----------
in_column:
Expand All @@ -31,12 +31,10 @@ def __init__(self, in_column: str):
def fit(self, df: pd.DataFrame) -> "OutliersTransform":
"""
Find outliers using detection method.

Parameters
----------
df:
dataframe with series to find outliers

Returns
-------
result: _OneSegmentTimeSeriesImputerTransform
Expand All @@ -49,12 +47,10 @@ def fit(self, df: pd.DataFrame) -> "OutliersTransform":
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Replace found outliers with NaNs.

Parameters
----------
df:
transform in_column series of given dataframe

Returns
-------
result: pd.DataFrame
Expand All @@ -68,12 +64,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@abstractmethod
def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call function for detection outliers with self parameters.

Parameters
----------
ts:
dataset to process

Returns
-------
dict of outliers:
Expand All @@ -87,7 +81,6 @@ class MedianOutliersTransform(OutliersTransform):

def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
"""Create instance of MedianOutliersTransform.

Parameters
----------
in_column:
Expand All @@ -104,12 +97,10 @@ def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call `get_anomalies_median` function with self parameters.

Parameters
----------
ts:
dataset to process

Returns
-------
dict of outliers:
Expand All @@ -130,7 +121,6 @@ def __init__(
distance_func: Callable[[float, float], float] = lambda x, y: abs(x - y),
):
"""Create instance of DensityOutliersTransform.

Parameters
----------
in_column:
Expand All @@ -153,12 +143,10 @@ def __init__(

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call `get_anomalies_density` function with self parameters.

Parameters
----------
ts:
dataset to process

Returns
-------
dict of outliers:
Expand All @@ -167,4 +155,57 @@ def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
return get_anomalies_density(ts, self.window_size, self.distance_coef, self.n_neighbors, self.distance_func)


class SAXOutliersTransform(OutliersTransform):
"""Transform that uses get_sequence_anomalies to find anomalies in data and replaces them with NaN."""

def __init__(
self,
in_column: str,
num_anomalies: int = 1,
anomaly_lenght: int = 15,
alphabet_size: int = 3,
word_lenght: int = 3,
):
"""Create instance of SAXOutliersTransform.
Parameters
----------
in_column:
name of processed column
num_anomalies:
number of outliers to be found
anomaly_lenght:
target lenght of outliers
alphabet_size:
the number of letters with which the subsequence will be encrypted
word_lenght:
the number of segments into which the subsequence will be divided by the paa algorithm
"""
self.in_column = in_column
self.num_anomalies = num_anomalies
self.anomaly_lenght = anomaly_lenght
self.alphabet_size = alphabet_size
self.word_lenght = word_lenght
super().__init__(in_column=self.in_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call `get_sequence_anomalies` function with self parameters.
Parameters
----------
ts:
dataset to process
Returns
-------
dict of outliers:
dict of outliers in format {segment: [outliers_timestamps]}
"""
return get_sequence_anomalies(
ts=ts,
num_anomalies=self.num_anomalies,
anomaly_lenght=self.anomaly_lenght,
alphabet_size=self.alphabet_size,
word_lenght=self.word_lenght,
in_column=self.in_column,
)


__all__ = ["MedianOutliersTransform", "DensityOutliersTransform"]
Loading