tinkoff-ai · julia-shenshina · Sep 28, 2021 · Sep 26, 2021 · Sep 26, 2021 · Sep 27, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,9 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Examples to TSDataset methods with doctest ([#92](https://github.com/tinkoff-ai/etna-ts/pull/92))
 - WandbLogger ([#71](https://github.com/tinkoff-ai/etna-ts/pull/71))
 - Pipeline ([#78](https://github.com/tinkoff-ai/etna-ts/pull/78))
+- Sequence anomalies ([#96](https://github.com/tinkoff-ai/etna-ts/pull/96))
 
 ### Changed
 - SklearnTransform out column names ([#99](https://github.com/tinkoff-ai/etna-ts/pull/99))
+- Update EDA notebook ([#96](https://github.com/tinkoff-ai/etna-ts/pull/96))
 
 ### Fixed
 - Add more obvious Exception Error for forecasting with unfitted model ([#102](https://github.com/tinkoff-ai/etna-ts/pull/102))

diff --git a/README.md b/README.md
@@ -97,7 +97,7 @@ ETNA documentation is available [here](https://etna-docs.netlify.app/).
 [Bunin Dmitriy](https://github.com/Mr-Geekman),
 [Chikov Aleksandr](https://github.com/alex-hse-repository),
 [Barinov Nikita](https://github.com/diadorer),
-[Romantsov Nikolay](),
+[Romantsov Nikolay](https://github.com/WinstonDovlatov),
 [Makhin Artem](https://github.com/Ama16),
 [Denisov Vladislav](https://github.com/v-v-denisov),
 [Mitskovets Ivan](https://github.com/imitskovets),

diff --git a/etna/analysis/__init__.py b/etna/analysis/__init__.py
@@ -3,6 +3,7 @@
 from etna.analysis.eda_utils import sample_pacf_plot
 from etna.analysis.outliers.density_outliers import get_anomalies_density
 from etna.analysis.outliers.median_outliers import get_anomalies_median
+from etna.analysis.outliers.sequence_outliers import get_sequence_anomalies
 from etna.analysis.plotters import get_correlation_matrix
 from etna.analysis.plotters import plot_anomalies
 from etna.analysis.plotters import plot_anomalies_interactive

diff --git a/etna/analysis/outliers/__init__.py b/etna/analysis/outliers/__init__.py
@@ -1,2 +1,3 @@
 from etna.analysis.outliers.density_outliers import get_anomalies_density
 from etna.analysis.outliers.median_outliers import get_anomalies_median
+from etna.analysis.outliers.sequence_outliers import get_sequence_anomalies
diff --git a/etna/analysis/outliers/sequence_outliers.py b/etna/analysis/outliers/sequence_outliers.py
@@ -0,0 +1,102 @@
+import warnings
+from typing import TYPE_CHECKING
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+from saxpy.hotsax import find_discords_hotsax
+
+if TYPE_CHECKING:
+    from etna.datasets import TSDataset
+
+
+def get_segment_sequence_anomalies(
+    series: np.ndarray, num_anomalies: int = 1, anomaly_lenght: int = 15, alphabet_size: int = 3, word_lenght: int = 3
+) -> List[Tuple[int, int]]:
+    """Get indices of start and end of sequence outliers for one segment using SAX HOT algorithm.
+    Parameters
+    ----------
+    series:
+        array to find outliers in
+    num_anomalies:
+        number of outliers to be found
+    anomaly_lenght:
+        target lenght of outliers
+    alphabet_size:
+        the number of letters with which the subsequence will be encrypted
+    word_lenght:
+        the number of segments into which the subsequence will be divided by the paa algorithm
+    Returns
+    -------
+    list of tuples with start and end of outliers.
+    """
+    start_points = find_discords_hotsax(
+        series=series, win_size=anomaly_lenght, num_discords=num_anomalies, a_size=alphabet_size, paa_size=word_lenght
+    )
+
+    result = [(pt[0], pt[0] + anomaly_lenght) for pt in start_points]
+
+    return result
+
+
+def get_sequence_anomalies(
+    ts: "TSDataset",
+    num_anomalies: int = 1,
+    anomaly_lenght: int = 15,
+    alphabet_size: int = 3,
+    word_lenght: int = 3,
+    in_column: str = "target",
+) -> Dict[str, List[pd.Timestamp]]:
+    """Find the start and end of the sequence outliers for each segment using the SAX HOT algorithm.
+
+    We use saxpy under the hood.
+    Repository link: https://github.com/seninp/saxpy.
+    Parameters
+    ----------
+    ts:
+        TSDataset with timeseries data
+    num_anomalies:
+        number of outliers to be found
+    anomaly_lenght:
+        target lenght of outliers
+    alphabet_size:
+        the number of letters with which the subsequence will be encrypted
+    word_lenght:
+        the number of segments into which the subsequence will be divided by the paa algorithm
+    in_column:
+        name of the column in which the anomaly is searching
+    Returns
+    -------
+    dict of sequence outliers in format {segment_name: [outliers]}, where outliers
+    are a pd.Timestamp.
+    """
+    segments = ts.segments
+    outliers_per_segment = dict()
+
+    for seg in segments:
+        segment_df = ts[:, seg, :][seg]
+        if segment_df[in_column].isnull().sum():
+            warnings.warn(
+                f"Segment {seg} contains nan-s. They will be removed when calculating outliers."
+                + "Make sure this behavior is acceptable",
+                RuntimeWarning,
+            )
+        segment_df = segment_df.dropna().reset_index()
+        outliers_idxs = get_segment_sequence_anomalies(
+            series=segment_df[in_column].values,
+            num_anomalies=num_anomalies,
+            anomaly_lenght=anomaly_lenght,
+            alphabet_size=alphabet_size,
+            word_lenght=word_lenght,
+        )
+
+        timestamps = segment_df["timestamp"].values
+        outliers_per_segment[seg] = []
+        for left_bound, right_bound in outliers_idxs:
+            outliers_per_segment[seg].extend(timestamps[left_bound:right_bound])
+    return outliers_per_segment
+
+
+__all__ = ["get_sequence_anomalies"]
diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py
@@ -11,6 +11,7 @@
 from etna.transforms.log import LogTransform
 from etna.transforms.outliers import DensityOutliersTransform
 from etna.transforms.outliers import MedianOutliersTransform
+from etna.transforms.outliers import SAXOutliersTransform
 from etna.transforms.power import BoxCoxTransform
 from etna.transforms.power import YeoJohnsonTransform
 from etna.transforms.pytorch_forecasting import PytorchForecastingTransform

diff --git a/etna/transforms/outliers.py b/etna/transforms/outliers.py
@@ -9,6 +9,7 @@
 
 from etna.analysis import get_anomalies_density
 from etna.analysis import get_anomalies_median
+from etna.analysis import get_sequence_anomalies
 from etna.datasets import TSDataset
 from etna.transforms.base import Transform
 
@@ -19,7 +20,6 @@ class OutliersTransform(Transform, ABC):
     def __init__(self, in_column: str):
         """
         Create instance of OutliersTransform.
-
         Parameters
         ----------
         in_column:
@@ -31,12 +31,10 @@ def __init__(self, in_column: str):
     def fit(self, df: pd.DataFrame) -> "OutliersTransform":
         """
         Find outliers using detection method.
-
         Parameters
         ----------
         df:
             dataframe with series to find outliers
-
         Returns
         -------
         result: _OneSegmentTimeSeriesImputerTransform
@@ -49,12 +47,10 @@ def fit(self, df: pd.DataFrame) -> "OutliersTransform":
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Replace found outliers with NaNs.
-
         Parameters
         ----------
         df:
             transform in_column series of given dataframe
-
         Returns
         -------
         result: pd.DataFrame
@@ -68,12 +64,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
     @abstractmethod
     def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
         """Call function for detection outliers with self parameters.
-
         Parameters
         ----------
         ts:
             dataset to process
-
         Returns
         -------
         dict of outliers:
@@ -87,7 +81,6 @@ class MedianOutliersTransform(OutliersTransform):
 
     def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
         """Create instance of MedianOutliersTransform.
-
         Parameters
         ----------
         in_column:
@@ -104,12 +97,10 @@ def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
 
     def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
         """Call `get_anomalies_median` function with self parameters.
-
         Parameters
         ----------
         ts:
             dataset to process
-
         Returns
         -------
         dict of outliers:
@@ -130,7 +121,6 @@ def __init__(
         distance_func: Callable[[float, float], float] = lambda x, y: abs(x - y),
     ):
         """Create instance of DensityOutliersTransform.
-
         Parameters
         ----------
         in_column:
@@ -153,12 +143,10 @@ def __init__(
 
     def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
         """Call `get_anomalies_density` function with self parameters.
-
         Parameters
         ----------
         ts:
             dataset to process
-
         Returns
         -------
         dict of outliers:
@@ -167,4 +155,57 @@ def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
         return get_anomalies_density(ts, self.window_size, self.distance_coef, self.n_neighbors, self.distance_func)
 
 
+class SAXOutliersTransform(OutliersTransform):
+    """Transform that uses get_sequence_anomalies to find anomalies in data and replaces them with NaN."""
+
+    def __init__(
+        self,
+        in_column: str,
+        num_anomalies: int = 1,
+        anomaly_lenght: int = 15,
+        alphabet_size: int = 3,
+        word_lenght: int = 3,
+    ):
+        """Create instance of SAXOutliersTransform.
+        Parameters
+        ----------
+        in_column:
+            name of processed column
+        num_anomalies:
+            number of outliers to be found
+        anomaly_lenght:
+            target lenght of outliers
+        alphabet_size:
+            the number of letters with which the subsequence will be encrypted
+        word_lenght:
+            the number of segments into which the subsequence will be divided by the paa algorithm
+        """
+        self.in_column = in_column
+        self.num_anomalies = num_anomalies
+        self.anomaly_lenght = anomaly_lenght
+        self.alphabet_size = alphabet_size
+        self.word_lenght = word_lenght
+        super().__init__(in_column=self.in_column)
+
+    def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
+        """Call `get_sequence_anomalies` function with self parameters.
+        Parameters
+        ----------
+        ts:
+            dataset to process
+        Returns
+        -------
+        dict of outliers:
+            dict of outliers in format {segment: [outliers_timestamps]}
+        """
+        return get_sequence_anomalies(
+            ts=ts,
+            num_anomalies=self.num_anomalies,
+            anomaly_lenght=self.anomaly_lenght,
+            alphabet_size=self.alphabet_size,
+            word_lenght=self.word_lenght,
+            in_column=self.in_column,
+        )
+
+
 __all__ = ["MedianOutliersTransform", "DensityOutliersTransform"]