tinkoff-ai · alex-hse-repository · Mar 1, 2022 · Feb 25, 2022 · Feb 25, 2022 · Feb 25, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,6 +46,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549))
 - 
 - Make detrending polynomial ([#566](https://github.com/tinkoff-ai/etna/pull/566))
+- Update documentation about transforms that generate regressors, update examples with them ([#572](https://github.com/tinkoff-ai/etna/pull/572))
 - 
 - Make `LabelEncoderTransform` and `OneHotEncoderTransform` multi-segment ([#554](https://github.com/tinkoff-ai/etna/pull/554))
 ### Fixed

diff --git a/etna/datasets/utils.py b/etna/datasets/utils.py
@@ -50,19 +50,19 @@ def duplicate_data(df: pd.DataFrame, segments: Sequence[str], format: str = Data
     ... )
     >>> timestamp = pd.date_range("2020-03-10", periods=100, freq="D")
     >>> is_friday_13 = (timestamp.weekday == 4) & (timestamp.day == 13)
-    >>> df_exog_raw = pd.DataFrame({"timestamp": timestamp, "regressor_is_friday_13": is_friday_13})
+    >>> df_exog_raw = pd.DataFrame({"timestamp": timestamp, "is_friday_13": is_friday_13})
     >>> df_exog = duplicate_data(df_exog_raw, segments=["segment_0", "segment_1"], format="wide")
     >>> df_ts_format = TSDataset.to_dataset(df)
-    >>> ts = TSDataset(df=df_ts_format, df_exog=df_exog, freq="D")
+    >>> ts = TSDataset(df=df_ts_format, df_exog=df_exog, freq="D", known_future="all")
     >>> ts.head()
-    segment                 segment_0                     segment_1
-    feature    regressor_is_friday_13 target regressor_is_friday_13 target
+    segment       segment_0           segment_1
+    feature    is_friday_13 target is_friday_13 target
     timestamp
-    2020-03-10                  False    1.0                  False    1.0
-    2020-03-11                  False    1.0                  False    1.0
-    2020-03-12                  False    1.0                  False    1.0
-    2020-03-13                   True    1.0                   True    1.0
-    2020-03-14                  False    1.0                  False    1.0
+    2020-03-10        False   1.00        False   1.00
+    2020-03-11        False   1.00        False   1.00
+    2020-03-12        False   1.00        False   1.00
+    2020-03-13         True   1.00         True   1.00
+    2020-03-14        False   1.00        False   1.00
     """
     # check segments length
     if len(segments) == 0:

diff --git a/etna/loggers/wandb_logger.py b/etna/loggers/wandb_logger.py
@@ -126,7 +126,7 @@ def log_backtest_metrics(
         from etna.analysis import plot_backtest_interactive
         from etna.datasets import TSDataset
 
-        summary = dict()
+        summary: Dict[str, Any] = dict()
         if self.table:
             summary["metrics"] = wandb.Table(data=metrics_df)
             summary["forecast"] = wandb.Table(data=TSDataset.to_flatten(forecast_df))
@@ -158,7 +158,7 @@ def log_backtest_run(self, metrics: pd.DataFrame, forecast: pd.DataFrame, test:
         columns_name = list(metrics.columns)
         metrics = metrics.reset_index()
         metrics.columns = ["segment"] + columns_name
-        summary = dict()
+        summary: Dict[str, Any] = dict()
         if self.table:
             summary["metrics"] = wandb.Table(data=metrics)
             summary["forecast"] = wandb.Table(data=TSDataset.to_flatten(forecast))

diff --git a/etna/transforms/decomposition/trend.py b/etna/transforms/decomposition/trend.py
@@ -82,7 +82,7 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
 
 
 class _TrendTransform(PerSegmentWrapper):
-    """_TrendTransform adds trend as a feature. Creates column 'regressor_<in_column>_trend'."""
+    """_TrendTransform adds trend as a feature. Creates column '<in_column>_trend'."""
 
     def __init__(
         self,
@@ -150,7 +150,7 @@ def __init__(
             name of column to apply transform to
         out_column:
             name of added column.
-            If not given, use '{self.__repr__()}'
+            If not given, use `self.__repr__()`
         detrend_model:
             model to get trend in data
         model:

diff --git a/etna/transforms/encoders/mean_segment_encoder.py b/etna/transforms/encoders/mean_segment_encoder.py
@@ -7,12 +7,12 @@
 
 
 class MeanSegmentEncoderTransform(Transform, FutureMixin):
-    """Makes expanding mean target encoding of the segment. Creates column 'regressor_segment_mean'."""
+    """Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""
 
     idx = pd.IndexSlice
 
     def __init__(self):
-        self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="regressor_segment_mean")
+        self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean")
         self.global_means: np.ndarray[float] = None
 
     def fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
@@ -48,5 +48,5 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         df = self.mean_encoder.transform(df)
         segment = df.columns.get_level_values("segment").unique()[0]
         nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
-        df.loc[nan_timestamps, self.idx[:, "regressor_segment_mean"]] = self.global_means
+        df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = self.global_means
         return df
diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py
@@ -6,7 +6,7 @@
 
 
 class SegmentEncoderTransform(Transform, FutureMixin):
-    """Encode segment label to categorical. Creates column 'regressor_segment_code'."""
+    """Encode segment label to categorical. Creates column 'segment_code'."""
 
     idx = pd.IndexSlice
 
@@ -47,9 +47,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         encoded_matrix = encoded_matrix.reshape(len(self._le.classes_), -1).repeat(len(df), axis=1).T
         encoded_df = pd.DataFrame(
             encoded_matrix,
-            columns=pd.MultiIndex.from_product(
-                [self._le.classes_, ["regressor_segment_code"]], names=("segment", "feature")
-            ),
+            columns=pd.MultiIndex.from_product([self._le.classes_, ["segment_code"]], names=("segment", "feature")),
             index=df.index,
         )
         encoded_df = encoded_df.astype("category")

diff --git a/etna/transforms/math/add_constant.py b/etna/transforms/math/add_constant.py
@@ -95,7 +95,7 @@ def __init__(self, in_column: str, value: float, inplace: bool = True, out_colum
         inplace:
             if True, apply add constant transformation inplace to in_column, if False, add transformed column to dataset
         out_column:
-            name of added column. If not given, use self.__repr__()
+            name of added column. If not given, use `self.__repr__()`
         """
         self.in_column = in_column
         self.value = value

diff --git a/etna/transforms/math/differencing.py b/etna/transforms/math/differencing.py
@@ -39,9 +39,8 @@ def __init__(
         inplace:
             if True, apply transformation inplace to in_column, if False, add transformed column to dataset
         out_column:
-            if set, name of added column, the final name will be '{out_column}',
-            don't forget to add 'regressor_' prefix
-            if isn't set, name will be based on `self.__repr__`
+            if set, name of added column, the final name will be '{out_column}';
+            if isn't set, name will be based on `self.__repr__()`
 
         Raises
         ------
@@ -63,10 +62,7 @@ def __init__(
 
     def _get_column_name(self) -> str:
         if self.out_column is None:
-            prefix = ""
-            if self.in_column.startswith("regressor_"):
-                prefix = "regressor_"
-            return f"{prefix}{self.__repr__()}"
+            return self.__repr__()
         else:
             return self.out_column
 
@@ -264,9 +260,8 @@ def __init__(
         inplace:
             if True, apply transformation inplace to in_column, if False, add transformed column to dataset
         out_column:
-            if set, name of added column, the final name will be '{out_column}',
-            don't forget to add 'regressor_' prefix
-            if isn't set, name will be based on `self.__repr__`
+            if set, name of added column, the final name will be '{out_column}';
+            if isn't set, name will be based on `self.__repr__()`
 
         Raises
         ------
@@ -307,10 +302,7 @@ def _get_column_name(self) -> str:
         if self.inplace:
             return self.in_column
         if self.out_column is None:
-            prefix = ""
-            if self.in_column.startswith("regressor_"):
-                prefix = "regressor_"
-            return f"{prefix}{self.__repr__()}"
+            return self.__repr__()
         else:
             return self.out_column
 

diff --git a/etna/transforms/math/lags.py b/etna/transforms/math/lags.py
@@ -28,7 +28,7 @@ def __init__(self, in_column: str, lags: Union[List[int], int], out_column: Opti
     def _get_column_name(self, lag: int) -> str:
         if self.out_column is None:
             temp_transform = LagTransform(in_column=self.in_column, out_column=self.out_column, lags=[lag])
-            return f"regressor_{temp_transform.__repr__()}"
+            return repr(temp_transform)
         else:
             return f"{self.out_column}_{lag}"
 
@@ -56,8 +56,8 @@ def __init__(self, in_column: str, lags: Union[List[int], int], out_column: Opti
             int value or list of values for lags computation; if int, generate range of lags from 1 to given value
         out_column:
             base for the name of created columns;
-            if set the final name is '{out_column}_{lag_number}', don't forget to add 'regressor_' prefix if necessary;
-            if don't set, name will be 'regressor_{transform.__repr__()}',
+            if set the final name is '{out_column}_{lag_number}';
+            if don't set, name will be 'transform.__repr__()',
             repr will be made for transform that creates exactly this column
 
         Raises

diff --git a/etna/transforms/math/log.py b/etna/transforms/math/log.py
@@ -26,7 +26,7 @@ def __init__(self, in_column: str, base: int = 10, inplace: bool = True, out_col
         inplace:
             if True, apply logarithm transformation inplace to in_column, if False, add transformed column to dataset.
         out_column:
-            name of added column. If not given, use self.__repr__()
+            name of added column. If not given, use `self.__repr__()`
         """
         self.base = base
         self.in_column = in_column
@@ -103,7 +103,7 @@ def __init__(self, in_column: str, base: int = 10, inplace: bool = True, out_col
             if True, apply logarithm transformation inplace to in_column,
             if False, add column add transformed column to dataset
         out_column:
-            name of added column. If not given, use self.__repr__()
+            name of added column. If not given, use `self.__repr__()`
         """
         self.in_column = in_column
         self.base = base

diff --git a/etna/transforms/math/power.py b/etna/transforms/math/power.py
@@ -36,7 +36,7 @@ def __init__(
             if True, apply transformation inplace to in_column,
             if False, add column to dataset.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         standardize:
             Set to True to apply zero-mean, unit-variance normalization to the
             transformed output.
@@ -84,7 +84,7 @@ def __init__(
             if True, apply transformation inplace to in_column,
             if False, add column to dataset.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         standardize:
             Set to True to apply zero-mean, unit-variance normalization to the
             transformed output.

diff --git a/etna/transforms/math/scalers.py b/etna/transforms/math/scalers.py
@@ -42,7 +42,7 @@ def __init__(
         inplace:
             features are changed by scaled.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         with_mean:
             if True, center the data before scaling.
         with_std:
@@ -100,7 +100,7 @@ def __init__(
         inplace:
             features are changed by scaled.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         with_centering:
             if True, center the data before scaling.
         with_scaling:
@@ -171,7 +171,7 @@ def __init__(
         inplace:
             features are changed by scaled.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         feature_range:
             desired range of transformed data.
         clip:
@@ -224,7 +224,7 @@ def __init__(
         inplace:
             features are changed by scaled.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         mode:
             "macro" or "per-segment", way to transform features over segments.
             If "macro", transforms features globally, gluing the corresponding ones for all segments.

diff --git a/etna/transforms/math/sklearn.py b/etna/transforms/math/sklearn.py
@@ -44,7 +44,7 @@ def __init__(
         inplace:
             features are changed by transformed.
         out_column:
-            base for the names of generated columns, uses self.__repr__() if not given.
+            base for the names of generated columns, uses `self.__repr__()` if not given.
         mode:
             "macro" or "per-segment", way to transform features over segments.
             If "macro", transforms features globally, gluing the corresponding ones for all segments.
@@ -74,7 +74,7 @@ def _get_column_name(self, in_column: str) -> str:
         if self.out_column is None:
             new_transform = deepcopy(self)
             new_transform.in_column = [in_column]
-            return f"{new_transform.__repr__()}"
+            return repr(new_transform)
         else:
             return f"{self.out_column}_{in_column}"
 

diff --git a/etna/transforms/math/statistics.py b/etna/transforms/math/statistics.py
@@ -134,7 +134,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.window = window
         self.in_column = in_column
@@ -213,7 +213,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.window = window
@@ -267,7 +267,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.quantile = quantile
@@ -319,7 +319,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.window = window
@@ -370,7 +370,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.window = window
@@ -421,7 +421,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.window = window
@@ -472,7 +472,7 @@ def __init__(
         fillna: float
             value to fill results NaNs with
         out_column: str, optional
-            result column name. If not given use __repr__()
+            result column name. If not given use `self.__repr__()`
         """
         self.in_column = in_column
         self.window = window

diff --git a/etna/transforms/missing_values/resample.py b/etna/transforms/missing_values/resample.py
@@ -24,7 +24,7 @@ def __init__(self, in_column: str, distribution_column: str, inplace: bool, out_
         inplace:
             if True, apply resampling inplace to in_column, if False, add transformed column to dataset
         out_column:
-            name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
+            name of added column. If not given, use `self.__repr__()`
         """
         self.in_column = in_column
         self.distribution_column = distribution_column

diff --git a/etna/transforms/timestamp/fourier.py b/etna/transforms/timestamp/fourier.py
@@ -33,9 +33,8 @@ def __init__(
             and sin and cos of the second order will be used,
             mods should be >= 1 and < period
         out_column:
-            if set, name of added column, the final name will be '{out_columnt}_{mod}',
-            don't forget to add 'regressor_' prefix;
-            if don't set, name will be 'regressor_{transform.repr}',
+            if set, name of added column, the final name will be '{out_columnt}_{mod}';
+            if don't set, name will be 'transform.__repr__()',
             repr will be made for transform that creates exactly this column
 
         Raises

diff --git a/etna/transforms/timestamp/holiday.py b/etna/transforms/timestamp/holiday.py
@@ -10,7 +10,7 @@
 
 
 class HolidayTransform(Transform, FutureMixin):
-    """HolidayTransform generates series that indicates holidays in given dataframe. Creates column 'regressor_holidays'."""
+    """HolidayTransform generates series that indicates holidays in given dataframe."""
 
     def __init__(self, iso_code: str = "RUS", out_column: Optional[str] = None):
         """
@@ -20,12 +20,12 @@ def __init__(self, iso_code: str = "RUS", out_column: Optional[str] = None):
         iso_code:
             internationally recognised codes, designated to country for which we want to find the holidays
         out_column:
-            name of added column. Use self.__repr__() if not given.
+            name of added column. Use `self.__repr__()` if not given.
         """
         self.iso_code = iso_code
         self.holidays = holidays.CountryHoliday(iso_code)
         self.out_column = out_column
-        self.out_column = self.out_column if self.out_column is not None else f"regressor_{self.__repr__()}"
+        self.out_column = self.out_column if self.out_column is not None else self.__repr__()
 
     def fit(self, df: pd.DataFrame) -> "HolidayTransform":
         """
@@ -46,7 +46,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
             value series with index column in timestamp format
         Returns
         -------
-            pd.DataFrame with 'regressor_holidays' column
+            pd.DataFrame with added holidays
         """
         if (df.index[1] - df.index[0]) > datetime.timedelta(days=1):
             raise ValueError("Frequency of data should be no more than daily.")