Skip to content

Clean up after regressors-2.0 #572

Merged
merged 9 commits into from
Mar 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549))
-
- Make detrending polynomial ([#566](https://github.com/tinkoff-ai/etna/pull/566))
- Update documentation about transforms that generate regressors, update examples with them ([#572](https://github.com/tinkoff-ai/etna/pull/572))
-
- Make `LabelEncoderTransform` and `OneHotEncoderTransform` multi-segment ([#554](https://github.com/tinkoff-ai/etna/pull/554))
### Fixed
Expand Down
18 changes: 9 additions & 9 deletions etna/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,19 @@ def duplicate_data(df: pd.DataFrame, segments: Sequence[str], format: str = Data
... )
>>> timestamp = pd.date_range("2020-03-10", periods=100, freq="D")
>>> is_friday_13 = (timestamp.weekday == 4) & (timestamp.day == 13)
>>> df_exog_raw = pd.DataFrame({"timestamp": timestamp, "regressor_is_friday_13": is_friday_13})
>>> df_exog_raw = pd.DataFrame({"timestamp": timestamp, "is_friday_13": is_friday_13})
>>> df_exog = duplicate_data(df_exog_raw, segments=["segment_0", "segment_1"], format="wide")
>>> df_ts_format = TSDataset.to_dataset(df)
>>> ts = TSDataset(df=df_ts_format, df_exog=df_exog, freq="D")
>>> ts = TSDataset(df=df_ts_format, df_exog=df_exog, freq="D", known_future="all")
>>> ts.head()
segment segment_0 segment_1
feature regressor_is_friday_13 target regressor_is_friday_13 target
segment segment_0 segment_1
feature is_friday_13 target is_friday_13 target
timestamp
2020-03-10 False 1.0 False 1.0
2020-03-11 False 1.0 False 1.0
2020-03-12 False 1.0 False 1.0
2020-03-13 True 1.0 True 1.0
2020-03-14 False 1.0 False 1.0
2020-03-10 False 1.00 False 1.00
2020-03-11 False 1.00 False 1.00
2020-03-12 False 1.00 False 1.00
2020-03-13 True 1.00 True 1.00
2020-03-14 False 1.00 False 1.00
"""
# check segments length
if len(segments) == 0:
Expand Down
4 changes: 2 additions & 2 deletions etna/loggers/wandb_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def log_backtest_metrics(
from etna.analysis import plot_backtest_interactive
from etna.datasets import TSDataset

summary = dict()
summary: Dict[str, Any] = dict()
if self.table:
summary["metrics"] = wandb.Table(data=metrics_df)
summary["forecast"] = wandb.Table(data=TSDataset.to_flatten(forecast_df))
Expand Down Expand Up @@ -158,7 +158,7 @@ def log_backtest_run(self, metrics: pd.DataFrame, forecast: pd.DataFrame, test:
columns_name = list(metrics.columns)
metrics = metrics.reset_index()
metrics.columns = ["segment"] + columns_name
summary = dict()
summary: Dict[str, Any] = dict()
if self.table:
summary["metrics"] = wandb.Table(data=metrics)
summary["forecast"] = wandb.Table(data=TSDataset.to_flatten(forecast))
Expand Down
4 changes: 2 additions & 2 deletions etna/transforms/decomposition/trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:


class _TrendTransform(PerSegmentWrapper):
"""_TrendTransform adds trend as a feature. Creates column 'regressor_<in_column>_trend'."""
"""_TrendTransform adds trend as a feature. Creates column '<in_column>_trend'."""

def __init__(
self,
Expand Down Expand Up @@ -150,7 +150,7 @@ def __init__(
name of column to apply transform to
out_column:
name of added column.
If not given, use '{self.__repr__()}'
If not given, use `self.__repr__()`
detrend_model:
model to get trend in data
model:
Expand Down
6 changes: 3 additions & 3 deletions etna/transforms/encoders/mean_segment_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@


class MeanSegmentEncoderTransform(Transform, FutureMixin):
"""Makes expanding mean target encoding of the segment. Creates column 'regressor_segment_mean'."""
"""Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""

idx = pd.IndexSlice

def __init__(self):
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="regressor_segment_mean")
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean")
self.global_means: np.ndarray[float] = None

def fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
Expand Down Expand Up @@ -48,5 +48,5 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.mean_encoder.transform(df)
segment = df.columns.get_level_values("segment").unique()[0]
nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
df.loc[nan_timestamps, self.idx[:, "regressor_segment_mean"]] = self.global_means
df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = self.global_means
return df
6 changes: 2 additions & 4 deletions etna/transforms/encoders/segment_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


class SegmentEncoderTransform(Transform, FutureMixin):
"""Encode segment label to categorical. Creates column 'regressor_segment_code'."""
"""Encode segment label to categorical. Creates column 'segment_code'."""

idx = pd.IndexSlice

Expand Down Expand Up @@ -47,9 +47,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
encoded_matrix = encoded_matrix.reshape(len(self._le.classes_), -1).repeat(len(df), axis=1).T
encoded_df = pd.DataFrame(
encoded_matrix,
columns=pd.MultiIndex.from_product(
[self._le.classes_, ["regressor_segment_code"]], names=("segment", "feature")
),
columns=pd.MultiIndex.from_product([self._le.classes_, ["segment_code"]], names=("segment", "feature")),
index=df.index,
)
encoded_df = encoded_df.astype("category")
Expand Down
2 changes: 1 addition & 1 deletion etna/transforms/math/add_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, in_column: str, value: float, inplace: bool = True, out_colum
inplace:
if True, apply add constant transformation inplace to in_column, if False, add transformed column to dataset
out_column:
name of added column. If not given, use self.__repr__()
name of added column. If not given, use `self.__repr__()`
"""
self.in_column = in_column
self.value = value
Expand Down
20 changes: 6 additions & 14 deletions etna/transforms/math/differencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@ def __init__(
inplace:
if True, apply transformation inplace to in_column, if False, add transformed column to dataset
out_column:
if set, name of added column, the final name will be '{out_column}',
don't forget to add 'regressor_' prefix
if isn't set, name will be based on `self.__repr__`
if set, name of added column, the final name will be '{out_column}';
if isn't set, name will be based on `self.__repr__()`

Raises
------
Expand All @@ -63,10 +62,7 @@ def __init__(

def _get_column_name(self) -> str:
if self.out_column is None:
prefix = ""
if self.in_column.startswith("regressor_"):
prefix = "regressor_"
return f"{prefix}{self.__repr__()}"
return self.__repr__()
else:
return self.out_column

Expand Down Expand Up @@ -264,9 +260,8 @@ def __init__(
inplace:
if True, apply transformation inplace to in_column, if False, add transformed column to dataset
out_column:
if set, name of added column, the final name will be '{out_column}',
don't forget to add 'regressor_' prefix
if isn't set, name will be based on `self.__repr__`
if set, name of added column, the final name will be '{out_column}';
if isn't set, name will be based on `self.__repr__()`

Raises
------
Expand Down Expand Up @@ -307,10 +302,7 @@ def _get_column_name(self) -> str:
if self.inplace:
return self.in_column
if self.out_column is None:
prefix = ""
if self.in_column.startswith("regressor_"):
prefix = "regressor_"
return f"{prefix}{self.__repr__()}"
return self.__repr__()
else:
return self.out_column

Expand Down
6 changes: 3 additions & 3 deletions etna/transforms/math/lags.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, in_column: str, lags: Union[List[int], int], out_column: Opti
def _get_column_name(self, lag: int) -> str:
if self.out_column is None:
temp_transform = LagTransform(in_column=self.in_column, out_column=self.out_column, lags=[lag])
return f"regressor_{temp_transform.__repr__()}"
return repr(temp_transform)
else:
return f"{self.out_column}_{lag}"

Expand Down Expand Up @@ -56,8 +56,8 @@ def __init__(self, in_column: str, lags: Union[List[int], int], out_column: Opti
int value or list of values for lags computation; if int, generate range of lags from 1 to given value
out_column:
base for the name of created columns;
if set the final name is '{out_column}_{lag_number}', don't forget to add 'regressor_' prefix if necessary;
if don't set, name will be 'regressor_{transform.__repr__()}',
if set the final name is '{out_column}_{lag_number}';
if don't set, name will be 'transform.__repr__()',
repr will be made for transform that creates exactly this column

Raises
Expand Down
4 changes: 2 additions & 2 deletions etna/transforms/math/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self, in_column: str, base: int = 10, inplace: bool = True, out_col
inplace:
if True, apply logarithm transformation inplace to in_column, if False, add transformed column to dataset.
out_column:
name of added column. If not given, use self.__repr__()
name of added column. If not given, use `self.__repr__()`
"""
self.base = base
self.in_column = in_column
Expand Down Expand Up @@ -103,7 +103,7 @@ def __init__(self, in_column: str, base: int = 10, inplace: bool = True, out_col
if True, apply logarithm transformation inplace to in_column,
if False, add column add transformed column to dataset
out_column:
name of added column. If not given, use self.__repr__()
name of added column. If not given, use `self.__repr__()`
"""
self.in_column = in_column
self.base = base
Expand Down
4 changes: 2 additions & 2 deletions etna/transforms/math/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Expand Down Expand Up @@ -84,7 +84,7 @@ def __init__(
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Expand Down
8 changes: 4 additions & 4 deletions etna/transforms/math/scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
with_mean:
if True, center the data before scaling.
with_std:
Expand Down Expand Up @@ -100,7 +100,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
with_centering:
if True, center the data before scaling.
with_scaling:
Expand Down Expand Up @@ -171,7 +171,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
feature_range:
desired range of transformed data.
clip:
Expand Down Expand Up @@ -224,7 +224,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand Down
4 changes: 2 additions & 2 deletions etna/transforms/math/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
inplace:
features are changed by transformed.
out_column:
base for the names of generated columns, uses self.__repr__() if not given.
base for the names of generated columns, uses `self.__repr__()` if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand Down Expand Up @@ -74,7 +74,7 @@ def _get_column_name(self, in_column: str) -> str:
if self.out_column is None:
new_transform = deepcopy(self)
new_transform.in_column = [in_column]
return f"{new_transform.__repr__()}"
return repr(new_transform)
else:
return f"{self.out_column}_{in_column}"

Expand Down
14 changes: 7 additions & 7 deletions etna/transforms/math/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.window = window
self.in_column = in_column
Expand Down Expand Up @@ -213,7 +213,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.window = window
Expand Down Expand Up @@ -267,7 +267,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.quantile = quantile
Expand Down Expand Up @@ -319,7 +319,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.window = window
Expand Down Expand Up @@ -370,7 +370,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.window = window
Expand Down Expand Up @@ -421,7 +421,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.window = window
Expand Down Expand Up @@ -472,7 +472,7 @@ def __init__(
fillna: float
value to fill results NaNs with
out_column: str, optional
result column name. If not given use __repr__()
result column name. If not given use `self.__repr__()`
"""
self.in_column = in_column
self.window = window
Expand Down
2 changes: 1 addition & 1 deletion etna/transforms/missing_values/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, in_column: str, distribution_column: str, inplace: bool, out_
inplace:
if True, apply resampling inplace to in_column, if False, add transformed column to dataset
out_column:
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
name of added column. If not given, use `self.__repr__()`
"""
self.in_column = in_column
self.distribution_column = distribution_column
Expand Down
5 changes: 2 additions & 3 deletions etna/transforms/timestamp/fourier.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ def __init__(
and sin and cos of the second order will be used,
mods should be >= 1 and < period
out_column:
if set, name of added column, the final name will be '{out_columnt}_{mod}',
don't forget to add 'regressor_' prefix;
if don't set, name will be 'regressor_{transform.repr}',
if set, name of added column, the final name will be '{out_columnt}_{mod}';
if don't set, name will be 'transform.__repr__()',
repr will be made for transform that creates exactly this column

Raises
Expand Down
8 changes: 4 additions & 4 deletions etna/transforms/timestamp/holiday.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class HolidayTransform(Transform, FutureMixin):
"""HolidayTransform generates series that indicates holidays in given dataframe. Creates column 'regressor_holidays'."""
"""HolidayTransform generates series that indicates holidays in given dataframe."""

def __init__(self, iso_code: str = "RUS", out_column: Optional[str] = None):
"""
Expand All @@ -20,12 +20,12 @@ def __init__(self, iso_code: str = "RUS", out_column: Optional[str] = None):
iso_code:
internationally recognised codes, designated to country for which we want to find the holidays
out_column:
name of added column. Use self.__repr__() if not given.
name of added column. Use `self.__repr__()` if not given.
"""
self.iso_code = iso_code
self.holidays = holidays.CountryHoliday(iso_code)
self.out_column = out_column
self.out_column = self.out_column if self.out_column is not None else f"regressor_{self.__repr__()}"
self.out_column = self.out_column if self.out_column is not None else self.__repr__()

def fit(self, df: pd.DataFrame) -> "HolidayTransform":
"""
Expand All @@ -46,7 +46,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
value series with index column in timestamp format
Returns
-------
pd.DataFrame with 'regressor_holidays' column
pd.DataFrame with added holidays
"""
if (df.index[1] - df.index[0]) > datetime.timedelta(days=1):
raise ValueError("Frequency of data should be no more than daily.")
Expand Down
Loading