Skip to content

Create notebook for Auto and Tune #1285

Merged
merged 8 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add default `params_to_tune` for `TreeFeatureSelectionTransform`, `MRMRFeatureSelectionTransform` and `GaleShapleyFeatureSelectionTransform` ([#1250](https://github.com/tinkoff-ai/etna/pull/1250))
- Add tuning stage into `Auto.fit` ([#1272](https://github.com/tinkoff-ai/etna/pull/1272))
- Add `params_to_tune` into `Tune` init ([#1282](https://github.com/tinkoff-ai/etna/pull/1282))
- Skip duplicates during `Tune.fit`, skip duplicates in `top_k`, add AutoML notebook ([#1285](https://github.com/tinkoff-ai/etna/pull/1285))
### Fixed
-
- Fix `BaseReconciliator` to work on `pandas==1.1.5` ([#1229](https://github.com/tinkoff-ai/etna/pull/1229))
Expand Down
76 changes: 60 additions & 16 deletions etna/auto/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def summary(self) -> pd.DataFrame:
def top_k(self, k: int = 5) -> List[BasePipeline]:
"""Get top k pipelines with the best metric value.

Only complete and non-duplicate studies are taken into account.

Parameters
----------
k:
Expand Down Expand Up @@ -149,7 +151,7 @@ def __init__(
runner:
Runner to use for distributed training. By default, :py:class:`~etna.auto.runner.local.LocalRunner` is used.
storage:
Optuna storage to use. By default, sqlite storage is used.
Optuna storage to use. By default, sqlite storage is used with name "etna-auto.db".
metrics:
List of metrics to compute.
By default, :py:class:`~etna.metrics.metrics.Sign`, :py:class:`~etna.metrics.metrics.SMAPE`,
Expand All @@ -174,7 +176,8 @@ def __init__(

def _top_k(self, summary: pd.DataFrame, k: int) -> List[BasePipeline]:
metric_name = f"{self.target_metric.name}_{self.metric_aggregation}"
df = summary[~summary[metric_name].isna()]
df = summary[summary["state"].apply(lambda x: x is optuna.structs.TrialState.COMPLETE)]
df = df.drop_duplicates(subset=["hash"])
df = df.sort_values(
by=metric_name,
ascending=(not self.target_metric.greater_is_better),
Expand All @@ -184,6 +187,8 @@ def _top_k(self, summary: pd.DataFrame, k: int) -> List[BasePipeline]:
def top_k(self, k: int = 5) -> List[BasePipeline]:
"""Get top k pipelines with the best metric value.

Only complete and non-duplicate studies are taken into account.

Parameters
----------
k:
Expand Down Expand Up @@ -558,6 +563,14 @@ def _make_tune_summary(self, trials: List[FrozenTrial], pipeline: BasePipeline)
def summary(self) -> pd.DataFrame:
"""Get Auto trials summary.

There are columns:

- hash: hash of the pipeline;
- pipeline: pipeline object;
- metrics: columns with metrics' values;
- state: state of the trial;
- study: name of the study in which trial was made.

Returns
-------
study_dataframe:
Expand All @@ -582,6 +595,8 @@ class Tune(AutoBase):
"""Automatic tuning of custom pipeline.

This class takes given pipelines and tries to optimize its hyperparameters by using `params_to_tune`.

Trials with duplicate parameters are skipped and previously computed results are returned.
"""

def __init__(
Expand Down Expand Up @@ -618,7 +633,7 @@ def __init__(
runner:
Runner to use for distributed training. By default, :py:class:`~etna.auto.runner.local.LocalRunner` is used.
storage:
Optuna storage to use. By default, sqlite storage is used.
Optuna storage to use. By default, sqlite storage is used with name "etna-auto.db".
metrics:
List of metrics to compute.
By default, :py:class:`~etna.metrics.metrics.Sign`, :py:class:`~etna.metrics.metrics.SMAPE`,
Expand All @@ -642,7 +657,7 @@ def __init__(
)
self.pipeline = pipeline
if sampler is None:
self.sampler: BaseSampler = TPESampler()
self.sampler: BaseSampler = TPESampler(seed=0)
else:
self.sampler = sampler
if params_to_tune is None:
Expand Down Expand Up @@ -760,6 +775,18 @@ def objective(
CategoricalDistribution: lambda x: ("suggest_categorical", {"choices": x.choices}),
}

def _find_duplicate_trial(trial: Trial, pipeline: BasePipeline) -> Optional[FrozenTrial]:
pipeline_hash = config_hash(pipeline.to_dict())

for t in trial.study.trials:
if t.state != optuna.structs.TrialState.COMPLETE:
continue

if t.user_attrs.get("hash") == pipeline_hash:
return t

return None

def _objective(trial: Trial) -> float:
# using received optuna.distribution objects to call corresponding trial.suggest_xxx
params_suggested = {}
Expand All @@ -771,23 +798,33 @@ def _objective(trial: Trial) -> float:
# create pipeline instance with the parameters to try
pipeline_trial_params: BasePipeline = pipeline.set_params(**params_suggested)

if initializer is not None:
initializer(pipeline=pipeline_trial_params)
duplicate_trial = _find_duplicate_trial(trial, pipeline_trial_params)
if duplicate_trial is not None:
for param_name, param_value in duplicate_trial.user_attrs.items():
trial.set_user_attr(param_name, param_value)

metrics_df, forecast_df, fold_info_df = pipeline_trial_params.backtest(
ts, metrics=metrics, **backtest_params
)
metric_value = trial.user_attrs[f"{target_metric.name}_{metric_aggregation}"]
return metric_value

if callback is not None:
callback(metrics_df=metrics_df, forecast_df=forecast_df, fold_info_df=fold_info_df)
else:
if initializer is not None:
initializer(pipeline=pipeline_trial_params)

trial.set_user_attr("pipeline", pipeline_trial_params.to_dict())
metrics_df, forecast_df, fold_info_df = pipeline_trial_params.backtest(
ts, metrics=metrics, **backtest_params
)

aggregated_metrics = aggregate_metrics_df(metrics_df)
for metric in aggregated_metrics:
trial.set_user_attr(metric, aggregated_metrics[metric])
if callback is not None:
callback(metrics_df=metrics_df, forecast_df=forecast_df, fold_info_df=fold_info_df)

return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
trial.set_user_attr("pipeline", pipeline_trial_params.to_dict())
trial.set_user_attr("hash", config_hash(pipeline_trial_params.to_dict()))

aggregated_metrics = aggregate_metrics_df(metrics_df)
for metric in aggregated_metrics:
trial.set_user_attr(metric, aggregated_metrics[metric])

return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]

return _objective

Expand Down Expand Up @@ -825,6 +862,13 @@ def _summary(self, trials: List[FrozenTrial]) -> List[dict]:
def summary(self) -> pd.DataFrame:
"""Get trials summary.

There are columns:

- hash: hash of the pipeline;
- pipeline: pipeline object;
- metrics: columns with metrics' values;
- state: state of the trial.

Returns
-------
study_dataframe:
Expand Down
79 changes: 62 additions & 17 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@ We have prepared a set of tutorials for an easy introduction:
- Change points plot
- Interactive change points plot

#### 04. [Outliers](https://github.com/tinkoff-ai/etna/tree/master/examples/outliers.ipynb)
- Point outliers
- Median method
- Density method
- Prediction interval method
- Histogram method
- Sequence outliers
- Interactive visualization
- Outliers imputation
#### 04. [Regressors and exogenous data](https://github.com/tinkoff-ai/etna/tree/master/examples/exogenous_data.ipynb)
- What is regressor?
- What is exogenous data?
- Dataset
- Loading Dataset
- EDA
- Forecast with regressors

#### 05. [Clustering](https://github.com/tinkoff-ai/etna/tree/master/examples/clustering.ipynb)
- Clustering pipeline
- Custom Distance
- Visualisation
#### 05. [Custom model and transform](https://github.com/tinkoff-ai/etna/tree/master/examples/exogenous_data.ipynb)
- What is Transform and how it works
- Custom Transform
- Per-segment Custom Transform
- Multi-segment Custom Transform
- Custom Model

#### 06. [Deep learning models](https://github.com/tinkoff-ai/etna/tree/master/examples/NN_examples.ipynb)
- Creating TSDataset
Expand All @@ -54,18 +54,63 @@ We have prepared a set of tutorials for an easy introduction:
- VotingEnsemble
- StackingEnsemble

#### 08. [Auto](https://github.com/tinkoff-ai/etna/tree/master/examples/auto.py)
#### 08. [Outliers](https://github.com/tinkoff-ai/etna/tree/master/examples/outliers.ipynb)
- Point outliers
- Median method
- Density method
- Prediction interval method
- Histogram method
- Sequence outliers
- Interactive visualization
- Outliers imputation

#### 09. [Forecasting strategies](https://github.com/tinkoff-ai/etna/tree/master/examples/forecasting_strategies.ipynb)
- Imports and constants
- Load dataset
- Recursive strategy
- AutoRegressivePipeline
- Direct strategy
- Pipeline
- DirectEnsemble
- assemble_pipelines + DirectEnsemble
- Summary

#### 10. [Forecast interpretation](https://github.com/tinkoff-ai/etna/tree/master/examples/forecast_interpretation.ipynb)
- Forecast decomposition
- CatBoost
- SARIMAX
- BATS
- In-sample and out-of-sample decomposition
- Accessing target components
- Regressors relevance
- Feature relevance
- Components relevance

#### 11. [Clustering](https://github.com/tinkoff-ai/etna/tree/master/examples/clustering.ipynb)
- Clustering pipeline
- Custom Distance
- Visualisation

#### 12. [AutoML script](https://github.com/tinkoff-ai/etna/tree/master/examples/auto.py)
- Auto pipeline search

#### 09. Hyperparameter search
#### 13. [AutoML notebook](https://github.com/tinkoff-ai/etna/tree/master/examples/automl.ipynb)
- Hyperparameters tuning
- How `Tune` works
- Example
- General AutoML
- How `Auto` works
- Example

#### 14. Hyperparameter search
- [Optuna](https://github.com/tinkoff-ai/etna/tree/master/examples/optuna)
- [WandB sweeps](https://github.com/tinkoff-ai/etna/tree/master/examples/wandb/sweeps) example based on [Hydra](https://hydra.cc/)

#### 10. [Inference: using saved pipeline on a new data](https://github.com/tinkoff-ai/etna/tree/master/examples/inference.ipynb)
#### 15. [Inference: using saved pipeline on a new data](https://github.com/tinkoff-ai/etna/tree/master/examples/inference.ipynb)
- Fitting and saving pipeline
- Using saved pipeline on a new data

#### 11. [Hierarchical time series](https://github.com/tinkoff-ai/etna/tree/master/examples/hierarchical_pipeline.ipynb)
#### 16. [Hierarchical time series](https://github.com/tinkoff-ai/etna/tree/master/examples/hierarchical_pipeline.ipynb)
- Hierarchical time series
- Hierarchical structure
- Reconciliation methods
Expand Down