tinkoff-ai · Mr-Geekman · Jun 14, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add default `params_to_tune` for `TreeFeatureSelectionTransform`, `MRMRFeatureSelectionTransform` and `GaleShapleyFeatureSelectionTransform` ([#1250](https://github.com/tinkoff-ai/etna/pull/1250))
 - Add tuning stage into `Auto.fit` ([#1272](https://github.com/tinkoff-ai/etna/pull/1272))
 - Add `params_to_tune` into `Tune` init ([#1282](https://github.com/tinkoff-ai/etna/pull/1282))
+- Skip duplicates during `Tune.fit`, skip duplicates in `top_k`, add AutoML notebook ([#1285](https://github.com/tinkoff-ai/etna/pull/1285))
 ### Fixed
 -
 - Fix `BaseReconciliator` to work on `pandas==1.1.5` ([#1229](https://github.com/tinkoff-ai/etna/pull/1229))

diff --git a/etna/auto/auto.py b/etna/auto/auto.py
@@ -104,6 +104,8 @@ def summary(self) -> pd.DataFrame:
     def top_k(self, k: int = 5) -> List[BasePipeline]:
         """Get top k pipelines with the best metric value.
 
+        Only complete and non-duplicate studies are taken into account.
+
         Parameters
         ----------
         k:
@@ -149,7 +151,7 @@ def __init__(
         runner:
             Runner to use for distributed training. By default, :py:class:`~etna.auto.runner.local.LocalRunner` is used.
         storage:
-            Optuna storage to use. By default, sqlite storage is used.
+            Optuna storage to use. By default, sqlite storage is used with name "etna-auto.db".
         metrics:
             List of metrics to compute.
             By default, :py:class:`~etna.metrics.metrics.Sign`, :py:class:`~etna.metrics.metrics.SMAPE`,
@@ -174,7 +176,8 @@ def __init__(
 
     def _top_k(self, summary: pd.DataFrame, k: int) -> List[BasePipeline]:
         metric_name = f"{self.target_metric.name}_{self.metric_aggregation}"
-        df = summary[~summary[metric_name].isna()]
+        df = summary[summary["state"].apply(lambda x: x is optuna.structs.TrialState.COMPLETE)]
+        df = df.drop_duplicates(subset=["hash"])
         df = df.sort_values(
             by=metric_name,
             ascending=(not self.target_metric.greater_is_better),
@@ -184,6 +187,8 @@ def _top_k(self, summary: pd.DataFrame, k: int) -> List[BasePipeline]:
     def top_k(self, k: int = 5) -> List[BasePipeline]:
         """Get top k pipelines with the best metric value.
 
+        Only complete and non-duplicate studies are taken into account.
+
         Parameters
         ----------
         k:
@@ -558,6 +563,14 @@ def _make_tune_summary(self, trials: List[FrozenTrial], pipeline: BasePipeline)
     def summary(self) -> pd.DataFrame:
         """Get Auto trials summary.
 
+        There are columns:
+
+        - hash: hash of the pipeline;
+        - pipeline: pipeline object;
+        - metrics: columns with metrics' values;
+        - state: state of the trial;
+        - study: name of the study in which trial was made.
+
         Returns
         -------
         study_dataframe:
@@ -582,6 +595,8 @@ class Tune(AutoBase):
     """Automatic tuning of custom pipeline.
 
     This class takes given pipelines and tries to optimize its hyperparameters by using `params_to_tune`.
+
+    Trials with duplicate parameters are skipped and previously computed results are returned.
     """
 
     def __init__(
@@ -618,7 +633,7 @@ def __init__(
         runner:
             Runner to use for distributed training. By default, :py:class:`~etna.auto.runner.local.LocalRunner` is used.
         storage:
-            Optuna storage to use. By default, sqlite storage is used.
+            Optuna storage to use. By default, sqlite storage is used with name "etna-auto.db".
         metrics:
             List of metrics to compute.
             By default, :py:class:`~etna.metrics.metrics.Sign`, :py:class:`~etna.metrics.metrics.SMAPE`,
@@ -642,7 +657,7 @@ def __init__(
         )
         self.pipeline = pipeline
         if sampler is None:
-            self.sampler: BaseSampler = TPESampler()
+            self.sampler: BaseSampler = TPESampler(seed=0)
         else:
             self.sampler = sampler
         if params_to_tune is None:
@@ -760,6 +775,18 @@ def objective(
             CategoricalDistribution: lambda x: ("suggest_categorical", {"choices": x.choices}),
         }
 
+        def _find_duplicate_trial(trial: Trial, pipeline: BasePipeline) -> Optional[FrozenTrial]:
+            pipeline_hash = config_hash(pipeline.to_dict())
+
+            for t in trial.study.trials:
+                if t.state != optuna.structs.TrialState.COMPLETE:
+                    continue
+
+                if t.user_attrs.get("hash") == pipeline_hash:
+                    return t
+
+            return None
+
         def _objective(trial: Trial) -> float:
             # using received optuna.distribution objects to call corresponding trial.suggest_xxx
             params_suggested = {}
@@ -771,23 +798,33 @@ def _objective(trial: Trial) -> float:
             # create pipeline instance with the parameters to try
             pipeline_trial_params: BasePipeline = pipeline.set_params(**params_suggested)
 
-            if initializer is not None:
-                initializer(pipeline=pipeline_trial_params)
+            duplicate_trial = _find_duplicate_trial(trial, pipeline_trial_params)
+            if duplicate_trial is not None:
+                for param_name, param_value in duplicate_trial.user_attrs.items():
+                    trial.set_user_attr(param_name, param_value)
 
-            metrics_df, forecast_df, fold_info_df = pipeline_trial_params.backtest(
-                ts, metrics=metrics, **backtest_params
-            )
+                metric_value = trial.user_attrs[f"{target_metric.name}_{metric_aggregation}"]
+                return metric_value
 
-            if callback is not None:
-                callback(metrics_df=metrics_df, forecast_df=forecast_df, fold_info_df=fold_info_df)
+            else:
+                if initializer is not None:
+                    initializer(pipeline=pipeline_trial_params)
 
-            trial.set_user_attr("pipeline", pipeline_trial_params.to_dict())
+                metrics_df, forecast_df, fold_info_df = pipeline_trial_params.backtest(
+                    ts, metrics=metrics, **backtest_params
+                )
 
-            aggregated_metrics = aggregate_metrics_df(metrics_df)
-            for metric in aggregated_metrics:
-                trial.set_user_attr(metric, aggregated_metrics[metric])
+                if callback is not None:
+                    callback(metrics_df=metrics_df, forecast_df=forecast_df, fold_info_df=fold_info_df)
 
-            return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
+                trial.set_user_attr("pipeline", pipeline_trial_params.to_dict())
+                trial.set_user_attr("hash", config_hash(pipeline_trial_params.to_dict()))
+
+                aggregated_metrics = aggregate_metrics_df(metrics_df)
+                for metric in aggregated_metrics:
+                    trial.set_user_attr(metric, aggregated_metrics[metric])
+
+                return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
 
         return _objective
 
@@ -825,6 +862,13 @@ def _summary(self, trials: List[FrozenTrial]) -> List[dict]:
     def summary(self) -> pd.DataFrame:
         """Get trials summary.
 
+        There are columns:
+
+        - hash: hash of the pipeline;
+        - pipeline: pipeline object;
+        - metrics: columns with metrics' values;
+        - state: state of the trial.
+
         Returns
         -------
         study_dataframe:

diff --git a/examples/README.md b/examples/README.md
@@ -27,20 +27,20 @@ We have prepared a set of tutorials for an easy introduction:
      - Change points plot
      - Interactive change points plot
 
-#### 04. [Outliers](https://github.com/tinkoff-ai/etna/tree/master/examples/outliers.ipynb) 
-- Point outliers
-    - Median method
-    - Density method
-    - Prediction interval method
-    - Histogram method
-- Sequence outliers
-- Interactive visualization
-- Outliers imputation
+#### 04. [Regressors and exogenous data](https://github.com/tinkoff-ai/etna/tree/master/examples/exogenous_data.ipynb)
+- What is regressor? 
+  - What is exogenous data?
+- Dataset
+  - Loading Dataset
+  - EDA
+- Forecast with regressors
 
-#### 05. [Clustering](https://github.com/tinkoff-ai/etna/tree/master/examples/clustering.ipynb) 
-- Clustering pipeline
-- Custom Distance
-- Visualisation
+#### 05. [Custom model and transform](https://github.com/tinkoff-ai/etna/tree/master/examples/exogenous_data.ipynb)
+- What is Transform and how it works 
+- Custom Transform 
+  - Per-segment Custom Transform 
+  - Multi-segment Custom Transform 
+- Custom Model
 
 #### 06. [Deep learning models](https://github.com/tinkoff-ai/etna/tree/master/examples/NN_examples.ipynb)
 - Creating TSDataset  
@@ -54,18 +54,63 @@ We have prepared a set of tutorials for an easy introduction:
 - VotingEnsemble
 - StackingEnsemble
 
-#### 08. [Auto](https://github.com/tinkoff-ai/etna/tree/master/examples/auto.py)
+#### 08. [Outliers](https://github.com/tinkoff-ai/etna/tree/master/examples/outliers.ipynb) 
+- Point outliers
+    - Median method
+    - Density method
+    - Prediction interval method
+    - Histogram method
+- Sequence outliers
+- Interactive visualization
+- Outliers imputation
+
+#### 09. [Forecasting strategies](https://github.com/tinkoff-ai/etna/tree/master/examples/forecasting_strategies.ipynb)
+- Imports and constants 
+- Load dataset 
+- Recursive strategy 
+  - AutoRegressivePipeline 
+- Direct strategy 
+  - Pipeline 
+  - DirectEnsemble 
+  - assemble_pipelines + DirectEnsemble 
+- Summary
+
+#### 10. [Forecast interpretation](https://github.com/tinkoff-ai/etna/tree/master/examples/forecast_interpretation.ipynb)
+- Forecast decomposition 
+  - CatBoost 
+  - SARIMAX 
+  - BATS 
+  - In-sample and out-of-sample decomposition 
+- Accessing target components 
+- Regressors relevance 
+  - Feature relevance 
+  - Components relevance
+
+#### 11. [Clustering](https://github.com/tinkoff-ai/etna/tree/master/examples/clustering.ipynb) 
+- Clustering pipeline
+- Custom Distance
+- Visualisation
+
+#### 12. [AutoML script](https://github.com/tinkoff-ai/etna/tree/master/examples/auto.py)
 - Auto pipeline search
 
-#### 09. Hyperparameter search
+#### 13. [AutoML notebook](https://github.com/tinkoff-ai/etna/tree/master/examples/automl.ipynb)
+- Hyperparameters tuning
+    - How `Tune` works
+    - Example
+- General AutoML
+    - How `Auto` works
+    - Example
+
+#### 14. Hyperparameter search
 - [Optuna](https://github.com/tinkoff-ai/etna/tree/master/examples/optuna)
 - [WandB sweeps](https://github.com/tinkoff-ai/etna/tree/master/examples/wandb/sweeps) example based on [Hydra](https://hydra.cc/)
 
-#### 10. [Inference: using saved pipeline on a new data](https://github.com/tinkoff-ai/etna/tree/master/examples/inference.ipynb) 
+#### 15. [Inference: using saved pipeline on a new data](https://github.com/tinkoff-ai/etna/tree/master/examples/inference.ipynb) 
 - Fitting and saving pipeline
 - Using saved pipeline on a new data
 
-#### 11. [Hierarchical time series](https://github.com/tinkoff-ai/etna/tree/master/examples/hierarchical_pipeline.ipynb)
+#### 16. [Hierarchical time series](https://github.com/tinkoff-ai/etna/tree/master/examples/hierarchical_pipeline.ipynb)
 - Hierarchical time series
 - Hierarchical structure
 - Reconciliation methods