Merge pull request #238 from winedarksea/dev

0.6.11
winedarksea · Apr 8, 2024 · a2a464c · a2a464c
2 parents e0e8e9c + 9555557
commit a2a464c
Show file tree

Hide file tree

Showing 58 changed files with 866 additions and 257 deletions.
diff --git a/README.md b/README.md
@@ -106,6 +106,7 @@ Also take a look at the [production_example.py](https://github.com/winedarksea/A
 	* `superfast` (simple naive models) and `fast` (more complex but still faster models, optimized for many series)
 	* `fast_parallel` (a combination of `fast` and `parallel`) or `parallel`, given many CPU cores are available
 		* `n_jobs` usually gets pretty close with `='auto'` but adjust as necessary for the environment
+	* 'scalable' is the best list to avoid crashing when many series are present. There is also a transformer_list = 'scalable'
 	* see a dict of predefined lists (some defined for internal use) with `from autots.models.model_list import model_lists`
 * Use the `subset` parameter when there are many similar series, `subset=100` will often generalize well for tens of thousands of similar series.
 	* if using `subset`, passing `weights` for series will weight subset selection towards higher priority series.
@@ -121,6 +122,7 @@ Also take a look at the [production_example.py](https://github.com/winedarksea/A
 	* this can be done by adjusting `frequency` and `aggfunc` but is probably best done before passing data into AutoTS.
 * It will be faster if NaN's are already filled. If a search for optimal NaN fill method is not required, then fill any NaN with a satisfactory method before passing to class.
 * Set `runtime_weighting` in `metric_weighting` to a higher value. This will guide the search towards faster models, although it may come at the expense of accuracy. 
+* Memory shortage is the most common cause of random process/kernel crashes. Try testing a data subset and using a different model list if issues occur. Please also report crashes if found to be linked to a specific set of model parameters (not AutoTS parameters but the underlying forecasting model params). Also crashes vary significantly by setup such as underlying linpack/blas so seeing crash differences between environments can be expected. 
 
 ## How to Contribute:
 * Give feedback on where you find the documentation confusing

diff --git a/TODO.md b/TODO.md
@@ -13,10 +13,12 @@
 * Forecasts are desired for the future immediately following the most recent data.
 * trimmed_mean to AverageValueNaive
 
-# 0.6.10 🇺🇦 🇺🇦 🇺🇦
-* assorted minor bug fixes
-* bug in mosaic model selection fixed
-* added crosshair_lite mosaic
+# 0.6.11 🇺🇦 🇺🇦 🇺🇦
+* bug fixes
+* continually trying to keep up with the Pandas maintainers who are breaking stuff for no good reasonable
+* updated RollingMeanTransformer and RegressionFilter, RegressionFilter should now be less memory intensive
+* EIA data call to load_live_daily
+* horizontal_ensemble_validation arg for more complete validation on these ensembles
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -3,6 +3,7 @@
 
 https://github.com/winedarksea/AutoTS
 """
+
 from autots.datasets import (
     load_hourly,
     load_daily,
@@ -21,12 +22,12 @@
 from autots.tools.transform import GeneralTransformer, RandomTransform
 from autots.tools.shaping import long_to_wide, infer_frequency
 from autots.tools.regressor import create_lagged_regressor, create_regressor
-from autots.evaluator.auto_model import model_forecast
+from autots.evaluator.auto_model import model_forecast, ModelPrediction
 from autots.evaluator.anomaly_detector import AnomalyDetector, HolidayDetector
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.10'
+__version__ = '0.6.11'
 
 TransformTS = GeneralTransformer
 
@@ -54,4 +55,5 @@
     'HolidayDetector',
     'Cassandra',
     'infer_frequency',
+    'ModelPrediction',
 ]
diff --git a/autots/datasets/__init__.py b/autots/datasets/__init__.py
@@ -1,6 +1,7 @@
 """
 Tools for Importing Sample Data
 """
+
 from autots.datasets._base import (
     load_daily,
     load_live_daily,

diff --git a/autots/datasets/_base.py b/autots/datasets/_base.py
@@ -1,8 +1,10 @@
 """Loading example datasets."""
+
 from os.path import dirname, join
 import time
 import datetime
 import io
+import json
 import numpy as np
 import pandas as pd
 
@@ -236,7 +238,9 @@ def load_live_daily(
     wikipedia_pages: list = ['Microsoft_Office', "List_of_highest-grossing_films"],
     wiki_language: str = "en",
     weather_event_types=["%28Z%29+Winter+Weather", "%28Z%29+Winter+Storm"],
-    caiso_query: str = "ENE_SLRS",
+    caiso_query: str = None,
+    eia_key: str = None,
+    eia_respondents: list = ["MISO", "PJM", "TVA", "US48"],
     timeout: float = 300.05,
     sleep_seconds: int = 2,
     **kwargs,
@@ -498,6 +502,7 @@ def load_live_daily(
         except Exception as e:
             print(f"pytrends data failed: {repr(e)}")
 
+    # this was kinda broken last I checked
     if caiso_query is not None:
         try:
             n_chunks = (364 * weather_years) / 30
@@ -537,6 +542,96 @@ def load_live_daily(
         except Exception as e:
             print(f"caiso download failed with error: {repr(e)}")
 
+    if eia_key is not None and eia_respondents is not None:
+        api_url = 'https://api.eia.gov/v2/electricity/rto/daily-region-data/data/'  # ?api_key={eia-key}
+        for respond in eia_respondents:
+            try:
+                params = {
+                    "frequency": "daily",
+                    "data": ["value"],
+                    "facets": {
+                        "type": ["D"],
+                        "respondent": [respond],
+                        "timezone": ["Eastern"],
+                    },
+                    "start": None,  # "start": "2018-06-30",
+                    "end": None,  # "end": "2023-11-01",
+                    "sort": [{"column": "period", "direction": "desc"}],
+                    "offset": 0,
+                    "length": 5000,
+                }
+
+                res = s.get(
+                    api_url,
+                    params={
+                        "api_key": eia_key,
+                    },
+                    headers={"X-Params": json.dumps(params)},
+                )
+                eia_df = pd.json_normalize(res.json()['response']['data'])
+                eia_df['datetime'] = pd.to_datetime(eia_df['period'])
+                eia_df['value'] = eia_df['value'].astype('float')
+                eia_df['ID'] = (
+                    eia_df['respondent']
+                    + "_"
+                    + eia_df['type']
+                    + "_"
+                    + eia_df['timezone']
+                )
+                temp = eia_df.pivot(columns='ID', index='datetime', values='value')
+                dataset_lists.append(temp)
+                time.sleep(sleep_seconds)
+            except Exception as e:
+                print(f"eia download failed with error {repr(e)}")
+            try:
+                api_url_mix = (
+                    "https://api.eia.gov/v2/electricity/rto/daily-fuel-type-data/data/"
+                )
+                params = {
+                    "frequency": "daily",
+                    "data": ["value"],
+                    "facets": {
+                        "respondent": [respond],
+                        "timezone": ["Eastern"],
+                        "fueltype": [
+                            "COL",
+                            "NG",
+                            "NUC",
+                            "SUN",
+                            "WAT",
+                            "WND",
+                        ],
+                    },
+                    "start": None,
+                    "end": None,
+                    "sort": [{"column": "period", "direction": "desc"}],
+                    "offset": 0,
+                    "length": 5000,
+                }
+                res = s.get(
+                    api_url_mix,
+                    params={
+                        "api_key": eia_key,
+                    },
+                    headers={"X-Params": json.dumps(params)},
+                )
+                eia_df = pd.json_normalize(res.json()['response']['data'])
+                eia_df['datetime'] = pd.to_datetime(eia_df['period'])
+                eia_df['value'] = eia_df['value'].astype('float')
+                eia_df['type-name'] = eia_df['type-name'].str.replace(" ", "_")
+                eia_df['ID'] = (
+                    eia_df['respondent']
+                    + "_"
+                    + eia_df['type-name']
+                    + "_"
+                    + eia_df['timezone']
+                )
+                temp = eia_df.pivot(columns='ID', index='datetime', values='value')
+                dataset_lists.append(temp)
+                time.sleep(1)
+            except Exception as e:
+                print(f"eia download failed with error {repr(e)}")
+
     ### End of data download
     if len(dataset_lists) < 1:
         raise ValueError("No data successfully downloaded!")

diff --git a/autots/datasets/fred.py b/autots/datasets/fred.py
@@ -4,6 +4,7 @@
 requires API key from FRED
 and pip install fredapi
 """
+
 import time
 import pandas as pd
 

diff --git a/autots/evaluator/anomaly_detector.py b/autots/evaluator/anomaly_detector.py
@@ -318,9 +318,9 @@ def detect(self, df):
             splash_threshold=self.splash_threshold,
             threshold=self.threshold,
             actuals=df if self.output != "univariate" else None,
-            anomaly_scores=self.anomaly_model.scores
-            if self.output != "univariate"
-            else None,
+            anomaly_scores=(
+                self.anomaly_model.scores if self.output != "univariate" else None
+            ),
             use_dayofmonth_holidays=self.use_dayofmonth_holidays,
             use_wkdom_holidays=self.use_wkdom_holidays,
             use_wkdeom_holidays=self.use_wkdeom_holidays,

diff --git a/autots/evaluator/auto_model.py b/autots/evaluator/auto_model.py
@@ -1,4 +1,5 @@
 """Mid-level helper functions for AutoTS."""
+
 import sys
 import gc
 import traceback as tb
@@ -697,10 +698,12 @@ def ModelMonster(
             n_jobs=n_jobs,
             **parameters,
         )
-    else:
+    elif model == "":
         raise AttributeError(
-            ("Model String '{}' not a recognized model type").format(model)
+            ("Model name is empty. Likely this means AutoTS has not been fit.")
         )
+    else:
+        raise AttributeError((f"Model String '{model}' not a recognized model type"))
 
 
 class ModelPrediction(ModelObject):
@@ -768,11 +771,17 @@ def __init__(
         self.force_gc = force_gc
         # handle still in JSON form
         if isinstance(transformation_dict, str):
-            self.transformation_dict = json.loads(transformation_dict)
+            if transformation_dict == "":
+                self.transformation_dict = {}
+            else:
+                self.transformation_dict = json.loads(transformation_dict)
         else:
             self.transformation_dict = transformation_dict
         if isinstance(parameter_dict, str):
-            self.parameter_dict = json.loads(parameter_dict)
+            if parameter_dict == "":
+                self.parameter_dict = {}
+            else:
+                self.parameter_dict = json.loads(parameter_dict)
         else:
             self.parameter_dict = parameter_dict
         if model_str == "PreprocessingRegression":
@@ -786,26 +795,31 @@ def __init__(
             self.transformation_dict = {}
         self.transformer_object = GeneralTransformer(
             **self.transformation_dict,
-            n_jobs=n_jobs,
-            holiday_country=holiday_country,
+            n_jobs=self.n_jobs,
+            holiday_country=self.holiday_country,
             verbose=self.verbose,
-        )
-        self.model = ModelMonster(
-            model_str,
-            parameters=self.parameter_dict,
-            frequency=frequency,
-            prediction_interval=prediction_interval,
-            holiday_country=holiday_country,
-            random_seed=random_seed,
-            verbose=verbose,
-            forecast_length=forecast_length,
-            n_jobs=n_jobs,
+            random_seed=self.random_seed,
         )
         self.name = "ModelPrediction"
         self._fit_complete = False
 
     def fit(self, df, future_regressor=None):
         self.df = df
+        if self.frequency == "infer":
+            self.inferred_frequency = infer_frequency(df)
+        else:
+            self.inferred_frequency = self.frequency
+        self.model = ModelMonster(
+            self.model_str,
+            parameters=self.parameter_dict,
+            frequency=self.inferred_frequency,
+            prediction_interval=self.prediction_interval,
+            holiday_country=self.holiday_country,
+            random_seed=self.random_seed,
+            verbose=self.verbose,
+            forecast_length=self.forecast_length,
+            n_jobs=self.n_jobs,
+        )
         transformationStartTime = datetime.datetime.now()
         if self.current_model_file is not None:
             try:
@@ -1266,9 +1280,15 @@ def model_forecast(
     full_model_created = False  # make at least one full model, horziontal only
     # handle JSON inputs of the dicts
     if isinstance(model_param_dict, str):
-        model_param_dict = json.loads(model_param_dict)
+        if model_param_dict == "":
+            model_param_dict = {}
+        else:
+            model_param_dict = json.loads(model_param_dict)
     if isinstance(model_transform_dict, str):
-        model_transform_dict = json.loads(model_transform_dict)
+        if model_transform_dict == "":
+            model_transform_dict = {}
+        else:
+            model_transform_dict = json.loads(model_transform_dict)
     if frequency == "infer":
         frequency = infer_frequency(df_train)
     # handle "auto" n_jobs to an integer of local count
@@ -1610,6 +1630,7 @@ def virtual_memory():
                 cumsum_A=cumsum_A,
                 diff_A=diff_A,
                 last_of_array=last_of_array,
+                column_names=df_train.columns,
             )
             if validation_round >= 1 and verbose > 0:
                 round_smape = model_error.avg_metrics['smape'].round(2)
@@ -1626,16 +1647,26 @@ def virtual_memory():
                         print(validation_accuracy_print)
                 else:
                     print(validation_accuracy_print)
-            model_id = create_model_id(
-                df_forecast.model_name,
-                df_forecast.model_parameters,
-                df_forecast.transformation_parameters,
-            )
+            # for horizontal ensemble, use requested ID and params
+            if ensemble_input == 2:
+                model_id = create_model_id(
+                    model_str, parameter_dict, transformation_dict
+                )
+                # it's already json
+                deposit_params = row['ModelParameters']
+            else:
+                # for non horizontal, recreate based on what model actually used (some change)
+                model_id = create_model_id(
+                    df_forecast.model_name,
+                    df_forecast.model_parameters,
+                    df_forecast.transformation_parameters,
+                )
+                deposit_params = json.dumps(df_forecast.model_parameters)
             result = pd.DataFrame(
                 {
                     'ID': model_id,
                     'Model': df_forecast.model_name,
-                    'ModelParameters': json.dumps(df_forecast.model_parameters),
+                    'ModelParameters': deposit_params,
                     'TransformationParameters': json.dumps(
                         df_forecast.transformation_parameters
                     ),