Merge pull request #244 from winedarksea/dev

0.6.13
winedarksea · May 14, 2024 · bd26a75 · bd26a75
2 parents 7a843d0 + 0c94d37
commit bd26a75
Show file tree

Hide file tree

Showing 41 changed files with 336 additions and 150 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 AutoTS is a time series package for Python designed for rapidly deploying high-accuracy forecasts at scale. 
 
-In 2023, AutoTS has won in the M6 forecasting competition, delivering the highest performance investment decisions across 12 months of stock market forecasting.
+In 2023, AutoTS won in the M6 forecasting competition, delivering the highest performance investment decisions across 12 months of stock market forecasting.
 
 There are dozens of forecasting models usable in the `sklearn` style of `.fit()` and `.predict()`. 
 These includes naive, statistical, machine learning, and deep learning models. 

diff --git a/TODO.md b/TODO.md
@@ -13,14 +13,10 @@
 * Forecasts are desired for the future immediately following the most recent data.
 * trimmed_mean to AverageValueNaive
 
-# 0.6.12 🇺🇦 🇺🇦 🇺🇦
+# 0.6.13 🇺🇦 🇺🇦 🇺🇦
+* trend_phi directly into Prophet
+* subset arg to make KalmanStateSpace more scalable to memory
 * bug fixes
-* added DMD model
-* modified the `constraints` options so it now accepts of list of dictionaries of constraints with new last_window and slope options
-* 'dampening' as a constraint method to dampen all forecasts, fixed Cassandra trend_phi dampening
-* new med_diff anomaly method and 'laplace' added as distribution option
-* modified fourier_df to now work with sub daily data
-* some madness with wavelets attempting to use them like fourier series for seasonality
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -27,7 +27,7 @@
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.12'
+__version__ = '0.6.13'
 
 TransformTS = GeneralTransformer
 

diff --git a/autots/evaluator/auto_ts.py b/autots/evaluator/auto_ts.py
@@ -418,9 +418,9 @@ def __init__(
 
                 full_params['transformations'] = transformations
                 full_params['transformation_params'] = transformation_params
-                self.initial_template.loc[index, 'TransformationParameters'] = (
-                    json.dumps(full_params)
-                )
+                self.initial_template.loc[
+                    index, 'TransformationParameters'
+                ] = json.dumps(full_params)
 
         self.regressor_used = False
         self.grouping_ids = None
@@ -1827,10 +1827,10 @@ def _run_template(
             self.model_count = template_result.model_count
         # capture results from lower-level template run
         if "TotalRuntime" in template_result.model_results.columns:
-            template_result.model_results['TotalRuntime'] = (
-                template_result.model_results['TotalRuntime'].fillna(
-                    pd.Timedelta(seconds=60)
-                )
+            template_result.model_results[
+                'TotalRuntime'
+            ] = template_result.model_results['TotalRuntime'].fillna(
+                pd.Timedelta(seconds=60)
             )
         else:
             # trying to catch a rare and sneaky bug (perhaps some variety of beetle?)
@@ -1930,9 +1930,9 @@ def _run_validations(
                         frac=0.8, random_state=self.random_seed
                     ).reindex(idx)
                 nan_frac = val_df_train.shape[1] / num_validations
-                val_df_train.iloc[-2:, int(nan_frac * y) : int(nan_frac * (y + 1))] = (
-                    np.nan
-                )
+                val_df_train.iloc[
+                    -2:, int(nan_frac * y) : int(nan_frac * (y + 1))
+                ] = np.nan
 
             # run validation template on current slice
             result = self._run_template(
@@ -2195,7 +2195,7 @@ def export_template(
         max_per_model_class: int = None,
         include_results: bool = False,
         unpack_ensembles: bool = False,
-        min_metrics: list = ['smape', 'spl'],
+        min_metrics: list = ['smape', 'spl', 'wasserstein', 'mle', 'imle', 'ewmae'],
         max_metrics: list = None,
     ):
         """Export top results as a reusable template.
@@ -3851,9 +3851,9 @@ def diagnose_params(self, target='runtime', waterfall_plots=True):
                     )
                     y = pd.json_normalize(json.loads(row["ModelParameters"]))
                     y.index = [row['ID']]
-                    y['Model'] = (
-                        x  # might need to remove this and do analysis independently for each
-                    )
+                    y[
+                        'Model'
+                    ] = x  # might need to remove this and do analysis independently for each
                     res.append(
                         pd.DataFrame(
                             {

diff --git a/autots/models/base.py b/autots/models/base.py
@@ -690,18 +690,18 @@ def long_form_results(
             value_name=value_name,
             id_vars="datetime",
         ).set_index("datetime")
-        upload_upper[interval_name] = (
-            f"{round(100 - ((1- self.prediction_interval)/2) * 100, 0)}%"
-        )
+        upload_upper[
+            interval_name
+        ] = f"{round(100 - ((1- self.prediction_interval)/2) * 100, 0)}%"
         upload_lower = pd.melt(
             self.lower_forecast.rename_axis(index='datetime').reset_index(),
             var_name=id_name,
             value_name=value_name,
             id_vars="datetime",
         ).set_index("datetime")
-        upload_lower[interval_name] = (
-            f"{round(((1- self.prediction_interval)/2) * 100, 0)}%"
-        )
+        upload_lower[
+            interval_name
+        ] = f"{round(((1- self.prediction_interval)/2) * 100, 0)}%"
 
         upload = pd.concat([upload, upload_upper, upload_lower], axis=0)
         if datetime_column is not None:

diff --git a/autots/models/basics.py b/autots/models/basics.py
@@ -2057,7 +2057,7 @@ class KalmanStateSpace(ModelObject):
         name (str): String to identify class
         frequency (str): String alias of datetime index frequency or else 'infer'
         prediction_interval (float): Confidence interval for probabilistic forecast
-
+        subset (int): if not None, forecasts in chunks of this size. Reduces memory at the expense of compute time.
     """
 
     def __init__(
@@ -2075,6 +2075,7 @@ def __init__(
         em_iter: int = 10,
         model_name: str = "undefined",
         forecast_length: int = None,
+        subset=None,
         **kwargs,
     ):
         ModelObject.__init__(
@@ -2093,6 +2094,7 @@ def __init__(
         self.em_iter = em_iter
         self.model_name = model_name
         self.forecast_length = forecast_length
+        self.subset = subset
 
     def fit(self, df, future_regressor=None):
         """Train algorithm given data supplied.
@@ -2102,31 +2104,55 @@ def fit(self, df, future_regressor=None):
         """
         self.fit_data(df)
 
+        if self.subset is None:
+            self.kf = self._fit(df, future_regressor=None)
+        elif isinstance(self.subset, (float, int)):
+            if self.subset < 1 and self.subset > 0:
+                self.subset = self.subset * df.shape[1]
+            chunks = df.shape[1] // self.subset
+            if chunks > 1:
+                self.kf = {}
+                self.subset_columns = {}
+                if (df.shape[1] % self.subset) != 0:
+                    chunks += 1
+                for x in range(chunks):
+                    subset = df.iloc[:, self.subset * x : (self.subset * (x + 1))]
+                    self.subset_columns[str(x)] = subset.columns.tolist()
+                    self.kf[str(x)] = self._fit(subset, future_regressor=None)
+            else:
+                self.kf = self._fit(df, future_regressor=None)
+        else:
+            raise ValueError(f"subset arg {self.subset} not recognized")
+
+        self.fit_runtime = datetime.datetime.now() - self.startTime
+        return self
+
+    def _fit(self, df, future_regressor=None):
         if self.observation_noise == "auto":
             self.fit_noise = self.tune_observational_noise(df)[0]
         else:
             self.fit_noise = self.observation_noise
-        self.kf = KalmanFilter(
+        kf = KalmanFilter(
             state_transition=self.state_transition,  # matrix A
             process_noise=self.process_noise,  # Q
             observation_model=self.observation_model,  # H
             observation_noise=self.fit_noise,  # R
         )
         if self.em_iter is not None:
-            self.kf = self.kf.em(self.df_train, n_iter=self.em_iter)
+            kf = kf.em(df.to_numpy().T, n_iter=self.em_iter)
 
-        self.fit_runtime = datetime.datetime.now() - self.startTime
-        return self
+        return kf
 
     def fit_data(self, df, future_regressor=None):
         df = self.basic_profile(df)
-        self.df_train = df.to_numpy().T
+        self.df_train = df  # df.to_numpy().T
         self.train_index = df.index
         return self
 
     def cost_function(self, param, df):
         try:
             # evaluating on a single, most recent holdout only, for simplicity
+            local = df.to_numpy().T
             kf = KalmanFilter(
                 state_transition=self.state_transition,  # matrix A
                 process_noise=self.process_noise,  # Q
@@ -2135,16 +2161,12 @@ def cost_function(self, param, df):
                 # covariances=False,
             )
             if self.em_iter is not None:
-                kf = kf.em(
-                    self.df_train[:, : -self.forecast_length], n_iter=self.em_iter
-                )
-            result = kf.predict(
-                self.df_train[:, : -self.forecast_length], self.forecast_length
-            )
+                kf = kf.em(local[:, : -self.forecast_length], n_iter=self.em_iter)
+            result = kf.predict(local[:, : -self.forecast_length], self.forecast_length)
             df_smooth = pd.DataFrame(
                 result.observations.mean.T,
                 index=df.index[-self.forecast_length :],
-                columns=self.column_names,
+                columns=df.columns,
             )
             df_stdev = np.sqrt(result.observations.cov).T
             bound = df_stdev * norm.ppf(self.prediction_interval)
@@ -2219,20 +2241,48 @@ def predict(
                     "must provide forecast_length to KalmanStateSpace predict"
                 )
         predictStartTime = datetime.datetime.now()
-        result = self.kf.predict(self.df_train, forecast_length)
-        df = pd.DataFrame(
-            result.observations.mean.T,
-            index=self.create_forecast_index(forecast_length),
-            columns=self.column_names,
-        )
-
-        if just_point_forecast:
-            return df
+        future_index = self.create_forecast_index(forecast_length)
+        if isinstance(self.kf, dict):
+            forecasts = []
+            uppers = []
+            lowers = []
+            for x in self.subset_columns:
+                current_cols = self.subset_columns[x]
+                result = self.kf[x].predict(
+                    self.df_train.reindex(columns=current_cols).to_numpy().T,
+                    forecast_length,
+                )
+                df = pd.DataFrame(
+                    result.observations.mean.T,
+                    index=future_index,
+                    columns=current_cols,
+                )
+                forecasts.append(df)
+                df_stdev = np.sqrt(result.observations.cov).T
+                bound = df_stdev * norm.ppf(self.prediction_interval)
+                uppers.append(df + bound)
+                lowers.append(df - bound)
+            df = pd.concat(forecasts, axis=1).reindex(columns=self.column_names)
+            upper_forecast = pd.concat(uppers, axis=1).reindex(
+                columns=self.column_names
+            )
+            lower_forecast = pd.concat(lowers, axis=1).reindex(
+                columns=self.column_names
+            )
         else:
+            result = self.kf.predict(self.df_train.to_numpy().T, forecast_length)
+            df = pd.DataFrame(
+                result.observations.mean.T,
+                index=future_index,
+                columns=self.column_names,
+            )
             df_stdev = np.sqrt(result.observations.cov).T
             bound = df_stdev * norm.ppf(self.prediction_interval)
             upper_forecast = df + bound
             lower_forecast = df - bound
+        if just_point_forecast:
+            return df
+        else:
             predict_runtime = datetime.datetime.now() - predictStartTime
             prediction = PredictionObject(
                 model_name=self.name,
@@ -2251,7 +2301,14 @@ def predict(
 
     def get_new_params(self, method: str = "random"):
         # predefined, or random
-        return new_kalman_params(method=method)
+        new_params = new_kalman_params(method=method)
+        if method in ['deep']:
+            new_params['subset'] = random.choices([None, 200, 300], [0.3, 0.3, 0.3])[0]
+        else:
+            new_params['subset'] = random.choices([100, 200, 300], [0.3, 0.3, 0.3])[
+                0
+            ]  # probably no difference
+        return new_params
 
     def get_params(self):
         """Return dict of current parameters."""
@@ -2261,6 +2318,7 @@ def get_params(self):
             "process_noise": self.process_noise,
             "observation_model": self.observation_model,
             "observation_noise": self.observation_noise,
+            "subset": self.subset,
         }
 
 

diff --git a/autots/models/ensemble.py b/autots/models/ensemble.py
@@ -1838,15 +1838,15 @@ def MosaicEnsemble(
             f"Mosaic Ensemble failed on model {row[3]} series {row[2]} and period {row[1]} due to missing model: {e} "
             + mi
         )
-    melted['forecast'] = (
-        fore  # [forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
-    melted['upper_forecast'] = (
-        u_fore  # [upper_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
-    melted['lower_forecast'] = (
-        l_fore  # [lower_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
+    melted[
+        'forecast'
+    ] = fore  # [forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
+    melted[
+        'upper_forecast'
+    ] = u_fore  # [upper_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
+    melted[
+        'lower_forecast'
+    ] = l_fore  # [lower_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
 
     forecast_df = melted.pivot(
         values="forecast", columns="series_id", index="forecast_period"

diff --git a/autots/models/matrix_var.py b/autots/models/matrix_var.py
@@ -236,8 +236,8 @@ def predict(
     def get_new_params(self, method: str = 'random'):
         """Return dict of new parameters for parameter tuning."""
         return {
-            'method': random.choice(['als', 'dmd']),
-            'rank': random.choice([2, 4, 0.1, 0.2, 0.5]),
+            'method': random.choices(['als', 'dmd'], [0.7, 0.3])[0],
+            'rank': random.choice([2, 4, 6, 0.1, 0.2, 0.5]),
             'maxiter': 200,
         }