Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactorised tabularisation + Jupyter notebook w/ experiments. #1399

Merged
merged 34 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
52bee74
Refactorised tabularisation + Jupyter notebook w/ experiments.
mabilton Nov 29, 2022
748caf5
Merge branch 'unit8co:master' into refactor/tabularization
mabilton Dec 4, 2022
83d78b1
Added 'moving window' method + refactored 'time intersection' method.
Dec 11, 2022
9356cd7
Merge branch 'master' into refactor/tabularization
Dec 11, 2022
85011fe
Refactoring/code simplification + bug fixes.
mabilton Dec 18, 2022
79b4fe7
Added `is_training` flag.
mabilton Dec 18, 2022
c64b330
Added tests + bug fixes.
mabilton Dec 18, 2022
ab48acf
Merge branch 'master' into refactor/tabularization
mabilton Dec 21, 2022
ededa0a
More tests + debugging.
mabilton Dec 22, 2022
9e954db
Fixed zero lag value not allowed bug + other debugging.
mabilton Dec 28, 2022
ab548c5
RegressionModel now calls `create_lagged_training_data` + passing tests.
mabilton Dec 28, 2022
fd3e5ce
ShapExplainer now uses `create_lagged_prediction_data` + minor test m…
mabilton Dec 28, 2022
8be63d1
Merge branch 'master' into refactor/tabularization
mabilton Dec 29, 2022
5b25175
Added further documentation, esp to tests.
mabilton Dec 29, 2022
9800cbd
Moved `_add_static_covariates` from `tabularization.py` to `regressio…
mabilton Dec 30, 2022
6298bb1
Merge branch 'master' into refactor/tabularization
mabilton Dec 30, 2022
9da4ade
Merge branch 'master' into refactor/tabularization
mabilton Jan 11, 2023
4c1313f
Static covariates refactorings.
mabilton Jan 13, 2023
01a6b14
Merge branch 'master' into refactor/tabularization
mabilton Jan 13, 2023
479ba8d
typo fix in test_regression_models.py
mabilton Jan 14, 2023
7fa3eea
Removed old `_create_lagged_data` and tests notebook.
mabilton Jan 15, 2023
2766c68
Clarification about `check_inputs` in docstring.
mabilton Jan 15, 2023
6bda371
Allow `lags_future_covariates` to be `> 0`, and enforce `lags_past_co…
mabilton Jan 16, 2023
722765d
Made `get_feature_times` private, now `_get_feature_times`.
mabilton Jan 19, 2023
9baa820
Placed `for` loop back inside `create_lagged_data`; more info in docs…
mabilton Jan 19, 2023
cadb51c
Fixed `bootstrap=True` in `test_regression_models.py`.
mabilton Jan 19, 2023
7002447
Added note about `np.split` in `regression_model.py`.
mabilton Jan 19, 2023
e38e240
Fixed repeated static covariates width calculation.
mabilton Jan 19, 2023
a4ee267
Fixed `shap_explainer` bug.
mabilton Jan 19, 2023
ca54b13
Merge branch 'master' into refactor/tabularization
mabilton Jan 19, 2023
8dab315
Merge branch 'master' into refactor/tabularization
hrzn Jan 20, 2023
7845c60
Amended static covariates test so that values of appended static cov …
mabilton Jan 21, 2023
4c7c163
Merge branch 'master' into refactor/tabularization
mabilton Jan 21, 2023
557d100
Updated docstring error.
mabilton Jan 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 12 additions & 10 deletions darts/explainability/shap_explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
)
from darts.logging import get_logger, raise_if, raise_log
from darts.models.forecasting.regression_model import RegressionModel
from darts.utils.data.tabularization import _create_lagged_data
from darts.utils.data.tabularization import create_lagged_prediction_data
from darts.utils.utils import series2seq

logger = get_logger(__name__)
Expand Down Expand Up @@ -665,16 +665,18 @@ def _create_regression_model_shap_X(
lags_past_covariates_list = self.model.lags.get("past")
lags_future_covariates_list = self.model.lags.get("future")

X, _, indexes = _create_lagged_data(
target_series,
self.n,
past_covariates,
future_covariates,
lags_list,
lags_past_covariates_list,
lags_future_covariates_list,
is_training=False,
X, indexes = create_lagged_prediction_data(
target_series=target_series,
past_covariates=past_covariates,
future_covariates=future_covariates,
lags=lags_list,
lags_past_covariates=lags_past_covariates_list if past_covariates else None,
lags_future_covariates=lags_future_covariates_list
if future_covariates
else None,
)
# Remove sample axis:
X = X[:, :, 0]

if train:
X = pd.DataFrame(X)
Expand Down
130 changes: 111 additions & 19 deletions darts/models/forecasting/regression_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from darts.logging import get_logger, raise_if, raise_if_not, raise_log
from darts.models.forecasting.forecasting_model import GlobalForecastingModel
from darts.timeseries import TimeSeries
from darts.utils.data.tabularization import _add_static_covariates, _create_lagged_data
from darts.utils.data.tabularization import create_lagged_training_data
from darts.utils.multioutput import MultiOutputRegressor
from darts.utils.utils import _check_quantiles, seq2series, series2seq

Expand Down Expand Up @@ -324,7 +324,7 @@ def _create_lagged_data(
lags_past_covariates = self.lags.get("past")
lags_future_covariates = self.lags.get("future")

training_samples, training_labels, _ = _create_lagged_data(
features, labels, _ = create_lagged_training_data(
target_series=target_series,
output_chunk_length=self.output_chunk_length,
past_covariates=past_covariates,
Expand All @@ -334,20 +334,111 @@ def _create_lagged_data(
lags_future_covariates=lags_future_covariates,
max_samples_per_ts=max_samples_per_ts,
multi_models=self.multi_models,
check_inputs=False,
concatenate=False,
)

training_samples = _add_static_covariates(
self,
training_samples,
for i, (X_i, y_i) in enumerate(zip(features, labels)):
features[i] = X_i[:, :, 0]
labels[i] = y_i[:, :, 0]

features = self._add_static_covariates(
features,
target_series,
*self.extreme_lags,
past_covariates=past_covariates,
future_covariates=future_covariates,
max_samples_per_ts=max_samples_per_ts,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't calling _add_static_covariates() better done inside the preceding for loop? in my opinion, this would help simplify the _add_static_covariates() internal logic since, if it only takes a specific target (from the input sequence) and its specific features, than one can avoid looping twice over the series inside the _add_static_covariates(), if I understood the implementation correctly. WDYT?
In the current _add_static_covariates() my assumption was that the function will receive all the features and the function should compute back everything it needs in terms of length and width of features, since I was expecting a change in the _create_lagged_data() outputs, but this might not be potentially relevant anymore with the changes you made.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would still be necessary though to go through all the series in the input sequence once, prior, to collect the static covariates information from all of them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely - in my opinion, the static covariates would ideally be added inside of create_lagged_data after each 'block' has been formed, but that would probably be a bit clumsy to implement at the moment since the process of computing the static covariates requires the n_features_in_ attribute of the RegressionModel object. Perhaps something to think about for a future PR?


training_samples = np.concatenate(features, axis=0)
training_labels = np.concatenate(labels, axis=0)

return training_samples, training_labels

def _add_static_covariates(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still wondering whether this shouldn't belong to tabularization.py. It is specific to RegressionModels, but so are the other tabularization functions in there. And I think handling static covariates is part of the tabularization. WDYT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm - definitely seeing where you're coming from. I suppose my main hesitation at the moment is that the process of computing the static covariates seems to depend a lot on the attributes of RegressionModel, and, from what I understand, _add_static_covariates is only going to be used by RegressionModel, which tends to give me the impression that _add_static_covariates is basically acting as a method for RegressionModel here.

Is there somehow a way to 'decouple' computing the actual static covariates values from the RegressionModel object? If so, an alternative approach would be to attach the static covariates to each block inside of create_lagged_data as soon as each block is formed (as I noted under one of @eliane-maalouf 's suggestions).

self,
features: Union[np.array, Sequence[np.array]],
target_series: Union[TimeSeries, Sequence[TimeSeries]],
) -> Union[np.array, Sequence[np.array]]:
"""
Add static covariates to the features' table for RegressionModels.
Accounts for series with potentially different static covariates by padding with 0 to accomodate for the maximum
number of available static_covariates in any of the given series in the sequence.

If no static covariates are provided for a given series, its corresponding features are padded with 0.
Accounts for the case where the model is trained with series with static covariates and then used to predict
on series without static covariates by padding with 0 the corresponding features of the series without
static covariates.

Parameters
----------
features
The features' numpy array(s) to which the static covariates will be added. Can either be a lone feature
matrix or a `Sequence` of feature matrices; in the latter case, static covariates will be appended to
each feature matrix in this `Sequence`.
target_series
The target series from which to read the static covariates.

Returns
-------
features
The features' array(s) with appended static covariates columns. If the `features` input was passed as a
`Sequence` of `np.array`s, then a `Sequence` is also returned; if `features` was passed as an `np.array`,
a `np.array` is returned.
"""

input_not_list = not isinstance(features, Sequence)
if input_not_list:
features = [features]
target_series = series2seq(target_series)
# collect static covariates info
scovs_map = {
"covs_exist": False,
"vals": [], # Stores values of static cov arrays in each timeseries
"sizes": {}, # Stores sizes of static cov arrays in each timeseries
}
for ts in target_series:
if ts.has_static_covariates:
scovs_map["covs_exist"] = True
# Each static covariate either adds 1 extra columns or
# `n_component` extra columns:
vals_i = {}
for name, row in ts.static_covariates.items():
vals_i[name] = row
scovs_map["sizes"][name] = row.size
scovs_map["vals"].append(vals_i)
else:
scovs_map["vals"].append(None)

if (
not scovs_map["covs_exist"]
and hasattr(self.model, "n_features_in_")
mabilton marked this conversation as resolved.
Show resolved Hide resolved
and (self.model.n_features_in_ is not None)
and (self.model.n_features_in_ > features[0].shape[1])
):
# for when series in prediction do not have static covariates but some of the training series did
num_static_components = self.model.n_features_in_ - features[0].shape[1]
for i, features_i in enumerate(features):
padding = np.zeros((features_i.shape[0], num_static_components))
features[i] = np.hstack([features_i, padding])
elif scovs_map["covs_exist"]:
scov_width = sum(scovs_map["sizes"].values())
for i, features_i in enumerate(features):
vals = scovs_map["vals"][i]
if vals:
scov_arrays = []
for name, size in scovs_map["sizes"].items():
mabilton marked this conversation as resolved.
Show resolved Hide resolved
scov_arrays.append(
vals[name] if name in vals else np.zeros((size,))
)
scov_array = np.concatenate(scov_arrays)
scovs = np.broadcast_to(
scov_array, (features_i.shape[0], scov_width)
)
else:
scovs = np.zeros((features_i.shape[0], scov_width))
features[i] = np.hstack([features_i, scovs])
if input_not_list:
features = features[0]
return features

def _fit_model(
self,
target_series,
Expand All @@ -362,7 +453,10 @@ def _fit_model(
"""

training_samples, training_labels = self._create_lagged_data(
target_series, past_covariates, future_covariates, max_samples_per_ts
target_series,
past_covariates,
future_covariates,
max_samples_per_ts,
)

# if training_labels is of shape (n_samples, 1) flatten it to shape (n_samples,)
Expand Down Expand Up @@ -681,15 +775,13 @@ def predict(

# concatenate retrieved lags
X = np.concatenate(np_X, axis=1)
X = _add_static_covariates(
self,
X,
series,
*self.extreme_lags,
past_covariates=past_covariates,
future_covariates=future_covariates,
max_samples_per_ts=1,
)
# Need to split up `X` into three equally-sized sub-blocks
# corresponding to each timeseries in `series`, so that
# static covariates can be added to each block; valid since
# each block contains same number of observations:
X_blocks = np.split(X, len(series), axis=0)
mabilton marked this conversation as resolved.
Show resolved Hide resolved
X_blocks = self._add_static_covariates(X_blocks, series)
X = np.concatenate(X_blocks, axis=0)

# X has shape (n_series * n_samples, n_regression_features)
prediction = self._predict_and_sample(X, num_samples, **kwargs)
Expand Down