Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/ts gaps #1265

Merged
merged 7 commits into from
Oct 13, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 0 additions & 6 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb

This file was deleted.

12 changes: 12 additions & 0 deletions darts/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -1035,13 +1035,18 @@ def test_gaps(self):
[1] * 10 + 1 * [np.nan] + [1] * 13 + 5 * [np.nan] + [1] * 18 + 9 * [np.nan],
index=times4,
)
pd_series7 = pd.Series(
[1] * 10 + 1 * [0] + [1] * 13 + 5 * [2] + [1] * 18 + 9 * [6],
index=times4,
)

series1 = TimeSeries.from_series(pd_series1)
series2 = TimeSeries.from_series(pd_series2)
series3 = TimeSeries.from_series(pd_series3)
series4 = TimeSeries.from_series(pd_series4)
series5 = TimeSeries.from_series(pd_series5)
series6 = TimeSeries.from_series(pd_series6)
series7 = TimeSeries.from_series(pd_series7)

gaps1 = series1.gaps()
self.assertTrue(
Expand Down Expand Up @@ -1111,13 +1116,20 @@ def test_gaps(self):
)
).all()
)
gaps7 = series7.gaps()
self.assertTrue(gaps7.empty)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, thanks for these tests!


# test gaps detection on integer-indexed series
values = np.array([1, 2, np.nan, np.nan, 3, 4, np.nan, 6])
times = pd.RangeIndex(8)
ts = TimeSeries.from_times_and_values(times, values)
np.testing.assert_equal(ts.gaps().values, np.array([[2, 3, 2], [6, 6, 1]]))

values = np.array([1, 2, 7, 8, 3, 4, 0, 6])
times = pd.RangeIndex(8)
ts = TimeSeries.from_times_and_values(times, values)
self.assertTrue(ts.gaps().empty)

def test_longest_contiguous_slice(self):
times = pd.date_range("20130101", "20130111")
pd_series1 = pd.Series(
Expand Down
45 changes: 45 additions & 0 deletions darts/tests/test_timeseries_multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,18 @@ class TimeSeriesMultivariateTestCase(DartsBaseTestClass):
},
index=times2,
)
dataframe4 = pd.DataFrame(
{
"0": [1, 1, np.nan, 1, 1, 1, 1, 1, 1, 1],
"1": [1, 1, np.nan, 1, 1, np.nan, np.nan, 1, 1, 1],
"2": [1, 1, np.nan, 1, 1, np.nan, np.nan, np.nan, np.nan, 1],
},
index=times2,
)
series1 = TimeSeries.from_dataframe(dataframe1)
series2 = TimeSeries.from_dataframe(dataframe2)
series3 = TimeSeries.from_dataframe(dataframe3)
series4 = TimeSeries.from_dataframe(dataframe4)

def test_creation(self):
series_test = TimeSeries.from_dataframe(self.dataframe1)
Expand Down Expand Up @@ -250,3 +259,39 @@ def test_drop_column(self):
seriesB = self.series1.drop_columns(["0", "1"])
self.assertIn("2", seriesB.columns.values)
self.assertEqual(len(seriesB.columns), 1)

def test_gaps(self):
gaps1_all = self.series1.gaps(mode="all")
self.assertTrue(gaps1_all.empty)
gaps1_any = self.series1.gaps(mode="any")
self.assertTrue(gaps1_any.empty)

gaps4_all = self.series4.gaps(mode="all")
self.assertTrue(
(
gaps4_all["gap_start"] == pd.DatetimeIndex([pd.Timestamp("20130208")])
).all()
)
self.assertTrue(
(gaps4_all["gap_end"] == pd.DatetimeIndex([pd.Timestamp("20130208")])).all()
)
self.assertEqual(gaps4_all["gap_size"].values.tolist(), [1])

gaps4_any = self.series4.gaps(mode="any")
self.assertTrue(
(
gaps4_any["gap_start"]
== pd.DatetimeIndex(
[pd.Timestamp("20130208"), pd.Timestamp("20130211")]
)
).all()
)
self.assertTrue(
(
gaps4_any["gap_end"]
== pd.DatetimeIndex(
[pd.Timestamp("20130208"), pd.Timestamp("20130214")]
)
).all()
)
self.assertEqual(gaps4_any["gap_size"].values.tolist(), [1, 4])
25 changes: 25 additions & 0 deletions darts/tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,28 @@ def test_extract_subseries(self):
for sub, start, end in zip(subseries, start_times, end_times):
self.assertEqual(sub.start_time(), pd.to_datetime(start))
self.assertEqual(sub.end_time(), pd.to_datetime(end))

# Multivariate timeserie
times = pd.date_range("20130206", "20130215")
dataframe = pd.DataFrame(
{
"0": [1, 1, np.nan, 1, 2, 1, 1, 1, 1, 1],
"1": [1, 1, np.nan, 1, 3, np.nan, np.nan, 1, 1, 1],
"2": [1, 1, np.nan, 1, 4, np.nan, np.nan, np.nan, np.nan, 1],
},
index=times,
)
series = TimeSeries.from_dataframe(dataframe)

# gaps is characterized by NaN in all the covariates
subseries_all = extract_subseries(series, mode="all")
self.assertEqual(len(subseries_all), 2)
self.assertEqual(subseries_all[0], series[:2])
self.assertEqual(subseries_all[1], series[3:])

# gaps is characterized by NaN in any the covariates
subseries_any = extract_subseries(series, mode="any")
self.assertEqual(len(subseries_any), 3)
self.assertEqual(subseries_any[0], series[:2])
self.assertEqual(subseries_any[1], series[3:5])
self.assertEqual(subseries_any[2], series[-1])
74 changes: 53 additions & 21 deletions darts/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -1857,23 +1857,37 @@ def concatenate(
=============
"""

def gaps(self) -> pd.DataFrame:
def gaps(self, mode: str = "all") -> pd.DataFrame:
"""
A function to compute and return gaps in the TimeSeries.
Works only on deterministic time series.

A function to compute and return gaps in the TimeSeries. Works only on deterministic time series (1 sample).
Parameters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not render well anymore, you need to be careful not removing empty lines
image

----------
mode
Only relevant for multivariate time series. The mode defines how gaps are defined. Set to
'any' if a NaN value in any columns should be considered as as gaps. 'all' will only
consider periods where all columns' values are NaN. Defaults to 'all'.
Returns
-------
-------/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the / for rendering.

pd.DataFrame
A dataframe containing a row for every gap (rows with all-NaN values in underlying DataFrame)
A pandas.DataFrame containing a row for every gap (rows with all-NaN values in underlying DataFrame)
in this time series. The DataFrame contains three columns that include the start and end time stamps
of the gap and the integer length of the gap (in `self.freq` units if the series is indexed
by a DatetimeIndex).
"""

df = self.pd_dataframe()

is_nan_series = df.isna().all(axis=1).astype(int)
if mode == "all":
is_nan_series = df.isna().all(axis=1).astype(int)
elif mode == "any":
is_nan_series = df.isna().any(axis=1).astype(int)
else:
raise_log(
ValueError(
f"Keyword mode accepts only 'any' or 'all'. Provided {mode}"
),
logger,
)
diff = pd.Series(np.diff(is_nan_series.values), index=is_nan_series.index[:-1])
gap_starts = diff[diff == 1].index + self._freq
gap_ends = diff[diff == -1].index
Expand All @@ -1883,21 +1897,25 @@ def gaps(self) -> pd.DataFrame:
if is_nan_series.iloc[-1] == 1:
gap_ends = gap_ends.insert(len(gap_ends), self.end_time())

gap_df = pd.DataFrame()
gap_df["gap_start"] = gap_starts
gap_df["gap_end"] = gap_ends
gap_df = pd.DataFrame(columns=["gap_start", "gap_end"])

def intvl(start, end):
if self._has_datetime_index:
return pd.date_range(start=start, end=end, freq=self._freq).size
else:
return int((end - start) / self._freq) + 1
if gap_starts.size == 0:
return gap_df
else:

gap_df["gap_size"] = gap_df.apply(
lambda row: intvl(start=row.gap_start, end=row.gap_end), axis=1
)
def intvl(start, end):
if self._has_datetime_index:
return pd.date_range(start=start, end=end, freq=self._freq).size
else:
return end - start + 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually we do need to keep the division by self.freq here. For integer-indexed series, self.freq represents the "gap" between any two consecutive timestamps. For instance it's possible for an integer indexed series to have timestamps 0, 2, 4, ..., in which case freq=2. If we intend for the length to represent the number of NaN time steps, then we should keep the division by the freq.


return gap_df
gap_df["gap_start"] = gap_starts
gap_df["gap_end"] = gap_ends
gap_df["gap_size"] = gap_df.apply(
lambda row: intvl(start=row.gap_start, end=row.gap_end), axis=1
)

return gap_df

def copy(self) -> "TimeSeries":
"""
Expand Down Expand Up @@ -2278,22 +2296,36 @@ def strip(self) -> "TimeSeries":
new_series, static_covariates=self.static_covariates
)

def longest_contiguous_slice(self, max_gap_size: int = 0) -> "TimeSeries":
def longest_contiguous_slice(
self, max_gap_size: int = 0, mode: str = "all"
) -> "TimeSeries":
"""
Return the largest TimeSeries slice of this deterministic series that contains no gaps
(contiguous all-NaN values) larger than `max_gap_size`.

This method is only applicable to deterministic series (i.e., having 1 sample).
Parameters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep the empty lines above "Parameters"

----------
max_gap_size
Indicate the maximum gap size that the TimeSerie can contain
mode
Only relevant for multivariate time series. The mode defines how gaps are defined. Set to
'any' if a NaN value in any columns should be considered as as gaps. 'all' will only
consider periods where all columns' values are NaN. Defaults to 'all'.

Returns
-------
TimeSeries
a new series constituting the largest slice of the original with no or bounded gaps

See Also
--------
TimeSeries.gaps : return the gaps in the TimeSeries
"""
if not (np.isnan(self._xa)).any():
return self.copy()
stripped_series = self.strip()
gaps = stripped_series.gaps()
gaps = stripped_series.gaps(mode=mode)
relevant_gaps = gaps[gaps["gap_size"] > max_gap_size]

curr_slice_start = stripped_series.start_time()
Expand Down
24 changes: 14 additions & 10 deletions darts/utils/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def fill_missing_values(


def extract_subseries(
series: TimeSeries, min_gap_size: Optional[int] = 1
series: TimeSeries, min_gap_size: Optional[int] = 1, mode: str = "all"
) -> List[TimeSeries]:
"""
Partitions the series into a sequence of sub-series by using significant gaps of missing values
Expand All @@ -97,15 +97,19 @@ def extract_subseries(
return [series]

# Get start/end times of sub-series without gaps of missing values
gaps_df = series.gaps().query(f"gap_size>={min_gap_size}")
start_times = [series.start_time()] + (gaps_df["gap_end"] + freq).to_list()
end_times = (gaps_df["gap_start"] - freq).to_list() + [series.end_time() + freq]

subseries = []
for start, end in zip(start_times, end_times):
subseries.append(series[start:end])

return subseries
gaps_df = series.gaps(mode=mode)
if gaps_df.empty:
return series
else:
gaps_df = gaps_df.query(f"gap_size>={min_gap_size}")
start_times = [series.start_time()] + (gaps_df["gap_end"] + freq).to_list()
end_times = (gaps_df["gap_start"] - freq).to_list() + [series.end_time() + freq]

subseries = []
for start, end in zip(start_times, end_times):
subseries.append(series[start:end])

return subseries


def _const_fill(series: TimeSeries, fill: float = 0) -> TimeSeries:
Expand Down