Skip to content

Commit

Permalink
Fix/ts gaps (#1265)
Browse files Browse the repository at this point in the history
* first version of TimeSeries.gaps fix

* correcting bug in gaps, propagated new argument to methods calling gaps

* added tests to cover the new argument of the gaps function

* corrected code and documentation according to reviewers comments

* fixed typos in documentation
  • Loading branch information
madtoinou committed Oct 13, 2022
1 parent 55796e3 commit cf6364a
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 36 deletions.
6 changes: 0 additions & 6 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb

This file was deleted.

12 changes: 12 additions & 0 deletions darts/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -1035,13 +1035,18 @@ def test_gaps(self):
[1] * 10 + 1 * [np.nan] + [1] * 13 + 5 * [np.nan] + [1] * 18 + 9 * [np.nan],
index=times4,
)
pd_series7 = pd.Series(
[1] * 10 + 1 * [0] + [1] * 13 + 5 * [2] + [1] * 18 + 9 * [6],
index=times4,
)

series1 = TimeSeries.from_series(pd_series1)
series2 = TimeSeries.from_series(pd_series2)
series3 = TimeSeries.from_series(pd_series3)
series4 = TimeSeries.from_series(pd_series4)
series5 = TimeSeries.from_series(pd_series5)
series6 = TimeSeries.from_series(pd_series6)
series7 = TimeSeries.from_series(pd_series7)

gaps1 = series1.gaps()
self.assertTrue(
Expand Down Expand Up @@ -1111,13 +1116,20 @@ def test_gaps(self):
)
).all()
)
gaps7 = series7.gaps()
self.assertTrue(gaps7.empty)

# test gaps detection on integer-indexed series
values = np.array([1, 2, np.nan, np.nan, 3, 4, np.nan, 6])
times = pd.RangeIndex(8)
ts = TimeSeries.from_times_and_values(times, values)
np.testing.assert_equal(ts.gaps().values, np.array([[2, 3, 2], [6, 6, 1]]))

values = np.array([1, 2, 7, 8, 3, 4, 0, 6])
times = pd.RangeIndex(8)
ts = TimeSeries.from_times_and_values(times, values)
self.assertTrue(ts.gaps().empty)

def test_longest_contiguous_slice(self):
times = pd.date_range("20130101", "20130111")
pd_series1 = pd.Series(
Expand Down
45 changes: 45 additions & 0 deletions darts/tests/test_timeseries_multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,18 @@ class TimeSeriesMultivariateTestCase(DartsBaseTestClass):
},
index=times2,
)
dataframe4 = pd.DataFrame(
{
"0": [1, 1, np.nan, 1, 1, 1, 1, 1, 1, 1],
"1": [1, 1, np.nan, 1, 1, np.nan, np.nan, 1, 1, 1],
"2": [1, 1, np.nan, 1, 1, np.nan, np.nan, np.nan, np.nan, 1],
},
index=times2,
)
series1 = TimeSeries.from_dataframe(dataframe1)
series2 = TimeSeries.from_dataframe(dataframe2)
series3 = TimeSeries.from_dataframe(dataframe3)
series4 = TimeSeries.from_dataframe(dataframe4)

def test_creation(self):
series_test = TimeSeries.from_dataframe(self.dataframe1)
Expand Down Expand Up @@ -250,3 +259,39 @@ def test_drop_column(self):
seriesB = self.series1.drop_columns(["0", "1"])
self.assertIn("2", seriesB.columns.values)
self.assertEqual(len(seriesB.columns), 1)

def test_gaps(self):
gaps1_all = self.series1.gaps(mode="all")
self.assertTrue(gaps1_all.empty)
gaps1_any = self.series1.gaps(mode="any")
self.assertTrue(gaps1_any.empty)

gaps4_all = self.series4.gaps(mode="all")
self.assertTrue(
(
gaps4_all["gap_start"] == pd.DatetimeIndex([pd.Timestamp("20130208")])
).all()
)
self.assertTrue(
(gaps4_all["gap_end"] == pd.DatetimeIndex([pd.Timestamp("20130208")])).all()
)
self.assertEqual(gaps4_all["gap_size"].values.tolist(), [1])

gaps4_any = self.series4.gaps(mode="any")
self.assertTrue(
(
gaps4_any["gap_start"]
== pd.DatetimeIndex(
[pd.Timestamp("20130208"), pd.Timestamp("20130211")]
)
).all()
)
self.assertTrue(
(
gaps4_any["gap_end"]
== pd.DatetimeIndex(
[pd.Timestamp("20130208"), pd.Timestamp("20130214")]
)
).all()
)
self.assertEqual(gaps4_any["gap_size"].values.tolist(), [1, 4])
25 changes: 25 additions & 0 deletions darts/tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,28 @@ def test_extract_subseries(self):
for sub, start, end in zip(subseries, start_times, end_times):
self.assertEqual(sub.start_time(), pd.to_datetime(start))
self.assertEqual(sub.end_time(), pd.to_datetime(end))

# Multivariate timeserie
times = pd.date_range("20130206", "20130215")
dataframe = pd.DataFrame(
{
"0": [1, 1, np.nan, 1, 2, 1, 1, 1, 1, 1],
"1": [1, 1, np.nan, 1, 3, np.nan, np.nan, 1, 1, 1],
"2": [1, 1, np.nan, 1, 4, np.nan, np.nan, np.nan, np.nan, 1],
},
index=times,
)
series = TimeSeries.from_dataframe(dataframe)

# gaps is characterized by NaN in all the covariate columns
subseries_all = extract_subseries(series, mode="all")
self.assertEqual(len(subseries_all), 2)
self.assertEqual(subseries_all[0], series[:2])
self.assertEqual(subseries_all[1], series[3:])

# gaps is characterized by NaN in any of the covariate columns
subseries_any = extract_subseries(series, mode="any")
self.assertEqual(len(subseries_any), 3)
self.assertEqual(subseries_any[0], series[:2])
self.assertEqual(subseries_any[1], series[3:5])
self.assertEqual(subseries_any[2], series[-1])
75 changes: 55 additions & 20 deletions darts/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import pickle
from collections import defaultdict
from inspect import signature
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -1857,23 +1857,39 @@ def concatenate(
=============
"""

def gaps(self) -> pd.DataFrame:
def gaps(self, mode: Literal["all", "any"] = "all") -> pd.DataFrame:
"""
A function to compute and return gaps in the TimeSeries.
Works only on deterministic time series.
A function to compute and return gaps in the TimeSeries. Works only on deterministic time series (1 sample).
Parameters
----------
mode
Only relevant for multivariate time series. The mode defines how gaps are defined. Set to
'any' if a NaN value in any columns should be considered as as gaps. 'all' will only
consider periods where all columns' values are NaN. Defaults to 'all'.
Returns
-------
pd.DataFrame
A dataframe containing a row for every gap (rows with all-NaN values in underlying DataFrame)
A pandas.DataFrame containing a row for every gap (rows with all-NaN values in underlying DataFrame)
in this time series. The DataFrame contains three columns that include the start and end time stamps
of the gap and the integer length of the gap (in `self.freq` units if the series is indexed
by a DatetimeIndex).
"""

df = self.pd_dataframe()

is_nan_series = df.isna().all(axis=1).astype(int)
if mode == "all":
is_nan_series = df.isna().all(axis=1).astype(int)
elif mode == "any":
is_nan_series = df.isna().any(axis=1).astype(int)
else:
raise_log(
ValueError(
f"Keyword mode accepts only 'any' or 'all'. Provided {mode}"
),
logger,
)
diff = pd.Series(np.diff(is_nan_series.values), index=is_nan_series.index[:-1])
gap_starts = diff[diff == 1].index + self._freq
gap_ends = diff[diff == -1].index
Expand All @@ -1883,21 +1899,25 @@ def gaps(self) -> pd.DataFrame:
if is_nan_series.iloc[-1] == 1:
gap_ends = gap_ends.insert(len(gap_ends), self.end_time())

gap_df = pd.DataFrame()
gap_df["gap_start"] = gap_starts
gap_df["gap_end"] = gap_ends
gap_df = pd.DataFrame(columns=["gap_start", "gap_end"])

def intvl(start, end):
if self._has_datetime_index:
return pd.date_range(start=start, end=end, freq=self._freq).size
else:
return int((end - start) / self._freq) + 1
if gap_starts.size == 0:
return gap_df
else:

gap_df["gap_size"] = gap_df.apply(
lambda row: intvl(start=row.gap_start, end=row.gap_end), axis=1
)
def intvl(start, end):
if self._has_datetime_index:
return pd.date_range(start=start, end=end, freq=self._freq).size
else:
return int((end - start) / self._freq) + 1

gap_df["gap_start"] = gap_starts
gap_df["gap_end"] = gap_ends
gap_df["gap_size"] = gap_df.apply(
lambda row: intvl(start=row.gap_start, end=row.gap_end), axis=1
)

return gap_df
return gap_df

def copy(self) -> "TimeSeries":
"""
Expand Down Expand Up @@ -2278,22 +2298,37 @@ def strip(self) -> "TimeSeries":
new_series, static_covariates=self.static_covariates
)

def longest_contiguous_slice(self, max_gap_size: int = 0) -> "TimeSeries":
def longest_contiguous_slice(
self, max_gap_size: int = 0, mode: str = "all"
) -> "TimeSeries":
"""
Return the largest TimeSeries slice of this deterministic series that contains no gaps
(contiguous all-NaN values) larger than `max_gap_size`.
This method is only applicable to deterministic series (i.e., having 1 sample).
Parameters
----------
max_gap_size
Indicate the maximum gap size that the TimeSerie can contain
mode
Only relevant for multivariate time series. The mode defines how gaps are defined. Set to
'any' if a NaN value in any columns should be considered as as gaps. 'all' will only
consider periods where all columns' values are NaN. Defaults to 'all'.
Returns
-------
TimeSeries
a new series constituting the largest slice of the original with no or bounded gaps
See Also
--------
TimeSeries.gaps : return the gaps in the TimeSeries
"""
if not (np.isnan(self._xa)).any():
return self.copy()
stripped_series = self.strip()
gaps = stripped_series.gaps()
gaps = stripped_series.gaps(mode=mode)
relevant_gaps = gaps[gaps["gap_size"] > max_gap_size]

curr_slice_start = stripped_series.start_time()
Expand Down
32 changes: 22 additions & 10 deletions darts/utils/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def fill_missing_values(


def extract_subseries(
series: TimeSeries, min_gap_size: Optional[int] = 1
series: TimeSeries, min_gap_size: Optional[int] = 1, mode: str = "all"
) -> List[TimeSeries]:
"""
Partitions the series into a sequence of sub-series by using significant gaps of missing values
Expand All @@ -83,10 +83,18 @@ def extract_subseries(
min_gap_size
The minimum number of contiguous missing values to consider a gap as significant. Defaults to 1.
mode
Only for multivariate TimeSeries. The definition of a gap; presence of a NaN in any column ("any")
or NaNs in all the columns ("all") for a given timestamp. Defaults to "all".
Returns
-------
subseries
A list of TimeSeries, sub-series without significant gaps of missing values
See Also
--------
TimeSeries.gaps : return the gaps in the TimeSeries
"""

# Remove null values from the series extremes
Expand All @@ -97,15 +105,19 @@ def extract_subseries(
return [series]

# Get start/end times of sub-series without gaps of missing values
gaps_df = series.gaps().query(f"gap_size>={min_gap_size}")
start_times = [series.start_time()] + (gaps_df["gap_end"] + freq).to_list()
end_times = (gaps_df["gap_start"] - freq).to_list() + [series.end_time() + freq]

subseries = []
for start, end in zip(start_times, end_times):
subseries.append(series[start:end])

return subseries
gaps_df = series.gaps(mode=mode)
if gaps_df.empty:
return series
else:
gaps_df = gaps_df.query(f"gap_size>={min_gap_size}")
start_times = [series.start_time()] + (gaps_df["gap_end"] + freq).to_list()
end_times = (gaps_df["gap_start"] - freq).to_list() + [series.end_time() + freq]

subseries = []
for start, end in zip(start_times, end_times):
subseries.append(series[start:end])

return subseries


def _const_fill(series: TimeSeries, fill: float = 0) -> TimeSeries:
Expand Down

0 comments on commit cf6364a

Please sign in to comment.