In [2]:
# default_exp data.load 

In [2]:
# hide
import sys

sys.path.append("..")

In [62]:
# export
from fastseq.core import *
from fastseq.data.external import *
from fastcore.utils import *
from fastcore.imports import *
from fastai2.basics import *
from fastai2.tabular.core import *

In [63]:
import numpy as np

# Dataset

In [101]:
# export

from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    """Takes a list of time series and provides access to windowed subseries for
    training.
    Arguments:
        * time_series (list): List of time series ``pandas`` DataFrames.
        * lookback (int): Number of time steps used as input for forecasting.
        * horizon (int): Number of time steps to forecast.
        * step (int): Time step size between consecutive examples.
        * cat_names (list): list of catigorical column names.
        * cont_names (list): list of continues colomn names.
        * y_names (list): list of names of the columns to predict.
        * splits (list): DOES NOT WORK
        * static_covs (list): Static covariates for each item in ``time_series`` list.
        * thinning (float): Fraction of examples to include.
    
    Keywords for `fastai`'s `Tabular` are allowed (except `splits`).
    """
    def __init__(self, 
                 time_series,
                 lookback,
                 horizon,
                 step, 
                 static_covs=None,
                 thinning=1.0,
                 transform = None,
                 splits=None,
                **kwargs):
        if splits:
            raise Warning('`splits` is not used at this level.')
        self.time_series = [TabularPandas(ts.T, **kwargs) for ts in time_series]
        self.lookback = lookback
        self.horizon = horizon
        self.step = step
        self.transform = transform
        self.static_covs = static_covs

        # Slice each time series into examples, assigning IDs to each
        last_id = 0
        n_dropped = 0
        self.example_ids = {}
        for i, ts in enumerate(self.time_series):
            num_examples = (len(ts) - self.lookback - self.horizon + self.step) // self.step
            # Time series shorter than the forecast horizon need to be dropped.
            if len(ts) < self.horizon:
                n_dropped += 1
                continue
            # For short time series zero pad the input
            if len(ts) < self.lookback + self.horizon:
                num_examples = 1
            for j in range(num_examples):
                self.example_ids[last_id + j] = (i, j * self.step)
            last_id += num_examples

        # Inform user about time series that were too short
        if n_dropped > 0:
            print("Dropped {}/{} time series due to length.".format(
                    n_dropped, len(self.time_series)
                    )
                 )

        # Store the number of training examples
        self._len = int(self.example_ids.__len__() * thinning)

    def __len__(self):
        return self._len

    def __getitem__(self, idx):
        # Get time series
        ts_id, lookback_id = self.example_ids[idx]
        ts = self.time_series[ts_id]

        # Prepare input and target. Zero pad if necessary.
        if len(ts) < self.lookback + self.horizon:
            # If the time series is too short, we zero pad
            X = ts[:, :-self.horizon]
            X = np.pad(
                X, 
                pad_width=((0, 0), (self.lookback - X.shape[-1], 0)), 
                mode='constant', 
                constant_values=0
            )
            y = ts[:, -self.horizon:]
        else:
            X = ts[:, lookback_id:lookback_id + self.lookback]
            y = ts[:, lookback_id + self.lookback:lookback_id + self.lookback + self.horizon]

        # Create the input and output for the sample
        sample = {'X': X, 'y': y}
        if self.transform is not None:
            sample = self.transform(sample)

        # Static covariates can be attached
        if self.static_covs is not None:
            sample['X_stat'] = self.static_covs[ts_id]

        return sample
    

## Test

In [102]:
horizon = 14
lookback = 128

In [103]:
t = np.arange(1000)
lenghts = np.random.randint(10,1000,100)
data_train = [pd.DataFrame({'y':i+.5*np.sin(t[:lenghts[i]]),
                            'x':t[:lenghts[i]]*np.random.randn(),
                            'z':np.random.randn(lenghts[i])
                           }).T
              for i in range(100)]

In [104]:
print([d.shape for d in data_train[:10]])
data_train[0].head()

[(3, 814), (3, 402), (3, 101), (3, 821), (3, 335), (3, 708), (3, 985), (3, 744), (3, 753), (3, 406)]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,804,805,806,807,808,809,810,811,812,813
y,0.0,0.420735,0.454649,0.07056,-0.378401,-0.479462,-0.139708,0.328493,0.494679,0.206059,...,-0.122597,0.341653,0.491788,0.189776,-0.286716,-0.499602,-0.253157,0.22604,0.497416,0.311471
x,0.0,0.921934,1.843868,2.765802,3.687737,4.609671,5.531605,6.453539,7.375473,8.297407,...,741.235059,742.156994,743.078928,744.000862,744.922796,745.84473,746.766664,747.688598,748.610533,749.532467
z,-0.473028,1.02727,-0.126358,0.160514,-0.900765,0.487678,-1.606093,0.090843,-0.239143,0.303381,...,-2.506964,0.530389,-1.331837,-0.31845,0.717902,-0.758105,-0.509258,0.048217,0.576735,1.057404


In [105]:
data_train = TimeSeriesDataset(
    data_train,
    lookback,
    horizon,
    cont_names = ['x','z'],
    y_names = 'y',
    step=1,
)

In [107]:
print(len(data_train))
print(data_train[1])

36393


TypeError: '(slice(None, None, None), slice(1, 129, None))' is an invalid key

In [105]:
# Create mini-batch data loader
dataloader_train = DataLoader(
    data_train,
    batch_size=512,
    shuffle=True,
    pin_memory=True,
    num_workers=1
)

In [30]:
# hide
from nbdev.export import *

notebook2script()

Converted 00_core.ipynb.
Converted 01_data.external.ipynb.
This cell doesn't have an export destination and was ignored:
 
Converted 02_deep4cast_m4_example.ipynb.
Converted 03_data.load.ipynb.
Converted index.ipynb.
