In [None]:
# default_exp data.unwindowed

# Unwindowed datasets

> This functionality will allow you to create a dataset that applies sliding windows to the input data on the fly. This heavily reduces the size of the input data files, as only the original, unwindowed data needs to be stored.

In [None]:
#export
from tsai.imports import *
from tsai.utils import *
from tsai.data.validation import *
from tsai.data.core import *

In [None]:
#export
class TSUnwindowedDataset():
    _types = TSTensor, TSLabelTensor
    def __init__(self, X, y=None, y_func=None, window_size=1, stride=1, drop_start=0, drop_end=0, seq_first=True, **kwargs):
        store_attr()
        if X.ndim == 1: X = np.expand_dims(X, 1)
        shape = X.shape
        assert len(shape) == 2
        if seq_first: 
            seq_len = shape[0]
        else: 
            seq_len = shape[-1]
        max_time = seq_len - window_size + 1 - drop_end
        assert max_time > 0, 'you need to modify either window_size or drop_end as they are larger than seq_len'
        self.all_idxs = np.expand_dims(np.arange(drop_start, max_time, step=stride), 0).T
        self.window_idxs = np.expand_dims(np.arange(window_size), 0)
        if 'split' in kwargs: self.split = kwargs['split']
        else: self.split = None
        self.n_inp = 1
        if y is None: self.loss_func = MSELossFlat()
        else: 
            _,yb=self[:2]
            if (is_listy(yb[0]) and isinstance(yb[0][0], Integral)) or isinstance(yb[0], Integral): self.loss_func = CrossEntropyLossFlat()
            else: self.loss_func = MSELossFlat()

    def __len__(self):
        if self.split is not None: 
            return len(self.split)
        else: 
            return len(self.all_idxs)

    def __getitem__(self, idxs):
        if self.split is not None:
            idxs = self.split[idxs]
        widxs = self.all_idxs[idxs] + self.window_idxs
        if self.seq_first:
            xb = self.X[widxs]
            if xb.ndim == 3: xb = xb.transpose(0,2,1)
            else: xb = np.expand_dims(xb, 1)
        else:
            xb = self.X[:, widxs].transpose(1,0,2)
        if self.y is None:
            return (self._types[0](xb),)
        else:
            yb = self.y[widxs]
            if self.y_func is not None: 
                yb = self.y_func(yb)
            return (self._types[0](xb), self._types[1](yb))
    @property
    def vars(self):
        s = self[0][0] if not isinstance(self[0][0], tuple) else self[0][0][0]
        return s.shape[-2]
    @property
    def len(self): 
        s = self[0][0] if not isinstance(self[0][0], tuple) else self[0][0][0]
        return s.shape[-1]    


class TSUnwindowedDatasets(FilteredBase):
    def __init__(self, dataset, splits):
        store_attr()
    def subset(self, i):
        return type(self.dataset)(self.dataset.X, y=self.dataset.y, y_func=self.dataset.y_func, window_size=self.dataset.window_size,
                                  stride=self.dataset.stride, drop_start=self.dataset.drop_start, drop_end=self.dataset.drop_end, 
                                  seq_first=self.dataset.seq_first, split=self.splits[i])
    @property
    def train(self): 
        return self.subset(0)
    @property
    def valid(self): 
        return self.subset(1)
    def __getitem__(self, i): return self.subset(i)

In [None]:
def y_func(y): return y.astype('float').mean(1)

This approach works with both univariate and multivariate data.

* Univariate: we'll use a simple array with 20 values, one with the seq_len first (X0), the other with seq_len second (X1).
* Multivariate: we'll use 2 time series arrays, one with the seq_len first (X2), the other with seq_len second (X3). No sliding window has been applied to them yet. 

In [None]:
# Univariate
X0 = np.arange(20)
X1 = np.arange(20).reshape(1, -1)
X0.shape, X0, X1.shape, X1

((20,),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 (1, 20),
 array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
         16, 17, 18, 19]]))

In [None]:
# Multivariate
X2 = np.arange(20).reshape(-1,1)*np.array([1, 10, 100]).reshape(1,-1)
X3 = np.arange(20).reshape(1,-1)*np.array([1, 10, 100]).reshape(-1,1)
X2.shape, X3.shape, X2, X3

((20, 3),
 (3, 20),
 array([[   0,    0,    0],
        [   1,   10,  100],
        [   2,   20,  200],
        [   3,   30,  300],
        [   4,   40,  400],
        [   5,   50,  500],
        [   6,   60,  600],
        [   7,   70,  700],
        [   8,   80,  800],
        [   9,   90,  900],
        [  10,  100, 1000],
        [  11,  110, 1100],
        [  12,  120, 1200],
        [  13,  130, 1300],
        [  14,  140, 1400],
        [  15,  150, 1500],
        [  16,  160, 1600],
        [  17,  170, 1700],
        [  18,  180, 1800],
        [  19,  190, 1900]]),
 array([[   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
           11,   12,   13,   14,   15,   16,   17,   18,   19],
        [   0,   10,   20,   30,   40,   50,   60,   70,   80,   90,  100,
          110,  120,  130,  140,  150,  160,  170,  180,  190],
        [   0,  100,  200,  300,  400,  500,  600,  700,  800,  900, 1000,
         1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900]]))

Now, instead of applying SlidingWindow to create and save the time series that can be consumed by a time series model, we can use a dataset that creates the data on the fly. In this way we avoid the need to create and save large files. This approach is also useful when you want to test different sliding window sizes, as otherwise you would need to create files for every size you want to test.The dataset will create the samples correctly formatted and ready to be passed on to a time series architecture.

In [None]:
wds0 = TSUnwindowedDataset(X0, window_size=5, stride=2, seq_first=True)[:][0]
wds1 = TSUnwindowedDataset(X1, window_size=5, stride=2, seq_first=False)[:][0]
test_eq(wds0, wds1)
wds0, wds0.data, wds1, wds1.data

(TSTensor(samples:8, vars:1, len:5),
 tensor([[[ 0,  1,  2,  3,  4]],
 
         [[ 2,  3,  4,  5,  6]],
 
         [[ 4,  5,  6,  7,  8]],
 
         [[ 6,  7,  8,  9, 10]],
 
         [[ 8,  9, 10, 11, 12]],
 
         [[10, 11, 12, 13, 14]],
 
         [[12, 13, 14, 15, 16]],
 
         [[14, 15, 16, 17, 18]]]),
 TSTensor(samples:8, vars:1, len:5),
 tensor([[[ 0,  1,  2,  3,  4]],
 
         [[ 2,  3,  4,  5,  6]],
 
         [[ 4,  5,  6,  7,  8]],
 
         [[ 6,  7,  8,  9, 10]],
 
         [[ 8,  9, 10, 11, 12]],
 
         [[10, 11, 12, 13, 14]],
 
         [[12, 13, 14, 15, 16]],
 
         [[14, 15, 16, 17, 18]]]))

In [None]:
wds2 = TSUnwindowedDataset(X2, window_size=5, stride=2, seq_first=True)[:][0]
wds3 = TSUnwindowedDataset(X3, window_size=5, stride=2, seq_first=False)[:][0]
test_eq(wds2, wds3)
wds2, wds3, wds2.data, wds3.data

(TSTensor(samples:8, vars:3, len:5),
 TSTensor(samples:8, vars:3, len:5),
 tensor([[[   0,    1,    2,    3,    4],
          [   0,   10,   20,   30,   40],
          [   0,  100,  200,  300,  400]],
 
         [[   2,    3,    4,    5,    6],
          [  20,   30,   40,   50,   60],
          [ 200,  300,  400,  500,  600]],
 
         [[   4,    5,    6,    7,    8],
          [  40,   50,   60,   70,   80],
          [ 400,  500,  600,  700,  800]],
 
         [[   6,    7,    8,    9,   10],
          [  60,   70,   80,   90,  100],
          [ 600,  700,  800,  900, 1000]],
 
         [[   8,    9,   10,   11,   12],
          [  80,   90,  100,  110,  120],
          [ 800,  900, 1000, 1100, 1200]],
 
         [[  10,   11,   12,   13,   14],
          [ 100,  110,  120,  130,  140],
          [1000, 1100, 1200, 1300, 1400]],
 
         [[  12,   13,   14,   15,   16],
          [ 120,  130,  140,  150,  160],
          [1200, 1300, 1400, 1500, 1600]],
 
         [[  14,   15, 

In [None]:
#hide
out = create_scripts(); beep(out)

<IPython.core.display.Javascript object>

Converted 000_utils.ipynb.
Converted 000b_data.validation.ipynb.
Converted 000c_data.preparation.ipynb.
Converted 001_data.external.ipynb.
Converted 002_data.core.ipynb.
Converted 002_data.unwindowed.ipynb.
Converted 003_data.preprocessing.ipynb.
Converted 003b_data.transforms.ipynb.
Converted 003c_data.mixed_augmentation.ipynb.
Converted 003d_data.image.ipynb.
Converted 003e_data.features.ipynb.
Converted 005_data.tabular.ipynb.
Converted 006_data.mixed.ipynb.
Converted 007_metrics.ipynb.
Converted 008_learner.ipynb.
Converted 008b_tslearner.ipynb.
Converted 009_optimizer.ipynb.
Converted 010_callback.core.ipynb.
Converted 011_callback.noisy_student.ipynb.
Converted 012_callback.gblend.ipynb.
Converted 013_callback.MVP.ipynb.
Converted 014_callback.PredictionDynamics.ipynb.
Converted 100_models.layers.ipynb.
Converted 100b_models.utils.ipynb.
Converted 100c_models.explainability.ipynb.
Converted 101_models.ResNet.ipynb.
Converted 101b_models.ResNetPlus.ipynb.
Converted 102_models.Ince