In [None]:
#default_exp tabular

In [None]:
#export
from timeseries_fastai.imports import *
from timeseries_fastai.core import *
from fastai2.basics import *
from fastai2.torch_core import *
from fastai2.vision.data import get_grid
from fastai2.tabular.core import TabularProc, _TabIloc

# Data
> DataBlock API to construct the DataLoaders

In [None]:
#hide
from nbdev.showdoc import show_doc

We will create a DataBlock to process our UCR datasets

In [None]:
ucr_path = untar_data(URLs.UCR)

In [None]:
df_train, df_test = load_df_ucr(ucr_path, 'StarLightCurves')

Loading files from: /home/tc256760/.fastai/data/Univariate2018_arff/StarLightCurves


In [None]:
df_train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att1016,att1017,att1018,att1019,att1020,att1021,att1022,att1023,att1024,target
0,0.537303,0.531103,0.528503,0.529403,0.533603,0.540903,0.551103,0.564003,0.579603,0.597603,...,0.546903,0.545903,0.543903,0.541003,0.537203,0.532303,0.526403,0.519503,0.511403,b'3'
1,0.588398,0.593898,0.599098,0.604098,0.608798,0.613397,0.617797,0.622097,0.626097,0.630097,...,0.237399,0.246499,0.256199,0.266499,0.277399,0.288799,0.300899,0.313599,0.326899,b'3'
2,-0.0499,-0.0415,-0.0334,-0.0256,-0.0181,-0.0108,-0.0038,0.003,0.0096,0.0159,...,-0.173801,-0.161601,-0.149201,-0.136401,-0.123201,-0.109701,-0.095901,-0.081701,-0.0671,b'1'
3,1.337005,1.319805,1.302905,1.286305,1.270005,1.254005,1.238304,1.223005,1.208104,1.193504,...,1.288905,1.298505,1.307705,1.316505,1.324905,1.332805,1.340205,1.347005,1.353205,b'3'
4,0.769801,0.775301,0.780401,0.785101,0.789401,0.793301,0.796801,0.799901,0.802601,0.805101,...,0.742401,0.744501,0.747301,0.750701,0.754801,0.759501,0.765001,0.771301,0.778401,b'3'


In [None]:
df_train['target'].astype(int)

0      3
1      3
2      1
3      3
4      3
      ..
995    2
996    3
997    1
998    3
999    3
Name: target, Length: 1000, dtype: int64

In [None]:
x_cols = df_train.columns[slice(0,-1)].to_list()
x_cols[0:5]

['att1', 'att2', 'att3', 'att4', 'att5']

In [None]:
#export
class TabularTS(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are x/y, and returns rows in `__getitem__`"
    _default, with_cont='procs',True
    def __init__(self, df, procs=None, x_names=None, y_names=None, block_y=None, splits=None,
                 do_setup=True, device=None, inplace=False):
        if inplace and splits is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(df)

        self.x_names,self.y_names,self.device = L(x_names),L(y_names),device
        if block_y is None and self.y_names:
            # Make ys categorical if they're not numeric
            ys = df[self.y_names]
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): block_y = CategoryBlock()
            else: block_y = RegressionBlock()
        if block_y is not None and do_setup:
            if callable(block_y): block_y = block_y()
            procs = L(procs) + block_y.type_tfms
        self.procs = Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if do_setup: self.setup()

    def new(self, df):
        return type(self)(df, do_setup=False, block_y=TransformBlock(),
                          **attrdict(self, 'procs','x_names','y_names', 'device'))
    
    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.x_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self
    
    def all_col_names (self): 
        ys = [n for n in self.y_names if n in self.items.columns]
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names

properties(TabularTS,'loc','iloc','targ','all_col_names','n_subsets','y')

In [None]:
tts = TabularTS(df_train, x_names=x_cols, y_names='target')

In [None]:
tts.iloc[0:4]

       att1      att2      att3      att4      att5      att6      att7  \
0  0.537303  0.531103  0.528503  0.529403  0.533603  0.540903  0.551103   
1  0.588398  0.593898  0.599098  0.604098  0.608798  0.613397  0.617797   
2 -0.049900 -0.041500 -0.033400 -0.025600 -0.018100 -0.010800 -0.003800   
3  1.337005  1.319805  1.302905  1.286305  1.270005  1.254005  1.238304   

       att8      att9     att10  ...   att1016   att1017   att1018   att1019  \
0  0.564003  0.579603  0.597603  ...  0.546903  0.545903  0.543903  0.541003   
1  0.622097  0.626097  0.630097  ...  0.237399  0.246499  0.256199  0.266499   
2  0.003000  0.009600  0.015900  ... -0.173801 -0.161601 -0.149201 -0.136401   
3  1.223005  1.208104  1.193504  ...  1.288905  1.298505  1.307705  1.316505   

    att1020   att1021   att1022   att1023   att1024  target  
0  0.537203  0.532303  0.526403  0.519503  0.511403    b'3'  
1  0.277399  0.288799  0.300899  0.313599  0.326899    b'3'  
2 -0.123201 -0.109701 -0.095901 -0.08

In [None]:
#export
class TSPandas(TabularTS):
    def transform(self, cols, f, all_col=True): 
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)

In [None]:
#export
def _add_prop(cls, nm):
    @property
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(TabularTS, 'y')
_add_prop(TabularTS, 'x')
_add_prop(TabularTS, 'all_col')

In [None]:
to = TSPandas(df_train, x_names=x_cols, y_names='target')

In [None]:
to.iloc[0:5]

       att1      att2      att3      att4      att5      att6      att7  \
0  0.537303  0.531103  0.528503  0.529403  0.533603  0.540903  0.551103   
1  0.588398  0.593898  0.599098  0.604098  0.608798  0.613397  0.617797   
2 -0.049900 -0.041500 -0.033400 -0.025600 -0.018100 -0.010800 -0.003800   
3  1.337005  1.319805  1.302905  1.286305  1.270005  1.254005  1.238304   
4  0.769801  0.775301  0.780401  0.785101  0.789401  0.793301  0.796801   

       att8      att9     att10  ...   att1016   att1017   att1018   att1019  \
0  0.564003  0.579603  0.597603  ...  0.546903  0.545903  0.543903  0.541003   
1  0.622097  0.626097  0.630097  ...  0.237399  0.246499  0.256199  0.266499   
2  0.003000  0.009600  0.015900  ... -0.173801 -0.161601 -0.149201 -0.136401   
3  1.223005  1.208104  1.193504  ...  1.288905  1.298505  1.307705  1.316505   
4  0.799901  0.802601  0.805101  ...  0.742401  0.744501  0.747301  0.750701   

    att1020   att1021   att1022   att1023   att1024  target  
0  0.5

In [None]:
#export
def _apply_cats (voc, add, c):
    if not is_categorical_dtype(c):
        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
    return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))

In [None]:
# #export
# class Categorify(TabularProc):
#     "Transform the categorical variables to that type."
#     order = 1
#     def setups(self, to):
#         self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}

#     def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
#     def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
#     def __getitem__(self,k): return self.classes[k]

In [None]:
#export
@Categorize
def setups(self, to:TabularTS):
    if len(to.y_names) > 0:
        self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items)
        self.c = len(self.vocab)
    return self(to)

@Categorize
def encodes(self, to:TabularTS):
    to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
    return to

@Categorize
def decodes(self, to:TabularTS):
    to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
    return to

In [None]:
#export
class NormalizeTS(TabularProc):
    "Normalize the x variables."
    order = 2
    def setups(self, dsets): self.means,self.stds = dsets.xs.mean(),dsets.xs.std(ddof=0)+1e-7
    def encodes(self, to): to.conts = (to.xs-self.means) / self.stds
    def decodes(self, to): to.conts = (to.xs*self.stds ) + self.means

In [None]:
#export
@Normalize
def setups(self, to:TabularTS):
    self.means,self.stds = getattr(to, 'train', to).xs.mean(),getattr(to, 'train', to).xs.std(ddof=0)+1e-7
    return self(to)

@Normalize
def encodes(self, to:TabularTS):
    to.xs = (to.xs-self.means) / self.stds
    return to

@Normalize
def decodes(self, to:TabularTS):
    to.xs = (to.xs*self.stds ) + self.means
    return to

In [None]:
norm = Normalize()
df = df_train.loc[:, [x_cols[0]]]
to = TSPandas(df, norm, x_names=x_cols[0])
x = df.values.squeeze()
m,s = x.mean(),x.std()
test_eq(norm.means[x_cols[0]], m)
test_close(norm.stds[x_cols[0]], s)
test_close(to[x_cols[0]].values, (x-m)/s)

In [None]:
#export
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o

In [None]:
t = tensor([[2.,3.],[4.,5.]])

In [None]:
t.unsqueeze(1).shape

torch.Size([2, 1, 2])

In [None]:
#export
class ReadTSBatch(ItemTransform):
    def __init__(self, to): self.to = to

    def encodes(self, to):
        res = (tensor(to.xs).float().unsqueeze(1), )
        ys = [n for n in to.y_names if n in to.items.columns]
        if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

    def decodes(self, o):
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to

In [None]:
to = TSPandas(df_train, None, x_names=x_cols, y_names='target')

In [None]:
to.procs

Pipeline: Categorize

In [None]:
#export
@typedispatch
def show_batch(x: TabularTS, y, its, max_n=10, ctxs=None):
    x.show()

In [None]:
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind
_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)

In [None]:
#export
@delegates()
class TabularTSDataloader(TfmdDL):
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTSBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

TSPandas._dl_type = TabularTSDataloader

In [None]:
splits = RandomSplitter()(range_of(df_train))
to = TSPandas(df_test, norm, x_names=x_cols, y_names='target', splits=splits)

In [None]:
test_dl = TabularTSDataloader(to)

In [None]:
def cycle_dl(dl):
    for x,y in iter(dl):
        pass

In [None]:
%time cycle_dl(test_dl)

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 128 ms


In [None]:
#export
def stack_train_valid(df_train, df_valid):
    "Stack df_train and df_valid, adds `valid_col`=True/False for df_valid/df_train"
    return pd.concat([df_train.assign(valid_col=False), df_valid.assign(valid_col=True)]).reset_index(drop=True)

In [None]:
from timeseries_fastai.models import create_inception

In [None]:
to = TSPandas(stack_train_valid(df_train, df_test), norm, x_names=x_cols, y_names='target', splits=splits)

In [None]:
dls = to.dataloaders(32, 128)

In [None]:
inception = create_inception(1, len(dls.vocab))

In [None]:
learn = Learner(dls, inception, metrics=[accuracy])

In [None]:
x,y = dls.train.one_batch()

In [None]:
x.shape

torch.Size([32, 1, 1024])

In [None]:
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,time
0,0.4516,1.03576,0.33,00:00
1,0.343518,0.262926,0.925,00:00
2,0.280191,0.144124,0.965,00:00
3,0.226884,0.115465,0.965,00:00
4,0.179048,0.098044,0.98,00:00


# Export -

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_data.ipynb.
Converted 02_models.ipynb.
Converted 03_tabular.ipynb.
Converted 99_index.ipynb.
