In [15]:
# default_exp data.procs

In [16]:
# hide
import sys

sys.path.append("..")
import pandas as pd

In [17]:
# hide
from nbdev.showdoc import *

# Data.Procs

### CatProc

In [18]:
# export
from fastseq.data.external import *
from fastseq.data.load import *
from fastseq.data.core import *
from fastseq.core import *
from fastcore.all import *
from fastcore.imports import *
from fastai2.basics import *
from fastai2.data.transforms import *
from fastai2.tabular.core import *
from typing import List
import orjson

In [19]:
# export
class CatProc():
    def __init__(self, path, num_of_workers = None, vocab = None, o2i = None):
        if vocab is None and o2i is None:
            vocab, o2i = make_vocab(path)
        self.meta = get_meta(path)
        self.f = CatMultiTfm(vocab = vocab, o2i = o2i)
        self.num_of_workers = num_of_workers
        
    def __call__(self, files:List[Path]):  
        return multithread_f(self._setup, files, self.num_of_workers)
#         r = []
#         for f in files:
#             r.append(self._setup(f))
#         return r
    
    def _setup(self, f:Path):   
        ts = get_ts_datapoint(f)
        tsm = json2TSMulti(ts, 0, self.meta['col_names']['ts_con_names'][0], ts['_length']-1, 1, self.meta)
        tsm = self.f(tsm)  
        for i, cat in enumerate(ts['ts_cat']):
            test_eq(len(tsm[2][i]), len(ts['ts_cat'][cat]))
            ts['ts_cat'][cat] = [o.item() for o in tsm[2][i]]
        for i, cat in enumerate(ts['cat']):
            ts['cat'][cat] = tsm[3][i].item()
        open(f,'wb').write(orjson.dumps(dict(ts)))
        return f
        

In [20]:
# %%time
path = Path('../data/test_data')
horizon,lookback = 7, 14
del_create([2000]*10, path = path)

fs = get_files(path, extensions='.json', folders = False)

proc = CatProc(path, num_of_workers = 1)
r = proc(fs)

In [21]:
for f in fs:
    ts = get_ts_datapoint(f)
    for cat in set(unpack_list([v for k,v in ts['ts_cat'].items()])):
        test_eq(type(cat), int)
        
    for cat in set(unpack_list([v for k,v in ts['cat'].items()])):
        test_eq(type(cat), int)

In [22]:
%%time
# hide
path = Path('../data/test_data')
horizon,lookback = 7, 14
del_create([2000]*10, path = path)

fs = get_files(path, extensions='.json', folders = False)

proc = CatProc(path, num_of_workers = 8)
r = proc(fs)

CPU times: user 1.01 s, sys: 249 ms, total: 1.26 s
Wall time: 1.02 s


### DateFeatures

In [23]:
# export
from fastai2.tabular import *

In [122]:
# export
class DateProc:
    def __init__(
        self,
        path: Path,
        field_name: str,
        num_of_workers=None,
        con_cols=["Year", "Day", "Dayofweek", "Dayofyear", "Elapsed"],
        cat_cols=["Year", "Day", "Dayofweek",'Is_month_end',
                  'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',],
    ):
        self.path = path
        self.meta = get_meta(path)
        self.num_of_workers = num_of_workers
        self.field_name = field_name
        self.con_cols = con_cols
        self.cat_cols = cat_cols

    def __call__(self, files: List[Path]):      

        r = []
        _, ts = self._setup(files[0],with_ts=True)
        o = {}
        for k,v in ts.items():
            if k[0] is not '_':
                try:
                    o.update({col:python_type(o) for col, o in v.items()})
                except:
                    print(k,v)
                    assert False
        length, classes, col_names, names = reconize_cols(o)
        make_meta_file(self.path, classes=classes, col_names = col_names)
        self.meta = get_meta(self.path)
        
        for i,f in enumerate(files):
            r.append(self._setup(f))
        return r
    
        # return multithread_f(self._setup, files, self.num_of_workers)

    def _setup(self, f: Path, with_ts=False):
        ts = get_ts_datapoint(f)
        df = pd.DataFrame(ts["ts_cat"])
        df = add_datepart(df, self.field_name)

        ts["ts_cat"].update({k: list(v.astype(str)) for k, v in dict(df).items() if k in self.cat_cols})
        df[self.con_cols] = (df[self.con_cols] - df[self.con_cols].mean()) / (df[
            self.con_cols
        ].std() + 1e-7)
        ts["ts_con"].update( {
            k: list(v) for k, v in dict(df).items() if k in self.con_cols
        })
        open(f, "wb").write(orjson.dumps(dict(ts)))
        if with_ts:
            return f, ts
        return f


In [111]:
path = Path("../data/m5_tiny/rows")
new_path = Path('../data/m5_tiny/rows_date')
new_path.delete()


In [112]:
if not new_path.exists():new_path.mkdir()
for i,f in enumerate(path.glob('*.json')):
    f.copy(new_path / f.name)
    if i == 50:
        break
(path / '.ts_meta').copy(new_path / '.ts_meta')

fs = get_files(new_path, extensions=".json", folders=False)
print(get_ts_datapoint(fs[0])['ts_cat'].keys(), get_ts_datapoint(fs[0])['ts_cat'].keys() )

dict_keys(['weekday', 'd.1', 'date']) dict_keys(['weekday', 'd.1', 'date'])


In [113]:
proc = DateProc(new_path, "date", num_of_workers=1)
fs = proc(fs)

In [114]:
print('ts_cat',get_ts_datapoint(fs[0])['ts_cat'].keys())
print('ts_con',get_ts_datapoint(fs[0])['ts_con'].keys())
print('\nmeta')
print(get_meta(new_path))

ts_cat dict_keys(['weekday', 'd.1', 'date', 'Year', 'Day', 'Dayofweek', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'])
ts_con dict_keys(['sales', 'snap_CA', 'prices', 'snap_TX', 'year', 'snap_WI', 'month', 'wm_yr_wk', 'wday', 'Year', 'Day', 'Dayofweek', 'Dayofyear', 'Elapsed'])

meta
classes:	{'Day': (#31) ['30','27','25','24','3','12','20','29','4','21'...], 'Dayofweek': (#7) ['2','6','1','0','3','5','4'], 'Is_month_end': (#2) ['True','False'], 'Is_month_start': (#2) ['True','False'], 'Is_quarter_end': (#2) ['True','False'], 'Is_quarter_start': (#2) ['True','False'], 'Is_year_end': (#2) ['True','False'], 'Is_year_start': (#2) ['True','False'], 'Year': (#6) ['2011','2014','2015','2016','2012','2013'], 'cat_id': (#3) ['HOBBIES','FOODS','HOUSEHOLD'], 'd.1': (#1969) ['d_1013','d_1730','d_793','d_1602','d_628','d_1151','d_1682','d_239','d_1405','d_1523'...], 'date': (#1969) ['2016-06-07','2015-02-10','2014-02-13','2016-01-20','2014-

In [115]:
print('ts_con',get_ts_datapoint(fs[0])['ts_con']['Year'][:10])

ts_con [-1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436, -1.4482177350408436]


## M5 example

In [116]:
new_path.ls()

(#52) [Path('../data/m5_tiny/rows_date/FOODS_2_176_CA_1.json'),Path('../data/m5_tiny/rows_date/FOODS_2_164_TX_3.json'),Path('../data/m5_tiny/rows_date/FOODS_2_181_CA_4.json'),Path('../data/m5_tiny/rows_date/FOODS_2_145_CA_3.json'),Path('../data/m5_tiny/rows_date/FOODS_2_150_TX_3.json'),Path('../data/m5_tiny/rows_date/FOODS_2_141_CA_3.json'),Path('../data/m5_tiny/rows_date/HOUSEHOLD_2_193_WI_3.json'),Path('../data/m5_tiny/rows_date/FOODS_2_174_CA_1.json'),Path('../data/m5_tiny/rows_date/FOODS_2_124_CA_3.json'),Path('../data/m5_tiny/rows_date/FOODS_2_112_TX_3.json')...]

In [117]:
tmf = CatProc(new_path)


In [118]:
path = Path("../data/m5_tiny/rows")
new_path = Path('../data/m5_tiny/rows_date')
new_path.delete()


In [119]:
if not new_path.exists():new_path.mkdir()
for i,f in enumerate(path.glob('*.json')):
    f.copy(new_path / f.name)
    if i == 50:
        break
(path / '.ts_meta').copy(new_path / '.ts_meta')

fs = get_files(new_path, extensions=".json", folders=False)
print(get_ts_datapoint(fs[0])['ts_cat'].keys(), get_ts_datapoint(fs[0])['ts_cat'].keys() )

dict_keys(['weekday', 'd.1', 'date']) dict_keys(['weekday', 'd.1', 'date'])


In [120]:
horizon,lookback = 28, 28*2
dls = MTSDataLoaders.from_m5_path(new_path, 'sales',  horizon=horizon, lookback=lookback, steps = 14,
                                  procs = [DateProc(new_path, 'date'), CatProc(new_path)])
dls.show_batch()

{'Train': 89046, 'Val': 2907, 'Validation': 51, 'Evaluation': 51}


KeyError: 'Day'

In [None]:
for o in dls.train:
    break
plt.plot(o[1][0].T.cpu(),) # ts_con
plt.figure()
plt.plot(o[2][0].T.cpu(),) # ts_cat
print(o[2].shape)
plt.figure()
plt.plot(o[3][0].T.cpu(),) # cat
plt.figure()
plt.plot(o[4][0].T.cpu(),) # con


In [123]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 00_m5.ipynb.
Converted 01_data.external.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.procs.ipynb.
Converted 05_data.m5.ipynb.
Converted 10_model.base.ipynb.
Converted 11_metrics.ipynb.
Converted 11_model.conv.ipynb.
Converted 12_compare.ipynb.
Converted 12_model.rnn.ipynb.
Converted index.ipynb.
Converted tab.model.ipynb.
Converted test.ipynb.


In [118]:
git_add('04_data.procs.ipynb', commit_msg='CatProc')

Converted 04_data.procs.ipynb.


['/home/tako/dev/fastseq/fastseq/data/procs.py',
 '/home/tako/dev/fastseq/nbs/04_data.procs.ipynb']