In [None]:
from timeseries_fastai.imports import *
from timeseries_fastai.core import *

# Benchmark tests
>Which approach is faster?

In [None]:
ucr_path = get_ucr()

In [None]:
df_train, df_test = load_df_ucr(ucr_path, 'StarLightCurves')

Loading files from: /home/tc256760/.fastai/data/Univariate2018_arff/StarLightCurves


In [None]:
df_train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att1016,att1017,att1018,att1019,att1020,att1021,att1022,att1023,att1024,target
0,0.537303,0.531103,0.528503,0.529403,0.533603,0.540903,0.551103,0.564003,0.579603,0.597603,...,0.546903,0.545903,0.543903,0.541003,0.537203,0.532303,0.526403,0.519503,0.511403,b'3'
1,0.588398,0.593898,0.599098,0.604098,0.608798,0.613397,0.617797,0.622097,0.626097,0.630097,...,0.237399,0.246499,0.256199,0.266499,0.277399,0.288799,0.300899,0.313599,0.326899,b'3'
2,-0.0499,-0.0415,-0.0334,-0.0256,-0.0181,-0.0108,-0.0038,0.003,0.0096,0.0159,...,-0.173801,-0.161601,-0.149201,-0.136401,-0.123201,-0.109701,-0.095901,-0.081701,-0.0671,b'1'
3,1.337005,1.319805,1.302905,1.286305,1.270005,1.254005,1.238304,1.223005,1.208104,1.193504,...,1.288905,1.298505,1.307705,1.316505,1.324905,1.332805,1.340205,1.347005,1.353205,b'3'
4,0.769801,0.775301,0.780401,0.785101,0.789401,0.793301,0.796801,0.799901,0.802601,0.805101,...,0.742401,0.744501,0.747301,0.750701,0.754801,0.759501,0.765001,0.771301,0.778401,b'3'


In [None]:
x_cols = df_train.columns[slice(0,-1)].to_list()
x_cols[0:5]

['att1', 'att2', 'att3', 'att4', 'att5']

## Tabular based DL

In [None]:
from timeseries_fastai.tabular import *

In [None]:
df_main = stack_train_valid(df_train, df_test)
splits=[range_of(df_train), list(range(len(df_train), len(df_main)))]
to = TSPandas(df_main, Normalize, x_names=x_cols, y_names='target', splits=splits)

In [None]:
dls = to.dataloaders(32, 128)

My very basic performance test

In [None]:
def cycle_dl(dl):
    for x,y in iter(dl):
        pass

In [None]:
%time cycle_dl(dls.valid)

CPU times: user 741 ms, sys: 1.42 ms, total: 742 ms
Wall time: 188 ms


# Row by Row pandas DataLoader
> as defined on notebook 01

In [None]:
from timeseries_fastai.data import *

In [None]:
tsdls = TSDataLoaders.from_dfs(df_train, df_test, x_cols=x_cols, label_col='target', bs=32, val_bs=128)

In [None]:
%time cycle_dl(tsdls.valid)

CPU times: user 115 ms, sys: 630 ms, total: 745 ms
Wall time: 8.27 s


# Naive approach

In [None]:
tfl = TfmdLists(df_test, [noop])
tfl[0]

att1      -0.569701
att2      -0.567901
att3      -0.565901
att4      -0.563701
att5      -0.561501
             ...   
att1021   -0.543001
att1022   -0.548501
att1023   -0.554501
att1024   -0.560801
target         b'2'
Name: 0, Length: 1025, dtype: object

In [None]:
def get_x(row): return row.values[:1024].astype(np.float32)

In [None]:
get_x(df_test.iloc[0])

array([-0.56970096, -0.56790096, -0.565901  , ..., -0.5485009 ,
       -0.55450094, -0.56080097], dtype=float32)

In [None]:
tfl = TfmdLists(df_main, [get_x])
tfl[0]

array([0.5373029 , 0.53110296, 0.52850294, ..., 0.52640295, 0.51950294,
       0.51140296], dtype=float32)

In [None]:
def get_y(row): return row.values[1024]

In [None]:
get_y(df_test.iloc[0])

b'2'

In [None]:
tfl_y = TfmdLists(df_test, [get_y])

In [None]:
tfl_y[0]

b'2'

SLOW: It is the Categorize setup:

In [None]:
%time tfl_y = TfmdLists(df_test, [get_y, Categorize])

CPU times: user 4 s, sys: 77.5 ms, total: 4.07 s
Wall time: 3.8 s


### Let's check the Pipeline

In [None]:
cat = Categorize()
cat.setup(L(get_y(row) for index, row in df_test.iterrows()))

In [None]:
pipe = Pipeline([get_y, cat])

In [None]:
%time [pipe(df_test.iloc[i]) for i in range_of(df_test)]

### Two tfmdLists stacked together, A Dataset

In [None]:
ds = Datasets(df_test, [[get_x], [get_y,Categorize]])
ds[0]

(array([-0.56970096, -0.56790096, -0.565901  , ..., -0.5485009 ,
        -0.55450094, -0.56080097], dtype=float32),
 TensorCategory(1))

In [None]:
dl = DataLoader(ds, bs=128)

In [None]:
%time cycle_dl(dl)

CPU times: user 8.08 s, sys: 428 µs, total: 8.09 s
Wall time: 8.07 s


# DBlock
>Somewhat faster

In [None]:
dblock = DataBlock(
    get_x = get_x,
    get_y = (get_y, Categorize),
    splitter=ColSplitter('valid_col') # having a validation set is crucial for any task,
)  

In [None]:
dbdls = dblock.dataloaders(df_main, bs=32, val_bs=128)

In [None]:
%time cycle_dl(dbdls.valid)

CPU times: user 128 ms, sys: 613 ms, total: 741 ms
Wall time: 2.23 s


somewhat faster, don't know why!

In [None]:
ds = dblock.datasets(df_main)
ds

(#9236) [(array([0.5373029 , 0.53110296, 0.52850294, ..., 0.52640295, 0.51950294,
       0.51140296], dtype=float32), TensorCategory(2)),(array([0.5883976 , 0.5938976 , 0.59909755, ..., 0.30089882, 0.31359878,
       0.32689872], dtype=float32), TensorCategory(2)),(array([-0.04990044, -0.04150042, -0.03340039, ..., -0.09590058,
       -0.08170054, -0.0671005 ], dtype=float32), TensorCategory(0)),(array([1.3370049, 1.3198048, 1.3029048, ..., 1.340205 , 1.3470049,
       1.353205 ], dtype=float32), TensorCategory(2)),(array([0.76980066, 0.7753006 , 0.78040063, ..., 0.76500064, 0.7713006 ,
       0.77840066], dtype=float32), TensorCategory(2)),(array([-0.41120073, -0.41930076, -0.42610076, ..., -0.45580086,
       -0.46380088, -0.4728009 ], dtype=float32), TensorCategory(1)),(array([0.7147005 , 0.7171005 , 0.7189005 , ..., 0.74120045, 0.73590046,
       0.7304005 ], dtype=float32), TensorCategory(2)),(array([-0.6439008 , -0.64870083, -0.65320086, ..., -0.63650084,
       -0.6361008 , -0.6

Categorize is slow on this column maybe?

In [None]:
%timeit vocab = CategoryMap(df_main['target'])

248 µs ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit vocab = CategoryMap([v for v in df_main['target']])

599 µs ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
vocab.o2i

{b'1': 0, b'2': 1, b'3': 2}

# Categorize setup is slow over a big DataFrame.
- Why the map is slow later then?