# Import libraries

In [1]:
stable = True # True: latest version, False: stable version

import sys
ISCOLAB = 'google.colab' in sys.modules
if ISCOLAB:
    if stable: 
        !pip install tsai
    else:
        !pip install git+https://github.com/timeseriesAI/tsai.git
    
import tsai
from tsai.all import *
print('tsai       :', tsai.__version__)
print('fastai     :', fastai.__version__)
print('fastcore   :', fastcore.__version__)
print('torch      :', torch.__version__)

tsai       : 0.2.8
fastai     : 2.1.6
fastcore   : 1.3.5
torch      : 1.7.0


# How to prepare the input to a neural network?

LSST state-of-the-art accuracy = 0.64; balanced accuracy: 0.458

Noisy chart!

# Experiments

## Scaling

In [87]:
dsid = 'LSST' 
X, y, splits = get_UCR_data(dsid, split_data=False)
batch_tfm_list = [None, 
                  TSStandardize(), TSStandardize(by_sample=True), TSStandardize(by_var=True), TSStandardize(by_sample=True, by_var=True), 
                  TSNormalize(), TSNormalize(by_sample=True), TSNormalize(by_var=True), TSNormalize(by_sample=True, by_var=True), 
                  TSRobustScaler(), TSRobustScaler(by_sample=True), TSRobustScaler(by_var=True), TSRobustScaler(by_sample=True, by_var=True), 
                  TSClipOutliers(), TSClipOutliers(by_sample=True), TSClipOutliers(by_var=True), TSClipOutliers(by_sample=True, by_var=True), 
                  [TSRobustScaler(), TSClipOutliers()], [TSRobustScaler(by_sample=True), TSClipOutliers(by_sample=True)], 
                  [TSRobustScaler(by_var=True), TSClipOutliers(by_var=True)], 
                  [TSRobustScaler(by_sample=True, by_var=True), TSClipOutliers(by_sample=True, by_var=True)]]
results = pd.DataFrame(columns=['preprocessor', 'train loss', 'valid loss', 'accuracy', 'time'])
for i, bt in enumerate(batch_tfm_list): 
    bt_name = [t for t in L(bt)]
    print(f'{i} {bt_name}')
    tfms  = [None, Categorize()]
    dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=bt)
    model = build_model(InceptionTime, dls=dls)
    learn = Learner(dls, model,  metrics=accuracy)
    start = time.time()
    learn.fit_one_cycle(50, 1e-3)
    elapsed = time.time() - start
    vals = learn.recorder.values[-1]
    results.loc[i] = [bt_name, vals[0], vals[1], vals[2], int(elapsed)]
    results.sort_values(by='accuracy', ascending=False, ignore_index=True, inplace=True)
    clear_output()
    display(results)
beep()

Unnamed: 0,preprocessor,train loss,valid loss,accuracy,time
0,"[TSStandardize(by_sample=True, by_var=False)]",0.001444,1.482326,0.690187,70
1,"[TSNormalize(by_sample=True, by_var=False)]",0.002735,1.41287,0.689376,70
2,"[TSClipOutliers(by_sample=False, by_var=False)]",0.008089,1.759094,0.628548,70
3,"[TSClipOutliers(by_sample=False, by_var=True)]",0.009819,1.648627,0.628143,69
4,"[TSNormalize(by_sample=True, by_var=True)]",0.002984,,0.620438,71
5,"[TSNormalize(by_sample=True, by_var=True)]",0.002503,,0.619627,70
6,"[TSRobustScaler(by_sample=False, by_var=True), TSClipOutliers(by_sample=False, by_var=True)]",0.00888,1.748324,0.611111,70
7,"[TSRobustScaler(by_sample=True, by_var=False), TSClipOutliers(by_sample=True, by_var=False)]",0.000916,1.817344,0.6103,81
8,"[TSRobustScaler(by_sample=False, by_var=False), TSClipOutliers(by_sample=False, by_var=False)]",0.009028,1.766534,0.60665,71
9,"[TSNormalize(by_sample=False, by_var=False)]",0.635609,1.250225,0.605028,71


## Stationarity

In [2]:
dsid = 'LSST' 
X, y, splits = get_UCR_data(dsid, split_data=False)
batch_tfm_list = [TSDiff(), [TSDiff(), TSStandardize(by_sample=True)], [TSStandardize(by_sample=True), TSDiff()]]
results4 = pd.DataFrame(columns=['preprocessor', 'train loss', 'valid loss', 'accuracy', 'time'])
for i, bt in enumerate(batch_tfm_list): 
    bt_name = [t for t in L(bt)]
    print(f'{i} {bt_name}')
    tfms  = [None, Categorize()]
    dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=bt)
    model = build_model(InceptionTime, dls=dls)
    learn = Learner(dls, model,  metrics=accuracy)
    start = time.time()
    learn.fit_one_cycle(50, 1e-3)
    elapsed = time.time() - start
    vals = learn.recorder.values[-1]
    results4.loc[i] = [bt_name, vals[0], vals[1], vals[2], int(elapsed)]
    results4.sort_values(by='accuracy', ascending=False, ignore_index=True, inplace=True)
    clear_output()
    display(results4)
beep()

Unnamed: 0,preprocessor,train loss,valid loss,accuracy,time
0,"[TSStandardize(by_sample=True, by_var=False), TSDiff(lag=1, pad=True)]",0.000984,1.8223,0.628954,71
1,"[TSDiff(lag=1, pad=True), TSStandardize(by_sample=True, by_var=False)]",0.000687,1.968431,0.608272,71
2,"[TSDiff(lag=1, pad=True)]",0.321753,3.684876,0.340227,70


# Discussion:

Data preprocessing can have a dramatic impact on performance as we have just seen. There are 2 important aspects to take into account: 

1. **Scaling:**
    When you scale time series data, it's important to decide: what do you want to preserve?

    * the ratio between different samples   ---> by_sample=False
    * the ratio between different variables ---> by_var=False
    * both of them                          ---> by_sample=False, by_bar=False
    * neither of them                       ---> by_sample=True, by_bar=True
    
    My preferred option is to test `None` (as a baseline), `TSStandardize()`, `TSStandardize(by_sample=True)`, and `TSStandardize(by_var=True)`. If any of the last 2 improves performance I may also test `TSStandardize(by_sample=True, by_bar=True)`. 


2. **Stationarity:**
    Another aspect to consider is: should I make the time series stationary?
    In general, I have not seen a lot of improvement by making a TS stationary. In case you want to try it though there are 3 options: 
    
    * TSDiff: for differencing. It can be used with both positive and negative inputs.
    * TSLog: to apply a logarithm to the inputs. It can only be used with positive inputs.
    * TSLogReturn: to apply differencing to the logarithm of the inputs. It can only be used with positive inputs.
    

# Conclusions

Data preprocessing is an important requirement to achieve a great performance in many cases. It's important to consider 2 things: 

* how you want to scale your time series data
* whether of not to make data stationary