### Transformers will pass the answer leaking test. 
Let's say there exists a column in the dataset that is the target moved back a time step so that it leaks the answer. Can the transformer based model find and us this information? This is literally a causality not a correlation. 

In [None]:
from utils.tools import dotdict
from exp.exp_informer import Exp_Informer
import torch
from utils.ipynb_helpers import setting_from_args, read_data, write_df, handle_gpu
import os

In [None]:
args = dotdict()

args.model = "informer"  # model of experiment, options: [informer, informerstack, informerlight(TBD)]

args.data = "custom"  # data
args.root_path = "./data/stock/"  # root path of data file


args.data_path = "close.csv"  # data file
args.features = "MS"  # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
args.target = "XOM_close"  # target feature in S or MS task
args.freq = "t"  # freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h
args.checkpoints = "./checkpoints"  # location of model checkpoints

# Informer decoder input: concat[start token series(label_len), zero padding series(pred_len)]

args.c_out = 1  # output size
args.factor = 5  # probsparse attn factor
args.d_model = 512  # dimension of model
args.n_heads = 8  # num of heads
args.e_layers = 2  # num of encoder layers
args.d_layers = 1  # num of decoder layers
args.d_ff = 2048  # dimension of fcn in model
args.dropout = 0.05  # dropout
args.attn = "prob"  # attention used in encoder, options:[prob, full]
args.t_embed = "timeF"  # time features encoding, options:[timeF, fixed, learned]
args.activation = "gelu"  # activation
args.distil = True  # whether to use distilling in encoder
args.output_attention = False  # whether to output attention in encoder
args.mix = True
args.padding = 0

args.seq_len = 64  # input sequence length of Informer encoder
args.label_len = 32  # start token length of Informer decoder
args.pred_len = 16  # prediction sequence length

args.cols = [args.target, "WTI_close"]
args.enc_in = 2  # encoder input size
args.dec_in = 2  # decoder input size


args.date_test = "2022-04-01"
args.date_start = "2021-01-01"

args.batch_size = 128
args.learning_rate = 0.00001
args.loss = "mse"
args.lradj = "type1"
args.use_amp = False  # whether to use automatic mixed precision training

args.num_workers = 0
args.itr = 3  # number of runs
args.max_epochs = 10
args.patience = 4
args.des = "assumption_leak"

args.scale = True
args.inverse = True  # Defaultly False but @Zac thinks it should be True

handle_gpu(args, None)

# idk what this is for
args.detail_freq = args.freq
args.freq = args.freq[-1:]

Exp = Exp_Informer

In [None]:
# Open data
path = os.path.join(args.root_path, args.data_path)
df = read_data(path)

# Get target
tick = args.target[: args.target.find("_")]
dat = args.target[args.target.find("_") + 1 :]

# Shift
temp = df[tick, dat]
temp = temp.shift(-1, fill_value=temp[-1])
new_col_name = f"{dat}shift"
df[tick, new_col_name] = temp
df.sort_index(axis=1, inplace=True)
new_col_name = f"{tick}_{new_col_name}"

df.tail()

In [None]:
# Only run this cell once

new_path = write_df(df, path, append="shift")

args.data_path = new_path[len(args.root_path) :]
print(args.data_path)

if args.cols is not None:
    args.cols.append(new_col_name)

args.enc_in += 1
args.dec_in += 1

In [None]:
exp = None
setting = None
for ii in range(args.itr):
    # setting record of experiments
    setting = setting_from_args(args, ii)

    print(args)
    # set experiments
    exp = Exp(args)

    # train
    print(f">>>>>>>start training : {setting}>>>>>>>>>>>>>>>>>>>>>>>>>>")
    exp.train(setting)

    # test
    print(f">>>>>>>testing : {setting}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
    exp.test(setting)

    torch.cuda.empty_cache()