In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *
from utils.metrics import MAPE, WAPE, MAE
from utils.dl import *

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

lags = 32
horizon = 8
stride = 1
batch_size = 256
val_size = 0.15
test_size = 0.0
drop_last = False
features = 1
keep_hs = False  # stride == lags
epochs = 200
verbose = False

model_params = {'input_size': features, 'hidden_size': 256, 'num_layers': 2, 'dropout': 0.1, 'output_size': horizon, 'seq_len': lags}

cuda:0


In [4]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    train_dl, val_dl, test_dl, X_scaler, y_scaler = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess=("log_returns", "normalize"),\
                                            val_size=val_size, test_size=test_size, drop_last=drop_last)
    
    model = RNNModel(seed=0, device=device)
    model.set_model(SimpleLSTM, **model_params)
    optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
    model.set_optim(torch.optim.AdamW, **optim_params)
    model.set_criterion(MAE)

    model.train(train_dl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
    results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

    del model, train_dl, val_dl, test_dl
    torch.cuda.empty_cache()

0it [00:00, ?it/s]Global seed set to 0
1it [00:13, 13.86s/it]Global seed set to 0
2it [00:27, 13.47s/it]Global seed set to 0
3it [00:39, 13.20s/it]Global seed set to 0
4it [00:51, 12.51s/it]Global seed set to 0
5it [01:00, 11.10s/it]Global seed set to 0
6it [01:26, 16.41s/it]Global seed set to 0
7it [01:46, 17.49s/it]Global seed set to 0
8it [02:08, 18.83s/it]Global seed set to 0
9it [02:36, 21.67s/it]Global seed set to 0
10it [02:58, 21.95s/it]Global seed set to 0
11it [03:40, 28.00s/it]Global seed set to 0
12it [04:23, 32.57s/it]Global seed set to 0
13it [05:05, 35.42s/it]Global seed set to 0
14it [05:42, 35.84s/it]Global seed set to 0
15it [06:28, 38.92s/it]Global seed set to 0
16it [07:16, 41.81s/it]Global seed set to 0
17it [08:12, 46.00s/it]Global seed set to 0
18it [09:05, 48.04s/it]Global seed set to 0
19it [10:03, 51.17s/it]Global seed set to 0
20it [10:59, 52.54s/it]Global seed set to 0
21it [14:17, 96.17s/it]Global seed set to 0
22it [17:32, 125.98s/it]Global seed set to 0
2

In [5]:
results

[{'train': 0.2093927264213562, 'val': 0.4945301413536072},
 {'train': 0.2912781933943431, 'val': 0.7903018593788147},
 {'train': 0.1181288609902064, 'val': 0.6460710167884827},
 {'train': 0.1759635110696157, 'val': 0.9245571494102478},
 {'train': 0.15671958774328232, 'val': 0.7232301235198975},
 {'train': 0.19305574397246042, 'val': 0.7023540735244751},
 {'train': 0.3734701365232468, 'val': 1.0341641902923584},
 {'train': 0.16760402917861938, 'val': 0.611506998538971},
 {'train': 0.10949812456965446, 'val': 0.7229129374027252},
 {'train': 0.1449963256716728, 'val': 0.8546842336654663},
 {'train': 0.11426695850160387, 'val': 0.6326352059841156},
 {'train': 0.14383798506524828, 'val': 0.43182967603206635},
 {'train': 0.1573903171552552, 'val': 0.6959636509418488},
 {'train': 0.20255407691001892, 'val': 0.5029991865158081},
 {'train': 0.2740155816078186, 'val': 0.45995838940143585},
 {'train': 0.15424613505601883, 'val': 0.4852403551340103},
 {'train': 0.200125743042339, 'val': 0.53888058

In [6]:
pd.DataFrame(results).to_csv("results\\pure_LSTM.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [6]:
class CombinedDataLoader:
    def __init__(self, *dls):
        self.dls = dls

    def __len__(self):
        return sum(map(len, self.dls))
    
    def __iter__(self):
        for dl in self.dls:
            for v in dl:
                yield v

def train_synth(synthetic_path):
    ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")
    epochs = 2

    results = []
    for ts_index, time_series in tqdm(enumerate(ts_iterator)):
        synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
        scaler = DimUniversalStandardScaler()
        synth_time_series = scaler.fit_transform(synth_time_series)
        synth_dls = []
        for i in range(synth_time_series.shape[0]):
            synth_dl, _, _, X_scaler, y_scaler = create_ts_dl(synth_time_series[i].reshape(- 1, 1), synth_time_series[i].flatten(), lags=lags, horizon=horizon, stride=stride,\
                                                batch_size=batch_size, device=device, data_preprocess=(None,),\
                                                val_size=0, test_size=0, drop_last=drop_last)
            synth_dls.append(synth_dl)

        train_dl, val_dl, *_ = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=("log_returns", "normalize"), device=device,\
                                                val_size=val_size, test_size=test_size, batch_size=batch_size, drop_last=drop_last, scaler=scaler)
        
        model = RNNModel(seed=0, device=device)
        model.set_model(SimpleLSTM, **model_params)
        optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
        model.set_optim(torch.optim.AdamW, **optim_params)
        model.set_criterion(MAE)

        # cdl = CombinedDataLoader(train_dl, *synth_dls)
        # only synth data
        cdl = CombinedDataLoader(*synth_dls)
        model.train(cdl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
        results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

        del model, train_dl, val_dl, cdl, synth_time_series, synth_dls
        torch.cuda.empty_cache()
    return results

In [7]:
results = train_synth(synthetic_path = f"{dataset_path}synthetic/QuantGAN/")
results

0it [00:00, ?it/s]Global seed set to 0
1it [00:29, 29.12s/it]Global seed set to 0
2it [01:03, 32.16s/it]Global seed set to 0
3it [01:36, 32.78s/it]Global seed set to 0
4it [02:10, 33.22s/it]Global seed set to 0
5it [02:44, 33.28s/it]Global seed set to 0
6it [03:17, 33.23s/it]Global seed set to 0
7it [03:51, 33.59s/it]Global seed set to 0
8it [04:26, 34.11s/it]Global seed set to 0
9it [05:02, 34.55s/it]Global seed set to 0
10it [05:39, 35.24s/it]Global seed set to 0
11it [06:09, 33.84s/it]Global seed set to 0
12it [06:47, 34.94s/it]Global seed set to 0
13it [07:25, 36.01s/it]Global seed set to 0
14it [08:04, 36.97s/it]Global seed set to 0
15it [08:42, 37.19s/it]Global seed set to 0
16it [09:29, 40.21s/it]Global seed set to 0
17it [10:12, 40.81s/it]Global seed set to 0
18it [11:07, 45.21s/it]Global seed set to 0
19it [11:54, 45.74s/it]Global seed set to 0
20it [12:34, 43.92s/it]Global seed set to 0
21it [13:12, 42.11s/it]Global seed set to 0
22it [13:51, 41.44s/it]Global seed set to 0
23

[{'train': 0.529029980301857, 'val': 0.4295864403247833},
 {'train': 0.7502809166908264, 'val': 0.9942901730537415},
 {'train': 0.7397843599319458, 'val': 0.5390610098838806},
 {'train': 0.677491823832194, 'val': 0.7245516777038574},
 {'train': 0.6246592700481415, 'val': 0.4809975028038025},
 {'train': 0.688608705997467, 'val': 0.6462737321853638},
 {'train': 0.9054366827011109, 'val': 1.0677287578582764},
 {'train': 0.6930517792701721, 'val': 0.6022903323173523},
 {'train': 0.7729416688283285, 'val': 0.5567872226238251},
 {'train': 0.74241042137146, 'val': 0.6309840083122253},
 {'train': 0.599002523554696, 'val': 0.4397227466106415},
 {'train': 0.7041731609238518, 'val': 0.3771653026342392},
 {'train': 0.6540345549583435, 'val': 0.5777110457420349},
 {'train': 0.7347254678606987, 'val': 0.3960066884756088},
 {'train': 0.6713309168815613, 'val': 0.36762768030166626},
 {'train': 0.628097978234291, 'val': 0.38759779930114746},
 {'train': 0.672879311171445, 'val': 0.4712396562099457},
 {'

In [8]:
pd.DataFrame(results).to_csv("results\\QuantGAN_synth_LSTM.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [9]:
results = train_synth(synthetic_path = f"{dataset_path}synthetic/FourierFlow/")
results

0it [00:00, ?it/s]Global seed set to 0
1it [00:41, 41.66s/it]Global seed set to 0
2it [01:21, 40.74s/it]Global seed set to 0
3it [02:01, 40.16s/it]Global seed set to 0
4it [02:42, 40.57s/it]Global seed set to 0
5it [03:21, 40.04s/it]Global seed set to 0
6it [04:03, 40.76s/it]Global seed set to 0
7it [04:47, 41.73s/it]Global seed set to 0
8it [05:29, 41.87s/it]Global seed set to 0
9it [06:10, 41.58s/it]Global seed set to 0
10it [06:52, 41.83s/it]Global seed set to 0
11it [07:35, 42.09s/it]Global seed set to 0
12it [08:17, 42.02s/it]Global seed set to 0
13it [08:59, 42.14s/it]Global seed set to 0
14it [09:42, 42.15s/it]Global seed set to 0
15it [10:25, 42.53s/it]Global seed set to 0
16it [11:08, 42.66s/it]Global seed set to 0
17it [11:51, 42.86s/it]Global seed set to 0
18it [12:35, 43.02s/it]Global seed set to 0
19it [13:17, 42.77s/it]Global seed set to 0
20it [13:59, 42.61s/it]Global seed set to 0
21it [14:42, 42.78s/it]Global seed set to 0
22it [15:26, 42.98s/it]Global seed set to 0
23

[{'train': 0.1695864275097847, 'val': 0.5021591186523438},
 {'train': 0.3073609173297882, 'val': 0.848341166973114},
 {'train': 0.42884289224942523, 'val': 0.4216979742050171},
 {'train': 0.27960634231567383, 'val': 0.7193499207496643},
 {'train': 0.2444256991147995, 'val': 0.5797590017318726},
 {'train': 0.48192890485127765, 'val': 0.6066265106201172},
 {'train': 0.6226737797260284, 'val': 0.952849805355072},
 {'train': 0.3543916642665863, 'val': 0.5353056192398071},
 {'train': 0.5868866840998331, 'val': 0.44092100858688354},
 {'train': 0.4939054548740387, 'val': 0.5580365657806396},
 {'train': 0.4318624238173167, 'val': 0.5588903576135635},
 {'train': 0.4312654733657837, 'val': 0.33545951545238495},
 {'train': 0.4315921233759986, 'val': 0.5692367106676102},
 {'train': 0.5169948525726795, 'val': 0.3575713187456131},
 {'train': 0.5155164271593093, 'val': 0.34426142275333405},
 {'train': 0.43204981088638306, 'val': 0.38741376996040344},
 {'train': 0.5291735096411272, 'val': 0.4150277227

In [10]:
pd.DataFrame(results).to_csv("results\\FourierFlow_synth_LSTM.csv", index=False)

# Augmentation with RealNVP synthetic data

In [11]:
results = train_synth(synthetic_path = f"{dataset_path}synthetic/RealNVP/")
results

0it [00:00, ?it/s]Global seed set to 0
1it [00:44, 44.94s/it]Global seed set to 0
2it [01:25, 42.61s/it]Global seed set to 0
3it [02:05, 41.42s/it]Global seed set to 0
4it [02:51, 42.89s/it]Global seed set to 0
5it [03:32, 42.25s/it]Global seed set to 0
6it [04:16, 42.82s/it]Global seed set to 0
7it [05:00, 43.34s/it]Global seed set to 0
8it [05:44, 43.49s/it]Global seed set to 0
9it [06:26, 42.93s/it]Global seed set to 0
10it [07:07, 42.41s/it]Global seed set to 0
11it [07:49, 42.37s/it]Global seed set to 0
12it [08:32, 42.55s/it]Global seed set to 0
13it [09:15, 42.67s/it]Global seed set to 0
14it [09:57, 42.57s/it]Global seed set to 0
15it [10:41, 42.80s/it]Global seed set to 0
16it [11:23, 42.66s/it]Global seed set to 0
17it [12:06, 42.88s/it]Global seed set to 0
18it [12:50, 43.06s/it]Global seed set to 0
19it [13:32, 42.86s/it]Global seed set to 0
20it [14:14, 42.64s/it]Global seed set to 0
21it [14:58, 43.03s/it]Global seed set to 0
22it [15:41, 43.00s/it]Global seed set to 0
23

[{'train': 0.06016901321709156, 'val': 0.562834620475769},
 {'train': 0.10256207485993703, 'val': 0.8074387311935425},
 {'train': 0.08573681116104126, 'val': 0.47008198499679565},
 {'train': 0.032542445386449494, 'val': 0.17352734506130219},
 {'train': 0.09536992758512497, 'val': 0.5945321321487427},
 {'train': 0.15892924865086874, 'val': 0.7092251777648926},
 {'train': 0.5627781212329864, 'val': 1.1119112968444824},
 {'train': 0.1066171333193779, 'val': 0.6048071384429932},
 {'train': 0.14888939758141836, 'val': 0.6328030824661255},
 {'train': 0.14658241271972655, 'val': 0.7578617334365845},
 {'train': 0.19719011419349247, 'val': 0.5948812663555145},
 {'train': 0.2471605572435591, 'val': 0.39896664023399353},
 {'train': 0.2831754783789317, 'val': 0.6197924911975861},
 {'train': 0.26914674043655396, 'val': 0.45424945652484894},
 {'train': 0.395182329416275, 'val': 0.39152607321739197},
 {'train': 0.2853864967823029, 'val': 0.4072388857603073},
 {'train': 0.4600316042249853, 'val': 0.45

In [12]:
pd.DataFrame(results).to_csv("results\\RealNVP_synth_LSTM.csv", index=False)

# Augmentation with TTS GAN synthetic data

In [13]:
results = train_synth(synthetic_path = f"{dataset_path}synthetic/TTS_GAN_standard/")
results

0it [00:00, ?it/s]Global seed set to 0
1it [00:42, 42.16s/it]Global seed set to 0
2it [01:22, 40.99s/it]Global seed set to 0
3it [02:02, 40.54s/it]Global seed set to 0
4it [02:41, 40.19s/it]Global seed set to 0
5it [03:22, 40.28s/it]Global seed set to 0
6it [04:00, 39.61s/it]Global seed set to 0
7it [04:39, 39.22s/it]Global seed set to 0
8it [05:18, 39.15s/it]Global seed set to 0
9it [06:00, 40.23s/it]Global seed set to 0
10it [06:41, 40.29s/it]Global seed set to 0
11it [07:23, 40.99s/it]Global seed set to 0
12it [08:03, 40.65s/it]Global seed set to 0
13it [08:43, 40.47s/it]Global seed set to 0
14it [09:23, 40.32s/it]Global seed set to 0
15it [10:02, 39.92s/it]Global seed set to 0
16it [10:40, 39.28s/it]Global seed set to 0
17it [11:19, 39.18s/it]Global seed set to 0
18it [11:59, 39.51s/it]Global seed set to 0
19it [12:38, 39.33s/it]Global seed set to 0
20it [13:19, 39.93s/it]Global seed set to 0
21it [14:00, 40.01s/it]Global seed set to 0
22it [14:39, 39.95s/it]Global seed set to 0
23

[{'train': 0.4881722182035446, 'val': 0.4607003331184387},
 {'train': 0.5572996139526367, 'val': 0.713964581489563},
 {'train': 0.18673823277155557, 'val': 0.14936329424381256},
 {'train': 0.1340605616569519, 'val': 0.14297246932983398},
 {'train': 0.25738203525543213, 'val': 0.25669431686401367},
 {'train': 0.8873999913533529, 'val': 0.8323074579238892},
 {'train': 0.7527846217155456, 'val': 0.8888095021247864},
 {'train': 0.9714203715324402, 'val': 0.8423780798912048},
 {'train': 0.3020841255784035, 'val': 0.21439193934202194},
 {'train': 0.36037432551383974, 'val': 0.3117588460445404},
 {'train': 1.0998954508039687, 'val': 0.8056656122207642},
 {'train': 0.8524727953804864, 'val': 0.4568272829055786},
 {'train': 0.7585266431172689, 'val': 0.6735197305679321},
 {'train': 0.7814048007130623, 'val': 0.4345851391553879},
 {'train': 0.8811773717403412, 'val': 0.48122061789035797},
 {'train': 0.7701827824115753, 'val': 0.4704471677541733},
 {'train': 0.8866960243745283, 'val': 0.626573204

In [14]:
pd.DataFrame(results).to_csv("results\\TTS_GAN_synth_LSTM.csv", index=False)