In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *
from utils.metrics import MAPE, WAPE, MAE
from utils.dl import *

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

lags = 32
horizon = 8
stride = 1
batch_size = 256
val_size = 0.15
test_size = 0.0
drop_last = False
features = 1
keep_hs = False  # stride == lags
epochs = 200
verbose = False

model_params = {'input_size': features, 'hidden_size': 256, 'num_layers': 2, 'dropout': 0.1, 'output_size': horizon, 'seq_len': lags}

cuda:0


In [4]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    train_dl, val_dl, test_dl, X_scaler, y_scaler = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size, drop_last=drop_last)
    
    model = RNNModel(seed=0, device=device)
    model.set_model(SimpleLSTM, **model_params)
    optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
    model.set_optim(torch.optim.AdamW, **optim_params)
    model.set_criterion(MAE)

    model.train(train_dl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
    results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

    del model, train_dl, val_dl, test_dl
    torch.cuda.empty_cache()

0it [00:00, ?it/s]Global seed set to 0
1it [00:09,  9.65s/it]Global seed set to 0
2it [00:23, 11.92s/it]Global seed set to 0
3it [00:36, 12.53s/it]Global seed set to 0
4it [00:48, 12.25s/it]Global seed set to 0
5it [00:57, 11.01s/it]Global seed set to 0
6it [01:23, 16.30s/it]Global seed set to 0
7it [01:43, 17.62s/it]Global seed set to 0
8it [02:06, 19.08s/it]Global seed set to 0
9it [02:34, 21.97s/it]Global seed set to 0
10it [02:57, 22.24s/it]Global seed set to 0
11it [03:39, 28.33s/it]Global seed set to 0
12it [04:22, 32.90s/it]Global seed set to 0
13it [05:05, 35.73s/it]Global seed set to 0
14it [05:42, 36.13s/it]Global seed set to 0
15it [06:29, 39.44s/it]Global seed set to 0
16it [07:18, 42.35s/it]Global seed set to 0
17it [08:10, 45.33s/it]Global seed set to 0
18it [09:02, 47.32s/it]Global seed set to 0
19it [09:54, 48.75s/it]Global seed set to 0
20it [10:47, 49.84s/it]Global seed set to 0
21it [13:52, 90.65s/it]Global seed set to 0
22it [17:18, 125.18s/it]Global seed set to 0
2

In [6]:
results

[{'train': 0.09587069973349571, 'val': 0.09166862070560455},
 {'train': 0.03804870073994001, 'val': 0.052777111530303955},
 {'train': 0.007281795144081116, 'val': 0.005239419639110565},
 {'train': 0.009841733146458864, 'val': 0.010587431490421295},
 {'train': 0.016299376729875803, 'val': 0.012811913155019283},
 {'train': 0.0273998969544967, 'val': 0.027212876826524734},
 {'train': 0.0268426064401865, 'val': 0.03662586584687233},
 {'train': 0.01703375708311796, 'val': 0.015111961401998997},
 {'train': 0.008660931993896762, 'val': 0.006156163290143013},
 {'train': 0.00804837578907609, 'val': 0.00693837646394968},
 {'train': 0.052466646250751287, 'val': 0.0384663101285696},
 {'train': 0.03620564006268978, 'val': 0.020883575081825256},
 {'train': 0.03437461662623617, 'val': 0.0320797311142087},
 {'train': 0.011673742556013167, 'val': 0.006538719404488802},
 {'train': 0.018509079050272704, 'val': 0.010300817899405956},
 {'train': 0.0391908086836338, 'val': 0.027567469514906406},
 {'train': 

In [7]:
pd.DataFrame(results).to_csv("results\\pure_LSTM.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [4]:
synthetic_path = f"{dataset_path}synthetic/QuantGAN/"

In [4]:
class CombinedDataLoader:
    def __init__(self, *dls):
        self.dls = dls

    def __len__(self):
        return sum(map(len, self.dls))
    
    def __iter__(self):
        for dl in self.dls:
            for v in dl:
                yield v

In [6]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")
epochs = 2

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    train_dl, val_dl, test_dl, X_scaler, y_scaler = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size, drop_last=drop_last)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    synth_dls = []
    for i in range(synth_time_series.shape[0]):
        synth_dl, *_ = create_ts_dl(synth_time_series[i].reshape(127, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess=None,\
                                            val_size=0, test_size=0, drop_last=drop_last)
        synth_dls.append(synth_dl)
    
    model = RNNModel(seed=0, device=device)
    model.set_model(SimpleLSTM, **model_params)
    optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
    model.set_optim(torch.optim.AdamW, **optim_params)
    model.set_criterion(MAE)

    # cdl = CombinedDataLoader(train_dl, *synth_dls)
    # only synth data
    cdl = CombinedDataLoader(*synth_dls)
    model.train(cdl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
    results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

    del model, train_dl, val_dl, test_dl
    torch.cuda.empty_cache()

0it [00:00, ?it/s]Global seed set to 0
1it [00:37, 37.08s/it]Global seed set to 0
2it [01:12, 36.04s/it]Global seed set to 0
3it [01:45, 34.79s/it]Global seed set to 0
4it [02:19, 34.33s/it]Global seed set to 0
5it [02:52, 33.77s/it]Global seed set to 0
6it [03:25, 33.58s/it]Global seed set to 0
7it [03:59, 33.85s/it]Global seed set to 0
8it [04:35, 34.52s/it]Global seed set to 0
9it [05:12, 35.09s/it]Global seed set to 0
10it [05:48, 35.37s/it]Global seed set to 0
11it [06:17, 33.50s/it]Global seed set to 0
12it [06:57, 35.42s/it]Global seed set to 0
13it [07:36, 36.66s/it]Global seed set to 0
14it [08:13, 36.72s/it]Global seed set to 0
15it [08:50, 36.75s/it]Global seed set to 0
16it [09:26, 36.73s/it]Global seed set to 0
17it [10:02, 36.50s/it]Global seed set to 0
18it [10:40, 36.79s/it]Global seed set to 0
19it [11:16, 36.63s/it]Global seed set to 0
20it [11:52, 36.46s/it]Global seed set to 0
21it [12:30, 36.87s/it]Global seed set to 0
22it [13:08, 37.28s/it]Global seed set to 0
23

In [7]:
results

[{'train': 0.10761301219463348, 'val': 0.08768519759178162},
 {'train': 0.03998527179161707, 'val': 0.05282701551914215},
 {'train': 0.007818675444771847, 'val': 0.005705437157303095},
 {'train': 0.010074544697999954, 'val': 0.01072484441101551},
 {'train': 0.01676370482891798, 'val': 0.012739374302327633},
 {'train': 0.029049171445270378, 'val': 0.027288392186164856},
 {'train': 0.029441210627555846, 'val': 0.03475208207964897},
 {'train': 0.01741277687251568, 'val': 0.014956658706068993},
 {'train': 0.008807544053221742, 'val': 0.006362587679177523},
 {'train': 0.008097720332443714, 'val': 0.006888795644044876},
 {'train': 0.0534266730149587, 'val': 0.03936200402677059},
 {'train': 0.038627251154846616, 'val': 0.020633424632251263},
 {'train': 0.035616915259096354, 'val': 0.03144158702343702},
 {'train': 0.01203392876777798, 'val': 0.006497248541563749},
 {'train': 0.018806928768754005, 'val': 0.010355071164667606},
 {'train': 0.043070480972528455, 'val': 0.026674081571400166},
 {'tr

In [8]:
pd.DataFrame(results).to_csv("results\\QuantGAN_synth_LSTM.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [6]:
synthetic_path = f"{dataset_path}synthetic/FourierFlow/"

In [7]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")
epochs = 2

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    train_dl, val_dl, test_dl, X_scaler, y_scaler = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size, drop_last=drop_last)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    synth_dls = []
    for i in range(synth_time_series.shape[0]):
        synth_dl, *_ = create_ts_dl(synth_time_series[i].reshape(- 1, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess=None,\
                                            val_size=0, test_size=0, drop_last=drop_last)
        synth_dls.append(synth_dl)
    
    model = RNNModel(seed=0, device=device)
    model.set_model(SimpleLSTM, **model_params)
    optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
    model.set_optim(torch.optim.AdamW, **optim_params)
    model.set_criterion(MAE)

    # only synth data
    cdl = CombinedDataLoader(*synth_dls)
    model.train(cdl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
    results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

    del model, train_dl, val_dl, test_dl
    torch.cuda.empty_cache()

0it [00:00, ?it/s]Global seed set to 0
1it [00:45, 45.91s/it]Global seed set to 0
2it [01:26, 42.60s/it]Global seed set to 0
3it [02:03, 40.27s/it]Global seed set to 0
4it [02:43, 40.07s/it]Global seed set to 0
5it [03:24, 40.53s/it]Global seed set to 0
6it [04:05, 40.63s/it]Global seed set to 0
7it [04:47, 40.96s/it]Global seed set to 0
8it [05:28, 40.91s/it]Global seed set to 0
9it [06:08, 40.68s/it]Global seed set to 0
10it [06:48, 40.56s/it]Global seed set to 0
11it [07:29, 40.65s/it]Global seed set to 0
12it [08:11, 41.00s/it]Global seed set to 0
13it [08:52, 41.22s/it]Global seed set to 0
14it [09:33, 41.00s/it]Global seed set to 0
15it [10:14, 41.13s/it]Global seed set to 0
16it [10:55, 40.88s/it]Global seed set to 0
17it [11:36, 41.06s/it]Global seed set to 0
18it [12:18, 41.16s/it]Global seed set to 0
19it [13:00, 41.58s/it]Global seed set to 0
20it [13:41, 41.43s/it]Global seed set to 0
21it [14:23, 41.52s/it]Global seed set to 0
22it [15:05, 41.60s/it]Global seed set to 0
23

In [8]:
results

[{'train': 0.05333846993744373, 'val': 0.10541445016860962},
 {'train': 0.037882319341103234, 'val': 0.059613317251205444},
 {'train': 0.007252447151889403, 'val': 0.005394076928496361},
 {'train': 0.01003556822737058, 'val': 0.010693907737731934},
 {'train': 0.016704676672816277, 'val': 0.014040783047676086},
 {'train': 0.028331669978797436, 'val': 0.028229130432009697},
 {'train': 0.029098593071103095, 'val': 0.03820819780230522},
 {'train': 0.017204161174595355, 'val': 0.015449265949428082},
 {'train': 0.008985073616107305, 'val': 0.0061752330511808395},
 {'train': 0.008392451517283916, 'val': 0.007161007262766361},
 {'train': 0.054158644957674876, 'val': 0.039467787370085716},
 {'train': 0.03755103134446674, 'val': 0.021011995151638985},
 {'train': 0.03527219076123503, 'val': 0.03197217732667923},
 {'train': 0.011848160065710545, 'val': 0.0066019208170473576},
 {'train': 0.018985438998788594, 'val': 0.010308427270501852},
 {'train': 0.042418853379786015, 'val': 0.027426238171756268

In [9]:
pd.DataFrame(results).to_csv("results\\FourierFlow_synth_LSTM.csv", index=False)

# Augmentation with RealNVP synthetic data

In [5]:
synthetic_path = f"{dataset_path}synthetic/RealNVP/"
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")
epochs = 2

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    train_dl, val_dl, test_dl, X_scaler, y_scaler = create_ts_dl(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size, drop_last=drop_last)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    synth_dls = []
    for i in range(synth_time_series.shape[0]):
        synth_dl, *_ = create_ts_dl(synth_time_series[i].reshape(- 1, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            batch_size=batch_size, device=device, data_preprocess=None,\
                                            val_size=0, test_size=0, drop_last=drop_last)
        synth_dls.append(synth_dl)
    
    model = RNNModel(seed=0, device=device)
    model.set_model(SimpleLSTM, **model_params)
    optim_params = {'params': model.model.parameters(), 'lr': 4e-4}
    model.set_optim(torch.optim.AdamW, **optim_params)
    model.set_criterion(MAE)

    # only synth data
    cdl = CombinedDataLoader(*synth_dls)
    model.train(cdl, epochs=epochs, print_info=verbose, keep_hs=keep_hs, agg_loss="mean")
    results.append({"train": model.eval(train_dl, agg_loss="mean"), "val": model.eval(val_dl, agg_loss="mean")})

    del model, train_dl, val_dl, test_dl
    torch.cuda.empty_cache()

0it [00:00, ?it/s]Global seed set to 0
1it [00:47, 47.49s/it]Global seed set to 0
2it [01:27, 42.86s/it]Global seed set to 0
3it [02:04, 40.57s/it]Global seed set to 0
4it [02:44, 40.07s/it]Global seed set to 0
5it [03:22, 39.27s/it]Global seed set to 0
6it [04:02, 39.82s/it]Global seed set to 0
7it [04:46, 41.16s/it]Global seed set to 0
8it [05:31, 42.39s/it]Global seed set to 0
9it [06:15, 42.82s/it]Global seed set to 0
10it [07:00, 43.29s/it]Global seed set to 0
11it [07:43, 43.49s/it]Global seed set to 0
12it [08:27, 43.55s/it]Global seed set to 0
13it [09:12, 43.83s/it]Global seed set to 0
14it [09:56, 44.06s/it]Global seed set to 0
15it [10:41, 44.36s/it]Global seed set to 0
16it [11:25, 44.09s/it]Global seed set to 0
17it [12:09, 44.14s/it]Global seed set to 0
18it [12:54, 44.35s/it]Global seed set to 0
19it [13:38, 44.32s/it]Global seed set to 0
20it [14:23, 44.37s/it]Global seed set to 0
21it [15:07, 44.37s/it]Global seed set to 0
22it [15:52, 44.51s/it]Global seed set to 0
23

In [6]:
results

[{'train': 0.020134223625063896, 'val': 0.11028391122817993},
 {'train': 0.0347943144539992, 'val': 0.05796701833605766},
 {'train': 0.007007423633088668, 'val': 0.00516876857727766},
 {'train': 0.00960927518705527, 'val': 0.010699539445340633},
 {'train': 0.015775732696056366, 'val': 0.013453226536512375},
 {'train': 0.027417889796197414, 'val': 0.02723686397075653},
 {'train': 0.027957553416490553, 'val': 0.0374186672270298},
 {'train': 0.01665698438882828, 'val': 0.015259279869496822},
 {'train': 0.008629022554183999, 'val': 0.006125194253399968},
 {'train': 0.008007310424000024, 'val': 0.006995756179094315},
 {'train': 0.05247532203793526, 'val': 0.038463592529296875},
 {'train': 0.03685981076624659, 'val': 0.02092783711850643},
 {'train': 0.034750730006231204, 'val': 0.03208211809396744},
 {'train': 0.011692417087033391, 'val': 0.006526415701955557},
 {'train': 0.018525396659970283, 'val': 0.010269255377352238},
 {'train': 0.04204504042863846, 'val': 0.027379087172448635},
 {'trai

In [9]:
pd.DataFrame(results).to_csv("results\\RealNVP_synth_LSTM.csv", index=False)