In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error as MAE, mean_absolute_percentage_error as MAPE

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
lags = 32
horizon = 8
stride = 1
val_size = 0.0
test_size = 0.3
features = 1

model_params = {"silent": True, "random_seed": 13, 'loss_function': 'MultiRMSE',  'eval_metric': 'MultiRMSE', "iterations": 100}

In [4]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    (X_train, y_train), _, (X_test, y_test), X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess=("log_returns", "normalize"),\
                                            val_size=val_size, test_size=test_size)
    X_train, X_test = map(lambda x: x.reshape(x.shape[:2]), (X_train, X_test))
    train_dl = Pool(X_train, label=y_train)
    val_dl = Pool(X_test, label=y_test)
    
    model = CatBoostRegressor(**model_params)
    # model.fit(train_dl, eval_set=val_dl, early_stopping_rounds=5, use_best_model=True)
    model.fit(train_dl)
    results.append({"train": MAE(y_train, model.predict(X_train)), "test": MAE(y_test, model.predict(X_test))})
results

24it [01:28,  3.69s/it]


[{'train': 0.4551696792367407, 'val': 0.460470602310324},
 {'train': 0.5409159536326564, 'val': 0.9238519972039024},
 {'train': 0.6099430898749105, 'val': 0.5668185352340493},
 {'train': 0.6302642027246494, 'val': 0.6036037557162195},
 {'train': 0.550001437018047, 'val': 0.46924459526660023},
 {'train': 0.6109781667128318, 'val': 0.6904009339140769},
 {'train': 0.5856537384808643, 'val': 0.8353120593180834},
 {'train': 0.5508834626020176, 'val': 0.48106388387647303},
 {'train': 0.69311364731957, 'val': 0.629294993384431},
 {'train': 0.6861136063149611, 'val': 0.7566039204390955},
 {'train': 0.6121696371455706, 'val': 0.6331751517173319},
 {'train': 0.5661686622992493, 'val': 0.42644868716920525},
 {'train': 0.5439640073507215, 'val': 0.565654813393387},
 {'train': 0.6468779767267319, 'val': 0.45658979849977355},
 {'train': 0.6395846526366398, 'val': 0.4664283248267977},
 {'train': 0.5569917316854089, 'val': 0.39352750212932663},
 {'train': 0.5989970473851234, 'val': 0.703203902479455},

In [5]:
pd.DataFrame(results).to_csv(f"results\\pure_cbr_h{horizon}.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [6]:
def train_synth(synthetic_path):
    ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

    results = []
    for ts_index, time_series in tqdm(enumerate(ts_iterator)):
        synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
        scaler = DimUniversalStandardScaler()
        synth_time_series = scaler.fit_transform(synth_time_series)
        X_synth, y_synth = [], []
        for i in range(synth_time_series.shape[0]):
            (X, y), *_ = create_ts(synth_time_series[i].reshape(- 1, 1), synth_time_series[i].flatten(), lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=(None,), val_size=0, test_size=0)
            X_synth.append(X)
            y_synth.append(y)

        # using train and synth data
        # X_train = np.row_stack((X_train, *X_synth))
        # y_train = np.row_stack((y_train, *y_synth))
        # using only synth data
        X_synth = np.row_stack(X_synth)
        y_synth = np.row_stack(y_synth)

        
        (X_train, y_train), _, (X_test, y_test), *_ = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=("log_returns", "normalize"), val_size=val_size, test_size=test_size, scaler=scaler)
        X_train, X_test, X_synth = map(lambda x: x.reshape(x.shape[:2]), (X_train, X_test, X_synth))
        
        model = CatBoostRegressor(**model_params)
        model.fit(X_synth, y_synth)
        results.append({"train": MAE(y_train, model.predict(X_train)), "test": MAE(y_test, model.predict(X_test))})
    return results

In [7]:
results = train_synth(f"{dataset_path}synthetic/QuantGAN/")
results

24it [05:45, 14.41s/it]


[{'train': 0.5705976005529405, 'val': 0.495936914983925},
 {'train': 0.7906638254047553, 'val': 1.2479540067632215},
 {'train': 0.6865303624407124, 'val': 0.5589616515091492},
 {'train': 0.7183084484047211, 'val': 0.5920065674247412},
 {'train': 0.64538927090309, 'val': 0.4586054053018275},
 {'train': 0.7170679586561128, 'val': 0.7526422049527296},
 {'train': 0.7233984971455265, 'val': 0.9544777322826088},
 {'train': 0.6587851940367666, 'val': 0.531267858974961},
 {'train': 0.7518214702351398, 'val': 0.6440745649911239},
 {'train': 0.7416300429983089, 'val': 0.7444720302010077},
 {'train': 0.554201114247848, 'val': 0.5453133001746481},
 {'train': 0.6477365802668988, 'val': 0.4620950986109732},
 {'train': 0.6566237313754806, 'val': 0.650461006612295},
 {'train': 0.6681783980748504, 'val': 0.4489827860531509},
 {'train': 0.6808225869697382, 'val': 0.4732200046602029},
 {'train': 0.602612144084566, 'val': 0.4076271493397299},
 {'train': 0.6549016919492303, 'val': 0.7335707730095424},
 {'t

In [8]:
pd.DataFrame(results).to_csv(f"results\\QuantGAN_synth_cbr_h{horizon}.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [9]:
results = train_synth(f"{dataset_path}synthetic/FourierFlow/")
results

24it [06:18, 15.77s/it]


[{'train': 0.39266954399588516, 'val': 0.3725091337596058},
 {'train': 0.5327169270857809, 'val': 0.9044217976217865},
 {'train': 0.5762583844881217, 'val': 0.5119876248562301},
 {'train': 0.588566339601027, 'val': 0.5350551802569065},
 {'train': 0.5314240151607013, 'val': 0.40696223602121684},
 {'train': 0.5733070050802087, 'val': 0.6320379544444451},
 {'train': 0.5414358874210758, 'val': 0.771274521519554},
 {'train': 0.5308209750931543, 'val': 0.4514637872366499},
 {'train': 0.6510222534537909, 'val': 0.5836922349174817},
 {'train': 0.6157591029340191, 'val': 0.6641309797338112},
 {'train': 0.588341715310258, 'val': 0.6044140241837402},
 {'train': 0.5411490473749854, 'val': 0.41200672511994774},
 {'train': 0.5292334388927556, 'val': 0.5530587776994584},
 {'train': 0.5804922537621942, 'val': 0.4188585753794068},
 {'train': 0.5761994349605177, 'val': 0.4273958347888962},
 {'train': 0.5371483760567111, 'val': 0.3801038815022171},
 {'train': 0.5511079375414307, 'val': 0.629519997618488}

In [10]:
pd.DataFrame(results).to_csv(f"results\\FourierFlow_synth_cbr_h{horizon}.csv", index=False)

# Augmentation with RealNVP synthetic data

In [11]:
results = train_synth(f"{dataset_path}synthetic/RealNVP/")
results

24it [06:00, 15.03s/it]


[{'train': 0.4192691074775543, 'val': 0.4776051485443943},
 {'train': 0.5512715651021589, 'val': 0.9670692734367201},
 {'train': 0.5704226292157994, 'val': 0.5723821946670228},
 {'train': 0.4943670801709165, 'val': 0.5520670835215182},
 {'train': 0.406910775068723, 'val': 0.38628757516012135},
 {'train': 0.5946378997370156, 'val': 0.693233331854394},
 {'train': 0.5538422217283312, 'val': 0.8387524940423599},
 {'train': 0.5129756636136815, 'val': 0.47453442577413446},
 {'train': 0.6535407473381898, 'val': 0.6111740742505485},
 {'train': 0.5916844033456929, 'val': 0.6728116200384897},
 {'train': 0.5931179222430558, 'val': 0.6246772825282134},
 {'train': 0.5484949065926874, 'val': 0.4282977877636452},
 {'train': 0.5292970746249208, 'val': 0.5647545084844513},
 {'train': 0.6123188359375664, 'val': 0.44318650636804713},
 {'train': 0.6064087030009728, 'val': 0.45929960628791167},
 {'train': 0.5407297484561965, 'val': 0.3919202876481251},
 {'train': 0.5793005348610855, 'val': 0.68421428066478

In [12]:
pd.DataFrame(results).to_csv(f"results\\RealNVP_synth_cbr_h{horizon}.csv", index=False)

# Augmentation with TTS GAN synthetic data

In [13]:
results = train_synth(f"{dataset_path}synthetic/TTS_GAN_standard/")
results

24it [05:51, 14.63s/it]


[{'train': 0.4033296433245175, 'val': 0.346253142740815},
 {'train': 0.44160962335612863, 'val': 0.6887987213064899},
 {'train': 0.15270834613223117, 'val': 0.15947363643142703},
 {'train': 0.1059298914431156, 'val': 0.08673123220468097},
 {'train': 0.2390187227632121, 'val': 0.23635228475615067},
 {'train': 0.7979688906555498, 'val': 0.8366779598930102},
 {'train': 0.5139242271435676, 'val': 0.6852156243025804},
 {'train': 0.7488439303803196, 'val': 0.5943480423695353},
 {'train': 0.2270093262426393, 'val': 0.19510159244237815},
 {'train': 0.22265766543676924, 'val': 0.2211849857563951},
 {'train': 1.1001916217256178, 'val': 1.0868589308255403},
 {'train': 0.9531175056031297, 'val': 0.7248833661241335},
 {'train': 0.7524001173862301, 'val': 0.7381950580076649},
 {'train': 0.5459293388254447, 'val': 0.3631415996951013},
 {'train': 0.8412013886954569, 'val': 0.5821116144416746},
 {'train': 3.7141621044628903, 'val': 3.7188648079103057},
 {'train': 0.7994655366449971, 'val': 0.8985482603

In [14]:
pd.DataFrame(results).to_csv(f"results\\TTS_GAN_synth_cbr_h{horizon}.csv", index=False)