In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error as MAE, mean_absolute_percentage_error as MAPE

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
lags = 32
horizon = 8
stride = 1
val_size = 0.15
test_size = 0.0
features = 1

model_params = {"silent": True, "random_seed": 13, 'loss_function': 'MultiRMSE',  'eval_metric': 'MultiRMSE', "iterations": 100}

In [6]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size)
    train_dl = Pool(X_train, label=y_train)
    val_dl = Pool(X_val, label=y_val)
    
    model = CatBoostRegressor(**model_params)
    # model.fit(train_dl, eval_set=val_dl, early_stopping_rounds=5, use_best_model=True)
    model.fit(train_dl)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [01:21,  3.39s/it]


In [8]:
results

[{'train': 0.0988395057316753, 'val': 0.08789940327982101},
 {'train': 0.036776620788563044, 'val': 0.05224397927529138},
 {'train': 0.006398688442784712, 'val': 0.005172470906840296},
 {'train': 0.009092196664038262, 'val': 0.010550395835513177},
 {'train': 0.014213012797644711, 'val': 0.01296672307139793},
 {'train': 0.026789991070434185, 'val': 0.027064470333851093},
 {'train': 0.021529949709914814, 'val': 0.03407264577985521},
 {'train': 0.01658921208340948, 'val': 0.015108352877514712},
 {'train': 0.008285412407542502, 'val': 0.006250136940208484},
 {'train': 0.007517652381158116, 'val': 0.006931711833014095},
 {'train': 0.050254284523359116, 'val': 0.04007008330234592},
 {'train': 0.03686151096930589, 'val': 0.020557156263218287},
 {'train': 0.033912738350219254, 'val': 0.03420939772485955},
 {'train': 0.01133497802904989, 'val': 0.006658268647246142},
 {'train': 0.0178769694550136, 'val': 0.010439005923165557},
 {'train': 0.04145062412181437, 'val': 0.026834370596076018},
 {'tra

In [9]:
pd.DataFrame([{key: value for key, value in x.items()} for x in results]).to_csv("results\\pure_cbr.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [6]:
def train_synth(synthetic_path):
    ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

    results = []
    for ts_index, time_series in tqdm(enumerate(ts_iterator)):
        (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess="log_returns", val_size=val_size, test_size=test_size)

        synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
        X_synth, y_synth = [], []
        for i in range(synth_time_series.shape[0]):
            (X, y), *_ = create_ts(synth_time_series[i].reshape(- 1, 1), synth_time_series[i].flatten(), lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=None, val_size=0, test_size=0)
            X_synth.append(X)
            y_synth.append(y)

        # using train and synth data
        # X_train = np.row_stack((X_train, *X_synth))
        # y_train = np.row_stack((y_train, *y_synth))
        # using only synth data
        X_synth = np.row_stack(X_synth)
        y_synth = np.row_stack(y_synth)
        
        model = CatBoostRegressor(**model_params)
        model.fit(X_synth, y_synth)
        results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})
    return results

In [6]:
results = train_synth(f"{dataset_path}synthetic/QuantGAN/")
results

[{'train': 0.11364971720278666, 'val': 0.08693882978348716},
 {'train': 0.03939550428315655, 'val': 0.052459493981945375},
 {'train': 0.00732770814826462, 'val': 0.005236142415456854},
 {'train': 0.01059379613775083, 'val': 0.010737764634326963},
 {'train': 0.01697277804361385, 'val': 0.013095494929244653},
 {'train': 0.028265612522638605, 'val': 0.027197195484945588},
 {'train': 0.023163520709057094, 'val': 0.03428551024400679},
 {'train': 0.01781297319001693, 'val': 0.015009823889852038},
 {'train': 0.008815946612340957, 'val': 0.006289377438780666},
 {'train': 0.008114468512206324, 'val': 0.006912829730923024},
 {'train': 0.0523987422383527, 'val': 0.04006941343734814},
 {'train': 0.0387281236950867, 'val': 0.020793728892764277},
 {'train': 0.035364503305783196, 'val': 0.034271758451891676},
 {'train': 0.01209907797697047, 'val': 0.006735574436137125},
 {'train': 0.018854615035617854, 'val': 0.010742046181860518},
 {'train': 0.04310588466219803, 'val': 0.026617328995026508},
 {'trai

In [7]:
pd.DataFrame(results).to_csv("results\\QuantGAN_synth_cbr.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [9]:
results = train_synth(f"{dataset_path}synthetic/FourierFlow/")
results

24it [06:17, 15.74s/it]


In [11]:
pd.DataFrame(results).to_csv("results\\FourierFlow_synth_cbr.csv", index=False)

# Augmentation with RealNVP synthetic data

In [4]:
results = train_synth(f"{dataset_path}synthetic/RealNVP/")
results

24it [05:40, 14.17s/it]


In [6]:
pd.DataFrame(results).to_csv("results\\RealNVP_synth_cbr.csv", index=False)

# Augmentation with TTS GAN synthetic data

In [7]:
results = train_synth(f"{dataset_path}synthetic/TTS_GAN/")
results

24it [05:25, 13.56s/it]


[{'train': 0.1321132544513609, 'val': 0.11785090803976703},
 {'train': 0.03951295701599329, 'val': 0.051195012872470635},
 {'train': 0.008066338858360254, 'val': 0.006309197257936066},
 {'train': 0.010482014251813134, 'val': 0.010534201773454361},
 {'train': 0.04955996457270981, 'val': 0.04925475804128197},
 {'train': 0.02823935233862508, 'val': 0.02711774121769691},
 {'train': 0.02298810528802903, 'val': 0.034035209327458904},
 {'train': 0.017808602609102827, 'val': 0.015091073857310412},
 {'train': 0.009019808714696334, 'val': 0.006619373871603162},
 {'train': 0.008053715526012873, 'val': 0.006975536161433829},
 {'train': 0.05218791753987803, 'val': 0.03996934767208509},
 {'train': 0.038537626453756135, 'val': 0.020567184848492508},
 {'train': 0.03536113242225513, 'val': 0.034205325598034346},
 {'train': 0.011880427205143457, 'val': 0.006721438251270983},
 {'train': 0.018594573946467116, 'val': 0.01034649243142939},
 {'train': 0.04315380475538547, 'val': 0.0266543420544008},
 {'train

In [8]:
pd.DataFrame(results).to_csv("results\\TTS_GAN_synth_cbr.csv", index=False)