In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error as MAE, mean_absolute_percentage_error as MAPE

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
lags = 32
horizon = 8
stride = 1
val_size = 0.15
test_size = 0.0
features = 1

model_params = {"silent": True, "random_seed": 13, 'loss_function': 'MultiRMSE',  'eval_metric': 'MultiRMSE', "iterations": 100}

In [6]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess="log_returns",\
                                            val_size=val_size, test_size=test_size)
    train_dl = Pool(X_train, label=y_train)
    val_dl = Pool(X_val, label=y_val)
    
    model = CatBoostRegressor(**model_params)
    # model.fit(train_dl, eval_set=val_dl, early_stopping_rounds=5, use_best_model=True)
    model.fit(train_dl)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [01:21,  3.39s/it]


In [8]:
results

[{'train': 0.0988395057316753, 'val': 0.08789940327982101},
 {'train': 0.036776620788563044, 'val': 0.05224397927529138},
 {'train': 0.006398688442784712, 'val': 0.005172470906840296},
 {'train': 0.009092196664038262, 'val': 0.010550395835513177},
 {'train': 0.014213012797644711, 'val': 0.01296672307139793},
 {'train': 0.026789991070434185, 'val': 0.027064470333851093},
 {'train': 0.021529949709914814, 'val': 0.03407264577985521},
 {'train': 0.01658921208340948, 'val': 0.015108352877514712},
 {'train': 0.008285412407542502, 'val': 0.006250136940208484},
 {'train': 0.007517652381158116, 'val': 0.006931711833014095},
 {'train': 0.050254284523359116, 'val': 0.04007008330234592},
 {'train': 0.03686151096930589, 'val': 0.020557156263218287},
 {'train': 0.033912738350219254, 'val': 0.03420939772485955},
 {'train': 0.01133497802904989, 'val': 0.006658268647246142},
 {'train': 0.0178769694550136, 'val': 0.010439005923165557},
 {'train': 0.04145062412181437, 'val': 0.026834370596076018},
 {'tra

In [9]:
pd.DataFrame([{key: value for key, value in x.items()} for x in results]).to_csv("results\\pure_cbr.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [4]:
synthetic_path = f"{dataset_path}synthetic/QuantGAN/"

In [5]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess="log_returns", val_size=val_size, test_size=test_size)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    X_synth, y_synth = [], []
    for i in range(synth_time_series.shape[0]):
        (X, y), *_ = create_ts(synth_time_series[i].reshape(127, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess=None, val_size=0, test_size=0)
        X_synth.append(X)
        y_synth.append(y)

    # using train and synth data
    # X_train = np.row_stack((X_train, *X_synth))
    # y_train = np.row_stack((y_train, *y_synth))
    # using only synth data
    X_synth = np.row_stack(X_synth)
    y_synth = np.row_stack(y_synth)
    
    model = CatBoostRegressor(**model_params)
    model.fit(X_synth, y_synth)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [05:34, 13.93s/it]


In [6]:
results

[{'train': 0.11364971720278666, 'val': 0.08693882978348716},
 {'train': 0.03939550428315655, 'val': 0.052459493981945375},
 {'train': 0.00732770814826462, 'val': 0.005236142415456854},
 {'train': 0.01059379613775083, 'val': 0.010737764634326963},
 {'train': 0.01697277804361385, 'val': 0.013095494929244653},
 {'train': 0.028265612522638605, 'val': 0.027197195484945588},
 {'train': 0.023163520709057094, 'val': 0.03428551024400679},
 {'train': 0.01781297319001693, 'val': 0.015009823889852038},
 {'train': 0.008815946612340957, 'val': 0.006289377438780666},
 {'train': 0.008114468512206324, 'val': 0.006912829730923024},
 {'train': 0.0523987422383527, 'val': 0.04006941343734814},
 {'train': 0.0387281236950867, 'val': 0.020793728892764277},
 {'train': 0.035364503305783196, 'val': 0.034271758451891676},
 {'train': 0.01209907797697047, 'val': 0.006735574436137125},
 {'train': 0.018854615035617854, 'val': 0.010742046181860518},
 {'train': 0.04310588466219803, 'val': 0.026617328995026508},
 {'trai

In [7]:
pd.DataFrame(results).to_csv("results\\QuantGAN_synth_cbr.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [8]:
synthetic_path = f"{dataset_path}synthetic/FourierFlow/"

In [9]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess="log_returns", val_size=val_size, test_size=test_size)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    X_synth, y_synth = [], []
    for i in range(synth_time_series.shape[0]):
        (X, y), *_ = create_ts(synth_time_series[i].reshape(- 1, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess=None, val_size=0, test_size=0)
        X_synth.append(X)
        y_synth.append(y)

    # using only synth data
    X_synth = np.row_stack(X_synth)
    y_synth = np.row_stack(y_synth)
    
    model = CatBoostRegressor(**model_params)
    model.fit(X_synth, y_synth)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [06:17, 15.74s/it]


In [10]:
results

[{'train': 0.09882969821406513, 'val': 0.0869540823318531},
 {'train': 0.036082272080638675, 'val': 0.05232444090073629},
 {'train': 0.0069615580240817935, 'val': 0.005415019845985192},
 {'train': 0.009966313767670282, 'val': 0.010744250155907506},
 {'train': 0.015545456368400639, 'val': 0.013214770001767046},
 {'train': 0.027761418615884405, 'val': 0.027815877333321228},
 {'train': 0.02216573265505383, 'val': 0.03488480704845248},
 {'train': 0.017073057638110734, 'val': 0.015265211147737644},
 {'train': 0.00856632657447557, 'val': 0.006267763016168115},
 {'train': 0.007972571744596967, 'val': 0.007025755524765736},
 {'train': 0.051125795879236505, 'val': 0.04024711798893335},
 {'train': 0.03652465204079635, 'val': 0.02057526138288386},
 {'train': 0.033960769710549746, 'val': 0.0342125575489541},
 {'train': 0.011202240334118891, 'val': 0.006649479377551289},
 {'train': 0.017745643086933702, 'val': 0.010436071183911745},
 {'train': 0.04135179630390444, 'val': 0.0267602816160294},
 {'tra

In [11]:
pd.DataFrame(results).to_csv("results\\FourierFlow_synth_cbr.csv", index=False)

# Augmentation with RealNVP synthetic data

In [4]:
synthetic_path = f"{dataset_path}synthetic/RealNVP/"
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for ts_index, time_series in tqdm(enumerate(ts_iterator)):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess="log_returns", val_size=val_size, test_size=test_size)

    synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
    X_synth, y_synth = [], []
    for i in range(synth_time_series.shape[0]):
        (X, y), *_ = create_ts(synth_time_series[i].reshape(- 1, 1), synth_time_series[i], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess=None, val_size=0, test_size=0)
        X_synth.append(X)
        y_synth.append(y)

    # using only synth data
    X_synth = np.row_stack(X_synth)
    y_synth = np.row_stack(y_synth)
    
    model = CatBoostRegressor(**model_params)
    model.fit(X_synth, y_synth)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [05:40, 14.17s/it]


In [5]:
results

[{'train': 0.09004008455095977, 'val': 0.08946741592775331},
 {'train': 0.03521337150260906, 'val': 0.05330768129155191},
 {'train': 0.006063771878002708, 'val': 0.0051601933672822675},
 {'train': 0.008296476321771322, 'val': 0.010697864670940528},
 {'train': 0.012744381585464456, 'val': 0.013140320179621655},
 {'train': 0.02625484031529226, 'val': 0.027048461794527377},
 {'train': 0.020962111402496104, 'val': 0.034314910791500786},
 {'train': 0.016035150535266298, 'val': 0.015192270819212832},
 {'train': 0.00811985983992624, 'val': 0.006248309402716151},
 {'train': 0.007371306526956318, 'val': 0.006922752602291816},
 {'train': 0.049608531531847286, 'val': 0.04005521935692396},
 {'train': 0.03606473508858668, 'val': 0.020853320754902857},
 {'train': 0.03370531754944335, 'val': 0.03444359708007573},
 {'train': 0.011071202660555775, 'val': 0.00666058267394762},
 {'train': 0.017422976831430216, 'val': 0.010470713870071853},
 {'train': 0.040662411472381005, 'val': 0.026817191450552245},
 {

In [6]:
pd.DataFrame(results).to_csv("results\\RealNVP_synth_cbr.csv", index=False)