In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.data import *

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error as MAE, mean_absolute_percentage_error as MAPE

In [2]:
dataset_path = "data/huge_stock_market_dataset/"

In [3]:
lags = 32
horizon = 8
stride = 1
val_size = 0.15
test_size = 0.0
features = 1

model_params = {"silent": True, "random_seed": 13, 'loss_function': 'MultiRMSE',  'eval_metric': 'MultiRMSE', "iterations": 100}

In [9]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

results = []
for time_series in tqdm(ts_iterator):
    (X_train, y_train), (X_val, y_val), _, X_scaler, y_scaler = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                            data_preprocess=("log_returns", "normalize"),\
                                            val_size=val_size, test_size=test_size)
    X_train, X_val = map(lambda x: x.reshape(x.shape[:2]), (X_train, X_val))
    train_dl = Pool(X_train, label=y_train)
    val_dl = Pool(X_val, label=y_val)
    
    model = CatBoostRegressor(**model_params)
    # model.fit(train_dl, eval_set=val_dl, early_stopping_rounds=5, use_best_model=True)
    model.fit(train_dl)
    results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})

24it [01:30,  3.79s/it]


In [10]:
results

[{'train': 0.4721138800789576, 'val': 0.4198577080365975},
 {'train': 0.4927910753616652, 'val': 0.7000470996421194},
 {'train': 0.611703930759554, 'val': 0.4944795815417712},
 {'train': 0.6220227366896259, 'val': 0.7217822417316845},
 {'train': 0.5644592305281493, 'val': 0.5149637611398229},
 {'train': 0.5818978325923387, 'val': 0.5878597190703397},
 {'train': 0.5740474813297128, 'val': 0.9084701431820057},
 {'train': 0.5628409667745751, 'val': 0.5125981812442175},
 {'train': 0.7076598653906154, 'val': 0.5338263021484075},
 {'train': 0.7018691782001159, 'val': 0.6471641199813174},
 {'train': 0.6299630144565079, 'val': 0.5022988728025262},
 {'train': 0.5808765471192069, 'val': 0.3239468387010491},
 {'train': 0.5581741874890538, 'val': 0.5630569428483101},
 {'train': 0.6544736919978776, 'val': 0.3844437681462015},
 {'train': 0.6498089439971121, 'val': 0.3794468314500749},
 {'train': 0.5537645440474475, 'val': 0.3584969663275588},
 {'train': 0.5651671014464634, 'val': 0.4362512548537032}

In [11]:
pd.DataFrame([{key: value for key, value in x.items()} for x in results]).to_csv("results\\pure_cbr.csv", index=False)

# Augmentation with QuantGAN synthetic data

In [20]:
def train_synth(synthetic_path):
    ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

    results = []
    for ts_index, time_series in tqdm(enumerate(ts_iterator)):
        synth_time_series = np.load(f"{synthetic_path}selected{ts_index}.npy")
        scaler = DimUniversalStandardScaler()
        synth_time_series = scaler.fit_transform(synth_time_series)
        X_synth, y_synth = [], []
        for i in range(synth_time_series.shape[0]):
            (X, y), *_ = create_ts(synth_time_series[i].reshape(- 1, 1), synth_time_series[i].flatten(), lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=(None,), val_size=0, test_size=0)
            X_synth.append(X)
            y_synth.append(y)

        # using train and synth data
        # X_train = np.row_stack((X_train, *X_synth))
        # y_train = np.row_stack((y_train, *y_synth))
        # using only synth data
        X_synth = np.row_stack(X_synth)
        y_synth = np.row_stack(y_synth)

        
        (X_train, y_train), (X_val, y_val), *_ = create_ts(time_series[["Close"]], time_series["Close"], lags=lags, horizon=horizon, stride=stride,\
                                                data_preprocess=("log_returns", "normalize"), val_size=val_size, test_size=test_size, scaler=scaler)
        X_train, X_val, X_synth = map(lambda x: x.reshape(x.shape[:2]), (X_train, X_val, X_synth))
        
        model = CatBoostRegressor(**model_params)
        model.fit(X_synth, y_synth)
        results.append({"train": MAE(y_train, model.predict(X_train)), "val": MAE(y_val, model.predict(X_val))})
    return results

In [21]:
results = train_synth(f"{dataset_path}synthetic/QuantGAN/")
results

24it [06:09, 15.40s/it]


[{'train': 0.5581620788603382, 'val': 0.42703556650480246},
 {'train': 0.7406758124535515, 'val': 0.9862884430025866},
 {'train': 0.7078966500998958, 'val': 0.5058399773633997},
 {'train': 0.7131427472027942, 'val': 0.722834310710203},
 {'train': 0.6287411470430859, 'val': 0.4850682318406551},
 {'train': 0.6676337978540728, 'val': 0.6423952676044954},
 {'train': 0.7229046373865335, 'val': 1.0700081309955045},
 {'train': 0.7087365816147391, 'val': 0.5972057723694857},
 {'train': 0.7726383208810653, 'val': 0.5512092895441913},
 {'train': 0.7429271469871261, 'val': 0.6329099797144754},
 {'train': 0.5912808347980893, 'val': 0.45215353034844796},
 {'train': 0.7076944643460387, 'val': 0.379975456041009},
 {'train': 0.6497924468330752, 'val': 0.6296742028964557},
 {'train': 0.7363951925757601, 'val': 0.40995225530310053},
 {'train': 0.6736557874469855, 'val': 0.38380213233104726},
 {'train': 0.6291133154739842, 'val': 0.3884693754649044},
 {'train': 0.6402245644869348, 'val': 0.47968147876775

In [22]:
pd.DataFrame(results).to_csv("results\\QuantGAN_synth_cbr.csv", index=False)

# Augmentation with FourierFlow synthetic data

In [23]:
results = train_synth(f"{dataset_path}synthetic/FourierFlow/")
results

24it [06:52, 17.19s/it]


[{'train': 0.472414447407477, 'val': 0.4156479829277657},
 {'train': 0.4673385304328088, 'val': 0.6777075263474912},
 {'train': 0.4991501964867769, 'val': 0.3884854344428356},
 {'train': 0.5576655861371456, 'val': 0.6011950560212238},
 {'train': 0.5258843738212344, 'val': 0.44704001474100896},
 {'train': 0.5428999737247002, 'val': 0.5439649640159456},
 {'train': 0.5497361874432269, 'val': 0.8651841819236795},
 {'train': 0.5355060595106209, 'val': 0.478801936376283},
 {'train': 0.6126725190400838, 'val': 0.4482768803656117},
 {'train': 0.5810299152064772, 'val': 0.512053773543315},
 {'train': 0.6146213646426646, 'val': 0.48384065519324926},
 {'train': 0.5564821867522125, 'val': 0.313480507316827},
 {'train': 0.5462842853676411, 'val': 0.5503344804341775},
 {'train': 0.5794021248828838, 'val': 0.34392428935947855},
 {'train': 0.5807505140736255, 'val': 0.3415349530957658},
 {'train': 0.544800046985519, 'val': 0.3525574584093807},
 {'train': 0.528634207581134, 'val': 0.40704937354552917},

In [24]:
pd.DataFrame(results).to_csv("results\\FourierFlow_synth_cbr.csv", index=False)

# Augmentation with RealNVP synthetic data

In [25]:
results = train_synth(f"{dataset_path}synthetic/RealNVP/")
results

24it [06:07, 15.31s/it]


[{'train': 0.44821009358717867, 'val': 0.44186669788799765},
 {'train': 0.4791537119518723, 'val': 0.7253706113054383},
 {'train': 0.45298236021498317, 'val': 0.3828760573758874},
 {'train': 0.11602518596493146, 'val': 0.14960830230047859},
 {'train': 0.4568607497279644, 'val': 0.4710543614091952},
 {'train': 0.5661806736305175, 'val': 0.5826485279899543},
 {'train': 0.5957985196085549, 'val': 0.9783628604221273},
 {'train': 0.5253719510363148, 'val': 0.4972970116294527},
 {'train': 0.6707809047941136, 'val': 0.5161722885382267},
 {'train': 0.630413630217284, 'val': 0.5920519216387193},
 {'train': 0.6169952582650898, 'val': 0.49867989128442475},
 {'train': 0.5659989214348362, 'val': 0.32666520775471275},
 {'train': 0.545203580839261, 'val': 0.5571362491156171},
 {'train': 0.61626152554795, 'val': 0.37075112503965685},
 {'train': 0.6228740414677943, 'val': 0.37432959479934114},
 {'train': 0.5477765874814933, 'val': 0.3607418138494773},
 {'train': 0.5451584024037488, 'val': 0.42850970437

In [26]:
pd.DataFrame(results).to_csv("results\\RealNVP_synth_cbr.csv", index=False)

# Augmentation with TTS GAN synthetic data

In [27]:
results = train_synth(f"{dataset_path}synthetic/TTS_GAN_standard/")
results

24it [06:15, 15.64s/it]


[{'train': 0.5318373800073919, 'val': 0.47435486321322845},
 {'train': 0.552310526820815, 'val': 0.7156018267812723},
 {'train': 0.18082850684926596, 'val': 0.14143748897065495},
 {'train': 0.14219920368833194, 'val': 0.14290718060431443},
 {'train': 0.2496834139188196, 'val': 0.24814578140820384},
 {'train': 0.8650025073040402, 'val': 0.8306463221688533},
 {'train': 0.5960645401342772, 'val': 0.8825077658619218},
 {'train': 0.99311020198162, 'val': 0.841576871679518},
 {'train': 0.3040346841236944, 'val': 0.22312216351950925},
 {'train': 0.3584225118921459, 'val': 0.3104392233254107},
 {'train': 1.0887808582860332, 'val': 0.833868502798798},
 {'train': 0.8541232137071266, 'val': 0.4558378820328849},
 {'train': 0.7509940133435706, 'val': 0.7264380723573328},
 {'train': 0.7870810101250203, 'val': 0.4452968160722733},
 {'train': 0.8774430891577936, 'val': 0.4882315839081644},
 {'train': 0.7709750559275947, 'val': 0.476199793057011},
 {'train': 0.8416773935757502, 'val': 0.631657212517380

In [28]:
pd.DataFrame(results).to_csv("results\\TTS_GAN_synth_cbr.csv", index=False)