In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
from torch.utils.tensorboard import SummaryWriter
from pytorch_lightning import seed_everything

from utils.data import get_hsm_dataset, split_data, log_returns
from utils.metrics import MAPE, WAPE, MAE
from utils.TTS_GAN import TTS_GAN_Generator, TTS_GAN_Discriminator, weights_init, train_TTS_GAN

In [2]:
dataset_path = "data/huge_stock_market_dataset/"
synthetic_path = f"{dataset_path}synthetic/TTS_GAN_standard/"
models_dir = "models/"

In [3]:
device = gpu = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

val_size = 0.0
test_size = 0.3

lr = 2e-4
wd = 0
ctrl_lr = 3.5e-4
beta1 = 0.0
beta2 = 0.9
max_epoch = 200
latent_dim = 128
batch_size = gen_batch_size = dis_batch_size = 64
ema = 0.995
ema_kimg = 500
ema_warmup = 0
world_size = 0
rank = - 1
print_freq = 50
n_critic = 1
phi = 1
accumulated_times = g_accumulated_times = 1
loss = "standard"
seq_len = 150

n_samples = 1600 * 127  # number of samples generated by QuantGAN

cuda:0


In [4]:
ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")
seed_everything(0)

start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start=start_point):
    print(f"Time Series #{ts_index}")
    
    (train_ts, *_), *_ = split_data(time_series, val_size=val_size, test_size=test_size)
    train_ts = log_returns(train_ts)
    
    # using all series to train model
    # train_ts = train_ts[:len(train_ts) // 15 * 15]
    # train_dl = torch.utils.data.DataLoader([torch.from_numpy(train_ts.values.reshape(1, 1, - 1)).to(device)], batch_size=batch_size, shuffle=True)
    # seq_len = len(train_ts)

    # using sequences of seq_len to train model
    train_ts = np.array([train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len)])
    train_dl = torch.utils.data.DataLoader(torch.from_numpy(train_ts.reshape(- 1, 1, 1, seq_len)).to(device), batch_size=batch_size, shuffle=True)

    TTS_GAN_gen = TTS_GAN_Generator(seq_len=seq_len, channels=1, latent_dim=latent_dim, ).to(device)
    TTS_GAN_dis = TTS_GAN_Discriminator(seq_length=seq_len, in_channels=1).to(device)

    gen_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_gen.parameters()), lr)
    dis_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_dis.parameters()), lr)
    
    for epoch in range(max_epoch):
        losses = train_TTS_GAN(globals(), TTS_GAN_gen, TTS_GAN_dis, gen_optimizer, dis_optimizer, train_dl, epoch)
    tqdm.write(f"generator loss: {losses[0]: 0.4f} discriminator loss: {losses[1]: 0.4f}")
    del dis_optimizer, gen_optimizer, TTS_GAN_dis, train_dl
    torch.cuda.empty_cache()

    samples_to_gen = n_samples // seq_len
    synth_data = []
    with torch.no_grad():
        for _ in range(samples_to_gen):
            z = torch.cuda.FloatTensor(np.random.normal(0, 1, (1, latent_dim))).cuda(device, non_blocking=True)
            synth_data.append(TTS_GAN_gen(z).cpu().numpy())
            del z
            torch.cuda.empty_cache()
    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(synth_data))

    del TTS_GAN_gen, synth_data
    torch.cuda.empty_cache()

Global seed set to 0


Time Series #0
generator loss: -0.5939 discriminator loss:  1.4007
Time Series #1
generator loss: -0.4423 discriminator loss:  1.1343
Time Series #2
generator loss: -0.5203 discriminator loss:  1.2282
Time Series #3
generator loss: -0.3505 discriminator loss:  0.7936
Time Series #4
generator loss: -0.6492 discriminator loss:  1.6032
Time Series #5
generator loss: -0.6025 discriminator loss:  1.4370
Time Series #6
generator loss: -0.5981 discriminator loss:  1.4023
Time Series #7
generator loss: -0.5905 discriminator loss:  1.3788
Time Series #8
generator loss: -0.3726 discriminator loss:  0.8513
Time Series #9
generator loss: -0.3367 discriminator loss:  0.9390
Time Series #10
generator loss: -0.6200 discriminator loss:  1.4492
Time Series #11
generator loss: -0.5958 discriminator loss:  1.3959
Time Series #12
generator loss: -0.6129 discriminator loss:  1.4360
Time Series #13
generator loss: -0.6062 discriminator loss:  1.3992
Time Series #14
generator loss: -0.6182 discriminator loss

0.7 train data

standard loss time: 165 min

0.85 train data

standard loss:
first 11 ts time: 38 min
another 13 ts time: 176 min

hinge loss:
first 11 ts time: 33 min
first 17 ts time: 83 min
another 7 ts time: 145 min
#18 gen 0.8737 dis 0.1096

# Similarity

In [1]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.special import kl_div

from utils.data import get_hsm_dataset, split_data, log_returns

In [5]:
dataset_path = Path("data/huge_stock_market_dataset/")
results_dir = Path("results")

val_size = 0.0
test_size = 0.3
seq_len = 150

sj_div = lambda x, y: (kl_div(x, (x + y) / 2) + kl_div(y, (x + y) / 2)) / 2
min_max_norm = lambda x: (x - x.min()) / (x.max() - x.min())

In [7]:
for model in ("TTS_GAN",):
    synthetic_path = dataset_path / f"synthetic/TTS_GAN_standard/"
    results = {"kl_div": [], "sj_div": []}
    ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected.csv")

    for ts_index, time_series in tqdm(enumerate(ts_iterator)):
        (train_ts, *_), *_ = split_data(time_series, val_size=val_size, test_size=test_size)
        train_ts = log_returns(train_ts).values.flatten()
        train_ts = min_max_norm(train_ts)
        train_tss = [train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len + 1)]
        
        synth_tss = np.load(synthetic_path / f"selected{ts_index}.npy")
        kl_div_res = sj_div_res = 0
        for synth_ts in tqdm(synth_tss, leave=False):
            synth_ts = min_max_norm(synth_ts)
            for train_ts in train_tss:
                res = kl_div(synth_ts, train_ts)
                kl_div_res += np.where(np.isinf(res), 0, res).mean()
                sj_div_res += sj_div(synth_ts, train_ts).mean()
        results["kl_div"].append(kl_div_res / len(synth_tss) / len(train_tss))
        results["sj_div"].append(sj_div_res / len(synth_tss) / len(train_tss))
    
    pd.DataFrame(results).to_csv(results_dir / f"synth_sim_{model}.csv", index=False)

24it [59:58, 149.94s/it]
