In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
# from torch.utils.tensorboard import SummaryWriter
from pytorch_lightning import seed_everything

from utils.data import get_hsm_dataset, get_solar_energy_dataset, get_fuel_prices_dataset, get_passengers_dataset, split_data, log_returns
from utils.metrics import MAPE, WAPE, MAE
from utils.TTS_GAN import TTS_GAN_Generator, TTS_GAN_Discriminator, weights_init, train_TTS_GAN

In [2]:
hsm_dataset_path = "data/huge_stock_market_dataset/"
solar_energy_dataset_path = "data/solar_energy/"
fuel_prices_dataset_path = "data/fuel_prices/"
passengers_dataset_path = "data/air_passengers/"
models_dir = "models/"

In [3]:
device = gpu = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

lr = 2e-4
wd = 0
ctrl_lr = 3.5e-4
beta1 = 0.0
beta2 = 0.9
max_epoch = 20
latent_dim = 128
batch_size = gen_batch_size = dis_batch_size = 64
ema = 0.995
ema_kimg = 500
ema_warmup = 0
world_size = 0
rank = - 1
print_freq = 50
n_critic = 1
phi = 1
accumulated_times = g_accumulated_times = 1
loss = "standard"
seq_len = 150

n_samples = 800 * 127  # number of samples generated by QuantGAN

cuda:0


In [4]:
ts_iterator = get_hsm_dataset(hsm_dataset_path, selected_files=f"{hsm_dataset_path}/selected100.csv")
synthetic_path = f"{hsm_dataset_path}synthetic/TTS_GAN/"
seed_everything(0)

start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start=start_point):
    print(f"Time Series #{ts_index}")
    
    train_ts = log_returns(time_series)

    # using sequences of seq_len to train model
    train_ts = np.array([train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len)])
    train_dl = torch.utils.data.DataLoader(torch.from_numpy(train_ts.reshape(- 1, 1, 1, seq_len)).to(device), batch_size=batch_size, shuffle=True)

    TTS_GAN_gen = TTS_GAN_Generator(seq_len=seq_len, channels=1, latent_dim=latent_dim, ).to(device)
    TTS_GAN_dis = TTS_GAN_Discriminator(seq_length=seq_len, in_channels=1).to(device)

    gen_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_gen.parameters()), lr)
    dis_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_dis.parameters()), lr)
    
    for epoch in range(max_epoch):
        losses = train_TTS_GAN(globals(), TTS_GAN_gen, TTS_GAN_dis, gen_optimizer, dis_optimizer, train_dl, epoch)
    tqdm.write(f"generator loss: {losses[0]: 0.4f} discriminator loss: {losses[1]: 0.4f}")
    del dis_optimizer, gen_optimizer, TTS_GAN_dis, train_dl
    torch.cuda.empty_cache()

    samples_to_gen = n_samples // seq_len
    synth_data = []
    with torch.no_grad():
        for _ in range(samples_to_gen):
            z = torch.cuda.FloatTensor(np.random.normal(0, 1, (1, latent_dim))).cuda(device, non_blocking=True)
            synth_data.append(TTS_GAN_gen(z).cpu().numpy())
            del z
            torch.cuda.empty_cache()
    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(synth_data))

    del TTS_GAN_gen, synth_data
    torch.cuda.empty_cache()

Global seed set to 0


Time Series #0
generator loss: -0.6032 discriminator loss:  1.4732
Time Series #1
generator loss: -0.5496 discriminator loss:  1.3828
Time Series #2
generator loss: -0.5574 discriminator loss:  1.1996
Time Series #3
generator loss: -0.6008 discriminator loss:  1.4814
Time Series #4
generator loss: -0.3154 discriminator loss:  0.8688
Time Series #5
generator loss: -0.5753 discriminator loss:  1.3119
Time Series #6
generator loss: -0.5630 discriminator loss:  1.2776
Time Series #7
generator loss: -0.5748 discriminator loss:  1.3138
Time Series #8
generator loss: -0.5272 discriminator loss:  1.1397
Time Series #9
generator loss: -0.5831 discriminator loss:  1.3381
Time Series #10
generator loss: -0.5642 discriminator loss:  1.3654
Time Series #11
generator loss: -0.7545 discriminator loss:  1.9704
Time Series #12
generator loss: -0.5646 discriminator loss:  1.2984
Time Series #13
generator loss: -0.5375 discriminator loss:  1.1957
Time Series #14
generator loss: -0.5787 discriminator loss

Time: ~53 min

In [4]:
ts_iterator = get_solar_energy_dataset(solar_energy_dataset_path, max_results=10)
synthetic_path = f"{solar_energy_dataset_path}synthetic/TTS_GAN/"
seed_everything(0)
max_epoch = 4

start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start=start_point):
    print(f"Time Series #{ts_index}")
    
    train_ts = log_returns(time_series + 1e-9)

    # using sequences of seq_len to train model
    train_ts = np.array([train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len)])
    train_dl = torch.utils.data.DataLoader(torch.from_numpy(train_ts.reshape(- 1, 1, 1, seq_len)).to(device), batch_size=batch_size, shuffle=True)

    TTS_GAN_gen = TTS_GAN_Generator(seq_len=seq_len, channels=1, latent_dim=latent_dim, ).to(device)
    TTS_GAN_dis = TTS_GAN_Discriminator(seq_length=seq_len, in_channels=1).to(device)

    gen_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_gen.parameters()), lr)
    dis_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_dis.parameters()), lr)
    
    for epoch in range(max_epoch):
        losses = train_TTS_GAN(globals(), TTS_GAN_gen, TTS_GAN_dis, gen_optimizer, dis_optimizer, train_dl, epoch)
    tqdm.write(f"generator loss: {losses[0]: 0.4f} discriminator loss: {losses[1]: 0.4f}")
    del dis_optimizer, gen_optimizer, TTS_GAN_dis, train_dl
    torch.cuda.empty_cache()

    samples_to_gen = n_samples // seq_len
    synth_data = []
    with torch.no_grad():
        for _ in range(samples_to_gen):
            z = torch.cuda.FloatTensor(np.random.normal(0, 1, (1, latent_dim))).cuda(device, non_blocking=True)
            synth_data.append(TTS_GAN_gen(z).cpu().numpy())
            del z
            torch.cuda.empty_cache()
    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(synth_data))

    del TTS_GAN_gen, synth_data
    torch.cuda.empty_cache()

Global seed set to 0


Time Series #0
generator loss: -0.3176 discriminator loss:  0.7872
Time Series #1
generator loss: -0.3169 discriminator loss:  0.7768
Time Series #2
generator loss: -0.3166 discriminator loss:  0.7941
Time Series #3
generator loss: -0.3146 discriminator loss:  0.7864
Time Series #4
generator loss: -0.3151 discriminator loss:  0.7826
Time Series #5
generator loss: -0.3160 discriminator loss:  0.7852
Time Series #6
generator loss: -0.3162 discriminator loss:  0.7807
Time Series #7
generator loss: -0.3144 discriminator loss:  0.7878
Time Series #8
generator loss: -0.3152 discriminator loss:  0.7822
Time Series #9
generator loss: -0.3441 discriminator loss:  0.8143


In [4]:
ts_iterator = get_fuel_prices_dataset(fuel_prices_dataset_path)
synthetic_path = f"{fuel_prices_dataset_path}synthetic/TTS_GAN/"
seed_everything(0)
max_epoch = 10

start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start=start_point):
    print(f"Time Series #{ts_index}")
    
    train_ts = log_returns(time_series + 1e-9)

    # using sequences of seq_len to train model
    train_ts = np.array([train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len)])
    train_dl = torch.utils.data.DataLoader(torch.from_numpy(train_ts.reshape(- 1, 1, 1, seq_len)).to(device), batch_size=batch_size, shuffle=True)

    TTS_GAN_gen = TTS_GAN_Generator(seq_len=seq_len, channels=1, latent_dim=latent_dim, ).to(device)
    TTS_GAN_dis = TTS_GAN_Discriminator(seq_length=seq_len, in_channels=1).to(device)

    gen_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_gen.parameters()), lr)
    dis_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_dis.parameters()), lr)
    
    for epoch in range(max_epoch):
        losses = train_TTS_GAN(globals(), TTS_GAN_gen, TTS_GAN_dis, gen_optimizer, dis_optimizer, train_dl, epoch)
    tqdm.write(f"generator loss: {losses[0]: 0.4f} discriminator loss: {losses[1]: 0.4f}")
    del dis_optimizer, gen_optimizer, TTS_GAN_dis, train_dl
    torch.cuda.empty_cache()

    samples_to_gen = n_samples // seq_len
    synth_data = []
    with torch.no_grad():
        for _ in range(samples_to_gen):
            z = torch.cuda.FloatTensor(np.random.normal(0, 1, (1, latent_dim))).cuda(device, non_blocking=True)
            synth_data.append(TTS_GAN_gen(z).cpu().numpy())
            del z
            torch.cuda.empty_cache()
    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(synth_data))

    del TTS_GAN_gen, synth_data
    torch.cuda.empty_cache()

Global seed set to 0


Time Series #0
generator loss: -0.5822 discriminator loss:  1.3599
Time Series #1
generator loss: -0.6760 discriminator loss:  1.6666
Time Series #2
generator loss: -0.7566 discriminator loss:  2.0300
Time Series #3
generator loss: -0.4859 discriminator loss:  1.1037
Time Series #4
generator loss: -0.5499 discriminator loss:  1.2034
Time Series #5
generator loss: -0.7811 discriminator loss:  1.9983
Time Series #6
generator loss: -0.6740 discriminator loss:  1.6647
Time Series #7
generator loss: -0.7084 discriminator loss:  1.8137


In [5]:
ts_iterator = get_passengers_dataset(passengers_dataset_path, max_results=99)
synthetic_path = f"{passengers_dataset_path}synthetic/TTS_GAN/"
seed_everything(0)
max_epoch = 10

start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start=start_point):
    print(f"Time Series #{ts_index}")
    
    train_ts = log_returns(time_series + 1e-9)

    # using sequences of seq_len to train model
    train_ts = np.array([train_ts[i: i + seq_len] for i in range(len(train_ts) - seq_len)])
    train_dl = torch.utils.data.DataLoader(torch.from_numpy(train_ts.reshape(- 1, 1, 1, seq_len)).to(device), batch_size=batch_size, shuffle=True)

    TTS_GAN_gen = TTS_GAN_Generator(seq_len=seq_len, channels=1, latent_dim=latent_dim, ).to(device)
    TTS_GAN_dis = TTS_GAN_Discriminator(seq_length=seq_len, in_channels=1).to(device)

    gen_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_gen.parameters()), lr)
    dis_optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, TTS_GAN_dis.parameters()), lr)
    
    for epoch in range(max_epoch):
        losses = train_TTS_GAN(globals(), TTS_GAN_gen, TTS_GAN_dis, gen_optimizer, dis_optimizer, train_dl, epoch)
    tqdm.write(f"generator loss: {losses[0]: 0.4f} discriminator loss: {losses[1]: 0.4f}")
    del dis_optimizer, gen_optimizer, TTS_GAN_dis, train_dl
    torch.cuda.empty_cache()

    samples_to_gen = n_samples // seq_len
    synth_data = []
    with torch.no_grad():
        for _ in range(samples_to_gen):
            z = torch.cuda.FloatTensor(np.random.normal(0, 1, (1, latent_dim))).cuda(device, non_blocking=True)
            synth_data.append(TTS_GAN_gen(z).cpu().numpy())
            del z
            torch.cuda.empty_cache()
    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(synth_data))

    del TTS_GAN_gen, synth_data
    torch.cuda.empty_cache()

Global seed set to 0


Time Series #0
generator loss: -0.5794 discriminator loss:  1.4573
Time Series #1
generator loss: -0.5417 discriminator loss:  1.4018
Time Series #2
generator loss: -0.5690 discriminator loss:  1.4125
Time Series #3
generator loss: -0.5147 discriminator loss:  1.4139
Time Series #4
generator loss: -0.5443 discriminator loss:  1.5028
Time Series #5
generator loss: -0.5339 discriminator loss:  1.2892
Time Series #6
generator loss: -0.5660 discriminator loss:  1.4827
Time Series #7
generator loss: -0.5801 discriminator loss:  1.4706
Time Series #8
generator loss: -0.5934 discriminator loss:  1.3918
Time Series #9
generator loss: -0.6092 discriminator loss:  1.3920
Time Series #10
generator loss: -0.4855 discriminator loss:  1.1645
Time Series #11
generator loss: -0.5292 discriminator loss:  1.3872
Time Series #12
generator loss: -0.4867 discriminator loss:  1.1863
Time Series #13
generator loss: -0.5843 discriminator loss:  1.3863
Time Series #14
generator loss: -0.5553 discriminator loss

# Similarity

In [6]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.special import kl_div

from utils.data import get_hsm_dataset, get_solar_energy_dataset, get_fuel_prices_dataset, get_passengers_dataset, split_data, log_returns

In [7]:
hsm_dataset_path = "data/huge_stock_market_dataset/"
solar_energy_dataset_path = "data/solar_energy/"
fuel_prices_dataset_path = "data/fuel_prices/"
passengers_dataset_path = "data/air_passengers/"
results_dir = Path("results")

seq_len = 150

sj_div = lambda x, y: (kl_div(x, (x + y) / 2) + kl_div(y, (x + y) / 2)) / 2
min_max_norm = lambda x: (x - x.min()) / (x.max() - x.min())

In [9]:
start_dataset = 2
start_ts = 0

for ds_ind, (dataset_path, dataset_name) in enumerate(((Path("data/huge_stock_market_dataset/"), "hsm"),\
     (Path("data/solar_energy"), "se"), (Path("data/fuel_prices/"), "fp"),\
        (Path("data/air_passengers/"), "ap"))):
    if ds_ind < start_dataset: continue
    print(f"processing {dataset_name} dataset")

    for model in ("TTS_GAN",):
        synthetic_path = dataset_path / f"synthetic/TTS_GAN/"
        if dataset_name == "hsm":
            ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected100.csv")
        elif dataset_name == "se":
            ts_iterator = get_solar_energy_dataset(dataset_path, max_results=10)
        elif dataset_name == "fp":
            ts_iterator = get_fuel_prices_dataset(dataset_path)
        else:
            ts_iterator = get_passengers_dataset(dataset_path, max_results=99)
        for _ in range(start_ts): next(ts_iterator)
        results = {"kl_div": [], "sj_div": []}

        for ts_index, time_series in tqdm(enumerate(ts_iterator)):
            train_ts = log_returns(time_series if dataset_name == "hsm" else time_series + 1e-9).values.flatten()
            train_ts = min_max_norm(train_ts)
            train_tss = [train_ts[i: i + seq_len] for i in range(0, len(train_ts), seq_len) if i < len(train_ts) - seq_len + 1]
            
            synth_tss = np.load(synthetic_path / f"selected{ts_index}.npy")
            kl_div_res = sj_div_res = 0
            for synth_ts in tqdm(synth_tss, leave=False):
                synth_ts = min_max_norm(synth_ts)
                for train_ts in train_tss:
                    res = kl_div(synth_ts, train_ts)
                    kl_div_res += np.where(np.isinf(res), 0, res).mean()
                    sj_div_res += sj_div(synth_ts, train_ts).mean()
            results["kl_div"].append(kl_div_res / len(synth_tss) / len(train_tss))
            results["sj_div"].append(sj_div_res / len(synth_tss) / len(train_tss))
        
        pd.DataFrame(results).to_csv(results_dir / f"synth_{dataset_name}_sim_{model}.csv", index=False)

processing fp dataset


8it [00:01,  4.55it/s]


processing ap dataset


99it [00:55,  1.78it/s]
