In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

import torch

from utils.data import get_hsm_dataset, get_solar_energy_dataset, split_data
from utils.metrics import MAPE, WAPE, MAE
from utils.dl import QuantGAN_Discriminator, QuantGAN_Generator
from utils.QuantGAN_gaussianize import Gaussianize

In [2]:
hsm_dataset_path = "data/huge_stock_market_dataset/"
solar_energy_dataset_path = "data/solar_energy/"
models_dir = "models/"

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

batches_to_gen = 10

num_epochs = 10
nz = 3
batch_size = 80
seq_len = 127
clip = 0.01
lr = 0.0002
receptive_field_size = 127

cuda:0


In [4]:
class Loader32(torch.utils.data.Dataset):
    def __init__(self, data, length):
        assert len(data) >= length
        self.data = data
        self.length = length
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx:idx+self.length]).reshape(- 1, self.length).to(torch.float32)
        
    def __len__(self):
        return max(len(self.data)-self.length, 0)

In [5]:
def model_routine(time_series, ts_index, synthetic_path):
    global val_size, test_size, batches_to_gen, num_epochs, nz, batch_size, seq_len, clip, lr, receptive_field_size
    
    # preprocessing steps according to the QuanGAN paper
    df = time_series
    returns = df.shift(1) / df - 1
    log_returns = np.log(df / df.shift(1))[1:].to_numpy().reshape(- 1, 1)
    standardScaler1 = StandardScaler()
    standardScaler2 = StandardScaler()
    gaussianize = Gaussianize()
    log_returns_preprocessed = standardScaler2.fit_transform(gaussianize.fit_transform(standardScaler1.fit_transform(log_returns)))
    data_size = log_returns.shape[0]

    # defining models and optimizers
    generator = QuantGAN_Generator().to(device)
    discriminator = QuantGAN_Discriminator(seq_len).to(device)
    disc_optimizer = torch.optim.RMSprop(discriminator.parameters(), lr=lr)
    gen_optimizer = torch.optim.RMSprop(generator.parameters(), lr=lr)

    # data preparing
    dataset = Loader32(log_returns_preprocessed, 127)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
    t = tqdm(range(num_epochs))
    for epoch in t:
        for idx, data in enumerate(dataloader, 0):

            discriminator.zero_grad()
            real = data.to(device)
            batch_size, seq_len = real.size(0), real.size(2)
            noise = torch.randn(batch_size, nz, seq_len, device=device)
            fake = generator(noise).detach()
            disc_loss = - torch.mean(discriminator(real)) + torch.mean(discriminator(fake))
            disc_loss.backward()
            disc_optimizer.step()

            for dp in discriminator.parameters():
                dp.data.clamp_(-clip, clip)
    
            if idx % 5 == 0:
                generator.zero_grad()
                gen_loss = - torch.mean(discriminator(generator(noise)))
                gen_loss.backward()
                gen_optimizer.step()
        t.set_description('Discriminator Loss: %.8f Generator Loss: %.8f' % (disc_loss.item(), gen_loss.item()))
    # saving model
    torch.save(generator, models_dir +  f'QuantGAN_generator_selected{ts_index}.pth')

    # generation synthetic time series
    generator.eval()
    ys = []
    for _ in range(batches_to_gen):
        with torch.no_grad():
            noise = torch.randn(80, 3, 127).to(device)
            y = generator(noise).cpu().detach().squeeze()

        y = (y - y.mean(axis=0)) / y.std(axis=0)
        y = standardScaler2.inverse_transform(y)
        y = np.array([gaussianize.inverse_transform(np.expand_dims(x, 1)) for x in y]).squeeze()
        y = standardScaler1.inverse_transform(y)

        # some basic filtering to redue the tendency of GAN to produce extreme returns
        y = y[(y.max(axis=1) <= 2 * log_returns.max()) & (y.min(axis=1) >= 2 * log_returns.min())]
        y -= y.mean()
        ys.append(y)

    np.save(synthetic_path + f"selected{ts_index}.npy", np.row_stack(ys))

    del discriminator, generator, disc_loss, gen_loss, dataloader, dataset, y
    torch.cuda.empty_cache()

In [6]:
ts_iterator = get_hsm_dataset(hsm_dataset_path, selected_files=f"{hsm_dataset_path}/selected100.csv")
synthetic_path = f"{hsm_dataset_path}synthetic/QuantGAN/"
start_point = 100
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series, ts_index, synthetic_path)

In [7]:
num_epochs = 4
ts_iterator = get_solar_energy_dataset(solar_energy_dataset_path, max_results=10)
synthetic_path = f"{solar_energy_dataset_path}synthetic/QuantGAN/"
start_point = 10
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series + 1e-9, ts_index, synthetic_path)

# Similarity

In [3]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.special import kl_div

from utils.data import get_hsm_dataset, get_solar_energy_dataset, split_data, log_returns

In [4]:
results_dir = Path("results")

seq_len = 127

sj_div = lambda x, y: (kl_div(x, (x + y) / 2) + kl_div(y, (x + y) / 2)) / 2
min_max_norm = lambda x: (x - x.min()) / (x.max() - x.min())

In [9]:
start_dataset = 0
start_ts = 0

for dataset_path, dataset_name in ((Path("data/huge_stock_market_dataset/"), "hsm"),\
     (Path("data/solar_energy"), "se")):
    if dataset_name == "hsm" and start_dataset == 1: continue
    print(f"processing {dataset_name} dataset")
    for model in ("QuantGAN",):
        synthetic_path = dataset_path / f"synthetic/{model}/"
        if (results_dir / f"synth_{dataset_name}_sim_{model}.csv").exists():
            results = pd.read_csv(results_dir / f"synth_{dataset_name}_sim_{model}.csv").to_dict()
        else:
            results = {"kl_div": {}, "sj_div": {}}
        if dataset_name == "hsm":
            ts_iterator = get_hsm_dataset(dataset_path, selected_files=f"{dataset_path}/selected100.csv")
        else:
            ts_iterator = get_solar_energy_dataset(dataset_path, max_results=10)
        for _ in range(start_ts): next(ts_iterator)

        for ts_index, time_series in tqdm(enumerate(ts_iterator, start=start_ts)):
            train_ts = log_returns(time_series + 1e-9).values.flatten()
            train_ts = min_max_norm(train_ts)
            train_tss = [train_ts[i: i + seq_len] for i in range(0, len(train_ts), seq_len) if i < len(train_ts) - seq_len + 1]
            
            synth_tss = np.load(synthetic_path / f"selected{ts_index}.npy")
            kl_div_res = sj_div_res = 0
            for synth_ts in tqdm(synth_tss):
                synth_ts = min_max_norm(synth_ts)
                # synth_ts = np.histogram(synth_ts, bins=np.arange(start=0, stop=1, step=1/100))[0]
                # train_ts = np.histogram(train_ts, bins=np.arange(start=0, stop=1, step=1/100))[0]
                for train_ts in train_tss:
                    res = kl_div(synth_ts, train_ts)
                    kl_div_res += np.where(np.isinf(res), 0, res).mean()
                    sj_div_res += sj_div(synth_ts, train_ts).mean()
            results["kl_div"][ts_index] = kl_div_res / len(synth_tss) / len(train_tss)
            results["sj_div"][ts_index] = sj_div_res / len(synth_tss) / len(train_tss)
        
            pd.DataFrame(results).to_csv(results_dir / f"synth_{dataset_name}_sim_{model}.csv", index=False)

processing hsm dataset


100%|██████████| 554/554 [00:00<00:00, 2986.49it/s]
100%|██████████| 799/799 [00:00<00:00, 4087.64it/s]
100%|██████████| 758/758 [00:00<00:00, 4293.91it/s]
100%|██████████| 770/770 [00:00<00:00, 2766.93it/s]
100%|██████████| 692/692 [00:00<00:00, 1210.18it/s]
100%|██████████| 532/532 [00:00<00:00, 10878.89it/s]
100%|██████████| 800/800 [00:00<00:00, 3730.90it/s]
100%|██████████| 668/668 [00:00<00:00, 6089.05it/s]
100%|██████████| 783/783 [00:00<00:00, 1321.71it/s]
100%|██████████| 794/794 [00:00<00:00, 11706.95it/s]
100%|██████████| 729/729 [00:00<00:00, 6194.59it/s]
100%|██████████| 513/513 [00:00<00:00, 4050.22it/s]
100%|██████████| 794/794 [00:00<00:00, 2671.25it/s]
100%|██████████| 760/760 [00:00<00:00, 5219.39it/s]
100%|██████████| 798/798 [00:00<00:00, 12117.61it/s]
100%|██████████| 741/741 [00:00<00:00, 5122.87it/s]
100%|██████████| 697/697 [00:00<00:00, 1877.06it/s]
100%|██████████| 800/800 [00:00<00:00, 1857.71it/s]
100%|██████████| 797/797 [00:00<00:00, 1793.99it/s]
100%|████

processing se dataset


100%|██████████| 789/789 [00:23<00:00, 33.55it/s]
100%|██████████| 779/779 [00:22<00:00, 34.49it/s]
100%|██████████| 619/619 [00:17<00:00, 35.17it/s]
100%|██████████| 781/781 [00:23<00:00, 33.61it/s]
100%|██████████| 308/308 [00:09<00:00, 33.49it/s]
100%|██████████| 768/768 [00:23<00:00, 32.75it/s]
100%|██████████| 782/782 [00:22<00:00, 34.01it/s]
100%|██████████| 465/465 [00:13<00:00, 34.34it/s]
100%|██████████| 793/793 [00:22<00:00, 34.91it/s]
100%|██████████| 783/783 [00:22<00:00, 34.37it/s]
10it [03:22, 20.21s/it]
