In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from pathlib import Path

import torch

from utils.data import get_hsm_dataset, get_solar_energy_dataset, get_fuel_prices_dataset, get_passengers_dataset, split_data
from utils.metrics import MAPE, WAPE, MAE
from utils.dl import QuantGAN_Discriminator, QuantGAN_Generator
from utils.QuantGAN_gaussianize import Gaussianize

In [2]:
hsm_dataset_path = Path("data/huge_stock_market_dataset/")
solar_energy_dataset_path = Path("data/solar_energy/")
fuel_prices_dataset_path = Path("data/fuel_prices/")
passengers_dataset_path = Path("data/air_passengers/")
models_dir = Path("models/")

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

batches_to_gen = 1

num_epochs = 10
nz = 3
batch_size = 80
seq_len = 127
clip = 0.01
lr = 0.0002
receptive_field_size = 127

cuda:0


In [4]:
class Loader32(torch.utils.data.Dataset):
    def __init__(self, data, length):
        assert len(data) >= length
        self.data = data
        self.length = length
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx:idx+self.length]).reshape(- 1, self.length).to(torch.float32)
        
    def __len__(self):
        return max(len(self.data)-self.length, 0)

In [5]:
def model_routine(time_series, ts_index, synthetic_path):
    global val_size, test_size, batches_to_gen, num_epochs, nz, batch_size, seq_len, clip, lr, receptive_field_size
    
    # preprocessing steps according to the QuanGAN paper
    df = time_series
    # returns = df.shift(1) / df - 1
    # log_returns = np.log(df / df.shift(1))[1:].to_numpy().reshape(- 1, 1)
    log_returns = df.values.reshape(- 1, 1)
    standardScaler1 = StandardScaler()
    standardScaler2 = StandardScaler()
    gaussianize = Gaussianize()
    log_returns_preprocessed = standardScaler2.fit_transform(gaussianize.fit_transform(standardScaler1.fit_transform(log_returns)))
    data_size = log_returns.shape[0]

    # defining models and optimizers
    generator = QuantGAN_Generator().to(device)
    discriminator = QuantGAN_Discriminator(seq_len).to(device)
    disc_optimizer = torch.optim.RMSprop(discriminator.parameters(), lr=lr)
    gen_optimizer = torch.optim.RMSprop(generator.parameters(), lr=lr)

    # data preparing
    dataset = Loader32(log_returns_preprocessed, receptive_field_size)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
    t = tqdm(range(num_epochs))
    for epoch in t:
        for idx, data in enumerate(dataloader, 0):

            discriminator.zero_grad()
            real = data.to(device)
            batch_size, seq_len = real.size(0), real.size(2)
            noise = torch.randn(batch_size, nz, seq_len, device=device)
            fake = generator(noise).detach()
            disc_loss = - torch.mean(discriminator(real)) + torch.mean(discriminator(fake))
            disc_loss.backward()
            disc_optimizer.step()

            for dp in discriminator.parameters():
                dp.data.clamp_(-clip, clip)
    
            if idx % 5 == 0:
                generator.zero_grad()
                gen_loss = - torch.mean(discriminator(generator(noise)))
                gen_loss.backward()
                gen_optimizer.step()
        t.set_description('Discriminator Loss: %.8f Generator Loss: %.8f' % (disc_loss.item(), gen_loss.item()))
    # saving model
    torch.save(generator, models_dir /  f'QuantGAN_generator_selected{ts_index}.pth')

    # generation synthetic time series
    generator.eval()
    ys = []
    for _ in range(batches_to_gen):
        with torch.no_grad():
            noise = torch.randn(80, 3, 127).to(device)
            y = generator(noise).cpu().detach().squeeze()

        y = (y - y.mean(axis=0)) / y.std(axis=0)
        y = standardScaler2.inverse_transform(y)
        y = np.array([gaussianize.inverse_transform(np.expand_dims(x, 1)) for x in y]).squeeze()
        y = standardScaler1.inverse_transform(y)

        # some basic filtering to redue the tendency of GAN to produce extreme returns
        # y = y[(y.max(axis=1) <= 2 * log_returns.max()) & (y.min(axis=1) >= 2 * log_returns.min())]
        # y -= y.mean()
        ys.append(y)

    np.save(synthetic_path / f"selected{ts_index}.npy", np.row_stack(ys))

    del discriminator, generator, disc_loss, gen_loss, dataloader, dataset, y
    torch.cuda.empty_cache()

In [6]:
ts_iterator = get_hsm_dataset(hsm_dataset_path, selected_files=hsm_dataset_path / "selected100.csv")
synthetic_path = hsm_dataset_path / "synthetic/QuantGAN/"
start_point = 100
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series, ts_index, synthetic_path)

In [6]:
num_epochs = 4
ts_iterator = get_solar_energy_dataset(solar_energy_dataset_path)
synthetic_path = solar_energy_dataset_path / "synthetic/QuantGAN/"
start_point = 0
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series, ts_index, synthetic_path)

Time Series #0


Discriminator Loss: 0.00000340 Generator Loss: -0.49755979: 100%|██████████| 4/4 [00:09<00:00,  2.49s/it] 


Time Series #1


Discriminator Loss: -0.00000247 Generator Loss: -0.49791011: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


Time Series #2


Discriminator Loss: -0.00002259 Generator Loss: -0.49741653: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


Time Series #3


Discriminator Loss: -0.00003412 Generator Loss: -0.49763224: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]


Time Series #4


Discriminator Loss: 0.00001574 Generator Loss: -0.50241703: 100%|██████████| 4/4 [00:10<00:00,  2.55s/it] 


Time Series #5


Discriminator Loss: 0.00008953 Generator Loss: -0.50079763: 100%|██████████| 4/4 [00:55<00:00, 13.87s/it]


Time Series #6


Discriminator Loss: 0.00010329 Generator Loss: -0.49743944: 100%|██████████| 4/4 [00:44<00:00, 11.22s/it] 


Time Series #7


Discriminator Loss: -0.00026551 Generator Loss: -0.49971384: 100%|██████████| 4/4 [00:45<00:00, 11.43s/it]


Time Series #8


Discriminator Loss: 0.00007766 Generator Loss: -0.50215000: 100%|██████████| 4/4 [00:45<00:00, 11.47s/it] 


Time Series #9


Discriminator Loss: 0.00013888 Generator Loss: -0.49666637: 100%|██████████| 4/4 [00:46<00:00, 11.56s/it]


Time Series #10


Discriminator Loss: -0.00019023 Generator Loss: -0.49651274: 100%|██████████| 4/4 [00:47<00:00, 11.98s/it]


Time Series #11


Discriminator Loss: 0.00009704 Generator Loss: -0.49957937: 100%|██████████| 4/4 [00:45<00:00, 11.44s/it] 


Time Series #12


Discriminator Loss: -0.00009269 Generator Loss: -0.50041807: 100%|██████████| 4/4 [00:45<00:00, 11.39s/it]


Time Series #13


Discriminator Loss: -0.00000060 Generator Loss: -0.49774498: 100%|██████████| 4/4 [00:45<00:00, 11.40s/it]


Time Series #14


Discriminator Loss: 0.00048095 Generator Loss: -0.50217146: 100%|██████████| 4/4 [00:48<00:00, 12.03s/it] 


Time Series #15


Discriminator Loss: 0.00029090 Generator Loss: -0.49911612: 100%|██████████| 4/4 [00:55<00:00, 13.79s/it] 


Time Series #16


Discriminator Loss: 0.00006634 Generator Loss: -0.49806985: 100%|██████████| 4/4 [00:48<00:00, 12.06s/it] 


Time Series #17


Discriminator Loss: 0.00052065 Generator Loss: -0.50203645: 100%|██████████| 4/4 [00:49<00:00, 12.40s/it] 


Time Series #18


Discriminator Loss: 0.00005841 Generator Loss: -0.49957877: 100%|██████████| 4/4 [00:47<00:00, 11.94s/it] 


Time Series #19


Discriminator Loss: -0.00009289 Generator Loss: -0.49923712: 100%|██████████| 4/4 [00:46<00:00, 11.52s/it]


Time Series #20


Discriminator Loss: -0.00031963 Generator Loss: -0.49999103: 100%|██████████| 4/4 [00:46<00:00, 11.53s/it]


Time Series #21


Discriminator Loss: -0.00095630 Generator Loss: -0.49877414: 100%|██████████| 4/4 [00:46<00:00, 11.58s/it]


Time Series #22


Discriminator Loss: -0.00109467 Generator Loss: -0.49757618: 100%|██████████| 4/4 [00:45<00:00, 11.49s/it]


Time Series #23


Discriminator Loss: -0.00135541 Generator Loss: -0.49720734: 100%|██████████| 4/4 [00:46<00:00, 11.73s/it]


Time Series #24


Discriminator Loss: -0.00045007 Generator Loss: -0.49698269: 100%|██████████| 4/4 [00:46<00:00, 11.61s/it]


Time Series #25


Discriminator Loss: 0.00017413 Generator Loss: -0.49993762: 100%|██████████| 4/4 [00:45<00:00, 11.49s/it] 


Time Series #26


Discriminator Loss: 0.00021067 Generator Loss: -0.49998456: 100%|██████████| 4/4 [00:46<00:00, 11.57s/it] 


Time Series #27


Discriminator Loss: 0.00055170 Generator Loss: -0.50052398: 100%|██████████| 4/4 [00:46<00:00, 11.58s/it]


Time Series #28


Discriminator Loss: 0.00018197 Generator Loss: -0.49560308: 100%|██████████| 4/4 [00:46<00:00, 11.54s/it] 


Time Series #29


Discriminator Loss: 0.00001577 Generator Loss: -0.49671698: 100%|██████████| 4/4 [00:46<00:00, 11.66s/it] 


Time Series #30


Discriminator Loss: 0.00014225 Generator Loss: -0.50016987: 100%|██████████| 4/4 [00:46<00:00, 11.60s/it]


Time Series #31


Discriminator Loss: 0.00022626 Generator Loss: -0.50237024: 100%|██████████| 4/4 [00:46<00:00, 11.51s/it] 


Time Series #32


Discriminator Loss: 0.00004840 Generator Loss: -0.49929780: 100%|██████████| 4/4 [00:46<00:00, 11.51s/it] 


Time Series #33


Discriminator Loss: 0.00019330 Generator Loss: -0.50210810: 100%|██████████| 4/4 [00:46<00:00, 11.57s/it] 


Time Series #34


Discriminator Loss: 0.00030315 Generator Loss: -0.50123662: 100%|██████████| 4/4 [00:46<00:00, 11.58s/it] 


Time Series #35


Discriminator Loss: 0.00005788 Generator Loss: -0.49786600: 100%|██████████| 4/4 [00:45<00:00, 11.47s/it] 


Time Series #36


Discriminator Loss: 0.00008249 Generator Loss: -0.50003004: 100%|██████████| 4/4 [00:46<00:00, 11.51s/it]


Time Series #37


Discriminator Loss: 0.00020587 Generator Loss: -0.50175810: 100%|██████████| 4/4 [00:46<00:00, 11.52s/it] 


Time Series #38


Discriminator Loss: -0.00029317 Generator Loss: -0.49961543: 100%|██████████| 4/4 [00:46<00:00, 11.55s/it]


Time Series #39


Discriminator Loss: -0.00005576 Generator Loss: -0.49910176: 100%|██████████| 4/4 [00:46<00:00, 11.59s/it]


Time Series #40


Discriminator Loss: -0.00018802 Generator Loss: -0.49747986: 100%|██████████| 4/4 [00:47<00:00, 11.79s/it]


Time Series #41


Discriminator Loss: 0.00011683 Generator Loss: -0.50045413: 100%|██████████| 4/4 [00:46<00:00, 11.65s/it] 


Time Series #42


Discriminator Loss: 0.00047982 Generator Loss: -0.50229728: 100%|██████████| 4/4 [00:48<00:00, 12.18s/it] 


Time Series #43


Discriminator Loss: 0.00025398 Generator Loss: -0.50036353: 100%|██████████| 4/4 [00:48<00:00, 12.07s/it] 


Time Series #44


Discriminator Loss: 0.00045741 Generator Loss: -0.50209105: 100%|██████████| 4/4 [00:48<00:00, 12.11s/it] 


Time Series #45


Discriminator Loss: 0.00024328 Generator Loss: -0.49937677: 100%|██████████| 4/4 [00:47<00:00, 11.98s/it] 


Time Series #46


Discriminator Loss: 0.00001580 Generator Loss: -0.49796569: 100%|██████████| 4/4 [00:50<00:00, 12.60s/it]


Time Series #47


Discriminator Loss: -0.00004968 Generator Loss: -0.49977753: 100%|██████████| 4/4 [00:48<00:00, 12.05s/it]


Time Series #48


Discriminator Loss: 0.00009620 Generator Loss: -0.50028276: 100%|██████████| 4/4 [00:52<00:00, 13.15s/it] 


Time Series #49


Discriminator Loss: -0.00003392 Generator Loss: -0.49791306: 100%|██████████| 4/4 [00:48<00:00, 12.08s/it]


45 sec 1 ts 3k

6 min 1 ts 10k

In [8]:
num_epochs = 10
ts_iterator = get_fuel_prices_dataset(fuel_prices_dataset_path)
synthetic_path = fuel_prices_dataset_path / "synthetic/QuantGAN/"
start_point = 8
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series, ts_index, synthetic_path)

In [9]:
num_epochs = 4
ts_iterator = get_passengers_dataset(passengers_dataset_path, max_results=99)
synthetic_path = passengers_dataset_path / "synthetic/QuantGAN/"
start_point = 99
for _ in range(start_point): next(ts_iterator)

for ts_index, time_series in enumerate(ts_iterator, start_point):
    print(f"Time Series #{ts_index}")
    model_routine(time_series, ts_index, synthetic_path)

# Similarity

In [10]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.special import kl_div

from utils.data import get_hsm_dataset, get_solar_energy_dataset, get_fuel_prices_dataset, get_passengers_dataset, split_data, log_returns
from utils.synth_eval import eval_sim

In [11]:
results_dir = Path("results")
seq_len = 127
hsm_dataset_path, solar_energy_dataset_path, fuel_prices_dataset_path, passengers_dataset_path = [Path(x) for x in (hsm_dataset_path, solar_energy_dataset_path, fuel_prices_dataset_path, passengers_dataset_path)]

In [12]:
eval_sim(("hsm", "se", "fp", "ap"), (hsm_dataset_path, solar_energy_dataset_path, fuel_prices_dataset_path, passengers_dataset_path),
     "QuantGAN", save=True, results_dir=results_dir)

processing hsm dataset


100it [00:14,  6.91it/s]


processing se dataset


  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
10it [00:09,  1.11it/s]


processing fp dataset


8it [00:00, 13.43it/s]


processing ap dataset


  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
99it [00:45,  2.16it/s]


defaultdict(dict,
            {'hsm': {'kl_div': inf, 'kstest_pval': 3.703076513218439e-31},
             'se': {'kl_div': nan, 'kstest_pval': 1.3039956366633642e-20},
             'fp': {'kl_div': inf, 'kstest_pval': 1.3813869469290808e-75},
             'ap': {'kl_div': nan, 'kstest_pval': 8.354717247922456e-35}})