In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm
from pathlib import Path

from utils.data import get_dataset_iterator

import torch
from torch import nn

from utils.dl import QuantGAN_TemporalBlock

data_path = Path("data")
models_dir = Path("models")

hsm_dataset_path = data_path / "huge_stock_market_dataset"
solar_energy_dataset_path = data_path / "solar_energy"
fuel_prices_dataset_path = data_path / "fuel_prices"
passengers_dataset_path = data_path / "air_passengers"

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [2]:
# 3rd version (5, 256, 32) + high freq preprocessing + new gen
def is_high_freq(time_series, threshold=0.85, rolling_parts=200):
    orig_std = time_series.std().values[0]
    ma_ts = time_series.rolling(len(time_series) // rolling_parts).mean()
    ma_std = ma_ts.std().values[0]
    return abs(ma_std - orig_std) / orig_std > threshold

def ma(time_series, rolling_parts=200, window=None):
    if window is None:
        window = max(len(time_series) // rolling_parts, 2)
    ts1 = time_series.rolling(window, closed="left").mean()
    ts2 = time_series[:: - 1].rolling(window).mean()[:: - 1]
    ts1[ts1.isna()] = ts2[ts1.isna()]
    ts2[ts2.isna()] = ts1[ts2.isna()]
    ats = (ts1 + ts2) / 2
    return ats


class TimeDIffusion(nn.Module):
    def __init__(self):
        super().__init__()
        self.tcn = nn.ModuleList([QuantGAN_TemporalBlock(1, 128, kernel_size=1, stride=1, dilation=1, padding=0, dropout=0.25),
                                 *[QuantGAN_TemporalBlock(128, 128, kernel_size=2, stride=1, dilation=i, padding=i, dropout=0.0)
                                        for i in [2 ** i for i in range(14)]]])
        self.last = nn.Conv1d(128, 1, kernel_size=1, stride=1, dilation=1)

    def forward(self, x):
        skip_layers = []
        for layer in self.tcn:
            skip, x = layer(x)
            skip_layers.append(skip)
        x = self.last(x + sum(skip_layers))
        return x


epochs = 5
steps_per_epoch = 256
samples_to_gen = 20
steps_to_gen = {16, 32, 64, 128}
# noise_gen_threshold = 0.001
batch_size = 32

for dataset_ind, (dataset_name, dataset_path) in enumerate(
    (
    # ("hsm", hsm_dataset_path),
    ("se", solar_energy_dataset_path),
    # ("fp", fuel_prices_dataset_path),
    # ("ap", passengers_dataset_path)
)):
    ts_iterator = get_dataset_iterator(dataset_name, dataset_path)
    out_dataset_dir = dataset_path / "synthetic/TimeDiffusion"
    if not out_dataset_dir.exists():
        out_dataset_dir.mkdir()
    
    start_point = 0
    for _ in range(start_point): next(ts_iterator)
    ts_index = - 1 + start_point
    for time_series in tqdm(ts_iterator):
        ts_index += 1
        
        # high freq check
        if is_high_freq(time_series):
            time_series = ma(time_series)
        
        train = time_series.values.flatten()
        tmean = train.mean()
        tstd = train.std()
        train = (train - tmean) / tstd
        train_tensor = torch.from_numpy(train).float().to(device)

        torch.random.manual_seed(0)
        model = TimeDIffusion().to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=4e-4)
        losses = []
#         kl_divs = []
#         val_noise = torch.rand(20, 1, len(train)).to(device)

        for epoch in range(1, epochs + 1):
#         for epoch in tqdm(range(1, epochs + 1)):
            model.train()
            X = train_tensor.repeat(batch_size, 1).unsqueeze(1)
            noise = torch.row_stack([torch.rand(1, *X.shape[1:]) for _ in range(X.shape[0])]).to(device)
            noise_level = torch.rand(X.shape).to(device)
            noise *= noise_level

            for step in range(steps_per_epoch):
                optim.zero_grad()
                y_hat = model(X + noise)
                loss = (y_hat - noise).abs().mean()
                loss.backward()
                optim.step()
                with torch.no_grad():
                    X -= y_hat
                    noise -= y_hat
                losses.append(loss.item())
    
        model.eval()
        result = []
        with torch.no_grad():
            generated = torch.rand(samples_to_gen // len(steps_to_gen), 1, len(train)).to(device)
            for step in range(1, steps_per_epoch + 1):
                pred_noise = model(generated)
                generated -= pred_noise
                if step in steps_to_gen:
                    result.append(generated.detach().cpu().numpy().squeeze() * tstd + tmean)
        result = np.row_stack(result)
        np.save(out_dataset_dir / f"selected{ts_index}.npy", result)
        torch.save(model.state_dict(), models_dir / f"TimeDiffusion_{dataset_name}_{ts_index}.pt")
        del model, optim, generated
             
#             steps = steps_per_epoch * 2
#             with torch.no_grad():
#                 model.eval()
#                 generated = val_noise
#                 for step in range(1, steps + 1):
#                     pred_noise = model(generated)
#                     generated -= pred_noise
#                 generated = generated.detach().cpu().numpy().squeeze()
#             kl_divs.append(np.mean([np.mean([x for x in kl_div(generated[i], train) if not np.isnan(x) and not np.isinf(x)])
#                                 for i in range(len(generated))]))
 
#         plt.plot(losses)
#         plt.show()
#         plt.plot(kl_divs)
#         print(kl_divs)
        
#         break
#     break

50it [4:08:13, 297.87s/it]


5 min 1k ts

noise thresholds (steps): 1.4 (25/256), 0.9 (41/256), 0.68 (15/256), 0.04 (98/256), 0.55 (65/256)

In [3]:
# steps = steps_per_epoch // 2
# plot_rate = steps // 5
# with torch.no_grad():
#     model.eval()
#     noise = torch.rand(1, 1, len(train)).to(device)
#     generated = noise
#     kl_divs = []
#     pred_noises = []
#     for step in range(1, steps + 1):
#         pred_noise = model(generated)
#         generated -= pred_noise
        
#         pred_noises.append(pred_noise.sum().item())
#         kl_divs.append(kl_div(generated.detach().cpu().squeeze().numpy(), train).mean())
        
#         if step % plot_rate == 0:

#             result = generated.detach().cpu().squeeze().numpy()
#             plt.plot(train)
#             plt.plot(result)
#             plt.legend(["ground truth", "synthetic"])
#             plt.title(f"Step #{step} pred_noise: {pred_noises[- 1]:0.4f} kl_div: {kl_divs[- 1]: 0.4f}")
#             plt.show()
    
#     plt.plot(range(len(kl_divs)), kl_divs)
#     ind = np.argmin(kl_divs)
#     plt.title("kl_div " + str(ind))
#     plt.show()
#     plt.plot(range(len(pred_noises)), pred_noises)
#     plt.title("pred noises")
#     plt.show()
#     print(kl_divs[ind], pred_noises[ind])
# # plt.plot(noise.cpu().numpy().squeeze())
# # plt.plot(pred_noise.cpu().numpy().squeeze())
# # plt.legend(["noise", "pred_noise"])