In [70]:
# import os

# from google.colab import drive

# drive.mount('/content/gdrive', force_remount=True)

# %cd "/content/gdrive/My Drive/Colab Notebooks"

!pip install -r requirement.txt


In [1]:
import os
import torch
import argparse
import itertools
import matplotlib.pyplot as plt
import numpy as np
from typing import Optional

from evaluate import evaluate_generator
from lib.networks import get_generator, get_discriminator
from lib.utils import to_numpy, load_obj
from lib.augmentations import parse_augmentations
from lib.test_metrics import get_standard_test_metrics
from lib.trainers.sig_wgan import compute_expected_signature

from lib.trainers.wgan import WGANTrainer
from lib.trainers.sig_wgan import SigWGANTrainer
# from utils.trainer import SigWGANTrainer
from utils.plot import plot_signature, plot_test_metrics
from utils.datasets import get_dataset, train_test_split
from utils.utils import set_seed, save_obj, get_experiment_dir, get_sigwgan_experiment_dir, get_config_path, \
    plot_individual_data

os.environ['PYTHONHASHSEED'] = "0"


In [None]:
%cd "./FinanceSigWGANBase"

In [4]:
def main(
        data_config: dict,
        dataset: str,
        experiment_dir: str,
        gan_algo: str,
        gan_config: dict,
        generator_config: dict,
        device: str = 'cpu',
        discriminator_config: Optional[dict] = None,
        seed: Optional[int] = 0
):
    """

    Full training procedure.
    Includes: initialising the dataset / generator / GAN and training the GAN.
    """

    # n_lags = data_config.pop("n_lags")
    n_lags = data_config["n_lags"]

    # Get / prepare dataset
    x_real_rolled = get_dataset(dataset, data_config)
    x_real_rolled = x_real_rolled.to(device)
    set_seed(seed)
    print('Total data: ', list(x_real_rolled.shape))
    x_real_train, x_real_test = train_test_split(x_real_rolled, train_test_ratio=0.8)
    x_real_dim: int = x_real_rolled.shape[2]

    # Compute test metrics for train and test set
    test_metrics_train = get_standard_test_metrics(x_real_train)
    test_metrics_test = get_standard_test_metrics(x_real_test)

    # Get generator
    set_seed(seed)
    generator_config.update(output_dim=x_real_dim)
    G = get_generator(**generator_config).to(device)

    # print("Before Training")
    # print(G.state_dict())

    # Get GAN
    if gan_algo == 'SigWGAN':
        discriminator_config.update(input_dim=x_real_dim * n_lags)
        D = get_discriminator(**discriminator_config)
        print("get_discriminator done")
        trainer = SigWGANTrainer(D, G,
                                 x_real_rolled=x_real_rolled,
                                 test_metrics_train=test_metrics_train,
                                 test_metrics_test=test_metrics_test,
                                 **gan_config
                                 )

    elif gan_algo == 'WGAN':
        discriminator_config.update(input_dim=x_real_dim * n_lags)
        D = get_discriminator(**discriminator_config)
        trainer = WGANTrainer(D, G,
                              x_real=x_real_rolled,
                              test_metrics_train=test_metrics_train,
                              test_metrics_test=test_metrics_test,
                              **gan_config
                             )
    else:
        raise NotImplementedError()

    # Start training
    set_seed(seed)
    trainer.fit(device=device)

    # Store relevant training results

    # print("After Training")
    # print(G.state_dict())
    save_obj(to_numpy(x_real_rolled), os.path.join(experiment_dir, 'x_real_rolled.pkl'))
    save_obj(to_numpy(x_real_test), os.path.join(experiment_dir, 'x_real_test.pkl'))
    save_obj(to_numpy(x_real_train), os.path.join(experiment_dir, 'x_real_train.pkl'))
    save_obj(trainer.losses_history, os.path.join(experiment_dir, 'losses_history.pkl'))  # dev of losses / metrics
    save_obj(trainer.G.state_dict(), os.path.join(experiment_dir, 'generator_state_dict.pt'))
    save_obj(generator_config, os.path.join(experiment_dir, 'generator_config.pkl'))

    loss_history = os.path.join(experiment_dir, 'LossHistory')
    os.makedirs(loss_history, exist_ok=True)

    if gan_algo == 'SigWGAN':
        # plt.plot(trainer.losses_history['sig_w1_loss'], alpha=0.8)
        # plt.grid()
        # plt.savefig(os.path.join(loss_history, 'sig_loss.png'))
        plt.plot(trainer.losses_history['D_loss'])
        plt.plot(trainer.losses_history['G_loss'])
        plt.yscale('log')
        plt.savefig(os.path.join(loss_history, 'sigwgan_loss.png'))
        plt.close()
    else:
        plt.plot(trainer.losses_history['D_loss_fake'])
        plt.plot(trainer.losses_history['D_loss_real'])
        plt.plot(np.array(trainer.losses_history['D_loss_real'])+np.array(trainer.losses_history['D_loss_fake']))
        plt.savefig(os.path.join(loss_history, 'wgan_loss.png'))
        plt.close()

    # plot_test_metrics(trainer.test_metrics_train, trainer.losses_history, 'train')
    plot_test_metrics(trainer.test_metrics_train, trainer.losses_history, 'train', locate_dir=loss_history)

    # plot_test_metrics(trainer.test_metrics_train, trainer.losses_history, 'test')
    plot_test_metrics(trainer.test_metrics_train, trainer.losses_history, 'test', locate_dir=loss_history)

    with torch.no_grad():
        x_fake = G(1024, n_lags, device)
        save_obj(x_fake, os.path.join(experiment_dir, 'x_fake.pkl'))

    # print("\nx_fake\n")
    # print(x_fake.shape)
    # print(x_fake)

    for i in range(x_real_dim):
        plt.plot(to_numpy(x_fake[:250, :, i]).T, 'C%s' % i, alpha=0.1)
    plt.savefig(os.path.join(experiment_dir, 'x_fake.png'))
    plt.close()

    for i in range(x_real_dim):
        random_indices = torch.randint(0, x_real_rolled.shape[0], (250,))
        plt.plot(to_numpy(x_real_rolled[random_indices, :, i]).T, 'C%s' % i, alpha=0.1)
    plt.savefig(os.path.join(experiment_dir, 'x_real.png'))
    plt.close()

    evaluate_generator(experiment_dir, batch_size=5000,)

    if gan_algo == 'WGAN':
        save_obj(trainer.D.state_dict(), os.path.join(experiment_dir, 'discriminator_state_dict.pt'))
        save_obj(generator_config, os.path.join(experiment_dir, 'discriminator_config.pkl'))
    else:
        plot_signature(trainer.sig_w1_metric.expected_signature_mu)
        plt.savefig(os.path.join(experiment_dir, 'sig_real.png'))
        plt.close()

        plot_signature(trainer.sig_w1_metric.expected_signature_mu)
        plot_signature(compute_expected_signature(x_fake, trainer.sig_w1_metric.depth, trainer.sig_w1_metric.augmentations))
        plt.savefig(os.path.join(experiment_dir, 'sig_real_fake.png'))
        plt.close()
        pass

def benchmark_sigwgan(
        datasets=('BINANCE', 'STABLECOIN'),
        generators=('LogSigRNN', 'LSTM'),
        discriminators=('ResFNN',),
        n_seeds={"start": 0,"end": 2,"step": 1},
        device='cuda:0',
):
    """ Benchmark for SigWGAN. """
    seeds = list(range(n_seeds["start"],n_seeds["end"],n_seeds["step"]))

    grid = itertools.product(datasets, discriminators, generators, seeds)

    for dataset, discriminator, generator, seed in grid:
        print(f"data:{dataset}, G:{generator}, D:{discriminator}, seed:{seed}")

        data_config = load_obj(get_config_path('', dataset))
        discriminator_config = load_obj(get_config_path('Discriminator', discriminator))
        gan_config = load_obj(get_config_path('Trainer', 'trainer_SigWGAN'))
        generator_config = load_obj(get_config_path('Generator', 'gen_' + generator))

        experiment_dir = get_sigwgan_experiment_dir(dataset, generator, 'SigWGAN', seed)

        if not os.path.exists(experiment_dir):
            os.makedirs(experiment_dir)

        save_obj(data_config, os.path.join(experiment_dir, 'data_config.json'))
        save_obj(gan_config, os.path.join(experiment_dir, 'gen_config.json'))
        save_obj(generator_config, os.path.join(experiment_dir, 'generator_config.json'))

        if gan_config.get('augmentations') is not None:
            gan_config['augmentations'] = parse_augmentations(gan_config.get('augmentations'))

        if generator_config.get('augmentations') is not None:
            generator_config['augmentations'] = parse_augmentations(generator_config.get('augmentations'))

        if generator_config['generator_type'] == 'LogSigRNN':
            generator_config['n_lags'] = data_config['n_lags']

        save_obj(data_config, os.path.join(experiment_dir, 'data_config.pkl'))
        save_obj(discriminator_config, os.path.join(experiment_dir, 'discriminator_config.pkl'))
        save_obj(gan_config, os.path.join(experiment_dir, 'gen_config.pkl'))
        save_obj(generator_config, os.path.join(experiment_dir, 'generator_config.pkl'))

        print('Training: %s' % experiment_dir.split('/')[-2:])

        main(
            dataset=dataset,
            data_config=data_config,
            device=device,
            experiment_dir=experiment_dir,
            gan_algo='SigWGAN',
            seed=seed,
            discriminator_config=discriminator_config,
            gan_config=gan_config,
            generator_config=generator_config,
        )

def benchmark_wgan(
    datasets=('BINANCE', 'STABLECOIN'),
    generators=('NSDE', 'LSTM'),
    discriminators=('ResFNN',),
    n_seeds={"start": 0,"end": 2,"step": 1},
    device='cuda:0',
):
    """ Benchmark for WGAN. """
    seeds = list(range(n_seeds["start"],n_seeds["end"],n_seeds["step"]))

    grid = itertools.product(datasets, discriminators, generators, seeds)

    for dataset, discriminator, generator, seed in grid:
        print(f"data:{dataset}, G:{generator}, D:{discriminator}, seed:{seed}")

        data_config = load_obj(get_config_path('', dataset))
        discriminator_config = load_obj(get_config_path('Discriminator', discriminator))
        gan_config = load_obj(get_config_path('Trainer', 'trainer_WGAN'))
        generator_config = load_obj(get_config_path('Generator', 'gen_' + generator))

        experiment_dir = get_experiment_dir(dataset, generator, discriminator, 'WGAN', seed)

        if not os.path.exists(experiment_dir):
            os.makedirs(experiment_dir)

        if gan_config.get('augmentations') is not None:
            gan_config['augmentations'] = parse_augmentations(gan_config.get('augmentations'))

        if generator_config.get('augmentations') is not None:
            generator_config['augmentations'] = parse_augmentations(generator_config.get('augmentations'))

        if generator_config['generator_type'] == 'LogSigRNN':
            generator_config['n_lags'] = data_config['n_lags']        

        save_obj(data_config, os.path.join(experiment_dir, 'data_config.pkl'))
        save_obj(discriminator_config, os.path.join(experiment_dir, 'discriminator_config.pkl'))
        save_obj(gan_config, os.path.join(experiment_dir, 'gan_config.pkl'))
        save_obj(generator_config, os.path.join(experiment_dir, 'generator_config.pkl'))

        print('Training: %s' % experiment_dir.split('/')[-2:])

        main(
            dataset=dataset,
            data_config=data_config,
            device=device,
            experiment_dir=experiment_dir,
            gan_algo='WGAN',
            seed=seed,
            discriminator_config=discriminator_config,
            gan_config=gan_config,
            generator_config=generator_config
        )

In [5]:
if torch.cuda.is_available():
    compute_device = 'cuda:0'
else:
    compute_device = 'cpu'

# target_dataset = os.listdir('./datasets')
# target_dataset.remove('Uniswap')
# target_dataset.append('Uniswap')
# target_dataset = ('BINANCE',)
# target_dataset = ('WrappedBitcoin',)
# target_dataset = ('STABLECOIN',)
# target_dataset = ('Uniswap',)
# target_dataset = ('BINANCE', 'STABLECOIN')

n_seeds = {
    "start": 0,
    "end": 1,
    "step": 5
}

target_dataset = ('MyBinance', )
# target_dataset = ('STABLECOIN',)
training_generators = ('LogSigRNN',)
training_discriminators=('ResFNN',)
# training_discriminators=('LSTM',)
# training_generators = ('LSTM',)
# training_generators = ('LogSigRNN', 'LSTM')


# benchmark_sigwgan(datasets=target_dataset, 
#                   generators=training_generators,
#                   n_seeds=n_seeds,
#                   device=compute_device)

benchmark_wgan( datasets=target_dataset,
                generators=training_generators,
                discriminators=training_discriminators,
                n_seeds=n_seeds,
                device=compute_device
)


data:MyBinance, G:LogSigRNN, D:ResFNN, seed:0
Training: ['MyBinance', 'WGAN_LogSigRNN_ResFNN_0']
Use data: 
	datasets\MyBinance\BNBUSDT_1h.csv
	datasets\MyBinance\BTCUSDT_1h.csv
	datasets\MyBinance\ETHUSDT_1h.csv
	Rolled data for training, shape [22422, 72, 3]
	Example : tensor([[0.0000, 0.0000, 0.0000],
        [0.0082, 0.0018, 0.0029],
        [0.0109, 0.0040, 0.0149]])
Total data:  [22422, 72, 3]


  0%|          | 0/500 [00:01<?, ?it/s]


RuntimeError: grad can be implicitly created only for scalar outputs

In [10]:
import pandas as pd

df = pd.read_csv(".\datasets\STABLECOIN\BUSD_USD_decimals_8_45000000.csv")
df = df.drop_duplicates(subset=[df.columns[0]], keep='last')

df

Unnamed: 0,lastUpdate,answers
20,1670965483,99982000
48,1671034952,99990140
74,1671121381,99997000
99,1671207800,99999837
121,1671294207,99994850
...,...,...
1558,1676479575,99986000
1590,1676566002,99983000
1622,1676652417,100000000
1655,1676738845,99988895


apply_augmentations()
fine: line 63 & 89 in sig_wgan.py


110         t = get_time_vector(x.shape[0], x.shape[1]).to(x.device)
111         return torch.cat([t, x], dim=-1)

IndexError: tuple index out of range

sth wrong about line 91 self.D_trainstep() in sig_wgan.py

# Read .pkl of real & fake data

In [None]:
import numpy as np
import pickle

In [89]:
def get_stats(path):
    with open(path, "rb") as f:
        data = pickle.load(f)

    if type(data) == torch.Tensor:
        data = data.cpu().numpy()

    covars = []
    correlations = []
    for i in range(data.shape[0]):
        x = data[i]

        # m = np.mean( x[:,None].numpy(), 0)
        # demean = x - m
        # covar_matrix = torch.matmul( demean.T, demean ) / ( x.size(0) - 1 )
        covar_matrix = np.cov( x, rowvar=False)
        covars.append( covar_matrix )

        corr_matrix = np.corrcoef( x, rowvar=False )
        correlations.append(corr_matrix)


    return np.stack(covars, axis=0) , np.stack(correlations, axis=0)

In [90]:
fake_covars , fake_correlations  = get_stats(".\\numerical_results\\MyBinance\\SigWGAN_LogSigRNN_0\\x_fake.pkl")
train_covars, train_correlations = get_stats(".\\numerical_results\\MyBinance\\SigWGAN_LogSigRNN_0\\x_real_train.pkl")
test_covars , test_correlations  = get_stats(".\\numerical_results\\MyBinance\\SigWGAN_LogSigRNN_0\\x_real_test.pkl")

In [93]:
test_covars.shape, train_covars.shape, fake_covars.shape

((4485, 3, 3), (17937, 3, 3), (1024, 3, 3))

In [107]:
fake_m_covars        = np.mean(fake_covars, axis=0)
fake_m_correlations  = np.mean(fake_correlations, axis=0)
train_m_covars       = np.mean(train_covars, axis=0)
train_m_correlations = np.mean(train_correlations, axis=0)
test_m_covars        = np.mean(test_covars, axis=0)
test_m_correlations  = np.mean(test_correlations, axis=0)

In [108]:
fake_m_covars, train_m_covars, test_m_covars

(array([[ 0.08632594, -0.02460503, -0.00660056],
        [-0.02460503,  0.07985577,  0.00586548],
        [-0.00660056,  0.00586548,  0.04321004]]),
 array([[0.00158084, 0.00067485, 0.00087596],
        [0.00067485, 0.00067959, 0.00072099],
        [0.00087596, 0.00072099, 0.00112274]]),
 array([[0.00171021, 0.0006745 , 0.00088972],
        [0.0006745 , 0.00067827, 0.00072271],
        [0.00088972, 0.00072271, 0.00112435]]))

In [109]:
fake_m_correlations, train_m_correlations, test_m_correlations

(array([[ 1.        , -0.24942438, -0.11977426],
        [-0.24942438,  1.        ,  0.09607904],
        [-0.11977426,  0.09607904,  1.        ]]),
 array([[1.        , 0.74356298, 0.76654193],
        [0.74356298, 1.        , 0.84921931],
        [0.76654193, 0.84921931, 1.        ]]),
 array([[1.        , 0.73736343, 0.75998161],
        [0.73736343, 1.        , 0.84859643],
        [0.75998161, 0.84859643, 1.        ]]))