# train.py, the main function of this project

In [2]:
import itertools
import os
from os import path as pt

import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.model_selection import train_test_split

from hyperparameters import SIGCWGAN_CONFIGS
from lib import ALGOS
from lib.algos.base import BaseConfig
from lib.data import get_data
from lib.plot import savefig, create_summary
from lib.utils import pickle_it

from train import *

from torch import nn
from typing import Tuple

Below is the core of `train.py`. Comments are made between the code to explain what each section does.

The main parts are:

1. `get_data()`
2. `algo.fit()`
3. Plot & Summarize

In [4]:
def run(algo_id, base_config, base_dir, dataset, spec, data_params={}):
    """ Create the experiment directory, calibrate algorithm, store relevant parameters. """
    print('Executing: %s, %s, %s' % (algo_id, dataset, spec))
    experiment_directory = pt.join(base_dir, dataset, spec, 'seed={}'.format(base_config.seed), algo_id)
    if not pt.exists(experiment_directory):
        # if the experiment directory does not exist we create the directory <<<<
        os.makedirs(experiment_directory)
    
    # >>>> Set seed for exact reproducibility of the experiments <<<<
    set_seed(base_config.seed)
    
    # >>>> initialise dataset and algo <<<<
    x_real = get_data(dataset, base_config.p, base_config.q, **data_params)
    x_real = x_real.to(base_config.device)

    # train test split
    # test set is used to compare with data generated by the generator trained by training set.
    size_train = int(x_real.shape[0] * 0.8)
    indices = np.random.permutation(x_real.shape[0])
    train_idx, test_idx = indices[:size_train], indices[size_train:]
    x_real_train, x_real_test = x_real[train_idx], x_real[test_idx]

    algo = get_algo(algo_id, base_config, dataset, data_params, x_real_train)

    # >>>> Train the algorithm <<<<
    algo.fit()

    # >>> Traing Ends Here <<<

    # >>>> create summary <<<<
    create_summary(dataset, base_config.device, algo.G, base_config.p, base_config.q, x_real_test, experiment_directory)
    savefig('summary.png', experiment_directory)

    # >>>> Save generator weights, real path and hyperparameters. <<<<
    # >>>> Also, graph the paths to see how different they are. <<<<
    pickle_it(x_real, pt.join(pt.dirname(experiment_directory), 'x_real.torch'))
    random_indices = torch.randint(0, x_real.shape[0], (250,))
    for asset_i in range(x_real.shape[2]):
        plt.plot( torch.transpose(x_real[random_indices, base_config.p:, asset_i], 0, 1) , 'C%s' % asset_i, alpha=0.1)
    plt.ylim( (-0.2,0.2) )
    plt.savefig(os.path.join(experiment_directory, 'x_real.png'))
    plt.clf()

    pickle_it(x_real_test, pt.join(pt.dirname(experiment_directory), 'x_real_test.torch'))
    random_indices = torch.randint(0, x_real_test.shape[0], (250,))
    for asset_i in range(x_real_test.shape[2]):
        plt.plot( torch.transpose( x_real_test[random_indices, base_config.p:, asset_i], 0, 1) , 'C%s' % asset_i, alpha=0.1)
    plt.ylim( (-0.2,0.2) )
    plt.savefig(os.path.join(experiment_directory, 'x_real_test.png'))
    plt.clf()
    
    pickle_it(x_real_train, pt.join(pt.dirname(experiment_directory), 'x_real_train.torch'))
    random_indices = torch.randint(0, x_real_train.shape[0], (250,))
    for asset_i in range(x_real_train.shape[2]):
        plt.plot( torch.transpose( x_real_train[random_indices, base_config.p:, asset_i], 0, 1) , 'C%s' % asset_i, alpha=0.1)
    plt.ylim( (-0.2,0.2) )
    plt.savefig(os.path.join(experiment_directory, 'x_real_train.png'))
    plt.clf()

    pickle_it(algo.training_loss, pt.join(experiment_directory, 'training_loss.pkl'))
    pickle_it(algo.G.to('cpu').state_dict(), pt.join(experiment_directory, 'G_weights.torch'))
    
    # >>>> Log some results <<<<
    algo.plot_losses()
    savefig('losses', experiment_directory)


def main(args):
    if not pt.exists('./data'):
        os.mkdir('./data')

    print('Start of training. CUDA: %s' % args.use_cuda)
    for dataset in args.datasets:
        for algo_id in args.algos:
            for seed in range(args.initial_seed, args.initial_seed + args.num_seeds):
                
                print(f"dataset={dataset} / algo={algo_id} / seed={seed}")
                
                base_config = BaseConfig(
                        device='cuda:{}'.format(args.device) if args.use_cuda and torch.cuda.is_available() else 'cpu',
                    seed=seed,
                    batch_size=args.batch_size,
                    hidden_dims=args.hidden_dims,
                    p=args.p,
                    q=args.q,
                    total_steps=args.total_steps,
                    mc_samples=1000,
                )
                set_seed(seed)
                generator = get_dataset_configuration(dataset)
                for spec, data_params in generator:
                    run(
                        algo_id=algo_id,
                        base_config=base_config,
                        data_params=data_params,
                        dataset=dataset,
                        base_dir=args.base_dir,
                        spec=spec,
                    )

**To start training, run the block below**

In [None]:
import argparse
class Args(argparse.Namespace):
    base_dir     = './numerical_results'
    use_cuda     = 'store_true'
    device       = 0
    num_seeds    = 1
    initial_seed = 0
    datasets     = ['BINANCE', ]
    algos        = ['CWGAN','SigCWGAN',]
    batch_size   = 200
    p            = 24
    q            = 6
    hidden_dims  = 3 * (50,)
    total_steps  = 100

args = Args()
main(args)


# Notations

* $N$ is the total number of closing prices for each asset.
* $d$ is the number of total assets.
* $p$ is the length of past data that we are conditioning on.
* $q$ is the length of generated data.

# [1] `get_data()`

First, call `get_binance_dataset()` which reads the csv of each Binance asset and concatenates all of their closing prices into a 3D-tensor whose size is $(1,N,d)$. Here, $N$ is the total number of closing prices for each asset and $d$ is the number of total assets.

Then, the $(1,N,d)$ tensor is thrown into `zero_based_rolling_window()`. The output is a $( \, N-(p+q) \, , \, p+q \, , \, d \, )$ 3D-tensor, where $p$ is the length of past data that we are conditioning on and $q$ is the length of generated data. This is done by following this procedure:

1. Call each entry of the second dimension of the $(1,N,d)$ tensor $x_t$, so $t=0,1,\ldots,N-1$.
2. For $t=0$ to $N-(p+q)$ (the start of each windonw)
    1. First take the next $p+q$ $x_t$'s, which are $x_t,\ldots,x_{t+(p+q-1)}$.
    2. Compute $y_{s} := \dfrac{ (x_s-x_t) }{ x_t }$ for $s=t,t+1,\ldots,t+(p+q-1)$, which is the relative change of price to the price at the start of the window.
    3. Collect $y_t,y_{t+1},\ldots,y_{t+(p+q-1)}$ to form a $(1,p+q,d)$ tensor.
3. Collect all $(1,p+q,d)$ tensors to form a $(N-(p+q),p+q,d)$ tensor. Return this tensor.

# [2] `algo.fit()`

The `fit()` function is defined by the `BaseAlgo` class. When the `fit()` function is called, we enter a training loop whose number of iterations is determined by `base_config.total_steps`. This value decides how many times the generator is trained before stopping.

An iteration is called a `step()`, which is defined by `gans.py`, `gmmn.py`, `sigcwgan.py` depending on the algorithm. I will explain how `step()` works under different algorithms.

## `GANs.py`

The class `GAN` whose base class is `BaseAlgo` is equipped with:
* a `ResFNN` discriminator ( `D` ) from `./lib/arfnn.py`
* a `SimpleGenerator` generator ( `G` ) from `./lib/algos/base.py`.

The pseudocode for `step()` is as follows:

1. Loop `self.D_steps_per_G_step` times (Train the discriminator this many times.)
    1. Randomly sample some real paths of length $p+q$. The number of real paths is decided by `self.batch_size`.
    2. Regard the generator as given, for eaxh real path, use the first $p$ entries to generate the next $q$ entries (fake path of length $q$, conditional on historic data of length $p$). This is done by calling `G.sample().`
    3. Concatenate the real part and fake part to form length-$(p+q)$ paths.
    4. Train the discriminator by comparing how different [the entirely-real paths] and [the paths with fake parts] are. This is done by calling `D_trainstep()`.
    5. Record the loss from the discriminator.
2. Randomly sample some length-$p$ real paths and generate length-$q$ future paths.
3. Concatenate them into a length-$(p+q)$ paths.
4. Regard the discriminator as given, train the generator by using the discriminator to compare how different [the entirely-real paths] and [the paths with fake parts] are. This is done by calling `G_trainstep()`.
5. Record the loss from the generator.

Notice that I mention "comparison between real and fake data" in the pseudocode. The pseudocode is as follows:
1. Provide the discriminator with entirely-real data.
2. Measure the loss between the real data and $1$.
3. Provide the discriminator with data with fake parts.
4. Measure the loss between the fake data and $0$.

The `ResFNN` discriminator has inputs:
* `input_dim` $= (p+q) \times d$,
* `hidden_dims` $= (50,50,50)$,
* `output_dim` $= 1$ which is a $[0,1]$ output. If the value is close to $1$, the discriminator thinks the input is real. If it's close to $0$, its considered fake/generated.

There are four GANs to try with:
1. Recurrent Conditional GAN (`RCGAN`)
2. Time-Series GAN (`TimeGAN`)

The loss function of 1. & 2. is `torch.nn.functional.binary_cross_entropy_with_logits()`

3. Recurrent Conditional Weierstrass GAN (`RCWGAN`)
4. ConditionalWGAN (`CWGAN`)

The loss function of 3. & 4. is
$$ (2 \times \text{target} - 1) \times \text{discriminator}_{\text{out}}$$

The `SimpleGenerator` has a `ArFNN` (autoregressive feedforward neural network) architecture:

$$ (x,z) \in \mathbb{R}^{p \times d} \times \mathbb{R}^{1 \times d} = \mathbb{R}^{(p+1) \times d} \overset{A_1}\longrightarrow \mathbb{R}^{50} \overset{\phi_\alpha}\longrightarrow \mathbb{R}^{50} \overset{R_2}\longrightarrow \mathbb{R}^{50} \overset{R_3}\longrightarrow \mathbb{R}^{50} \overset{A_4}\longrightarrow \mathbb{R}^{d} $$

In `gans.py`, the generator `G` calls the `sample()` function, which iteratively generates the future path according to Algorithm 1 on Page 15. 

In short, Algorithm 1 uses past data of length $p$ to generate one new value, then uses the past $p-1$ data plus the newly-generated data (so there are still $p$ data in total) to generate one extra new data. This procedure is done iteratively until we have generated and collected a path of length $q$. 

See the comments in code to understand how input data is transformed.

In [None]:
class ResFNN(nn.Module):
    pass  # omitted

class ArFNN(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, hidden_dims: Tuple[int]):
        super().__init__()
        self.network = ResFNN(input_dim, output_dim, hidden_dims)

    def forward(self, z, x_past):
        x_generated = list()
        for t in range(z.shape[1]):
            # d=2, p=24
            z_t = z[:, t:t+1]
            # z_t: torch.Size([200000, 1, d=2])  x:torch.Size([200000, 1, p*d=48])
            x_in = torch.cat([z_t, x_past.reshape(x_past.shape[0], 1, -1)], dim=-1)
            # x_in: torch.Size([200000, 1, d*(p+1)=50])
            # (x,z) is created as x_in
            
            # >>> ResFNN Generator <<<
            x_gen = self.network(x_in)  # Calls ResFNN().network(), layer A_4 outputs here. See below for more.
            # x_gen:torch.Size([200000, 1, d=2])
            
            x_past = torch.cat([x_past[:, 1:], x_gen], dim=1) # iterative replace and append
            # x_past:torch.Size([200000, 1, p=24])
            x_generated.append(x_gen)
        x_fake = torch.cat(x_generated, dim=1)
        return x_fake

class SimpleGenerator(ArFNN):
    def __init__(self, input_dim: int, output_dim: int, hidden_dims: Tuple[int], latent_dim: int):
        super(SimpleGenerator, self).__init__(input_dim + latent_dim, output_dim, hidden_dims)
        self.latent_dim = latent_dim

    def sample(self, steps, x_past):
        '''
        [Usage] generator.sample( q, x_past ) where x_past has length p.
        '''
        # self.latent_dim = d
        z = torch.randn(x_past.size(0), steps, self.latent_dim).to(x_past.device)
        return self.forward(z, x_past)

Note that up until `network()` is called, we have a $(x,z) \in \mathbb{R}^{(p+1) \times d}$ tensor.

When `network()` is called, the network of `ResFNN` is instantiated, which takes the $\mathbb{R}^{(p+1) \times d}$ tensor as input. We set `hiddem_dims` to be $(50,50,50)$, so, as a result of the for loop below, there will be three `ResidualBlocks` ($\phi_\alpha \circ A_1: \mathbb{R}^{(p+1) \times d} \rightarrow \mathbb{R}^{50}$, $R_2: \mathbb{R}^{50} \rightarrow \mathbb{R}^{50}$, and $R_3: \mathbb{R}^{50} \rightarrow \mathbb{R}^{50}$). Finally, one more `Linear` layer ($A_4$) is appended, which maps  $\mathbb{R}^{50}$ to $\mathbb{R}^d$.

In [2]:
p = 24
d = 2

class ResidualBlock():
    def __init__(self, input_dim: int, output_dim: int) -> None:
        super(ResidualBlock, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.activation = nn.PReLU()
        self.create_residual_connection = True if input_dim == output_dim else False

class ResFNN(nn.Module):
    def __init__(self, input_dim=(p+1)*d, output_dim=d, hidden_dims=(50,50,50), flatten: bool = False):
        blocks = list()
        input_dim_block = input_dim  # initially R^{ (p+1) * d }
        for hidden_dim in hidden_dims:
            blocks.append(ResidualBlock(input_dim_block, hidden_dim))  # layers A_1, R_2, and R_3
            input_dim_block = hidden_dim  # becomes R^{ 50 }
        blocks.append(nn.Linear(input_dim_block, output_dim))  # layer A_4

# [3] Summarizing & Plotting the results

1. Call `create_summary()` from `./lib/plot.py`
    1. Take the length-$(p+q)$ real paths from the **test set**. (Call the last $q$ entries `x_real_future`)
    2. Use their first $p$ entries to generate the next $q$ fake entries. (Call the $q$ entries `x_fake_future`)
    3. Call `plot_summary()` to compare `x_real_future` and `x_fake_future` by visualizing their distributions (histogram) and autocorrelation graph.
2. Plot and pickle and the paths
3. Plot the loss progressions. See `get_standard_test_metrics()` in `./lib/algos/base.py` for all the losses.

# My Result

* Input BTC & ETH 1 hour Binance close.
* $p=24$, $q=6$
* Algorithm: Conditional Weierstrass GAN (CWGAN)

![alt text](.\numerical_results\BINANCE\BTC_ETH\seed=0\CWGAN\summary.png "Title")

$x_{ \text{real future, test} }$

![alt text](.\numerical_results\BINANCE\BTC_ETH\seed=0\CWGAN\x_real_test.png "Title")

$x_{ \text{fake future} }$

![alt text](.\numerical_results\BINANCE\BTC_ETH\seed=0\CWGAN\x_fake_future.png "Title")