In [1]:
import numba
from numba import cuda
import nemo
nemo.core.NeuralModuleFactory()

<nemo.core.neural_factory.NeuralModuleFactory at 0x7fea843f11d0>

In [1]:
from gquant.dataframe_flow import TaskGraph
taskgraph = TaskGraph.load_taskgraph('./test.gq.yaml')
# taskgraph = TaskGraph()
taskgraph.draw()

GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'parameters'), ('type', 'ParaNode'), ('conf', {'seed': Non…

In [2]:
import torch
from torch.autograd import grad
'''
z = (xy)^2
x = 3, y =2
first order deriv [24 36]
x = 4, y =5
first order deriv [200, 160]
'''
inputs = torch.tensor([[3.0,2.0], [4.0, 5.0]], requires_grad=True)
#z = (inputs[0]*inputs[1])**2
#first_order_grad = grad(z, inputs, create_graph=True)
#print(first_order_grad)
z = (inputs.prod(axis=1)**2).sum()
first_order_grad = grad(z, inputs, create_graph=True)
print(first_order_grad)

(tensor([[ 24.,  36.],
        [200., 160.]], grad_fn=<DivBackward0>),)


In [3]:
from gquant.dataframe_flow import TaskGraph
taskgraph = TaskGraph()
taskgraph.draw()

GQuantWidget(sub=HBox())

In [2]:
from gquant.dataframe_flow import TaskGraph
taskgraph = TaskGraph.load_taskgraph('../taskgraphs/option_price_example/option_price_nemo.gq.yaml')
# taskgraph = TaskGraph()
taskgraph.draw()

GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'parameters'), ('type', 'ParaNode'), ('conf', {'seed': Non…

In [7]:
taskgraph.run()

[NeMo W 2020-10-11 17:47:17 callbacks:415] No checkpoints will be saved because step_freq and epoch_freq are both -1.


[NeMo I 2020-10-11 17:47:17 callbacks:534] Found 1 modules with weights:
[NeMo I 2020-10-11 17:47:17 callbacks:536] NetLayer
[NeMo I 2020-10-11 17:47:17 callbacks:537] Total model parameters: 1055233
[NeMo I 2020-10-11 17:47:17 callbacks:473] Found checkpoint folder nemo_log. Will attempt to restore checkpoints from it.


[NeMo W 2020-10-11 17:47:17 callbacks:499] For module NetLayer, no file matches  in nemo_log
[NeMo W 2020-10-11 17:47:17 callbacks:501] Checkpoint folder nemo_log was present but nothing was restored. Continuing training from random initialization.


[NeMo I 2020-10-11 17:47:25 callbacks:232] loss: 1990.8569
[NeMo I 2020-10-11 17:48:41 callbacks:232] loss: 1390.6603


KeyboardInterrupt: 

## Monte Carlo Option Pricing and Deep Learning model in gQuant


The Black–Scholes model can efficiently be used for pricing “plain vanilla” options with the European exercise rule. Options like the Barrier option and Basket option have a complicated structure with no simple analytical solution. The Monte Carlo simulation is an effective way to price them. Traditionally, Monte Carlo pricing is done in the C/C++ CUDA code.  In this [developer blog](https://developer.nvidia.com/blog/accelerating-python-for-exotic-option-pricing/), I explored how to use Python GPU libraries to achieve the state-of-the-art performance in the domain of exotic option pricing. The Monte Carlo simulation method I used has one limitations that it cannot handle the continuous maturity time, which is critical to calculate the Greek Theta.  

Recently, Huge and Savine introduced a novel regularization for training fast, accurate pricing in a [paper](https://arxiv.org/pdf/2005.02347.pdf). Inspired by this method, in this notebook we are going to:

    1. Handle the continuous maturity time T.     
    2. Implement the differential regularization for the example Asian Barrier option
    3. Show how we do HPC (Monte Carlo simulation) and deep learning in gQuant way. 
    
Without loss of generality, we use the Asian Barrier Option as an example. The Asian Barrier Option is a mixture of the Asian Option and the Barrier Option. The derivative price depends on the average of underlying Asset Price S, the Strike Price K, and the Barrier Price B.  Use the Down-and-Out Call Discretized Asian Barrier Option as an example. 

    The option is void if the average price of the underlying asset goes below the barrier. 
    The asset Spot Price S is usually modeled as Geometric Brownian motion, which has three free parameters: Spot Price, Percent Volatility, and Percent Drift. 
    The price of the option is the expected profit at the maturity discount to the current value. 
    The path-dependent nature of the option makes an analytic solution of the option price impossible. 

This is a good sample option for pricing using the Monte Carlo simulation. 

As a reresher, let's first run Monte Carlo simulation for Option pricing using the method introduced in the [developer blog](https://developer.nvidia.com/blog/accelerating-python-for-exotic-option-pricing/). We choose to price the example Asian Barrier Option:

    Maturity (T): 1.1 year
    Spot (S) : 120
    Strike (K): 110
    Volatility (sigma): 35.0 %
    Risk Free Rate (r): 5.0 %
    Stock Drift Rate (mu): 10.0 %
    Barrier (B): 100

To handle continuous maturity time, we fix the number of steps per year and do the fractional step for the last step. Import the libraries and define the option parameters:

In [3]:
import cupy
import numpy as np
import math
import time
import numba
from numba import cuda
from numba import njit
from numba import prange
import cudf
cupy.cuda.set_allocator(None)
#110.0, 100.0, 120.0, 0.35, 0.1, 0.05
N_PATHS = 8192000
Y_STEPS = 252 # constant, number of steps per year
T = 1.1 # time, unit 1 year
K = 110.0 # Strike price
B = 100.0 # barrier price
S0 = 120.0 # initial stock price 
sigma = 0.35 # stock annual volatility 
mu = 0.1 # stock annual return
r = 0.05 # stock annual interest rate
N_STEPS = int(np.ceil(T * Y_STEPS))
print('steps', N_STEPS)

steps 278


allocate GPU arrays for random numbers and outputs.

In [4]:
randoms_gpu = cupy.random.normal(0, 1, N_PATHS * N_STEPS, dtype=cupy.float32)
output =  np.zeros(N_PATHS, dtype=np.float32)
doutput =  np.zeros(N_PATHS*6, dtype=np.float32)

The following is the Numba kernel that we use to run simulation for each of the path. Note, the last step is handled specially to account for the continuous maturity time.

In [5]:
@cuda.jit
def numba_gpu_barrier_option(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):
    # ii - overall thread index
    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    stride = cuda.gridDim.x * cuda.blockDim.x
    tmp1 = mu/Y_STEPS
    tmp2 = math.exp(-r*T)
    tmp3 = math.sqrt(1.0/Y_STEPS)
    running_average = 0.0
    for i in range(ii, N_PATHS, stride):
        s_curr = S0
        for n in range(N_STEPS):
            if n == N_STEPS - 1:
                delta_t = T - n/Y_STEPS
                tmp1 = delta_t * mu
                tmp3 = math.sqrt(delta_t)                
            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]
            running_average += (s_curr - running_average) / (n + 1.0)

            if running_average <= B:
                break
        payoff = running_average - K if running_average>K else 0
        d_s[i] = tmp2 * payoff

Run the simulation and benchmark the computation time:

In [6]:
number_of_threads = 256
number_of_blocks = (N_PATHS-1) // number_of_threads + 1
output = cupy.zeros(N_PATHS, dtype=cupy.float32)
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
s = time.time()
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
v = output.mean()
cuda.synchronize()
e = time.time()
print('time', e-s, 'v', v)

time 0.3919990062713623 v 19.4496


Automatic adjoint differentiation(AAD) can be applied in the Monte Carlo simulation to calculate the Greeks accurately and efficiently according to 
the [paper](https://arxiv.org/pdf/2005.02347.pdf). We need to do some derivation to find the formular of pathwise differentials.

The option parameters are $T$, $K$, $S_0$, $\sigma$, $\mu$, $r$. For simplicity, we define $\theta=(T, K, S_0, \sigma, \mu, r)$.

The option price is computed by $$ p = E(f_i(\theta)) = \frac{1}{N}\sum_i f_i$$, where $f_i$ is the option value at the exercise time for the $i^{th}$ path. The Greeks are the first-order differentiation with respect to $\theta$: 

$$\nabla_{\theta} p = \frac{1}{N}\sum_i \nabla_{\theta} f_i $$

Let's focus on the calculation of gradient of $f_i(\theta)$. $f_i$ is calculated by Monte Carlo simulation method. Break it down into individual time steps. Without loss of generality, we drop the index $i$ here.

$$    \nabla_{\theta} f = 
\begin{cases}
    \nabla_{\theta} (a_n(\theta) - K)  & \text{if } a_n\geq K\\
    (0,0,0,0,0,0)              & \text{otherwise}
\end{cases}
$$

where the moving average $a_n$ at step $n$ is

$$a_n = g(a_{n-1}, s_{n}) = a_{n-1} + \frac{s_{n} - a_{n-1}}{n + 1.0}$$

The gradient of $a_n$:
$$ \nabla_{\theta} a_n = \frac{\partial g} {\partial a_{n-1}} \nabla_{\theta} a_{n-1} + \frac{\partial g} {\partial s_{n}} \nabla_{\theta} s_{n} = \frac{n}{n+1} \nabla_{\theta} a_{n-1} + \frac{1}{n+1} \nabla_{\theta} s_{n} $$

The stock price $s_n$ at step $n$ is:
$$s_n = s(s_{n-1}, \theta) = s_{n-1} + \frac{\mu}{Y} s_{n-1} + \sigma  \sqrt{\frac{1}{Y}} n_n s_{n-1}$$

At the last time step $s_n$ is:
$$s_n = s(s_{n-1}, \theta) = s_{n-1} + \frac{\mu (T Y - n) }{Y} s_{n-1} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n s_{n-1}$$
where the $n_n$ is the normal random number at step $n$ and we can treat them as constant.

The gradient of $s_n$:
$$\nabla_{\theta} s_n = \nabla_{\theta} s(s_{n-1}, \theta) = (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} n_n) \nabla_{\theta} s_{n-1} + \nabla_{\theta} (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} n_n) s_{n-1} $$
where the gradient in the second term is:
$$\nabla_{\theta} (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} v_n) = (0, 0, 0, 1/Y, \sqrt{1/Y} n_n ,0) $$

The gradient of $s_n$ for the last step:
$$\nabla_{\theta} s_n = \nabla_{\theta} s(s_{n-1}, \theta) = (\frac{1+ \mu (T Y - n) }{Y} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n ) \nabla_{\theta} s_{n-1} + \nabla_{\theta} (\frac{1+ \mu (T Y - n) }{Y} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n )  s_{n-1} $$
where the gradient in the second term is:
$$\nabla_{\theta} (\frac{1+ \mu (T Y - n) }{Y} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n ) = (\mu + \frac{1}{2}\sigma n_n (T-n/Y)^{-\frac{1}{2}}, 0, 0, (T-n/Y), \sqrt{T-n/Y} n_n ,0) $$

The initial contition $$\nabla_{\theta} S_0 = (0,0,1,0,0,0)$$

Let's convert these equations into code in the Numba kernel:

In [7]:
@cuda.jit
def numba_gpu_barrier_option(d_s, doutput, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):
    # ii - overall thread index
    
    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    stride = cuda.gridDim.x * cuda.blockDim.x
    tmp1 = mu/Y_STEPS
    tmp2 = math.exp(-r*T)
    tmp3 = math.sqrt(1.0/Y_STEPS)
    running_average = 0.0
    d_theta = numba.cuda.local.array(6, numba.float64)
    d_a = numba.cuda.local.array(6, numba.float64)
    for i in range(ii, N_PATHS, stride):
        d_theta[0] = 0 # T
        d_theta[1] = 0 # K
        d_theta[2] = 1 # S_0
        d_theta[3] = 0 # mu
        d_theta[4] = 0 # sigma
        d_theta[5] = 0 # r
        for k in range(6):
            d_a[k] = 0
        s_curr = S0
        for n in range(N_STEPS):
            if n == N_STEPS - 1:
                delta_t = T - n/Y_STEPS
                tmp1 = delta_t * mu
                tmp3 = math.sqrt(delta_t)  
            
            ## start to compute the gradient
            factor = (1.0+tmp1+sigma*tmp3*d_normals[i + n * N_PATHS])
            for k in range(6):
                 d_theta[k] *= factor
            if n == N_STEPS - 1:
                d_theta[0] += (mu + 0.5 * sigma * d_normals[i + n * N_PATHS] / tmp3) * s_curr
                d_theta[3] += (T - n/Y_STEPS) * s_curr
                d_theta[4] += tmp3 * d_normals[i + n * N_PATHS] * s_curr
            else:
                d_theta[3] += 1.0/Y_STEPS * s_curr
                d_theta[4] += tmp3 * d_normals[i + n * N_PATHS] * s_curr
            for k in range(6):
                d_a[k] = d_a[k]*n/(n+1.0) + d_theta[k]/(n+1.0)
            ## start to compute current stock price and moving average
              
            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]
            running_average += (s_curr - running_average) / (n + 1.0)
            # print(running_average, n, tmp1 * s_curr, sigma,s_curr, tmp3,d_normals[i + n * N_PATHS])
            if running_average <= B:
                break
        payoff = running_average - K if running_average>K else 0
        d_s[i] = tmp2 * payoff
        # gradient for strik 
        if running_average > K:
            d_a[1] = -1
            # adjust gradient for discount factor
            for k in range(6):
                d_a[k] *= tmp2
            d_a[0] += payoff * tmp2* -r 
            d_a[5] += payoff * tmp2* -T
        else:
            for k in range(6):
                d_a[k] = 0
        for k in range(6):
            doutput[k*N_PATHS+i] = d_a[k]

Run the simulation and benchmark the computation time:

In [8]:
number_of_threads = 256
number_of_blocks = (N_PATHS-1) // number_of_threads + 1
output = cupy.zeros(N_PATHS, dtype=cupy.float32)
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, doutput, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
s = time.time()
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, doutput, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
v = output.mean()
cuda.synchronize()
e = time.time()
print('time', e-s, 'v', v)
greeks = doutput.reshape(6, N_PATHS).mean(axis=1)
print('greeks', greeks)

time 1.3806703090667725 v 19.4496
greeks [ -0.9357169  -0.6614186   0.7683792  52.935646   21.628197  -21.394554 ]


As we shown in the [developer blog](https://developer.nvidia.com/blog/accelerating-python-for-exotic-option-pricing/), Cupy implementation is faster as it compiles the native CUDA code. The following is the same GPU kernel that is implemented in Cupy and handles batches of simulations. 

In [12]:
import cupy
cupy_batched_barrier_option = cupy.RawKernel(r'''
extern "C" __global__ void batched_barrier_option(
    float *d_s,
    float *d_d,
    const float * T,
    const float * K,
    const float * B,
    const float * S0,
    const float * sigma,
    const float * mu,
    const float * r,
    const float * d_normals,
    const long *N_STEPS,
    const long Y_STEPS,
    const long N_PATHS,
    const long N_BATCH)
{
  unsigned idx =  threadIdx.x + blockIdx.x * blockDim.x;
  unsigned stride = blockDim.x * gridDim.x;
  unsigned tid = threadIdx.x;
  double d_theta[6];
  double d_a[6];

  for (unsigned i = idx; i<N_PATHS * N_BATCH; i+=stride)
  {
    d_theta[0] = 0; // T
    d_theta[1] = 0; // K
    d_theta[2] = 1.0; // S_0
    d_theta[3] = 0; // mu
    d_theta[4] = 0; // sigma
    d_theta[5] = 0; // r
    for (unsigned k = 0; k < 6; k++){
      d_a[k] = 0.0;
    }
    
    int batch_id = i / N_PATHS;
    int path_id = i % N_PATHS;
    float s_curr = S0[batch_id];
    float tmp1 = mu[batch_id]/Y_STEPS;
    float tmp2 = exp(-r[batch_id]*T[batch_id]);
    float tmp3 = sqrt(1.0/Y_STEPS);
    unsigned n=0;
    double running_average = 0.0;
    for(unsigned n = 0; n < N_STEPS[batch_id]; n++){
        if (n == N_STEPS[batch_id] - 1) {
            float delta_t = T[batch_id] - n/Y_STEPS;
            tmp1 = delta_t * mu[batch_id];
            tmp3 = sqrt(abs(delta_t));
        }
        float normal = d_normals[path_id + batch_id * N_PATHS + n * N_PATHS * N_BATCH];
        
            
        // start to compute the gradient
        float factor = (1.0+tmp1+sigma[batch_id]*tmp3*normal);
        for (unsigned k=0; k < 6; k++) {
            d_theta[k] *= factor;
        }
        
        if (n == N_STEPS[batch_id] - 1){
                d_theta[0] += (mu[batch_id] + 0.5 * sigma[batch_id] * normal / tmp3) * s_curr;
                d_theta[3] += (T[batch_id] - n/Y_STEPS) * s_curr;
                d_theta[4] += tmp3 * normal * s_curr;
        }
        else {
                d_theta[3] += 1.0/Y_STEPS * s_curr;
                d_theta[4] += tmp3 * normal * s_curr;
        }
        for (unsigned k = 0; k < 6; k++) {
                d_a[k] = d_a[k]*n/(n+1.0) + d_theta[k]/(n+1.0); 
        }
        
        
        // start to compute current stock price and moving average       
       
       s_curr += tmp1 * s_curr + sigma[batch_id]*s_curr*tmp3*normal;
       running_average += (s_curr - running_average) / (n + 1.0);
       if (running_average <= B[batch_id]){
           break;
       }
    }

    float payoff = (running_average>K[batch_id] ? running_average-K[batch_id] : 0.f); 
    d_s[i] = tmp2 * payoff;
    //printf("%d, %d, %f, %f, %f, %d\n", i, idx, d_s[i], payoff, K[batch_id], batch_id);
    
    // gradient for strik 
    if (running_average > K[batch_id]){
       d_a[1] = -1.0;
       // adjust gradient for discount factor
       for (unsigned k = 0; k < 6; k++) {
            d_a[k] *= tmp2;
        }
        d_a[0] += payoff * tmp2* -r[batch_id];
        d_a[5] += payoff * tmp2* -T[batch_id];
        
    }
    else {
        for (unsigned k = 0; k < 6; k++) {
           d_a[k] = 0.0;
        }

    }
    
    for (unsigned k = 0; k < 6; k++) {
       d_d[k*N_PATHS*N_BATCH+i] = d_a[k];
    }
  }
}

''', 'batched_barrier_option')

Wrap the driver function into a function to call this Cupy GPU kernel:

In [13]:
import time

Y_STEPS = 252
N_BATCH = 2
N_PATHS = 102400
K = cupy.array([110.0, 120.0], dtype=cupy.float32)
B = cupy.array([100.0, 90.0], dtype=cupy.float32)
S0 = cupy.array([120.0, 100.0], dtype=cupy.float32)
sigma = cupy.array([0.35, 0.2], dtype=cupy.float32)
mu = cupy.array([0.1, 0.1], dtype=cupy.float32)
r =cupy.array([0.05, 0.05], dtype=cupy.float32)
T =cupy.array([1.1, 1.2], dtype=cupy.float32)
N_STEPS = cupy.ceil(T * Y_STEPS).astype(cupy.int64)


def batch_run():
    number_of_threads = 256
    number_of_blocks = (N_PATHS * N_BATCH - 1) // number_of_threads + 1
    random_elements = (N_STEPS.max()*N_PATHS*N_BATCH).item()
    randoms_gpu = cupy.random.normal(0, 1, random_elements, dtype=cupy.float32)
    output = cupy.zeros(N_BATCH*N_PATHS, dtype=cupy.float32)
    d_output = cupy.zeros(N_BATCH*N_PATHS*6, dtype=cupy.float32)
    cupy.cuda.stream.get_current_stream().synchronize()
    s = time.time() 
    cupy_batched_barrier_option((number_of_blocks,), (number_of_threads,),
                       (output, d_output, T, K, B, S0, sigma, mu, r,
                        randoms_gpu, N_STEPS, Y_STEPS, N_PATHS, N_BATCH))
    v = output.reshape(N_BATCH, N_PATHS).mean(axis=1)
    b = d_output.reshape(6, N_BATCH, N_PATHS).mean(axis=2)
    cupy.cuda.stream.get_current_stream().synchronize()
    e = time.time()
    print('time', e-s, 'v',v)
    print(b.shape)
    print('gradient', b)
    return output
o = batch_run()

time 0.05196547508239746 v [19.33871    1.2993258]
(6, 2)
gradient [[ -0.9299918   -0.05759778]
 [ -0.66196716  -0.14673688]
 [  0.7679586    0.1890775 ]
 [ 52.929882    12.13784   ]
 [ 21.215418    17.727806  ]
 [-21.272583    -1.5591911 ]]


After the simulation, it generates both option price and Greeks. gQuant organizes all the computation steps into weakly coupled computation nodes. We create one node that is used to generate random Option parameters.

In [21]:
from gquant.dataframe_flow import TaskGraph
taskgraph = TaskGraph.load_taskgraph('../taskgraphs/option_price_example/option_parameter.gq.yaml')
taskgraph.draw()

GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'parameters'), ('type', 'ParaNode'), ('conf', {'seed': Non…

The paramter node returns an iteratable object that emits one set of parameters at a time. Let's evaluate it and get 3 parameters samples

In [22]:
para_iter = taskgraph.run()[0]
print(next(para_iter))
print(next(para_iter))
print(next(para_iter))

[[2.1617178e+01 1.6578143e+00 1.9070444e+02 1.7638628e+02 1.1962897e-01
  8.3251335e-02 5.5134021e-02]]
[[8.6211075e+01 1.5687722e+00 1.9577818e+02 1.8485504e+02 1.7730063e-01
  3.0913249e-01 1.1180643e-01]]
[[2.9695154e+01 1.9450008e+00 1.9962321e+02 1.7963535e+02 7.1146175e-02
  3.2251015e-01 4.4907290e-02]]


The parameter node feed the parameter iterator to simulaiton node that computes the option price and greeks

In [25]:
from gquant.dataframe_flow import TaskGraph
taskgraph = TaskGraph.load_taskgraph('../taskgraphs/option_price_example/option_simulation.gq.yaml')
taskgraph.draw()

GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'parameters'), ('type', 'ParaNode'), ('conf', {'seed': Non…

The simulation node returns an iteratable object that emits the parameters alongside with the corresponding option price and Greeks. Let's evaluate it and run 3 simulations:

In [26]:
sim_iter = taskgraph.run()[0]
print(next(sim_iter))
print(next(sim_iter))
print(next(sim_iter))

(array([[2.2658905e+01, 4.5627517e-01, 1.1528189e+02, 1.5669768e+02,
        1.8900518e-01, 1.1131494e-01, 1.6266526e-01]], dtype=float32), array([[ 45.0945    ,  -7.074818  ,  -0.92846686,   0.97085005,
         36.136776  ,   0.08403954, -20.575502  ]], dtype=float32))
(array([[3.4293114e+01, 3.7975675e-01, 1.2478013e+02, 2.4558773e+01,
        1.2021179e-01, 9.7593300e-02, 1.2359453e-02]], dtype=float32), array([[0., 0., 0., 0., 0., 0., 0.]], dtype=float32))
(array([[2.2846821e+01, 1.8677117e+00, 2.0562315e+02, 7.5052475e+01,
        1.5142280e-01, 1.1507863e-01, 1.0253887e-01]], dtype=float32), array([[0., 0., 0., 0., 0., 0., 0.]], dtype=float32))


The simulation computes both the option price and Greeks. Greeks can be used to add differential regularization to the cost function. The simulation is very costly even with GPUs. The Option price accuracy depends on the number of paths because the standard deviation scale with the $n$ number of paths as $\frac{1}{\sqrt{n}}$. To speed up the option pricing and Greek compuation, we can use a neural network to approximate the option pricing. 

We have seen the simulation can generate any numbers of data to train the neural network model in Cupy GPU arrays. It is easy to convert Cupy GPU array into Pytorch tensors via DLpack library. Here is an example:

In [27]:
import torch
from torch.utils.dlpack import from_dlpack
X, Y = next(sim_iter)
X_t, Y_t = (from_dlpack(X[0].toDlpack()), from_dlpack(Y[0].toDlpack()))
print(X_t)
print(Y_t)

tensor([4.9171e+01, 1.6421e+00, 1.7739e+02, 1.9516e+02, 1.5127e-01, 1.8999e-01,
        7.6484e-03], device='cuda:0')
tensor([ 44.2958,  -0.2524,  -0.9181,   1.0615, 178.5863,  16.7144, -72.7383],
       device='cuda:0')


Now we are ready to move from HPC to Deep Learning world. We create a gQuant node that takes an iterator of simulation results and convert it into NeMo DataLayer. We use the basic feed foward neural network to approximate the option prices. But we choose to use `Elu` activation function as we need high order differenations. 

In [22]:
class ParameterIter(object):
    
    def __init__(self, batch, K=200.0, S0=200.0, sigma=0.4, mu=0.2, r=0.2, T=1.9, minT=0.1, seed=None):
        self.N_BATCH = batch
        self.K = K
        self.S0 = S0
        self.sigma = sigma
        self.mu = mu
        self.r = r
        self.T = T
        self.minT = minT
        if seed is not None:
            cupy.random.seed(seed)
        
    def __iter__(self):
        return self
        
    def __next__(self):
        """
        Parameters order (B, T, K, S0, mu, sigma, r)
        """
        X = cupy.random.rand(self.N_BATCH, 7, dtype=cupy.float32)
        # scale the [0, 1) random numbers to the correct range for each of the option parameters
        X = X * cupy.array([ 0.99, self.T, self.K, self.S0, self.mu, self.sigma, self.r], dtype=cupy.float32)
        # make sure the Barrier is smaller than the Strike price
        X[:, 0] = X[:, 0] * X[:, 2]
        X[:, 1] += self.minT 
        
        X[:, 0] += 10.0
        X[:, 2] += 10.0        
        X[:, 3] += 10.0  
        
        X[:, 4] += 0.0001
        X[:, 5] += 0.0001
        X[:, 6] += 0.0001
        return X

In [33]:
class SimulationIter(object):
    
    def __init__(self, para_iter, N_PATHS=102400, Y_STEPS=252):
        self.para_iter = para_iter
        self.N_PATHS = N_PATHS
        self.Y_STEPS = Y_STEPS
        self.N_BATCH = para_iter.N_BATCH
        self.block_threads = 256
    
    def __iter__(self):
        return self
    
    def __next__(self):
        # Parameters order (B, T, K, S0, mu, sigma, r)
        para = next(self.para_iter)
        B = cupy.ascontiguousarray(para[:, 0])
        T = cupy.ascontiguousarray(para[:, 1])
        K = cupy.ascontiguousarray(para[:, 2])
        S0 = cupy.ascontiguousarray(para[:, 3])
        mu = cupy.ascontiguousarray(para[:, 4])
        sigma = cupy.ascontiguousarray(para[:, 5])        
        r = cupy.ascontiguousarray(para[:, 6])

        N_STEPS = cupy.ceil(T * self.Y_STEPS).astype(cupy.int64)
        number_of_threads = self.block_threads
        number_of_blocks = (self.N_PATHS * self.N_BATCH - 1) // number_of_threads + 1
        random_elements = (N_STEPS.max()*self.N_PATHS*self.N_BATCH).item()
        randoms_gpu = cupy.random.normal(0, 1, random_elements, dtype=cupy.float32)
        output = cupy.zeros(self.N_BATCH * self.N_PATHS, dtype=cupy.float32)
        d_output = cupy.zeros(self.N_BATCH*self.N_PATHS*6, dtype=cupy.float32)
        cupy_batched_barrier_option((number_of_blocks,), (number_of_threads,),
                                    (output, d_output, T, K, B, S0, sigma, mu, r,
                                    randoms_gpu, N_STEPS, self.Y_STEPS, self.N_PATHS, self.N_BATCH))
        v = output.reshape(self.N_BATCH, self.N_PATHS).mean(axis=1)[:,None]
        b = d_output.reshape(6, self.N_BATCH, self.N_PATHS).mean(axis=2).T
        y = cupy.concatenate([v, b], axis=1)
        return para, y

In [34]:
import torch
from torch.utils.dlpack import from_dlpack
p_iter = ParameterIter(1, seed=5)
sim_iter = SimulationIter(p_iter)
next(sim_iter)
X, Y = next(sim_iter)
X_t, Y_t = (from_dlpack(X[0].toDlpack()), from_dlpack(Y[0].toDlpack()))

In [35]:
from gquant.dataframe_flow import Node
from gquant.dataframe_flow import NodePorts, PortsSpecSchema, TaskSpecSchema
from gquant.dataframe_flow import ConfSchema
from gquant.dataframe_flow import TaskGraph

In [36]:
class ParaNode(Node):

    def ports_setup(self):
        input_ports = {}
        output_ports = {
            'para_out': {
                PortsSpecSchema.port_type: ParameterIter
            },
        }
        return NodePorts(inports=input_ports, outports=output_ports)

    def conf_schema(self):
        json = {
            "title": "Source node configure",
            "type": "object",
            "properties": {
                "batch": {
                    "type": "integer",
                    "title": "batch size",
                    "description": "the batch size for Asian Barrier Option parameters",
                    "default": 16
                },
                "seed": {
                    "type": ["integer", "null"],
                    "title": "seed number",
                    "description": "seed number for random numbers",
                    "default": None                   
                }
            }
        }
        ui = {}
        return ConfSchema(json=json, ui=ui)

    def init(self):
        self.required = {}

    def columns_setup(self):
        columns_out = {
            'para_out': {
                'B': 'float32',
                'K': 'float32',
                "S0": "float32",
                "sigma": "float32",
                "mu": "float32",
                "r": "float32",
                "T": "float32"
            },
        }
        return columns_out

    def process(self, inputs):
        output = {}
        it = ParameterIter(self.conf.get('batch', 16), seed=self.conf.get('seed', None))
        output.update({'para_out': it})
        return output

In [37]:
class SimNode(Node):

    def ports_setup(self):
        input_ports = {
             'para_in': {
                PortsSpecSchema.port_type: ParameterIter
            },           
        }
        output_ports = {
            'sim_out': {
                PortsSpecSchema.port_type: SimulationIter
            },
        }
        return NodePorts(inports=input_ports, outports=output_ports)

    def conf_schema(self):
        json = {
            "title": "Monte Carlo Sim node configure",
            "type": "object",
            "properties": {
                "N_PATHS": {
                    "type": "integer",
                    "title": "Number Paths",
                    "description": "Number of paths in the simulation",
                    "default": 102400
                },
                "Y_STEPS": {
                    "type": "integer",
                    "title": "Steps per year",
                    "description": "Number of steps for one year",
                    "default": 252
                },
            }
        }
        ui = {}
        return ConfSchema(json=json, ui=ui)

    def init(self):
        self.required = {}

    def columns_setup(self):
        self.requried = {
            "para_in": {
                'B': 'float32',
                'K': 'float32',
                "S0": "float32",
                "sigma": "float32",
                "mu": "float32",
                "r": "float32",
                "T": "float32"
            },
        }
        columns_out = {
            'sim_out': {
                'X': 'parameters',
                'Y': 'values'
            },
        }
        return columns_out

    def process(self, inputs):
        it = inputs['para_in']
        sit = SimulationIter(it, N_PATHS=self.conf.get('N_PATHS', 102400), Y_STEPS=self.conf.get('Y_STEPS', 252))
        output = {}
        output.update({'sim_out': sit})
        return output


In [27]:
module_name = 'option_pricing'
TaskGraph.register_lab_node(module_name, ParaNode)
TaskGraph.register_lab_node(module_name, SimNode)

GQuantWidget(sub=HBox())

In [28]:
taskGraph = TaskGraph.load_taskgraph('./test.gq.yaml')
taskGraph.draw()

GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'zf1pzfpb2c'), ('type', 'ParaNode'), ('conf', {}), ('input…

In [26]:
next(taskGraph.run()[0])

/home/quant/gQuant/notebooks/gquantrc


(array([[1.11068596e+02, 1.24624074e+00, 1.48081741e+02, 7.29064178e+01,
         1.01525221e+01, 5.21019362e-02, 1.37375116e-01],
        [1.43054260e+02, 8.44110131e-01, 1.35821121e+02, 1.33756393e+02,
         1.01800976e+01, 2.00890988e-01, 1.45757884e-01],
        [4.30453186e+01, 1.53463662e+00, 1.50896591e+02, 8.55318604e+01,
         1.01782160e+01, 5.58364652e-02, 2.69974992e-02],
        [1.91962605e+01, 1.39168453e+00, 1.41576263e+02, 9.06375351e+01,
         1.01137362e+01, 2.44380590e-02, 4.17722575e-02],
        [1.52635803e+01, 7.08370388e-01, 6.07707548e+00, 1.51260345e+02,
         1.01147919e+01, 3.12184632e-01, 1.01845488e-01],
        [7.11844635e+01, 1.08279133e+00, 1.78315414e+02, 2.59601440e+01,
         1.01427555e+01, 1.25180438e-01, 1.78539768e-01],
        [1.12303629e+01, 4.84660774e-01, 1.13028290e+02, 1.62559418e+02,
         1.00545931e+01, 2.81207204e-01, 1.99183822e-01],
        [5.60447311e+01, 2.05380350e-01, 5.19982262e+01, 1.07294296e+02,
         1

In [38]:
import torch
import torch as t
import torch.nn as nn
import torch.utils.data as t_utils
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import grad
import nemo
from nemo.core.neural_types import NeuralType, ChannelType, LossType
from nemo.utils.decorators import add_port_docs

import torch
from torch.utils.dlpack import from_dlpack
# class OptionDataSet(torch.utils.data.IterableDataset):
#     
#     def __init__(self, seed=2,  N_PATHS=102400, Y_STEPS=252, max_len=10):
#         p_iter = ParameterIter(1, seed=seed)
#         self.sim_iter = SimulationIter(p_iter, N_PATHS=N_PATHS, Y_STEPS=Y_STEPS)
#         self.num = 0
#         self.max_length = max_len
#         
#     def __len__(self):
#         return self.max_length
#         
#     def __iter__(self):
#         self.num = 0
#         return self
#     
#     def __next__(self):
#         if self.num > self.max_length:
#             raise StopIteration
#         X, Y = next(self.sim_iter)
#         return (from_dlpack(X[0].toDlpack()), from_dlpack(Y[0].toDlpack()))
# 
class OptionDataSet(torch.utils.data.Dataset):
    def __init__(self,  seed=2,  N_PATHS=102400, Y_STEPS=252, max_len=10):
        p_iter = ParameterIter(1, seed=seed)
        self.sim_iter = SimulationIter(p_iter, N_PATHS=N_PATHS, Y_STEPS=Y_STEPS)
        self.num = 0
        self.max_length = max_len

    def __getitem__(self, index):
        X, Y = next(self.sim_iter)
        return (from_dlpack(X[0].toDlpack()), from_dlpack(Y[0].toDlpack()))        

    def __len__(self):
        return self.max_length


class OptionDataLayer(nemo.backends.pytorch.nm.DataLayerNM):
    
    @property
    @add_port_docs()
    def output_ports(self):
        """Returns definitions of module output ports
        """
        return {
            "x": NeuralType(('B', 'D'), ChannelType()),
            "y": NeuralType(('B', 'D'), LabelsType()),
    }
    

    def __init__(self, seed=2,  N_PATHS=102400, Y_STEPS=252, max_len=10, batch_size=8, name=None):
        super().__init__(name=name)
        self._batch_size = batch_size
        self._dataset = OptionDataSet(seed, N_PATHS, Y_STEPS, max_len)
        self._data_iterator = t_utils.DataLoader(self._dataset, batch_size=self._batch_size,)

    def __len__(self):
        return len(self._dataset)

    @property
    def dataset(self):
        return None

    @property
    def data_iterator(self):
        return self._data_iterator
    


In [39]:
class MSELoss(nemo.backends.pytorch.nm.LossNM):
    @property
    @add_port_docs()
    def input_ports(self):
        return {
            "predictions": NeuralType(('B', 'D'), ChannelType()),
            "target": NeuralType(('B', 'D'), LabelsType()),
        }

    @property
    @add_port_docs()
    def output_ports(self):
        """Returns definitions of module output ports.
        """
        return {"loss": NeuralType(elements_type=LossType())}

    def __init__(self, name=None):
        super().__init__(name=name)
        self._criterion = nn.MSELoss()

    def _loss_function(self, **kwargs):
        return self._criterion(*(kwargs.values()))

In [40]:
class NetLayer(nemo.backends.pytorch.nm.TrainableNM):


    @property
    @add_port_docs()
    def input_ports(self):
        """Returns definitions of module input ports.
        Returns:
          A (dict) of module's input ports names to NeuralTypes mapping
        """
        return {"x": NeuralType(('B', 'D'), ChannelType())}

    @property
    @add_port_docs()
    def output_ports(self):
        """Returns definitions of module output ports.
        Returns:
          A (dict) of module's output ports names to NeuralTypes mapping
        """
        return {"y_pred": NeuralType(('B', 'D'), ChannelType())}


    def __init__(self, hidden=512, name=None):
        super().__init__(name=name)
        self.fc1 = nn.Linear(7, hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.fc3 = nn.Linear(hidden, hidden)
        self.fc4 = nn.Linear(hidden, hidden)
        self.fc5 = nn.Linear(hidden, hidden)
        self.fc6 = nn.Linear(hidden, 1)
        self.register_buffer('norm',
                             torch.tensor([198.0,
                                           2.0,
                                           200.0,
                                           200.0,
                                           0.2,
                                           0.4,
                                           0.2]))

    def _forward(self, x):
        x = x / self.norm
        x = F.elu(self.fc1(x))
        x = F.elu(self.fc2(x))
        x = F.elu(self.fc3(x))
        x = F.elu(self.fc4(x))
        x = F.elu(self.fc5(x))
        y = self.fc6(x)
        return y
        
    def forward(self, x):
        """
        Parameters order (B, T, K, S0, mu, sigma, r)
        """
        y = self._forward(x)
        inputs = x.clone().detach()
        # inputs = x
        inputs.requires_grad = True
        # instead of using loss.backward(), use torch.autograd.grad() to compute gradients
        # https://pytorch.org/docs/stable/autograd.html#torch.autograd.grad
        loss_grads = grad(self._forward(inputs).sum(), inputs, create_graph=True)
        #torch.cat((y, loss_grads[0][1:]))
        #return torch.cat((y, loss_grads[0][1:]))
        return torch.cat((y, loss_grads[0][:, 1:]), axis=1)
        # return y, loss_grads[0]

In [41]:
_dataset = OptionDataSet(1)
print(len(_dataset))
_data_iterator = t_utils.DataLoader(_dataset, batch_size=2)

10


In [41]:
 x_data = t.tensor(np.random.uniform(low=0, high=1, size=1000)).unsqueeze(-1)
y_data = x_data**2
test_dataset = t_utils.TensorDataset(x_data.float(), y_data.float())
test_data_iterator = t_utils.DataLoader(test_dataset, batch_size=200)
for i in test_data_iterator:
    print(i[0].shape, i[1].shape)

torch.Size([200, 1]) torch.Size([200, 1])
torch.Size([200, 1]) torch.Size([200, 1])
torch.Size([200, 1]) torch.Size([200, 1])
torch.Size([200, 1]) torch.Size([200, 1])
torch.Size([200, 1]) torch.Size([200, 1])


In [42]:
for i in _data_iterator:
    print(i[0], i[1][:, 0])

tensor([[1.4303e+02, 2.4326e-01, 2.0914e+02, 2.8978e+01, 8.3191e-02, 4.5027e-02,
         1.3974e-01],
        [2.9654e+01, 1.9897e+00, 7.4952e+01, 7.2937e+01, 7.5282e-02, 1.5301e-03,
         1.0949e-01]], device='cuda:0') tensor([0.0000, 3.0253], device='cuda:0')
tensor([[6.4603e+01, 8.4230e-01, 1.0023e+02, 1.1882e+01, 3.9869e-02, 3.5644e-01,
         1.3856e-01],
        [1.1644e+02, 1.3120e+00, 1.5874e+02, 1.4445e+02, 9.0268e-02, 2.9101e-01,
         1.5337e-01]], device='cuda:0') tensor([0.0000, 8.0263], device='cuda:0')
tensor([[2.8875e+01, 8.0980e-01, 7.4806e+01, 7.6230e+01, 1.5153e-01, 1.7848e-01,
         4.9608e-02],
        [1.3245e+02, 3.6263e-01, 1.4677e+02, 1.1591e+02, 7.5371e-02, 1.0872e-01,
         1.2603e-01]], device='cuda:0') tensor([6.8931, 0.0000], device='cuda:0')
tensor([[1.3054e+02, 1.2497e+00, 1.3889e+02, 1.7546e+02, 1.8222e-01, 2.7569e-01,
         3.3436e-02],
        [4.0025e+01, 1.9401e+00, 1.7678e+02, 3.7794e+01, 3.1838e-02, 2.8716e-03,
         5.2197e-0

In [40]:
i[1][:, 0]

tensor([5.7957e+08, 7.9516e+05], device='cuda:0')

In [32]:
from gquant.dataframe_flow.task import load_modules
#load_modules('../modules/nemo_gquant_modules/')
from nemo_gquant_modules.nemoBaseNode import NeMoBase
from gquant.dataframe_flow import Node
import nemo


class OptionDataLayerNode(NeMoBase, Node):
    def init(self):
        NeMoBase.init(self, OptionDataLayer)



class OptionPriceNode(NeMoBase, Node):
    def init(self):
        NeMoBase.init(self, NetLayer)



class OptionMSELossNode(NeMoBase, Node):
    def init(self):
        NeMoBase.init(self, MSELoss)



ModuleNotFoundError: No module named 'nemo_gquant_modules'

In [112]:
TaskGraph.register_lab_node(module_name, OptionDataLayerNode)
TaskGraph.register_lab_node(module_name, OptionPriceNode)
TaskGraph.register_lab_node(module_name, OptionMSELossNode)

NotImplementedError: object proxy must define __reduce_ex__()

In [34]:
n = NetLayer().cuda()

In [37]:
X_t

tensor([ 44.5728,   1.9478,  75.9254, 103.3741,  10.0697,   0.2939,   0.1639],
       device='cuda:0')

In [42]:
t = n.forward(i[0])

In [50]:
t

(tensor([[0.0522],
         [0.0472]], device='cuda:0', grad_fn=<AddmmBackward>),
 tensor([[-2.2395e-06, -1.5866e-03,  1.9912e-05, -1.8659e-05,  1.0162e-02,
           2.1831e-02,  9.4899e-03],
         [-2.1606e-06, -1.4889e-03,  1.7424e-05, -1.7588e-05,  1.0280e-02,
           2.0810e-02,  9.7746e-03]], device='cuda:0', grad_fn=<DivBackward0>))

In [44]:
t[0].shape

torch.Size([2, 1])

In [46]:
t[1][:, 1:].shape

torch.Size([2, 6])

In [51]:
torch.cat((t[0], t[1][:, 1:]), axis=1)

tensor([[ 5.2201e-02, -1.5866e-03,  1.9912e-05, -1.8659e-05,  1.0162e-02,
          2.1831e-02,  9.4899e-03],
        [ 4.7198e-02, -1.4889e-03,  1.7424e-05, -1.7588e-05,  1.0280e-02,
          2.0810e-02,  9.7746e-03]], device='cuda:0', grad_fn=<CatBackward>)

In [95]:
t[0], t[1][0]

(tensor([-0.0024], device='cuda:0', grad_fn=<AddBackward0>),
 tensor([-1.9490e-06, -1.4939e-03, -1.2966e-05,  1.6835e-05,  1.0066e-02,
         -6.5063e-03, -4.0723e-02], device='cuda:0', grad_fn=<DivBackward0>))

In [96]:
Y_t

tensor([ 1.1552, -0.3395, -0.0534,  0.0166,  0.0518, -0.1811, -0.1894],
       device='cuda:0')

In [41]:
#inputs = torch.tensor(X_t, requires_grad=True)
first_order_grad = grad(t, X_t, torch.ones(7), create_graph=True)

RuntimeError: One of the differentiated Tensors does not require grad

In [39]:
t.shape

torch.Size([7])

In [88]:
nf = nemo.Ne

In [None]:
nemo.core.neural_types.

In [50]:
from gquant.dataframe_flow.task import load_modules
load_modules('../modules/nemo_gquant_modules/')
from nemo_gquant_modules.nemoBaseNode import NeMoBase


Load(path='/home/quant/gQuant/notebooks/../modules', mod=<module 'nemo_gquant_modules' from '../modules/nemo_gquant_modules/__init__.py'>)

In [41]:
import nemo_gquant_modules