In [305]:
import numba
from numba import cuda

In [307]:
import cupy
import numpy as np
import math
import time
import numba
from numba import cuda
from numba import njit
from numba import prange
import cudf
cupy.cuda.set_allocator(None)
#110.0, 100.0, 120.0, 0.35, 0.1, 0.05
N_PATHS = 8192000
Y_STEPS = 365 # constant, number of steps per year
T = 1.0 # time, unit 1 year
K = 110.0 # Strike price
B = 100.0 # barrier price
S0 = 120.0 # initial stock price 
sigma = 0.35 # stock annual volatility 
mu = 0.1 # stock annual return
r = 0.05 # stock annual interest rate
# calculate total step size
N_STEPS = int(np.ceil(T * Y_STEPS))
print('steps', N_STEPS)

steps 365


In [308]:
cupy.random.seed(11)

In [309]:
randoms_gpu = cupy.random.normal(0, 1, N_PATHS * N_STEPS, dtype=cupy.float32)
# randoms_cpu = np_randoms = cupy.asnumpy(randoms_gpu)
output =  np.zeros(N_PATHS, dtype=np.float32)

doutput =  np.zeros(N_PATHS*6, dtype=np.float32)

In [310]:
@cuda.jit
def numba_gpu_barrier_option(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):
    # ii - overall thread index
    
    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    stride = cuda.gridDim.x * cuda.blockDim.x
    tmp1 = mu/Y_STEPS
    tmp2 = math.exp(-r*T)
    tmp3 = math.sqrt(1.0/Y_STEPS)
    running_average = 0.0
    for i in range(ii, N_PATHS, stride):
        s_curr = S0
        for n in range(N_STEPS):
            if n == N_STEPS - 1:
                delta_t = T - n/Y_STEPS
                tmp1 = delta_t * mu
                tmp3 = math.sqrt(delta_t)                
            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]
            running_average += (s_curr - running_average) / (n + 1.0)
            # print(running_average, n, tmp1 * s_curr, sigma,s_curr, tmp3,d_normals[i + n * N_PATHS])
            if running_average <= B:
                break
        payoff = running_average - K if running_average>K else 0
        d_s[i] = tmp2 * payoff

In [311]:
number_of_threads = 256
number_of_blocks = (N_PATHS-1) // number_of_threads + 1
output = cupy.zeros(N_PATHS, dtype=cupy.float32)
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
s = time.time()
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
v = output.mean()
cuda.synchronize()
e = time.time()
print('time', e-s, 'v', v)

time 0.44753336906433105 v 18.708305


In [312]:
(19.43269 - 19.432878)/2e-4





-0.9399999999892827

In [313]:
1/365

0.0027397260273972603

In [314]:
output

array([59.287468  ,  0.9394655 ,  4.4536147 , ...,  0.67556167,
       10.877099  ,  0.52545613], dtype=float32)

Calculate the differentiation by ADD method

The parameters are $T$, $K$, $S_0$, $\sigma$, $\mu$, $r$. The option price is computed by 
$$ p = E(f_i(\theta)) = \frac{1}{N}\sum_i f_i$$
where $f_i$ is the option value at the exercise time for the $i^{th}$ path.
$$\nabla_{\theta} p = \frac{1}{N}\sum_i \nabla_{\theta} f_i $$

Let's focus on the calculation of gradient of $f_i(\theta)$. $f_i$ is calculated by Monte Carlo simulation method. Break it down into steps. Without loss of generality, we drop the index $i$ here.

$$    \nabla_{\theta} f = 
\begin{cases}
    \nabla_{\theta} (a_n(\theta) - K)  & \text{if } a_n\geq K\\
    (0,0,0,0,0,0)              & \text{otherwise}
\end{cases}
$$

where the moving average $a_n$ at step $n$ is

$$a_n = g(a_{n-1}, s_{n}) = a_{n-1} + \frac{s_{n} - a_{n-1}}{n + 1.0}$$

The gradient of $a_n$:
$$ \nabla_{\theta} a_n = \frac{\partial g} {\partial a_{n-1}} \nabla_{\theta} a_{n-1} + \frac{\partial g} {\partial s_{n}} \nabla_{\theta} s_{n} = \frac{n}{n+1} \nabla_{\theta} a_{n-1} + \frac{1}{n+1} \nabla_{\theta} s_{n} $$

The stock price $s_n$ at step $n$ is:
$$s_n = s(s_{n-1}, \theta) = s_{n-1} + \frac{\mu}{Y} s_{n-1} + \sigma  \sqrt{\frac{1}{Y}} n_n s_{n-1}$$
At the last time step $s_n$ is:
$$s_n = s(s_{n-1}, \theta) = s_{n-1} + \frac{\mu (T Y - n) }{Y} s_{n-1} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n s_{n-1}$$
The gradient of $s_n$:
$$\nabla_{\theta} s_n = \nabla_{\theta} s(s_{n-1}, \theta) = (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} n_n) \nabla_{\theta} s_{n-1} + \nabla_{\theta} (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} n_n) s_{n-1} $$

The initial contition $$\nabla_{\theta} S_0 = (0,0,1,0,0,0)$$

$$\nabla_{\theta} (1 + \frac{\mu}{Y} + \sigma \sqrt{\frac{1}{Y}} v_n) = (0, 0, 0, 1/Y, \sqrt{1/Y} n_n ,0) $$

$$\nabla_{\theta} (\frac{1+ \mu (T Y - n) }{Y} + \sigma  \sqrt{\frac{TY-n}{Y}} n_n ) = (\mu + \frac{1}{2}\sigma n_n (T-n/Y)^{-\frac{1}{2}}, 0, 0, (T-n/Y), \sqrt{T-n/Y} n_n ,0) $$

In [315]:
@cuda.jit
def numba_gpu_barrier_option(d_s, doutput, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):
    # ii - overall thread index
    
    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    stride = cuda.gridDim.x * cuda.blockDim.x
    tmp1 = mu/Y_STEPS
    tmp2 = math.exp(-r*T)
    tmp3 = math.sqrt(1.0/Y_STEPS)
    running_average = 0.0
    d_theta = numba.cuda.local.array(6, numba.float64)
    d_a = numba.cuda.local.array(6, numba.float64)
    for i in range(ii, N_PATHS, stride):
        d_theta[0] = 0 # T
        d_theta[1] = 0 # K
        d_theta[2] = 1 # S_0
        d_theta[3] = 0 # mu
        d_theta[4] = 0 # sigma
        d_theta[5] = 0 # r
        for k in range(7):
            d_a[k] = 0
        s_curr = S0
        for n in range(N_STEPS):
            if n == N_STEPS - 1:
                delta_t = T - n/Y_STEPS
                tmp1 = delta_t * mu
                tmp3 = math.sqrt(delta_t)  
            
            ## start to compute the gradient
            factor = (1.0+tmp1+sigma*tmp3*d_normals[i + n * N_PATHS])
            for k in range(6):
                 d_theta[k] *= factor
            if n == N_STEPS - 1:
                d_theta[0] += (mu + 0.5 * sigma * d_normals[i + n * N_PATHS] / tmp3) * s_curr
                d_theta[3] += (T - n/Y_STEPS) * s_curr
                d_theta[4] += tmp3 * d_normals[i + n * N_PATHS] * s_curr
            else:
                d_theta[3] += 1.0/Y_STEPS * s_curr
                d_theta[4] += tmp3 * d_normals[i + n * N_PATHS] * s_curr
            for k in range(6):
                d_a[k] = d_a[k]*n/(n+1.0) + d_theta[k]/(n+1.0)
            ## start to compute current stock price and moving average
              
            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]
            running_average += (s_curr - running_average) / (n + 1.0)
            # print(running_average, n, tmp1 * s_curr, sigma,s_curr, tmp3,d_normals[i + n * N_PATHS])
            if running_average <= B:
                break
        payoff = running_average - K if running_average>K else 0
        d_s[i] = tmp2 * payoff
        # gradient for strik 
        if running_average > K:
            d_a[1] = -1
            # adjust gradient for discount factor
            for k in range(6):
                d_a[k] *= tmp2
            d_a[0] += payoff * tmp2* -r 
            d_a[5] += payoff * tmp2* -T
        else:
            for k in range(6):
                d_a[k] = 0
        for k in range(6):
            doutput[k*N_PATHS+i] = d_a[k]

In [316]:
number_of_threads = 256
number_of_blocks = (N_PATHS-1) // number_of_threads + 1
output = cupy.zeros(N_PATHS, dtype=cupy.float32)
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, doutput, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
s = time.time()
numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, doutput, np.float32(T), np.float32(K), 
                    np.float32(B), np.float32(S0), 
                    np.float32(sigma), np.float32(mu), 
                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)
v = output.mean()
cuda.synchronize()
e = time.time()
print('time', e-s, 'v', v)

time 1.7911946773529053 v 18.708305


In [317]:
print(output)

[59.287468    0.9394655   4.4536147  ...  0.67556167 10.877099
  0.52545613]


In [318]:
doutput.reshape(6, N_PATHS).shape

(6, 8192000)

In [320]:
doutput.reshape(6, N_PATHS).mean(axis=1)

array([ -0.9074474,  -0.6714072,   0.7713591,  48.02249  ,  20.488997 ,
       -18.708313 ], dtype=float32)

In [226]:

18.714218

18.714218

In [116]:
18.725582

18.725582

In [122]:
18.754488

18.754488

In [134]:
(18.760712 - 18.71775)/0.002

21.481000000001416