In [None]:
%pylab inline
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

import numba
import cupy as cp
from numba import cuda, prange
from numba.cuda import random
from numba import jit, njit, vectorize
import GPUtil

In [None]:
# Functions
def ReLU(input):
    return input * (input > 0)


# Drift-Diffusion Model

## 1) CPU

In [None]:
%%time
# DDM

# Variables
coh = 100
rt = np.array([],dtype=float32)       # Accumulator/decision variable
decision = np.array([],dtype=float32)       # Accumulator/decision variable

sp = 0             # starting point 1
dg = 10e-6         # drift gain
sigma = 10e-3      # diffusion variability
dr = coh*dg        # drift rate
t = 0              # time -> msec
ndt = 100          # Non-decision time (msec)
bound = 1          # Decision-bound
stop_time = 10000  # Maximum allowed time for simulation
nTrials = 1000     # Number of trials to simulate

diffusion_step = np.random.normal(loc=dr, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 1
for tr in range(nTrials):
    dv = np.array([sp],dtype=float32)
    for t in range(stop_time):
        dv = np.append(dv, dv[t]+diffusion_step[tr,t])    # update decision variable
        if np.abs(dv[t]) > bound:    # checking if decision bound is reached
            decision =  np.append(decision, np.sign(dv[t]))
            rt =  np.append(rt, t)
            break
            
else:    # if decision is not reached
    decision =  np.append(decision, np.NaN)
    rt =  np.append(rt, np.NaN) 

    
rt = rt+ndt
    
# plt.plot(rt)
plt.hist(rt)
plt.show


## 2) CPU Parallelization with Numba

Function for CPU parellelization

In [None]:
@njit(parallel=True,fastmath=True)
def DDM_cpu_parallel(coh,nTrials,pars,stop_time):
    # To create empty array with numba => [np.int64(x) for x in range(0)]
    rt =  np.empty(nTrials)*np.NaN        
    decision = np.empty(nTrials)*np.NaN   
    sp = pars[0]             # starting point 1
    dg = pars[1]             # drift gain
    sigma = pars[2]          # diffusion variability
    ndt = pars[3]            # Non-decision time (msec)
    bound = pars[4]
    dr = coh*dg              # drift rate
    t = 0                    # time -> msec
    
    diffusion_step = np.random.normal(loc=dr, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 1
    for tr in prange(nTrials):
        dv = np.array([sp],dtype=float64)
        for t in range(stop_time):
            dv = np.append(dv, dv[t]+diffusion_step[tr,t])    # update decision variable
            if np.abs(dv[t]) > bound:    # checking if decision bound is reached
                decision[tr] =  np.sign(dv[t])
                rt[tr] = t
                break
    rt = rt+ndt
    return decision, rt

# Initializing jit compilation
coh = 100
sp = 0             # starting point 1
dg = 10e-6         # drift gain
sigma = 10e-3      # diffusion variability
dr = coh*dg        # drift rate
t = 0              # time -> msec
ndt = 100          # Non-decision time (msec)
bound = 1          # Decision-bound
stop_time = 1000   # Maximum allowed time for simulation
nTrials = 10       # Number of trials to simulate
pars = np.array([sp,dg,sigma,ndt,bound])
DDM_cpu_parallel(coh,nTrials,pars,stop_time) 

Running above model with random parameters and stimulus input

In [None]:
# Variables
coh = 100
sp = 0             # starting point 1
dg = 10e-6         # drift gain
sigma = 10e-3      # diffusion variability
dr = coh*dg        # drift rate
t = 0              # time -> msec
ndt = 100          # Non-decision time (msec)
bound = 1          # Decision-bound
stop_time = 10000  # Maximum allowed time for simulation
nTrials = 1000     # Number of trials to simulate
pars = np.array([sp,dg,sigma,ndt,bound])
%time decision, rt = DDM_cpu_parallel(coh,nTrials,pars,stop_time)

# plt.plot(rt)
plt.hist(rt)
plt.show

## 3) GPU Parallelization with Numba and CuPy

#### DDM Kernel for Numba

In [None]:
# @jit
def get_diffusion_matrix(coherence, drift_gain, sigma, nTrials=50000, stop_time=10000):
    # Check if coherence value is scalar. If it is, then create matrix for diffusion step calculation
    if np.isscalar(coherence):        
        coherence = np.ones((nTrials, stop_time), dtype=float32)*coherence
    if 'diffusion_matrix' in globals():  # If diffusion_matrix is already defined delete it to reduce load on VRAM
        del diffusion_matrix
        cp.get_default_memory_pool().free_all_blocks()
    diffusion_matrix = cp.asarray(coherence)
    diffusion_matrix = cp.multiply(diffusion_matrix,cp.asarray(drift_gain))    # Mean of diffusion to be Coh x drift gain
    diffusion_matrix = cp.random.normal(diffusion_matrix, sigma, dtype='float32')    # Generating normal random steps
    return diffusion_matrix

# For Dynamic Coherence
@cuda.jit
def DDM_kernel(sp, bound, ndt, diffusion_step, decision, rt): 
    tr = cuda.grid(1)
    tr_in_bounds = (tr >= 0) and (tr <= (diffusion_step.shape[0] - 1))   
    if tr_in_bounds:
        dv = sp
        for t in range(diffusion_step.shape[1]):
            dv = dv + diffusion_step[tr,t]      # update decision variable
            if dv > bound[t] or dv < -bound[t]:       # checking if decision bound is reached
                decision[tr] =  2*(dv>0) - 1    # np.sign(dv) alternative
                rt[tr] = t+ndt
                break  
# Initializing DDM_Coherence_kernel with small variables
sp=0
ndt=0
bound=cp.array([1,1])
diffusion_step = cp.array([[2,3],[3,5]])
decision = cp.empty(2)*cp.NaN
rt = cp.empty(2)*cp.NaN
blockdim = (16)
griddim = (diffusion_step.shape[0] // blockdim) + 1
cuda.synchronize()
%time DDM_kernel[griddim, blockdim](sp, bound, ndt, diffusion_step, decision, rt)
cuda.synchronize()


In [None]:
%%time
cp.random.seed()
coh = 100
sp = 0             # starting point 1
dg = 10e-6         # drift gain
sigma = 10e-3      # diffusion variability
ndt = 100          # Non-decision time (msec)
bound = 1
dg = np.array([dg],dtype=float32)

coherence = np.ones((50000, 10000), dtype=float32)*coh
bound = cp.ones(10000, dtype=float32)*bound

diffusion_matrix = get_diffusion_matrix(coh, dg, sigma)
decision = cp.empty(diffusion_matrix.shape[0])*cp.NaN
rt = cp.empty(diffusion_matrix.shape[0])*cp.NaN
blockdim = (16)
griddim = (diffusion_matrix.shape[0] // blockdim) + 1
cuda.synchronize()
%time DDM_kernel[griddim, blockdim](sp, bound, ndt, diffusion_matrix, decision, rt)
cuda.synchronize()

decision = cp.asnumpy(decision)
rt = cp.asnumpy(rt)
plt.hist(rt)


### DDM Cuda kernel with inbuilt diffusion step

1. Defining Kernels for parallelization and initializing with small variables for compilation

In [None]:

# For Dynamic Coherence
@cuda.jit
def DDM_kernel_for_delay(coherence, sp, dg, sigma, bound, ndt, decision, rt, rng_states): 
    tr = cuda.grid(1)
    tr_in_bounds = (tr >= 0) and (tr <= (coherence.shape[0] - 1))   
    if tr_in_bounds:
        dv = sp                
        for t in range(coherence.shape[1]):
            diffusion_step = (coherence[tr,t]*dg)+ (random.xoroshiro128p_normal_float32(rng_states, tr)*sigma)   
            dv = dv + diffusion_step      # update decision variable
            if dv > bound[t] or dv < -bound[t]:       # checking if decision bound is reached
                decision[tr] =  2*(dv>0) - 1    # np.sign(dv) alternative
                rt[tr] = t+ndt
                break  
                
# Initializing DDM_Coherence_kernel with small variables
sp=0
dg = 10e-6
sigma = 10e-3
ndt=0
bound=cp.array([1,1])
coherence = cp.array([[100,100],[100,100]])
decision = cp.empty(coherence.shape[0])*cp.NaN
rt = cp.empty(coherence.shape[0])*cp.NaN
blockdim = (16)
griddim = (coherence.shape[0] // blockdim) + 1
rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=1)
cuda.synchronize()
%time DDM_kernel_for_delay[griddim, blockdim](coherence, sp, dg, sigma, bound, ndt, decision, rt, rng_states)
cuda.synchronize()

2. Defining model parameters and experimental condition (in this case coherence)

In [None]:
cp.random.seed()
coh = 100
sp = 0             # starting point 1
dg = 10e-6         # drift gain
sigma = 10e-3      # diffusion variability
ndt = 100          # Non-decision time (msec)
bound = 1
# dg = np.array([dg],dtype=float32)

if 'coherence' in globals():
    del coherence
coherence = cp.ones((50000, 10000), dtype=float32)*coh
bound = cp.ones(10000, dtype=float32)*bound


3. Running the code with GPU parallelization

In [None]:
%%timeit 

decision = cp.empty(coherence.shape[0])*cp.NaN
rt = cp.empty(coherence.shape[0])*cp.NaN
blockdim = int(128)
griddim = (coherence.shape[0] // blockdim) + 1
seed = np.random.randint(0, np.iinfo(np.int32).max)
rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=seed)
cuda.synchronize()
DDM_kernel_for_delay[griddim, blockdim](coherence, sp, dg, sigma, bound, ndt, decision, rt, rng_states);
cuda.synchronize()

decision_cpu = cp.asnumpy(decision)
rt_cpu = cp.asnumpy(rt)
plt.hist(rt_cpu)
# GPUtil.showUtilization()

4. Batch Processing: GPU parallelization overloading memory for larger dataset hence performing batch operation

In [None]:

def batch_simulation(model, coherence, sp, dg, sigma, bound, ndt, seed=None):
    batch_size = 10000;
    rt_np = []
    decision_np = []
    blockdim = int(128)
    griddim = (batch_size // blockdim) + 1
    if seed is None:
        seed = np.random.randint(0, np.iinfo(np.int32).max)
    rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=seed)

    for i in range((coherence.shape[0]//batch_size)+1):
        coherence_batch = coherence[batch_size*i:batch_size*(i+1)]
        decision_cp = cp.empty(coherence_batch.shape[0])*cp.NaN
        rt_cp = cp.empty(coherence_batch.shape[0])*cp.NaN
        cuda.synchronize()
        model[griddim, blockdim](coherence_batch, sp, dg, sigma, bound, ndt, decision_cp, rt_cp, rng_states);
        cuda.synchronize()
        decision_np = np.append(decision_np, cp.asnumpy(decision_cp))
        rt_np = np.append(rt_np, cp.asnumpy(rt_cp))
    plt.hist(rt_np)
    return decision_np, rt_np

%timeit decision, rt =  batch_simulation(DDM_kernel_for_delay, coherence, sp, dg, sigma, bound, ndt)
# plt.hist(rt)

<center><h1>Drift Diffusion Model - Simulation Results <h1><center>

| Model | Number of trials | Simulation time |
| --- | --- | --- |
|CPU Simulation | 10K | 6 secs | 
| CPU Simulation with Parallelization | 10K | 250 msec|

<br><h5><center>Further improvement in the model to accomodate dynamic input<center><h4><br>

| Model | Number of trials | Simulation time |
| --- | --- | --- |
| GPU Simulation with external diffusion process | 50K | 200 mses |
| GPU Simulation with internal diffusion process | 50K | 48.6 ms ± 168 µs |
| GPU Simulation with internal diffusion process with batch processing | 50K | 28.9 ms ± 121 µs |

# Leaky-Competing Accumulator Model

<h3>1. CPU implementation of LCA <h3>

In [None]:
%%time
# LCA

# Variables
coh = 51

sp1 = 0             # starting point 1
sp2 = 0             # starting point 2
dg1 = 10e-6       # drift gain 1
dg2 = 2*10e-6       # drift gain 2
sigma = 10e-3         # diffusion variability
I0 = 0.001         # baseline input
dr1 = ReLU(coh)*dg1       # first variable (input)
dr2 = ReLU(-coh)*dg2       # second variable (input)
leak = 0               # leak
lateral_inhibition = 0               # lateral inhibition
t = 0               # time -> msec
ndt = 100           # Non-decision time (msec)
stop_time = 10000

nTrials = 10000
diffusion_step1 = np.random.normal(loc=dr1, scale=sigma, size=(nTrials, stop_time))
diffusion_step2 = np.random.normal(loc=dr2, scale=sigma, size=(nTrials, stop_time))
for tr in range(nTrials):
    dv1 = np.array([sp1],dtype=float32)       # Accumulator 1
    dv2 = np.array([sp2],dtype=float32)       # Accumulator 2
    for t in range(stop_time):
        del_dv1 = diffusion_step1[tr,t] - leak*dv1[t] - lateral_inhibition*dv2[t] + I0    # change in accumulator 1
        del_dv2 = diffusion_step2[tr,t] - leak*dv2[t] - lateral_inhibition*dv1[t] + I0    # change in accumulator 2
        dv1 = np.append(dv1, ReLU(dv1[t]+del_dv1))    # update decision variable 1
        dv2 = np.append(dv2, ReLU(dv2[t]+del_dv2))    # update decision variable 2
        if np.abs(dv1[t]) > bound:
            decision =  np.append(decision, 1)
            rt =  np.append(rt, t)
            break
        elif np.abs(dv2[t]) > bound:    # checking if decision bound is reached
            decision =  np.append(decision, -1)
            rt =  np.append(rt, t)
            break
            
else:    # if decision is not reached
    decision =  np.append(decision, np.NaN)
    rt =  np.append(rt, np.NaN) 
rt =  rt+ndt

# plt.plot(dv1,'b')
# plt.plot(dv2,'g')
# plt.hist(rt)
plt.show


<h3>2. GPU Implementation of LCA<h3>

LCA kernels and initialization

In [None]:

@cuda.jit
def ReLU_cuda(input):
    return input * (input > 0)

# For Dynamic Coherence
@cuda.jit
def LCA_kernel_for_delay(coherence, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition, decision, rt, rng_states): 
    tr = cuda.grid(1)
    tr_in_bounds = (tr >= 0) and (tr <= (coherence.shape[0] - 1))   
    if tr_in_bounds:
        dv1 = sp1   # First accumulator                
        dv2 = sp2   # Second accumulator
        for t in range(coherence.shape[1]):
            diffusion_step1 = ReLU_cuda(coherence[tr,t]*dg1)  + (random.xoroshiro128p_normal_float32(rng_states, tr)*sigma)   
            diffusion_step2 = ReLU_cuda(-coherence[tr,t]*dg2) + (random.xoroshiro128p_normal_float32(rng_states, tr)*sigma)               
            del_dv1 = diffusion_step1 - leak*dv1 - lateral_inhibition*dv2    # change in accumulator 1
            del_dv2 = diffusion_step2 - leak*dv2 - lateral_inhibition*dv1    # change in accumulator 1
        
            dv1 = ReLU_cuda(dv1 + del_dv1)           # update decision variable
            dv2 = ReLU_cuda(dv2 + del_dv2)           # update decision variable
            
            if dv1 > bound[t]:
                decision[tr] =  1
                rt[tr] = t+ndt
                break
            if dv2 > bound[t]:
                decision[tr] =  -1
                rt[tr] = t+ndt
                break
                                
# Initializing DDM_Coherence_kernel with small variables
sp1=0
sp2=0
dg1 = 10e-6
dg2 = 10e-6
sigma = 10e-3         # diffusion variability
I0 = 0.001            # baseline input
leak = 0              # leak
lateral_inhibition = 0     # lateral inhibition
ndt=0
bound=1
coherence = cp.array([[100,100],[100,100]])
bound = cp.array([1,1])
decision = cp.empty(coherence.shape[0])*cp.NaN
rt = cp.empty(coherence.shape[0])*cp.NaN
blockdim = (16)
griddim = (coherence.shape[0] // blockdim) + 1
rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=1)
cuda.synchronize()
LCA_kernel_for_delay[griddim, blockdim](coherence, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition, decision, rt, rng_states)
cuda.synchronize()

Model Parameters and input

In [None]:
# Initializing DDM_Coherence_kernel with small variables
coh = 100
sp1=0
sp2=0
dg1 = 10e-6
dg2 = 10e-6
sigma = 10e-3         # diffusion variability
I0 = 0.001            # baseline input
leak = 0              # leak
lateral_inhibition = 0     # lateral inhibition
ndt=0
bound=1

if 'coherence' in globals():
    del coherence
coherence = cp.ones((50000, 10000), dtype=float32)*coh

bound = cp.ones(10000, dtype=float32)*bound

Full Execution

In [None]:
%%timeit

decision = cp.empty(coherence.shape[0])*cp.NaN
rt = cp.empty(coherence.shape[0])*cp.NaN
blockdim = (16)
griddim = (coherence.shape[0] // blockdim) + 1
rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=1)

cuda.synchronize()
LCA_kernel_for_delay[griddim, blockdim](coherence, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition, decision, rt, rng_states)
cuda.synchronize()

Batch Execution

In [None]:
def batch_simulation(model, coherence, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition, seed=None):
    batch_size = 10000;
    rt_np = []
    decision_np = []
    blockdim = int(128)
    griddim = (batch_size // blockdim) + 1
    if seed is None:
        seed = np.random.randint(0, np.iinfo(np.int32).max)
    rng_states = random.create_xoroshiro128p_states(griddim * blockdim, seed=seed)

    for i in range((coherence.shape[0]//batch_size)+1):
        coherence_batch = coherence[batch_size*i:batch_size*(i+1)]
        decision_cp = cp.empty(coherence_batch.shape[0])*cp.NaN
        rt_cp = cp.empty(coherence_batch.shape[0])*cp.NaN
        cuda.synchronize()
        model[griddim, blockdim](coherence_batch, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition, decision_cp, rt_cp, rng_states);
        cuda.synchronize()
        decision_np = np.append(decision_np, cp.asnumpy(decision_cp))
        rt_np = np.append(rt_np, cp.asnumpy(rt_cp))
#     plt.hist(rt_np)
    return decision_np, rt_np

%timeit decision, rt =  batch_simulation(LCA_kernel_for_delay, coherence, sp1, sp2, dg1, dg2, sigma, bound, ndt, leak, lateral_inhibition)
    

<center><h3>Leaky-Competing Accumulator Model - Simulation Results<h3><centert>

| Model | Number of trials | Simulation time |
| --- | --- | --- |
| CPU Simulation | 10K | 2.5 secs |
| GPU Simulation with internal diffusion process | 50K | 47.4 ms ± 388 µs |
| GPU Simulation with internal diffusion process with batch processing | 50K | 41.4 ms ± 654 µs |

# Race Model

In [None]:
# Race Model
''' 
Considering latent observation of the stimulus.
Mainly driven by diffusion variability (if considering 0 input in opposite direction)
Good estimate in case of motion energy in random dots which also have some opposite direction component.
'''

# Variables
coh = 52

sp1 = 0                   # starting point 1
sp2 = 0                   # starting point 2
dg1 = 10e-6               # drift gain 1
dg2 = 2*10e-6             # drift gain 2
sigma = 10e-3             # diffusion variability
I0 = 0.0002               # baseline input
dr1 = ReLU(coh)*dg1       # drift rate 1
dr2 = ReLU(-coh)*dg2      # drift rate 2
t = 0                     # time -> msec
ndt = 100                 # Non-decision time (msec)
stop_time = 10000

nTrials = 1000
diffusion_step1 = np.random.normal(loc=dr1, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 1
diffusion_step2 = np.random.normal(loc=dr2, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 2
for tr in range(nTrials):
    dv1 = np.array([sp1],dtype=float32)       # Accumulator 1
    dv2 = np.array([sp2],dtype=float32)       # Accumulator 2
    for t in range(stop_time):
        dv1 = np.append(dv1, ReLU(dv1[t]+diffusion_step1[tr,t]))    # update decision variable 1
        dv2 = np.append(dv2, ReLU(dv2[t]+diffusion_step2[tr,t]))    # update decision variable 2
        if np.abs(dv1[t]) > bound:
            decision =  np.append(decision, 1)
            rt =  np.append(rt, t)
            break
        elif np.abs(dv2[t]) > bound:    # checking if decision bound is reached
            decision =  np.append(decision, -1)
            rt =  np.append(rt, t)
            break
            
else:    # if decision is not reached
    decision =  np.append(decision, np.NaN)
    rt =  np.append(rt, np.NaN) 
rt =  rt+ndt

# plt.plot(dv1)
# plt.plot(dv2)
plt.hist(rt)
plt.show

# Latent Drift-Diffusion Model

In [None]:
# Latent DDM
''' 
Considering latent observation of the stimulus.
Mainly driven by diffusion variability (if considering 0 input in opposite direction)
Good estimate in case of motion energy in random dots also have some opposite direction motion.

Mathematically equivalent to running DDM with diffusion of sqrt(2)*sigma instead of sigma

'''

# Variables
coh = 52

sp1 = 0                   # starting point 1
sp2 = 0                   # starting point 2
dg1 = 10e-6               # drift gain 1
dg2 = 2*10e-6             # drift gain 2
sigma = 10e-3             # diffusion variability
I0 = 0.0002               # baseline input
dr1 = ReLU(coh)*dg1       # drift rate 1
dr2 = ReLU(-coh)*dg2      # drift rate 2
t = 0                     # time -> msec
ndt = 100                 # Non-decision time (msec)
stop_time = 10000

nTrials = 1000
diffusion_step1 = np.random.normal(loc=dr1, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 1
diffusion_step2 = np.random.normal(loc=dr2, scale=sigma, size=(nTrials, stop_time))    # change in accumulator 2
for tr in range(nTrials):
    x1 = np.array([sp1],dtype=float32)       # Accumulator 1
    x2 = np.array([sp2],dtype=float32)       # Accumulator 2
    for t in range(stop_time):
        x1 = np.append(x1, x1[t]+diffusion_step1[tr,t])    # update decision variable 1
        x2 = np.append(x2, x2[t]+diffusion_step2[tr,t])    # update decision variable 2
        if np.abs(x1[t]-x2[t]) > bound:    # checking if decision bound is reached
            decision =  np.append(decision, np.sign(x1[t]-x2[t]))
            rt =  np.append(rt, t)
            break

else:    # if decision is not reached
    decision =  np.append(decision, np.NaN)
    rt =  np.append(rt, np.NaN) 
rt =  rt+ndt

# plt.plot(x1,'b')
# plt.plot(x2,'g')
# plt.plot(dv,'r')
plt.hist(rt)
plt.show

# Working unused functinos

In [None]:
                
# Matrix multiplication with LARGE matric and a scale or vector on GPU 
@cuda.jit
def matmult_cuda(matrix, scalar):
    row,col = cuda.grid(2)
    row_in_bounds = (row >= 0) and (row <= (matrix.shape[0] - 1))
    col_in_bounds = (col >= 0) and (col <= (matrix.shape[1] - 1))
    if row_in_bounds and col_in_bounds:
        matrix[row,col] = scalar*matrix[row,col]
# Initializing matmult_cuda function
test_mat = cp.array([[2,3,5],[4,5,5]],dtype=float32)
test_scalar = np.float32(3)
blockdim = (16, 16)
griddim = (test_mat.shape[0] // blockdim[0]) + 1, (test_mat.shape[1] // blockdim[1]) + 1
%time matmult_cuda[griddim, blockdim](test_mat, test_scalar)
        
                
        
# Memory allotment trick
def ddm_kernel(coh,nTrials,pars,stop_time,decision, rt):
    tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
    ty = cuda.blockIdx.x  # Similarly, this is the unique block ID within the 1D grid

    block_size = cuda.blockDim.x  # number of threads per block
    grid_size = cuda.gridDim.x    # number of blocks in the grid
    
    start = tx + ty * block_size
    stride = block_size * grid_size

    # assuming x and y inputs are same length
    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]


# For single Coherence
@cuda.jit
def DDM_kernel(nTrials,pars,stop_time, diffusion_step, decision, rt): 

    sp = pars[0]             # starting point 1
    dg = pars[1]             # drift gain
    sigma = pars[2]          # diffusion variability
    ndt = pars[3]            # Non-decision time (msec)
    bound = pars[4]          # drift rate
    t = 0                    # time -> msec

    tr = cuda.grid(1)
    tr_in_bounds = (tr >= 0) and (tr <= (nTrials - 1))
    
    if tr_in_bounds:
        dv = sp
        for t in range(stop_time):
            dv = dv + diffusion_step[tr,t]      # update decision variable
            if dv > bound or dv < -bound:       # checking if decision bound is reached
                decision[tr] =  2*(dv>0) - 1    # np.sign(dv) alternative for making decision based on where evidence is maximum
                rt[tr] = t+ndt
                break  