In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# Always reload modules so that as you change code in src, it gets loaded
%autoreload

# %%capture --no-display
%matplotlib inline
%pylab inline

import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

import numba
import cupy as cp
from numba import cuda, prange
from numba.cuda import random as cuda_random
from numba import jit, njit, vectorize
from numba.core.errors import NumbaPerformanceWarning
import warnings
warnings.simplefilter('ignore', category=NumbaPerformanceWarning)
import GPUtil


from src.DDM import *
from src.IAM import *
from src.dynamic_parameters import *



In [32]:
# Functions

def get_gpu_info():       
    ''' Gathers GPU information using Numba package
    Returns:
    num_sm: Number of total Streaming Multiprocessors on GPU
    num_cores_per_sm: Number of total SMs on GPU
    '''
    from numba import cuda
    cc_cores_per_SM_dict = {
        (2,0) : 32,  (2,1) : 48,
        (3,0) : 192, (3,5) : 192, (3,7) : 192,
        (5,0) : 128, (5,2) : 128,
        (6,0) : 64,  (6,1) : 128,
        (7,0) : 64,  (7,5) : 64, 
        (8,0) : 64,  (8,6) : 128
        }

    device = cuda.get_current_device()
    num_sm = getattr(device, 'MULTIPROCESSOR_COUNT')
    my_cc = device.compute_capability
    num_cores_per_sm = cc_cores_per_SM_dict.get(my_cc)
    total_cores = num_cores_per_sm*num_sm
    print("GPU compute capability: " , my_cc)
    print("GPU total number of Streaming Multiprocessors (SM): " , num_sm)
    print("GPU total number of cores per SMs: " , num_cores_per_sm)
    print("total cores: " , total_cores)
    print('''\n Deciding which execution configuration to use is not easy, and the choice should be driven by performance analysis. However, here are some basic rules to get started:
    - The number of blocks in the grid should be larger than the number of Streaming Multiprocessors on the GPU, typically 2 to 4 times larger.
    - The number of threads per block should be a multiple of 32, typically between 128 and 512. ''')   
    
    return num_sm, num_cores_per_sm

num_sm, num_cores_per_sm = get_gpu_info()

print(2*(num_cores_per_sm//32)*32)

GPU compute capability:  (6, 1)
GPU total number of Streaming Multiprocessors (SM):  20
GPU total number of cores per SMs:  128
total cores:  2560

 Deciding which execution configuration to use is not easy, and the choice should be driven by performance analysis. However, here are some basic rules to get started:
    - The number of blocks in the grid should be larger than the number of Streaming Multiprocessors on the GPU, typically 2 to 4 times larger.
    - The number of threads per block should be a multiple of 32, typically between 128 and 512. 
256


In [9]:
%reload_ext autoreload

num_choices = 2
num_trials = 3
num_samples = 2000

coherence = np.ones((num_trials,num_samples))*50     # 100
coherence[0,0:300] = -50
coherence[0,1000:1300] = 150
coherence[1,10:350] = -50
coherence[1,800:1350] = -150
coherence[2,10:350] = -100
coherence[2,350:700] = 120

starting_point = np.zeros(num_choices, dtype=np.float32)             
drift_offset = np.zeros(num_choices, dtype=np.float32)
drift_gain = np.float32(10e-5)             # drift gain
drift_variability = np.float32(0)      # diffusion variability
nondecision_time = np.float32(100)         # Non-decision time (msec)
decision_bound = 1
bound_rate = 0
bound_delay = 0
lateral_inhibition = 0.005
leak = 0.01
neural_ddm = 0.2
urgency_signal = False
# Dynamic time-dependent variables
stimulus = get_unsigned_coherence_matrix(coherence)
stimulus_cp= cp.asarray(stimulus)
decision_bound_cp = get_time_dependent_bound(decision_bound, bound_rate, bound_delay, stop_time=num_samples)
decision_bound = cp.asnumpy(decision_bound_cp)
drift_variability_cp = get_time_dependent_variability(drift_variability, time_coefficient=0, stop_time=num_samples)
drift_variability = cp.asnumpy(drift_variability_cp)
                
decision1, reaction_time1 = IAM_cpu_sim(stimulus, starting_point, drift_gain, drift_variability, drift_offset, decision_bound, nondecision_time, lateral_inhibition, leak, neural_ddm, urgency_signal)
decision2, reaction_time2 = IAM_gpu_sim(stimulus_cp, starting_point, drift_gain, drift_variability_cp, drift_offset, decision_bound_cp, nondecision_time, lateral_inhibition, leak, neural_ddm, urgency_signal)

print("\n", decision1, "\n", decision2)
print("\n", reaction_time1, "\n", reaction_time2)


 [0. 1. 0.] 
 [0. 1. 0.]

 [1169. 1016.  654.] 
 [1169. 1016.  654.]


In [7]:
%reload_ext autoreload

num_choices = 2
num_trials = 3
num_samples = 2000

coherence = np.ones((num_trials,num_samples))*50     # 100
coherence[0,0:300] = -50
coherence[0,1000:1300] = 150
coherence[1,10:350] = -50
coherence[1,800:1350] = -150
coherence[2,10:350] = -100
coherence[2,350:700] = 120

starting_point = 0 #np.array(np.zeros(1), dtype=float32)             
drift_offset = 0 #np.array(np.zeros(1), dtype=float32)
drift_gain = np.float32(5e-5)             # drift gain
drift_variability = np.float32(0)#10e-3)      # diffusion variability
nondecision_time = np.float32(100)         # Non-decision time (msec)
decision_bound = 1
bound_rate = 0
bound_delay = 0
lateral_inhibition = 0
leak = 0
neural_ddm = 0
urgency_signal = False
# Dynamic time-dependent variables
stimulus = get_unsigned_coherence_matrix(coherence)
stimulus_cp= cp.asarray(stimulus)
decision_bound_cp = get_time_dependent_bound(decision_bound, bound_rate, bound_delay, stop_time=num_samples)
decision_bound = cp.asnumpy(decision_bound_cp)
drift_variability_cp = get_time_dependent_variability(drift_variability, time_coefficient=0, stop_time=num_samples)
drift_variability = cp.asnumpy(drift_variability_cp)


decision1, reaction_time1 = DDM_cpu_sim(stimulus, starting_point, drift_gain, drift_variability, drift_offset, decision_bound, nondecision_time)
decision2, reaction_time2 = DDM_gpu_sim(stimulus_cp, starting_point, drift_gain, drift_variability_cp, drift_offset, decision_bound_cp, nondecision_time, urgency_signal)
    
print(decision1, decision2)
print(reaction_time1, reaction_time2)

[ 1. -1. -1.] [ 1. -1. -1.]
[1100. 1073.  315.] [1100. 1073.  315.]


True