# PyCUDA basics.

In [1]:
# to install, execute in a cell: %install_ext https://raw.github.com/minrk/ipython_extensions/master/nbtoc.py
%load_ext nbtoc
%nbtoc

## Versions

In [4]:
#%load_ext version_information
#%version_information numpy, scipy, matplotlib, sympy, pycuda

In [5]:
import pycuda
import numpy as np

In [6]:
pycuda.VERSION

(2016, 1, 2)

In [7]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Tue_Jan_10_13:22:03_CST_2017
Cuda compilation tools, release 8.0, V8.0.61


## Exploring your GPU device.

Listing devices:

In [8]:
from pycuda import autoinit
from pycuda.tools import DeviceData

In [9]:
specs = DeviceData()

In [10]:
print 'Max threads pero block = ',specs.max_threads

Max threads pero block =  1024


In [11]:
print 'Warp size            =',specs.warp_size
print 'Warps per MP         =', specs.warps_per_mp
print 'Thread Blocks per MP =', specs.thread_blocks_per_mp
print 'Registers            =', specs.registers
print 'Shared memory        =', specs.shared_memory
print 'Granularity ??       =', specs.smem_granularity

Warp size            = 32
Warps per MP         = 48
Thread Blocks per MP = 8
Registers            = 65536
Shared memory        = 49152
Granularity ??       = 32


Other way

In [12]:
import pycuda.driver as drv

In [13]:
drv.init()

In [14]:
drv.get_version()

(8, 0, 0)

In [15]:
devn = drv.Device.count()
print 'Localized GPUs =',devn

Localized GPUs = 4


In [16]:
devices = []
for i in range(devn):
    devices.append(drv.Device(i))

All you want to know about your GPU, but you're afraid to ask!

In [17]:
for sp in devices:
    print 'Name = ',sp.name()
    print 'PCI Bus = ',sp.pci_bus_id()
    print 'Compute Capability = ',sp.compute_capability()
    print 'Total Memory = ',sp.total_memory()/(2.**20) , 'MBytes'
    attr = sp.get_attributes()
    for j in range(len(attr.items())):
        print attr.items()[j]#,'Bytes (when apply)'
    print '------------------'
    print '------------------'

Name =  GeForce GTX 1080
PCI Bus =  0000:4B:00.0
Compute Capability =  (6, 1)
Total Memory =  8114.0 MBytes
(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK, 1024)
(pycuda._driver.device_attribute.MAX_BLOCK_DIM_X, 1024)
(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Y, 1024)
(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Z, 64)
(pycuda._driver.device_attribute.MAX_GRID_DIM_X, 2147483647)
(pycuda._driver.device_attribute.MAX_GRID_DIM_Y, 65535)
(pycuda._driver.device_attribute.MAX_GRID_DIM_Z, 65535)
(pycuda._driver.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK, 49152)
(pycuda._driver.device_attribute.TOTAL_CONSTANT_MEMORY, 65536)
(pycuda._driver.device_attribute.WARP_SIZE, 32)
(pycuda._driver.device_attribute.MAX_PITCH, 2147483647)
(pycuda._driver.device_attribute.MAX_REGISTERS_PER_BLOCK, 65536)
(pycuda._driver.device_attribute.CLOCK_RATE, 1847500)
(pycuda._driver.device_attribute.TEXTURE_ALIGNMENT, 512)
(pycuda._driver.device_attribute.GPU_OVERLAP, 1)
(pycuda._driver.device_attribu

MAX_THREADS_PER_BLOCK, 1024

For example for a 3D mesh (less optimal), we only have avaiable $$8\times 8\times 8 = 512 \,simetric$$ 
 $$8\times 8\times 16 = 1024 \,cilindrical$$
block size per dimension = 8 or 16.
In 2D case the optimal value is:
$$32\times32 = 1024$$
In last case $$1024$$


MAX_THREADS_PER_MULTIPROCESSOR, $1536 = 3*2^9$

If we can take this literally, we can process in one processor about 3 meshes of $8\times8\times8$, or three blocks of 3D meshes. With this result, we can evaluate the eficience comparing cilindrical and simetric performance


### Now your device has ..

In [18]:
drv.mem_get_info()[0]/(2.**20),'MB of Free Memory',drv.mem_get_info()[1]/(2.**20),'MB Total Memory'

(8001.0, 'MB of Free Memory', 8114.0, 'MB Total Memory')

Let's think in array sizes. For example a float of 4 bytes length:

In [17]:
print 'Linear max:', drv.mem_get_info()[0]/(4*8)
print '2D max:', np.sqrt(drv.mem_get_info()[0]/(4*8))
print '3D max:', np.power(drv.mem_get_info()[0]/(4*8),1./3.)

Linear max: 32349952
2D max: 5687.7018206
3D max: 318.633338212


## GPU-ARRAY STRUCTURE

In [18]:
import pycuda.gpuarray as gpuarray

In [19]:
a = np.random.rand(10,20,30)

In [20]:
a.shape

(10, 20, 30)

In [21]:
a_gpu = gpuarray.to_gpu(a)

In [22]:
a_gpu.gpudata

<pycuda._driver.DeviceAllocation at 0x4c25280>

In [23]:
psi1_h = np.zeros( (1,10,100), dtype=np.complex64)

In [24]:
psi1_h

array([[[ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
          0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  

In [25]:
b=psi1_h.real

In [26]:
b

array([[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.