# GPU Computing for Data Scientists
#### Using CUDA, Jupyter, PyCUDA, ArrayFire and Thrust


https://github.com/QuantScientist/Data-Science-ArrayFire-GPU

In [1]:
%reset -f
import pycuda
from pycuda import compiler
import pycuda.driver as drv
import pycuda.driver as cuda

# Make sure we have CUDA

In [2]:
drv.init()
print("%d device(s) found." % drv.Device.count())           
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    print "Device #%d: %s" % (ordinal, dev.name())    

drv

1 device(s) found.
Device #0: GeForce GTX 1080


<module 'pycuda.driver' from '/usr/local/lib/python2.7/dist-packages/pycuda/driver.pyc'>

## Simple addition the GPU: compilation

In [3]:
import pycuda.autoinit
import numpy

from pycuda.compiler import SourceModule

ARR_SIZE=100000000

srcGPU = """
   __global__ void multGPU(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
"""

srcGPUModule = SourceModule(srcGPU)

print srcGPUModule

<pycuda.compiler.SourceModule object at 0x7f6bd40f0a10>


# Simple addition on the GPU: Host memory allocation

In [4]:
a = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
b = numpy.random.randn(ARR_SIZE).astype(numpy.float32)

dest = numpy.zeros_like(a)
# print dest

## Simple addition on the GPU: execution

In [5]:
multGPUFunc = srcGPUModule.get_function("multGPU")

print multGPUFunc

multGPUFunc(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(32,32,1), grid=(32,32))

# print dest

<pycuda._driver.Function object at 0x7f6bd4118dd0>


In [6]:
import timeit
n_iter = ARR_SIZE
rounds = 3  # for timeit
print 'numpy', timeit.timeit(lambda: 
                              numpy.add(a, b), 
                              number=rounds)

numpy 0.175907850266


In [7]:
# print "Calculating %d iterations" % (n_iter)
    
print 'pycuda', timeit.timeit(lambda: 
                              multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),block=(64,16,1), grid=(128,128)), 
                              number=rounds)

pycuda 0.376569986343


In [8]:
a = numpy.random.randn(4,4)
a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)
  
func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print a_doubled

[[-1.48978746 -0.04078278  2.82737327  0.75455797]
 [-3.2101264  -0.87261534  1.81067336 -2.88350916]
 [-3.06599188  2.20522308 -1.13825965  0.31716722]
 [-1.76300967  0.97310859 -2.0407064  -2.09832382]]
