# GPU Computing for Data Scientists
## 06
#### Using CUDA, Jupyter, PyCUDA, ArrayFire and Thrust


https://github.com/QuantScientist/Data-Science-ArrayFire-GPU

In [38]:
# !pip install pycuda
%reset -f
import numpy
import numpy as np

# imports
import numpy as np                     # numeric python lib
import matplotlib.image as mpimg       # reading images to numpy arrays

%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = (6, 6)      # setting default size of plots

import tensorflow as tf 
print("tensorflow:" + tf.__version__)
!set "KERAS_BACKEND=tensorflow"

import numpy as np
import matplotlib.pyplot as plt

import cv2
import numpy as np

import numpy as np
import matplotlib.pyplot as plot   # Library to plot
import matplotlib.cm as colormap   # Library to plot
import caffe
from caffe import layers as L
from caffe import params as P
caffe.set_device(0)
caffe.set_mode_gpu()
# http://christopher5106.github.io/deep/learning/2015/09/04/Deep-learning-tutorial-on-Caffe-Technology.html
import time

import multiprocessing
import threading
import sys
import os
import time
import gc
from multiprocessing import Process, Pipe, sharedctypes, Lock
from threading import Thread

MAXCPU=0
try:
    MAXCPU = multiprocessing.cpu_count()
except:
    MAXCPU = 0
print 'MAXCPU:' + str(MAXCPU)

In [39]:
from pycuda.compiler import SourceModule
import pycuda
from pycuda import compiler
import pycuda.driver as cuda
import pycuda.autoinit             # PyCuda autoinit
import pycuda.driver as cuda       # PyCuda In, Out helpers
import matplotlib.pyplot as plot   # Library to plot
import matplotlib.cm as colormap   # Library to plot
import numpy                       # Fast math library
import time
import pycuda.driver as drv
drv.init()
MAX_THREADS_PER_BLOCK=512 # backward compatible
print("%d device(s) found." % cuda.Device.count())           

# All GPU attributes
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)    
    print "Device #%d: %s" % (ordinal, dev.name())   
    
    atts = [(str(att), value) 
            for att, value in dev.get_attributes().iteritems()]    
    atts.sort    
    print type(atts)

# MAX_THREADS_PER_BLOCK=filter(lambda s: s == 'MAX_THREADS_PER_BLOCK' , atts) 
# print 'MAX_THREADS_PER_BLOCK:' + str(MAX_THREADS_PER_BLOCK)
    
print cuda

# Get the max possible therads per block limit
# MAX_THREADS_PER_BLOCK

1 device(s) found.
Device #0: GeForce GTX 1080
<type 'list'>
MAX_THREADS_PER_BLOCK:[]
<module 'pycuda.driver' from '/usr/local/lib/python2.7/dist-packages/pycuda/driver.pyc'>


# Blocks  &  Threads

  MAX_BLOCK_DIM_X: 1024
  MAX_BLOCK_DIM_Y: 1024
  MAX_BLOCK_DIM_Z: 64
  MAX_GRID_DIM_X: 2147483647
  MAX_GRID_DIM_Y: 65535
  MAX_GRID_DIM_Z: 65535
  MAX_PITCH: 2147483647
  MAX_REGISTERS_PER_BLOCK: 65536
  MAX_REGISTERS_PER_MULTIPROCESSOR: 65536
  MAX_SHARED_MEMORY_PER_BLOCK: 49152
  MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: 98304
  MAX_THREADS_PER_BLOCK: 1024
  MAX_THREADS_PER_MULTIPROCESSOR: 2048
  MEMORY_CLOCK_RATE: 5005000
  MULTIPROCESSOR_COUNT: 20
  MULTI_GPU_BOARD: 0
  MULTI_GPU_BOARD_GROUP_ID: 0
  PCI_BUS_ID: 1
  PCI_DEVICE_ID: 0
  PCI_DOMAIN_ID: 0
  STREAM_PRIORITIES_SUPPORTED: 1
  SURFACE_ALIGNMENT: 512
  TCC_DRIVER: 0
  TEXTURE_ALIGNMENT: 512
  TEXTURE_PITCH_ALIGNMENT: 32
  TOTAL_CONSTANT_MEMORY: 65536
  UNIFIED_ADDRESSING: 1
  WARP_SIZE: 32
  
-  Maximum thread size for GPU is dependent on GPU, but normally 512.
-  Threads per block should be a multiple of 32.
-  Block and Grid Size is dependent on the image.
-  This example uses a 256x256 pixel image. A 2D block (16x16) and a 1D grid (256,1) is used

In [40]:
#Kernel text
kernel = """
 
    __global__ void bw( float *inIm, int check ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
 
        if(idx *3 < check*3)
        {
        int val = 0.21 *inIm[idx*3] + 0.71*inIm[idx*3+1] + 0.07 * inIm[idx*3+2];        
        //int val = (inIm[idx*3] + inIm[idx*3+1] + inIm[idx*3+2])/3.0;
 
        inIm[idx*3]= val;
        inIm[idx*3+1]= val;
        inIm[idx*3+2]= val;
        }
    }         
    """
 
#Compile and get kernel function
mod = SourceModule(kernel)
print mod
func = mod.get_function("bw")
print func

<pycuda.compiler.SourceModule object at 0x7fefb972e5d0>
<pycuda._driver.Function object at 0x7fefb99c2950>


In [41]:
import PIL
from PIL import Image as PILImage

def filterImageOnGPU(inPath, filterFunc):              
    im = PILImage.open(inPath)            
    px = numpy.array(im).astype(numpy.float32)
        
    print 'Size:' + str(im.size)
    print 'Pixels:' + str (im.size[0]*im.size[1])
    
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)
      
    BLOCK_SIZE = 1024
    block = (BLOCK_SIZE,1,1)
    print ('Block:' + str (block))
    totalPixels = numpy.int32(im.size[0]*im.size[1])
    print ('TotalPixels:' + str (totalPixels))
    gridRounded=int(im.size[0]*im.size[1]/BLOCK_SIZE)+1
    print ('GridRounded:' + str (gridRounded))
    print ('BLOCK_SIZE * GridRounded:' + str (gridRounded*BLOCK_SIZE))
    grid = (gridRounded,1,1)
    print ('Grid:' + str (grid))
   
    filterFunc(d_px,totalPixels, block=block,grid = grid)
   
    bwPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(bwPx, d_px)
    # On monochrome images, Pixels are uint8 [0,255].    
#     numpy.clip(bwPx, 0, 255, out=bwPx)
#     bwPx = bwPx.astype('uint8')
    bwPx = (numpy.uint8(bwPx))       
    pil_im = PILImage.fromarray(bwPx,mode ="RGB")           
    return pil_im

In [42]:
figure = plot.figure()
figure.set_size_inches(10, 10)
filterImageOnGPU('images/nematode.jpg',func)

Size:(10700, 7600)
Pixels:81320000
Block:(1024, 1, 1)
TotalPixels:81320000
GridRounded:79415
BLOCK_SIZE * GridRounded:81320960
Grid:(79415, 1, 1)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


<matplotlib.figure.Figure at 0x7fefb972e2d0>