<a href="https://colab.research.google.com/github/vladimiralencar/DeepLearning_LANA/blob/master/pyCUDA/pyCUDA_Jupyter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyCUDA - Nvidia CUDA em GPUs com python

In [1]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/58/33/cced4891eddd1a3ac561ff99081019fddc7838a07cace272c941e3c2f915/pycuda-2018.1.1.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 16.2MB/s 
[?25hCollecting pytools>=2011.2 (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/90/6a/7b706e4730db0ee5724c677cceafcac1bc9710c61612442a689e7b0aa5c4/pytools-2018.5.2.tar.gz (58kB)
[K    100% |████████████████████████████████| 61kB 22.9MB/s 
Collecting appdirs>=1.4.0 (from pycuda)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting mako (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/eb/f3/67579bb486517c0d49547f9697e36582cd19dafb5df9e687ed8e22de57fa/Mako-1.0.7.tar.gz (564kB)
[K    100% |████████████████████████████████| 573kB 27.2MB/s 
Building wheels for collected packages: pycuda, pyto

Status da GPU

In [2]:
!nvidia-smi

Mon Jan 14 23:25:49 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [3]:
!ls

 cuda_files.zip   exemplo3.cu	 exemplo4_out   exemplo6.cu	   exemplo7.cu
 exemplo1.cu	  exemplo3_out	 exemplo5.cu    exemplo6_out	   exemplo7_out
 exemplo2.cu	  exemplo4.cu	 exemplo5_out  'exemplo7 (1).cu'   sample_data


## Para importar arquivos

In [4]:
from google.colab import files
uploaded = files.upload()

Saving 01-check-env.py to 01-check-env.py
Saving 02-PyCudaWorkflow.py to 02-PyCudaWorkflow.py
Saving 03-PyCudaMatrixManipulation.py to 03-PyCudaMatrixManipulation.py
Saving 04-PyCudaGPUArray.py to 04-PyCudaGPUArray.py
Saving 05-PyCudaElementWise.py to 05-PyCudaElementWise.py
Saving 06-PyCudaReductionKernel.py to 06-PyCudaReductionKernel.py
Saving Duvida-Pycuda-01.txt to Duvida-Pycuda-01.txt
Saving t2est-02-PyCudaWorkflow-test.py to t2est-02-PyCudaWorkflow-test.py
Saving test-02-PyCudaWorkflow-test.py to test-02-PyCudaWorkflow-test.py


## Checando o ambiente GPU

In [12]:
# Informações da GPU
import pycuda.driver as drv 

drv.init() 

print ("%d Dispositivo(s) encontrados." % drv.Device.count())

for ordinal in range(drv.Device.count()): 
       dev = drv.Device(ordinal) 
       print ("Dispositivo #%d: %s" % (ordinal, dev.name())) 
       print (" Compute Capability: %d.%d" % dev.compute_capability())     
       print (" Total Memory: %s KB" % (dev.total_memory()//(1024))) 

1 Dispositivo(s) encontrados.
Dispositivo #0: Tesla K80
 Compute Capability: 3.7
 Total Memory: 11715776 KB


In [8]:
!ls *.py

01-check-env.py			05-PyCudaElementWise.py
02-PyCudaWorkflow.py		06-PyCudaReductionKernel.py
03-PyCudaMatrixManipulation.py	t2est-02-PyCudaWorkflow-test.py
04-PyCudaGPUArray.py		test-02-PyCudaWorkflow-test.py


In [13]:
# Multiplicação de Matrizes Usando a GPU

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy

a = numpy.random.randn(5,5)
a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
  __global__ void doubles_matrix(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)

func = mod.get_function("doubles_matrix")

func(a_gpu, block=(5,5,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)

print ("Matriz Original")
print (a)
print ("Matriz Multiplicada Por 2 Usando PyCUDA")
print (a_doubled)


Matriz Original
[[-0.91079015 -0.8891651  -1.0901822  -1.1925074  -0.11522503]
 [-0.24382235 -0.83982086 -0.6174249  -0.82852226 -1.3399014 ]
 [-0.13524824 -2.486508    0.5964713   1.7961265  -0.1433859 ]
 [ 0.84577096  0.17217064  0.26081115 -1.7617484  -0.3193688 ]
 [-0.10955688  0.33049542  0.08368162  0.5058635   1.8535402 ]]
Matriz Multiplicada Por 2 Usando PyCUDA
[[-1.8215803  -1.7783302  -2.1803644  -2.3850148  -0.23045006]
 [-0.4876447  -1.6796417  -1.2348498  -1.6570445  -2.679803  ]
 [-0.2704965  -4.973016    1.1929426   3.592253   -0.2867718 ]
 [ 1.6915419   0.34434128  0.5216223  -3.5234969  -0.6387376 ]
 [-0.21911375  0.33049542  0.08368162  0.5058635   1.8535402 ]]


In [17]:
# Gerenciamento de Memória na GPU através da muliplicação de 2 matrizes

# Pacotes
import numpy as np
from pycuda import driver, compiler, gpuarray, tools

# Inicializando o device
import pycuda.autoinit

# Kernel
kernel_code_template = """
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    float Pvalue = 0;
    for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
        float Aelement = a[ty * %(MATRIX_SIZE)s + k];
        float Belement = b[k * %(MATRIX_SIZE)s + tx];
        Pvalue += Aelement * Belement;
    }

    c[ty * %(MATRIX_SIZE)s + tx] = Pvalue;
}
"""

# Define o tamanho da Matriz
MATRIX_SIZE = 5

# Variáveis para armazenar as matrizes na memória do host
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
c_cpu = np.dot(a_cpu, b_cpu)

# Variáveis para armazenar as matrizes na memória do device
a_gpu = gpuarray.to_gpu(a_cpu) 
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

# Define o kernel
kernel_code = kernel_code_template % {
    'MATRIX_SIZE': MATRIX_SIZE 
    }

# Compila o kernel
mod = compiler.SourceModule(kernel_code)

# Obtém o kernel
matrixmul = mod.get_function("MatrixMulKernel")

# Executa o kernel
matrixmul(
    a_gpu, b_gpu, 
    c_gpu, 
    block = (MATRIX_SIZE, MATRIX_SIZE, 1),
    )

# Imprime os resultados
print ("-" * 80)
print ("Matriz A (GPU):")
print (a_gpu.get())

print ("-" * 80)
print ("Matriz B (GPU):")
print (b_gpu.get())

print ("-" * 80)
print ("Matriz C (GPU):")
print (c_gpu.get())

print ("-" * 80)
print ("Diferença CPU-GPU:")
print (c_cpu - c_gpu.get())

np.allclose(c_cpu, c_gpu.get())


--------------------------------------------------------------------------------
Matriz A (GPU):
[[ 0.3604919   1.1128123  -0.9120744  -0.2843168  -0.55064857]
 [ 1.6783663   0.06910207  0.30600306  0.35123047  0.5714042 ]
 [-1.8354139  -0.13443327 -0.11898965  1.3090595   0.5153561 ]
 [-2.0327392  -0.47590256  0.06097374 -2.3049285  -1.6842628 ]
 [ 1.3557538  -1.0107814   0.36772087  1.38075    -0.15153128]]
--------------------------------------------------------------------------------
Matriz B (GPU):
[[ 0.18019308 -0.23353235 -0.6655699   0.2204949   0.79046947]
 [ 0.9329438  -1.0420737   0.8548512  -0.62169874  0.68219674]
 [ 0.15263858 -0.625713   -1.7614852  -0.834755    1.8488528 ]
 [-0.31875882 -1.4452021  -0.8125798  -0.778752    0.34576753]
 [ 0.18867064 -0.58592045 -0.2399493   1.092761    0.67266935]]
--------------------------------------------------------------------------------
Matriz C (GPU):
[[ 0.9506691   0.06040929  2.6811197  -0.23130372 -1.1108884 ]
 [ 0.4094556  

True

In [27]:
# GPU Array
# Funciona de forma similar ao np.ndarray do Numpy
# GPU Array suporta diversas operações aritméticas e pode ser usado em conjunto com pycuda.cumath e pycuda.curandom

import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy

# Definindo a matriz em tempo de execução - gpuarray -> operações na GPU
a_gpu = gpuarray.to_gpu(numpy.random.randn(5,5).astype(numpy.float32))

# Multiplicando a matriz na GPU
a_doubled = (2 * a_gpu).get()

# Imprimindo so resultados
print ("Matriz Original")
print (a_gpu.get())
print ("Matriz multiplicada por 2 após a execução com GPUARRAY")
print (a_doubled)

Matriz Original
[[-0.68389815 -0.9540187   0.62444663  2.2092094   0.6055998 ]
 [ 2.1918674  -2.0914237  -1.5909268   1.1299127  -0.14859967]
 [-0.18609966  0.793407    0.90570015 -0.4340446   1.9414983 ]
 [-1.7000875   1.1402414   0.8981155   1.5661582  -1.6731738 ]
 [-0.98103285 -0.86361957 -1.1050426  -0.82644194  1.7413609 ]]
Matriz multiplicada por 2 após a execução com GPUARRAY
[[-1.3677963  -1.9080374   1.2488933   4.418419    1.2111996 ]
 [ 4.3837347  -4.1828475  -3.1818535   2.2598255  -0.29719934]
 [-0.37219933  1.586814    1.8114003  -0.8680892   3.8829966 ]
 [-3.400175    2.2804828   1.796231    3.1323164  -3.3463476 ]
 [-1.9620657  -1.7272391  -2.2100852  -1.6528839   3.4827218 ]]


In [28]:
# Avaliando Expressões Element-wise
# Avaliando a combinação linear entre 2 vetores

import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand
from pycuda.elementwise import ElementwiseKernel
import numpy.linalg as la

# Matrizes
input_vector_a = curand((50,))
input_vector_b = curand((50,))

# Coeficientes
mult_coefficient_a = 2
mult_coefficient_b = 5

# Kernel
# Combinação Linear = 2a + 5b
linear_combination = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *c",
        "c[i] = a*x[i] + b*y[i]",
        "linear_combination")

# Variável para receber o resultado da operação
linear_combination_result = gpuarray.empty_like(input_vector_a)

# Execução do kernel
linear_combination(mult_coefficient_a, input_vector_a, mult_coefficient_b, input_vector_b, linear_combination_result)

# Imprime os resultados
print ("Vetor A =")
print (input_vector_a)

print ("Vetor B = ")
print (input_vector_b)

print ("Vetor Resultante C = ")
print (linear_combination_result)

print ("Verificando o resultado, checando a diferença entre o vetor C e a combinação linear de A e B")
print ("C - (%sA + %sB) = "%(mult_coefficient_a,mult_coefficient_b))
print (linear_combination_result - (mult_coefficient_a*input_vector_a + mult_coefficient_b*input_vector_b))
assert la.norm((linear_combination_result - (mult_coefficient_a*input_vector_a + mult_coefficient_b*input_vector_b)).get()) < 1e-5

Vetor A =
[0.9299976  0.57159555 0.6747105  0.35152113 0.42484605 0.2478653
 0.28940392 0.5270712  0.41606745 0.48860577 0.32973623 0.60531074
 0.37160522 0.49076557 0.2574233  0.75812644 0.14752856 0.85217303
 0.81258714 0.03948139 0.8952462  0.98310745 0.37579942 0.04873587
 0.37918943 0.84297776 0.86008066 0.17635828 0.54952157 0.8074612
 0.8917616  0.01910386 0.32837296 0.0960414  0.23743396 0.94870305
 0.5522556  0.5548405  0.33031994 0.83856946 0.09172963 0.7755765
 0.0759598  0.7508158  0.12523359 0.2862492  0.0158942  0.83963037
 0.0863947  0.25151792]
Vetor B = 
[0.96609336 0.48443323 0.71880895 0.28619775 0.5227978  0.82229006
 0.97751075 0.5487958  0.4755591  0.9365577  0.29246634 0.3210364
 0.95645505 0.19344777 0.9341336  0.03502631 0.6869635  0.3317518
 0.18612684 0.13323092 0.8401832  0.31316194 0.90463746 0.25292233
 0.58972275 0.32619864 0.9309445  0.9536835  0.3777082  0.36830986
 0.24655311 0.97965175 0.72053516 0.4027994  0.36097062 0.7576021
 0.45439163 0.26408187 

In [29]:
# Operações de MapReduce em Paralelo na GPU

# Pacotes
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.reduction import ReductionKernel

# Comprimento do vetor
vector_length = 400

# Vetores A e B
input_vector_a = gpuarray.arange(vector_length, dtype = numpy.int)
input_vector_b = gpuarray.arange(vector_length, dtype = numpy.int)

# Operação de redução em paralelo
dot_product = ReductionKernel(numpy.int,
                       arguments = "int *x, int *y",
                       map_expr = "x[i]*y[i]",
                       reduce_expr = "a+b", 
                       neutral = "0")

# Execução do kernel
dot_product = dot_product(input_vector_a, input_vector_b).get()

# Imprime os resultados
print("Matriz A")
print(input_vector_a)

print("Matriz B")
print(input_vector_b)

print("Resultado do Produto A * B")
print(dot_product)


Matriz A
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 2

In [25]:
!ls *.py

01-check-env.py			05-PyCudaElementWise.py
02-PyCudaWorkflow.py		06-PyCudaReductionKernel.py
03-PyCudaMatrixManipulation.py	t2est-02-PyCudaWorkflow-test.py
04-PyCudaGPUArray.py		test-02-PyCudaWorkflow-test.py


In [31]:
%%bash
rm -f pycuda_files.zip
zip -r pycuda_files.zip . -i *.py

  adding: t2est-02-PyCudaWorkflow-test.py (deflated 52%)
  adding: 04-PyCudaGPUArray.py (deflated 44%)
  adding: 02-PyCudaWorkflow.py (deflated 44%)
  adding: 01-check-env.py (deflated 43%)
  adding: test-02-PyCudaWorkflow-test.py (deflated 44%)
  adding: 05-PyCudaElementWise.py (deflated 63%)
  adding: 03-PyCudaMatrixManipulation.py (deflated 60%)
  adding: 06-PyCudaReductionKernel.py (deflated 57%)


## Download a file

In [0]:
from google.colab import files
files.download('pycuda_files.zip') 

In [33]:
!ls

 01-check-env.py		  exemplo4_out
 02-PyCudaWorkflow.py		  exemplo5.cu
 03-PyCudaMatrixManipulation.py   exemplo5_out
 04-PyCudaGPUArray.py		  exemplo6.cu
 05-PyCudaElementWise.py	  exemplo6_out
 06-PyCudaReductionKernel.py	 'exemplo7 (1).cu'
 cuda_files.zip			  exemplo7.cu
 Duvida-Pycuda-01.txt		  exemplo7_out
 exemplo1.cu			  pycuda_files.zip
 exemplo2.cu			  sample_data
 exemplo3.cu			  t2est-02-PyCudaWorkflow-test.py
 exemplo3_out			  test-02-PyCudaWorkflow-test.py
 exemplo4.cu


** Depois de ** executar a célula acima, você pode baixar o arquivo zip [here](cuda_files.zip)