<a href="https://colab.research.google.com/github/vellamike/311/blob/master/pycuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pycuda # install cuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule



In [0]:
import numpy
a = numpy.random.randn(4,4)

In [0]:
a = a.astype(numpy.float32)

In [0]:
a_gpu = cuda.mem_alloc(a.nbytes)

In [0]:
cuda.memcpy_htod(a_gpu, a)


In [0]:
mod = SourceModule("""
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)

In [0]:
func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))

In [0]:
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print(a_doubled)
print(a)

[[ 3.429391    0.37923738  2.2865975  -3.8205628 ]
 [-0.12789133 -3.5567102   0.63542295  2.184241  ]
 [-0.57918423 -4.6991134   0.7864673  -3.087758  ]
 [-0.5750209   1.9147984   1.6053301   1.6458256 ]]
[[ 1.7146955   0.18961869  1.1432987  -1.9102814 ]
 [-0.06394567 -1.7783551   0.31771147  1.0921205 ]
 [-0.28959212 -2.3495567   0.39323366 -1.543879  ]
 [-0.28751045  0.9573992   0.80266505  0.8229128 ]]


In [0]:
b = numpy.random.randn(4,4)
b = b.astype(numpy.float32)
c = numpy.random.randn(4,4)
c = c.astype(numpy.float32)

In [0]:
mod2 = SourceModule("""
  __global__ void add2(float *a, float *b)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] += b[idx];
  }
  """)

In [0]:
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(b_gpu, b)
cuda.memcpy_htod(c_gpu, c)


In [0]:
func = mod2.get_function("add2")
func(b_gpu,c_gpu, block=(4,4,1))

In [0]:
added = numpy.empty_like(b)
cuda.memcpy_dtoh(added, b_gpu)
print(added)
print(b)
print(c)

[[ 1.2008601  -2.0707388  -1.638124   -0.8436059 ]
 [-0.27846655  1.9869852   0.4241285   1.6201097 ]
 [-0.81453604  1.3625945   0.29871333 -0.45156989]
 [ 0.23972344  1.8589917   0.45649573  1.2006731 ]]
[[ 0.27993715 -0.6367077  -1.3909823   0.86861134]
 [-0.5009937   1.2655689  -0.00545789  2.012207  ]
 [-0.3106417   0.3997737   0.8599566  -0.46498844]
 [ 1.0907929   0.7727295   0.8979871   1.1682794 ]]
[[ 0.920923   -1.434031   -0.24714178 -1.7122172 ]
 [ 0.22252718  0.7214164   0.4295864  -0.39209738]
 [-0.5038943   0.96282077 -0.5612433   0.01341857]
 [-0.85106945  1.0862622  -0.4414914   0.03239373]]


# Exercises

1. Write a cuda kernel to find the elementwise square of a matrix
2. Write a cuda kernel to find a matrix, which when added to the given matrix results in every element being equal to zero
3. Write a cuda kernel to multiply two matrices - how does it scale with matrix size?

In [0]:
1. 