<a href="https://colab.research.google.com/github/tonystz/gitpod/blob/main/test_pycuda_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to CUDA and PyCUDA

In [None]:
!pip install pycuda # install cuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [2]:
!nvidia-smi

Mon Mar 13 08:27:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    30W /  70W |    103MiB / 15360MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pycuda
import pycuda.driver as drv
drv.init()

print('CUDA device query (PyCUDA version) \n')

print('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count()))

for i in range(drv.Device.count()):
    
    gpu_device = drv.Device(i)
    print('Device {}: {}'.format( i, gpu_device.name() )) 
    compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
    print('\t Compute Capability: {}'.format(compute_capability))
    print('\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))


    # The following will give us all remaining device attributes as seen 
    # in the original deviceQuery.
    # We set up a dictionary as such so that we can easily index
    # the values using a string descriptor.
    
    device_attributes_tuples = iter(gpu_device.get_attributes().items()) 
    device_attributes = {}

        
    for k, v in device_attributes_tuples:
        device_attributes[str(k)] = v
        # print(f'{k}->{v}')
    # continue
    num_mp = device_attributes['MULTIPROCESSOR_COUNT']
    
    # Cores per multiprocessor is not reported by the GPU!  
    # We must use a lookup table based on compute capability.
    # See the following:
    # http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
    
    cuda_cores_per_mp = { 5.0 : 128, 5.1 : 128, 5.2 : 128, 6.0 : 64, 6.1 : 128, 7.5 : 128}[compute_capability]
    
    print('\t ({}) Multiprocessors, ({}) CUDA Cores / Multiprocessor: {} CUDA Cores'.format(num_mp, cuda_cores_per_mp, num_mp*cuda_cores_per_mp))
    
    device_attributes.pop('MULTIPROCESSOR_COUNT')
    
    for k in list(device_attributes.keys()):
        print('\t {}: {}'.format(k, device_attributes[k]))


In [None]:
import numpy

a = numpy.array([[1,1,1,1],[1,1,1,1],[1,1,1,1],[1,1,1,1]], dtype=numpy.float32)

In [None]:
a_gpu = cuda.mem_alloc(a.nbytes)

In [None]:
cuda.memcpy_htod(a_gpu, a)


In [None]:
mod = SourceModule("""
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
    printf("loca is %zu, %f\\n", idx,a[idx]);
  }
  """)

##pycuda index test

In [62]:
%%writefile a.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *g_pcnt)
    { 
      int l_count = 0;
      __shared__ int g_count;
      l_count +=1;

      __syncthreads();
      g_count +=1;

      *g_pcnt +=22;

      printf("I am threadIdx:[%d[%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]:\\n", threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y);
      
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      printf("thread id: [%d][%d]   -> l_count=%d ,g_count=%d\\n",idx,idy,l_count,g_count);
    }
    """)

func = mod.get_function("say_hi")
acnt= np.array([0],dtype=np.int32)
print('original pointer array:',acnt)

acnt_gpu=gpuarray.to_gpu(acnt)
func(acnt_gpu,block=(3,2,1),grid=(2,1,1))
print('modify pointer array:',acnt_gpu.get())


Overwriting a.py


In [63]:
!python a.py

original pointer array: [0]
I am threadIdx:[0[0]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[1[0]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[2[0]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[0[1]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[1[1]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[2[1]  of block:[1][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[0[0]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[1[0]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[2[0]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[0[1]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[1[1]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
I am threadIdx:[2[1]  of block:[0][0] block size:[3][2][1]  grid size:[2][1]:
thread id: [3][0]   -> l_count=1 ,g_

In [None]:
%%cu
#include <stdio.h>

void helloCPU()
{
  printf("Hello from the CPU.\n");
}

/*
 * The addition of `__global__` signifies that this function
 * should be launced on the GPU.
 */

__global__ void helloGPU()
{
  printf("Hello from the GPU.\n");
}

int main()
{
   helloCPU();


  /*
   * Add an execution configuration with the <<<...>>> syntax
   * will launch this function as a kernel on the GPU.
   */

  helloGPU<<<1, 1>>>();

  /*
   * `cudaDeviceSynchronize` will block the CPU stream until
   * all GPU kernels have completed.
   */

  cudaDeviceSynchronize();
}

UsageError: Cell magic `%%cu` not found.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
func = mod.get_function("doublify")
print('host:',a.shape)
from pycuda import gpuarray
import pandas as pd
print(pd.DataFrame(a))
a_on_gpu= gpuarray.to_gpu(a)
func(a_gpu, block=(4,4,1))
print(pd.DataFrame(a_on_gpu.get()))

host: (4, 4)
     0    1    2    3
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
     0    1    2    3
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0


  globals().clear()


In [None]:
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print(a_doubled.shape)
print(a_doubled)
print(a)

(4, 4)
[[-1.3746789  -1.6419895   3.3047218  -1.1505861 ]
 [ 2.1979356   1.8518921  -1.9868276  -1.7164422 ]
 [ 0.14977352  1.058711    0.2419031  -0.44884723]
 [-3.113357    0.11188176  0.32294306 -4.2692833 ]]
[[-0.6873394  -0.82099473  1.6523609  -0.57529306]
 [ 1.0989678   0.92594606 -0.9934138  -0.8582211 ]
 [ 0.07488676  0.5293555   0.12095155 -0.22442362]
 [-1.5566785   0.05594088  0.16147153 -2.1346416 ]]


In [None]:
b = numpy.random.randn(4,4)
b = b.astype(numpy.float32)
c = numpy.random.randn(4,4)
c = c.astype(numpy.float32)

In [None]:
mod2 = SourceModule("""
  __global__ void add2(float *a, float *b)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] += b[idx];
  }
  """)

In [None]:
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(b_gpu, b)
cuda.memcpy_htod(c_gpu, c)


In [None]:
func = mod2.get_function("add2")
func(b_gpu,c_gpu, block=(4,4,1))

In [None]:
added = numpy.empty_like(b)
cuda.memcpy_dtoh(added, b_gpu)
print(added)
print(b)
print(c)

# Exercises

1. Write a cuda kernel to find the elementwise square of a matrix
2. Write a cuda kernel to find a matrix, which when added to the given matrix results in every element being equal to zero
3. Write a cuda kernel to multiply two matrices:
    1. Assume square matrices, with dimensions < 1024
    2. Assume square matrices, with dimensions > 1024
    3. Assume non-square matrices, with dimensions > 1024

In [None]:
1. 