<a href="https://colab.research.google.com/github/tonystz/cuda/blob/main/test_pycuda_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to CUDA and PyCUDA

In [4]:
!free -h && grep processor /proc/cpuinfo

              total        used        free      shared  buff/cache   available
Mem:           12Gi       809Mi       8.7Gi       1.0Mi       3.2Gi        11Gi
Swap:            0B          0B          0B
processor	: 0
processor	: 1


In [None]:
!pip install pycuda # install cuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

## query info

In [None]:
!nvidia-smi

Tue Mar 28 09:36:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W /  70W |    103MiB / 15360MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pycuda
import pycuda.driver as drv
drv.init()

print('CUDA device query (PyCUDA version) \n')

print('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count()))

for i in range(drv.Device.count()):
    
    gpu_device = drv.Device(i)
    print('Device {}: {}'.format( i, gpu_device.name() )) 
    compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
    print('\t Compute Capability: {}'.format(compute_capability))
    print('\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))


    # The following will give us all remaining device attributes as seen 
    # in the original deviceQuery.
    # We set up a dictionary as such so that we can easily index
    # the values using a string descriptor.
    
    device_attributes_tuples = iter(gpu_device.get_attributes().items()) 
    device_attributes = {}

        
    for k, v in device_attributes_tuples:
        device_attributes[str(k)] = v
        # print(f'{k}->{v}')
    # continue
    num_mp = device_attributes['MULTIPROCESSOR_COUNT']
    
    # Cores per multiprocessor is not reported by the GPU!  
    # We must use a lookup table based on compute capability.
    # See the following:
    # http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
    # nGpuArchCoresPerSM: https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h#L652
    
    cuda_cores_per_mp = { 5.0 : 128, 5.1 : 128, 5.2 : 128, 6.0 : 64, 6.1 : 128, 7.5 : 64}[compute_capability]
    
    print('\t ({}) Multiprocessors, ({}) CUDA Cores / Multiprocessor: {} CUDA Cores'.format(num_mp, cuda_cores_per_mp, num_mp*cuda_cores_per_mp))
    
    device_attributes.pop('MULTIPROCESSOR_COUNT')
    
    for k in list(device_attributes.keys()):
        print('\t {}: {}'.format(k, device_attributes[k]))


CUDA device query (PyCUDA version) 

Detected 1 CUDA Capable device(s) 

Device 0: Tesla T4
	 Compute Capability: 7.5
	 Total Memory: 15101 megabytes
	 (40) Multiprocessors, (64) CUDA Cores / Multiprocessor: 2560 CUDA Cores
	 ASYNC_ENGINE_COUNT: 3
	 CAN_MAP_HOST_MEMORY: 1
	 CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: 1
	 CLOCK_RATE: 1590000
	 COMPUTE_CAPABILITY_MAJOR: 7
	 COMPUTE_CAPABILITY_MINOR: 5
	 COMPUTE_MODE: DEFAULT
	 COMPUTE_PREEMPTION_SUPPORTED: 1
	 CONCURRENT_KERNELS: 1
	 CONCURRENT_MANAGED_ACCESS: 1
	 DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: 0
	 ECC_ENABLED: 1
	 GENERIC_COMPRESSION_SUPPORTED: 0
	 GLOBAL_L1_CACHE_SUPPORTED: 1
	 GLOBAL_MEMORY_BUS_WIDTH: 256
	 GPU_OVERLAP: 1
	 HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: 1
	 HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: 0
	 HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: 0
	 HOST_NATIVE_ATOMIC_SUPPORTED: 0
	 INTEGRATED: 0
	 KERNEL_EXEC_TIMEOUT: 0
	 L2_CACHE_SIZE: 4194304
	 LOCAL_L1_CACHE_SUPPORTED: 1
	 MANAGED_MEMORY: 1
	 MAXIMUM_SURFACE1D_LAYERED_LAYE

##pycuda index test

In [None]:
%%writefile a.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      __shared__ double out_buf[1024];
      int l_count = 0;
      __shared__ int g_count;
      l_count +=1;

      printf("I am threadIdx:[%d[%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]:\\n", threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y);
      

      out_buf[idx]=idy;

      __syncthreads();
      g_count +=1;
      //how to pass out the modified array data
      __syncthreads();
      out_gpu[idx]=2;
      __syncthreads();
      printf("thread id: [%d][%d]   -> l_count=%d ,g_count=%d *out_gpu=%d\\n",idx,idy,l_count,g_count,*out_gpu);
    }
    """)

func = mod.get_function("say_hi")


data=np.zeros(64, dtype=np.int32)
print('shape:',data.shape)
data_gpu = gpuarray.to_gpu(data)
out_gpu = gpuarray.empty_like(data_gpu)
func(out_gpu,block=(3,1,1),grid=(1,1,1))
print('modify pointer array:',out_gpu.get())


Writing a.py


In [None]:
!python a.py



  mod = SourceModule("""
shape: (64,)
I am threadIdx:[0[0]  of block:[0][0] block size:[3][1][1]  grid size:[1][1]:
I am threadIdx:[1[0]  of block:[0][0] block size:[3][1][1]  grid size:[1][1]:
I am threadIdx:[2[0]  of block:[0][0] block size:[3][1][1]  grid size:[1][1]:
thread id: [0][0]   -> l_count=1 ,g_count=0 *out_gpu=2
thread id: [1][0]   -> l_count=1 ,g_count=0 *out_gpu=2
thread id: [2][0]   -> l_count=1 ,g_count=0 *out_gpu=2
modify pointer array: [2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


##test 1d array index:one row, colSize thread

In [None]:
%%writefile a1colThead.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      const int colSize=6;

      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      printf("perThadloop[%d]: %d\\n",idx,out_gpu[idx]);
      for(int i=0;i<colSize;i++){
          printf("one row/col thread:idx=%d, %d\\n",idx,out_gpu[idx*colSize+i]);
          //printf("%d\\n",out_gpu[idx]);
      }
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
#data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(6,1,1),grid=(1,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting a1colThead.py


In [None]:
!python a1colThead.py

shape: (6,) [21  2  3 38  9 10]
I am threadIdx:[0][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=0, idy=0
I am threadIdx:[1][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=1, idy=0
I am threadIdx:[2][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=2, idy=0
I am threadIdx:[3][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=3, idy=0
I am threadIdx:[4][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=4, idy=0
I am threadIdx:[5][0]  of block:[0][0] block size:[6][1][1]  grid size:[1][1]  idx=5, idy=0
perThadloop[0]: 21
perThadloop[1]: 2
perThadloop[2]: 3
perThadloop[3]: 38
perThadloop[4]: 9
perThadloop[5]: 10
one row/col thread:idx=0, 21
one row/col thread:idx=1, 0
one row/col thread:idx=2, 0
one row/col thread:idx=3, 0
one row/col thread:idx=4, 0
one row/col thread:idx=5, 0
one row/col thread:idx=0, 2
one row/col thread:idx=1, 0
one row/col thread:idx=2, 0
one row/col thread:idx=3, 0
one row/col thread:idx=4

## test 1d array index:one row, one thread

In [None]:
%%writefile a1.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      const int colSize=6;

      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      for(int i=0;i<colSize;i++){
          printf("one row/one thread:idx=%d, %d\\n",idx,out_gpu[idx*colSize+i]);
          //printf("%d\\n",out_gpu[idx]);
      }
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
#data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(1,1,1),grid=(1,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting a1.py


In [None]:
!python a1.py

shape: (6,) [21  2  3 38  9 10]
I am threadIdx:[0][0]  of block:[0][0] block size:[1][1][1]  grid size:[1][1]  idx=0, idy=0
one row/one thread:idx=0, 21
one row/one thread:idx=0, 2
one row/one thread:idx=0, 3
one row/one thread:idx=0, 38
one row/one thread:idx=0, 9
one row/one thread:idx=0, 10


##[grid]test 1d array index:one row, one thread

In [None]:
%%writefile ga1.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      const int colSize=6;

      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      for(int i=0;i<colSize;i++){
          printf("one row/one thread:idx=%d, %d\\n",idx,out_gpu[idx*colSize+i]);
          //printf("%d\\n",out_gpu[idx]);
      }
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
#data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(1,1,1),grid=(2,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting ga1.py


In [None]:
!python ga1.py

shape: (6,) [21  2  3 38  9 10]
I am threadIdx:[0][0]  of block:[0][0] block size:[1][1][1]  grid size:[2][1]  idx=0, idy=0
I am threadIdx:[0][0]  of block:[1][0] block size:[1][1][1]  grid size:[2][1]  idx=1, idy=0
one row/one thread:idx=0, 21
one row/one thread:idx=1, 0
one row/one thread:idx=0, 2
one row/one thread:idx=1, 0
one row/one thread:idx=0, 3
one row/one thread:idx=1, 0
one row/one thread:idx=0, 38
one row/one thread:idx=1, 0
one row/one thread:idx=0, 9
one row/one thread:idx=1, 0
one row/one thread:idx=0, 10
one row/one thread:idx=1, 0


## [grid][Maxblock=1024]test 1d array index:one row, one thread

In [None]:
%%writefile mga1.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      const int colSize=6;
      //__shared__ int ot=[2][1024];
      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      /*for(int i=0;i<colSize;i++){
          printf("one row/one thread:idx=%d, %d\\n",idx,out_gpu[idx*colSize+i]);
          //printf("%d\\n",out_gpu[idx]);
      }*/
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
#data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(1024,1,1),grid=(2,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting mga1.py


In [None]:
!python mga1.py

shape: (6,) [21  2  3 38  9 10]
I am threadIdx:[928][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1952, idy=0
I am threadIdx:[929][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1953, idy=0
I am threadIdx:[930][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1954, idy=0
I am threadIdx:[931][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1955, idy=0
I am threadIdx:[932][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1956, idy=0
I am threadIdx:[933][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1957, idy=0
I am threadIdx:[934][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1958, idy=0
I am threadIdx:[935][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1959, idy=0
I am threadIdx:[936][0]  of block:[1][0] block size:[1024][1][1]  grid size:[2][1]  idx=1960, idy=0
I am threadIdx:[937][0]  of block:[1][0] block size:[1024][1][1]  gr

## test 2d arry index: one row, colSize thread

In [None]:
%%writefile a2colSizeThread.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;

      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      
      printf("perThadloop[%d][%d]: %d\\n",idx,idy,out_gpu[idx*3+idy]);

      for(int i=0;i<3;i++){
          printf("idx=%d, %d\\n",idx,out_gpu[idx*3+i]);
          //printf("%d\\n",out_gpu[idx]);
      }
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(2,3,1),grid=(1,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting a2colSizeThread.py


In [None]:
!python a2colSizeThread.py

shape: (2, 3) [[21  2  3]
 [38  9 10]]
I am threadIdx:[0][0]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=0, idy=0
I am threadIdx:[1][0]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=1, idy=0
I am threadIdx:[0][1]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=0, idy=1
I am threadIdx:[1][1]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=1, idy=1
I am threadIdx:[0][2]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=0, idy=2
I am threadIdx:[1][2]  of block:[0][0] block size:[2][3][1]  grid size:[1][1]  idx=1, idy=2
perThadloop[0][0]: 21
perThadloop[1][0]: 38
perThadloop[0][1]: 2
perThadloop[1][1]: 9
perThadloop[0][2]: 3
perThadloop[1][2]: 10
idx=0, 21
idx=1, 38
idx=0, 21
idx=1, 38
idx=0, 21
idx=1, 38
idx=0, 2
idx=1, 9
idx=0, 2
idx=1, 9
idx=0, 2
idx=1, 9
idx=0, 3
idx=1, 10
idx=0, 3
idx=1, 10
idx=0, 3
idx=1, 10


## test 2d arry index: one row, one thread

In [None]:
%%writefile a2.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;

      printf("I am threadIdx:[%d][%d]  of block:[%d][%d] block size:[%d][%d][%d]  grid size:[%d][%d]  idx=%d, idy=%d\\n", \
      threadIdx.x, threadIdx.y, blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,idx,idy);
      for(int i=0;i<3;i++){
          printf("idx=%d, %d\\n",idx,out_gpu[idx*3+i]);
          //printf("%d\\n",out_gpu[idx]);
      }
      //__syncthreads();
  
    }
    """)

func = mod.get_function("say_hi")


data=np.array([21,2,3,38,9,10], dtype=np.int32)
data=data.reshape(2,3)
print('shape:',data.shape,data)
data_gpu = gpuarray.to_gpu(data)
func(data_gpu,block=(2,1,1),grid=(1,1,1))
#print('modify pointer array:',data_gpu.get())

Overwriting a2.py


In [None]:
!python a2.py

shape: (2, 3) [[21  2  3]
 [38  9 10]]
I am threadIdx:[0][0]  of block:[0][0] block size:[2][1][1]  grid size:[1][1]  idx=0, idy=0
I am threadIdx:[1][0]  of block:[0][0] block size:[2][1][1]  grid size:[1][1]  idx=1, idy=0
idx=0, 21
idx=1, 38
idx=0, 2
idx=1, 9
idx=0, 3
idx=1, 10


In [None]:
import numpy as np
data=np.array([21,2,3,38,9,10], dtype=np.int32)
print(data.reshape(2,3))
print(data)

[[21  2  3]
 [38  9 10]]
[21  2  3 38  9 10]


## Run naive_prefix.py

In [None]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from time import time
# this is a naive parallel prefix-sum kernel that uses shared memory
naive_ker = SourceModule("""
__global__ void naive_prefix(double *vec, double *out)
{
     __shared__ double sum_buf[1024];     
     int tid = threadIdx.x;     
     sum_buf[tid] = vec[tid];
     
     // begin parallel prefix sum algorithm
     
     int iter = 1;
     for (int i=0; i < 10; i++)
     {
         __syncthreads();
         if (tid >= iter )
         {
             sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter];            
         }
         
         iter *= 2;
     }
         
    __syncthreads();
    out[tid] = sum_buf[tid];
    __syncthreads();
        
}
""")
naive_gpu = naive_ker.get_function("naive_prefix")
    


if __name__ == '__main__':
    
    
    testvec = np.random.randn(1024).astype(np.float64)
    testvec_gpu = gpuarray.to_gpu(testvec)
    
    outvec_gpu = gpuarray.empty_like(testvec_gpu)

    naive_gpu( testvec_gpu , outvec_gpu, block=(1024,1,1), grid=(1,1,1))
    
    total_sum = sum( testvec)
    total_sum_gpu = outvec_gpu[-1].get()
    
    print('outvec_gpu:',outvec_gpu)
    print("Does our kernel work correctly? : {}".format(np.allclose(total_sum_gpu , total_sum) ))


outvec_gpu: [ -1.20500879  -1.6132585   -2.71519557 ... -57.63849801 -58.57298875
 -59.34142221]
Does our kernel work correctly? : True


### hello world
https://documen.tician.de/pycuda/tutorial.html

[[1 0 0]
 [1 1 0]
 [0 0 1]]


### print input

In [None]:
%%writefile i.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi(int *out_gpu)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
     

      printf("index: threadIdx.x =%d blockIdx.x =%d blockDim.x %d  \\n", threadIdx.x , blockIdx.x , blockDim.x);
      printf("thread id: [%d]  intput out_gpu=%d \\n",idx,out_gpu[idx]);
      
      out_gpu[idx] *= 2;
      printf("thread id 2: [%d]  out_gpu=%d \\n",idx,out_gpu[idx]);

    }
    """)

func = mod.get_function("say_hi")


t= np.array([1,2,3,4], dtype=np.int32)
print('shape:',t.shape,t.size)
t_gpu = gpuarray.to_gpu(t)
func(t_gpu,block=(t.size,1,1),grid=(1,1,1))
print(t_gpu.get())

Overwriting i.py


In [None]:
!python i.py

shape: (4,) 4
index: threadIdx.x =0 blockIdx.x =0 blockDim.x 4  
index: threadIdx.x =1 blockIdx.x =0 blockDim.x 4  
index: threadIdx.x =2 blockIdx.x =0 blockDim.x 4  
index: threadIdx.x =3 blockIdx.x =0 blockDim.x 4  
thread id: [0]  intput out_gpu=1 
thread id: [1]  intput out_gpu=2 
thread id: [2]  intput out_gpu=3 
thread id: [3]  intput out_gpu=4 
thread id 2: [0]  out_gpu=2 
thread id 2: [1]  out_gpu=4 
thread id 2: [2]  out_gpu=6 
thread id 2: [3]  out_gpu=8 
[2 4 6 8]


## test tutaorl 2 https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial02/

In [None]:
!curl -O https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial02/solutions/vector_add_thread.cu
!ls -l

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  1575  100  1575    0     0  47727      0 --:--:-- --:--:-- --:--:-- 47727
total 8
drwxr-xr-x 1 root root 4096 Mar 15 13:40 sample_data
-rw-r--r-- 1 root root 1575 Mar 17 01:21 vector_add_thread.cu


In [None]:
!nvcc vector_add_thread.cu -o vector_add_grid
!nvprof ./vector_add_grid

==2277== NVPROF is profiling process 2277, command: ./vector_add_grid
PASSED
==2277== Profiling application: ./vector_add_grid
==2277== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   40.01%  26.437ms         1  26.437ms  26.437ms  26.437ms  [CUDA memcpy DtoH]
                   34.70%  22.929ms         1  22.929ms  22.929ms  22.929ms  vector_add(float*, float*, float*, int)
                   25.29%  16.712ms         2  8.3562ms  8.0535ms  8.6589ms  [CUDA memcpy HtoD]
      API calls:   82.24%  328.28ms         3  109.43ms  115.35us  328.02ms  cudaMalloc
                   16.88%  67.366ms         3  22.455ms  8.2867ms  50.200ms  cudaMemcpy
                    0.61%  2.4278ms         3  809.27us  240.07us  1.1046ms  cudaFree
                    0.23%  907.08us         1  907.08us  907.08us  907.08us  cuDeviceGetPCIBusId
                    0.03%  129.28us       101  1.2790us     130ns  55.520us  cuDeviceGetAttribut