<a href="https://colab.research.google.com/github/tonystz/cuda/blob/main/cudaTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycuda # install cuda

## CPU vs GPU monte_carlo_pi

In [None]:
%%writefile monte_carlo_pi.py
import time
import numpy as np

total = 10000000
#total = 100000
data=np.random.rand(total,2).astype(np.float64)

t=time.time()
def cal_pi_cpu():
  hits = 0
  for i in data:
      x,y = i
      if (x ** 2 + y ** 2) <= 1 :
          hits+=1
  return hits
hits=cal_pi_cpu()
print(f'CPU time:{time.time()-t} hits={hits} total={total} pi={hits * 4.0 / total}')

Overwriting monte_carlo_pi.py


In [None]:
!python -m profile -s cumtime  monte_carlo_pi.py 

In [20]:
%%writefile pi.py
import time
import numpy as np
#GPU
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.compiler import SourceModule


total = 10000000
#total = 100000
data=np.random.rand(total,2).astype(np.float64)

t=time.time()
def cal_pi_cpu():
  hits = 0
  for i in data:
      x,y = i
      if (x ** 2 + y ** 2) <= 1 :
          hits+=1
  return hits
hits=cal_pi_cpu()
print(f'CPU time:{time.time()-t} hits={hits} total={total} pi={hits * 4.0 / total}')


t=time.time()
mod = SourceModule("""
    #include <stdio.h>

    __global__ void cal_pi_gpu(double *in, unsigned long long *out)
    { 
      unsigned long long idx = threadIdx.x + blockIdx.x * blockDim.x;
      const int colSize=2;
      double x = in[idx*colSize];
      double y = in[idx*colSize+1];
      if ((x*x + y*y) <= 1.0)
        out[idx]=1;
    }
    """)

cal_pi_gpu = mod.get_function("cal_pi_gpu")
data_gpu = gpuarray.to_gpu(data)
out_gpu = gpuarray.zeros((total,1),dtype=np.uint64)
cal_pi_gpu(data_gpu,out_gpu,block=(128,1,1),grid=(int(total/128),1,1))
hits=sum(out_gpu.get())[0]
print(f'GPU time:{time.time()-t} hits={hits} total={total} pi={hits * 4.0 / total}')

Overwriting pi.py


In [23]:
!python pi.py

CPU time:19.792373657226562 hits=7856074 total=10000000 pi=3.1424296
GPU time:5.321574687957764 hits=7856074 total=10000000 pi=3.1424296


In [41]:
%%writefile matrix_index.py
#!python 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np

mod = SourceModule("""
    #include <stdio.h>

    __global__ void matrix_index(int *out_gpu, int col_size)
    { 
      int idx = threadIdx.x + blockIdx.x * blockDim.x;
      int idy = threadIdx.y + blockIdx.y * blockDim.y;
      
      printf("thread[%d][%d]: %d\\n",idx,idy,out_gpu[idx*col_size+idy]);
    }
    """)

matrix_index = mod.get_function("matrix_index")

data=np.array([21,2,3,38,9,10], dtype=np.int32)
ROW,COL=2,3
data=data.reshape(ROW,COL)
print(f'shape:{data.shape},data=\n{data}')
data_gpu = gpuarray.to_gpu(data)
matrix_index(data_gpu,np.int32(COL),block=(ROW,COL,1),grid=(1,1,1))

Overwriting matrix_index.py


In [42]:
!python matrix_index.py

shape:(2, 3),data=
[[21  2  3]
 [38  9 10]]
thread[0][0]: 21
thread[1][0]: 38
thread[0][1]: 2
thread[1][1]: 9
thread[0][2]: 3
thread[1][2]: 10
