In [1]:
import cv2
import numpy as np
from numba import cuda
import time
import math

# CPU and GPU

In [2]:
#gpu
@cuda.jit
def process_gpu(img,channels):
#     tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
#     ty = cuda.blockIdx.y * cuda.blockDim.y  +cuda.threadIdx.y
    tx, ty = cuda.grid(2)
    for k in range(channels):
        color = img[tx,ty][k]*2+30
        if color >255:
            img[tx,ty][k]=255
        elif color<0:
            img[tx,ty][k]=0
        else:
            img[tx,ty][k]=color

#cpu   
def process_cpu(img):
    rows,cols,channels=img.shape
    for i in range(rows):
        for j in range(cols):
            for k in range(channels):
                color=img[i,j][k]*2+30
                if color>255:
                    img[i,j][k]=255
                elif color<0:
                    img[i,j][k]=0
                else:
                    img[i,j][k]=color
    return img  
           
    
if __name__ == "__main__":
    img=cv2.imread('D:/Desktop/20210817203252.jpg')
#     img=cv2.imread('D:/Desktop/ii.jfif')
    rows,cols,channels = img.shape
    start_cpu = time.time()
    result_cpu = process_cpu(img)
    end_cpu = time.time()
    print('cpu time:',end_cpu-start_cpu)
    
    #GPU function 
    dImg = cuda.to_device(img)
    threadsperblock = (16,16)
    blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
    blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))
    blockspergrid = (blockspergrid_x,blockspergrid_y)
    cuda.synchronize()
    
    start_gpu = time.time()
    process_gpu[blockspergrid,threadsperblock](dImg,channels)
    cuda.synchronize()
    end_gpu = time.time()
    dst_gpu=dImg.copy_to_host()
    print('gpu time:',end_gpu-start_gpu)
    
    #save
    cv2.imwrite('D:/Desktop/cpu.jpg',result_cpu)
    cv2.imwrite('D:/Desktop/gpu.jpg',dst_gpu)
    print('Done')
    

cpu time: 391.8570840358734
gpu time: 0.287822961807251
Done


In [5]:
print(channels)

3


# cpu 

In [4]:
  
def process_cpu(img):
    rows,cols,channels=img.shape
    for i in range(rows):
        for j in range(cols):
            for k in range(channels):
                color=img[i,j,k]*2+30
                if color>255:
                    img[i,j,k]=255
                elif color<0:
                    img[i,j,k]=0
                else:
                    img[i,j,k]=color
    return img  
    
if __name__ == "__main__":
    img=cv2.imread('D:/Desktop/20210817203252.jpg')
#     img=cv2.imread('C:/Users/student/Desktop/2017111008465240.jpg')
    rows,cols,channels = img.shape 
    print(img.shape)
#     start_cpu = time.time()
#     result_cpu = process_cpu(img)
#     end_cpu = time.time()
#     cv2.imwrite('D:/Desktop/cpu.jpg',result_cpu)
#     print('cpu time:',end_cpu-start_cpu)

(5945, 9054, 3)


# GPU

In [5]:
@cuda.jit
def process_gpu(img,channels):
    tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    ty = cuda.blockIdx.y * cuda.blockDim.y  +cuda.threadIdx.y

    for i in range(channels):
        color = img[tx,ty,i]*2+30
        if color >255:
            img[tx,ty,i]=255
        elif color<0:
            img[tx,ty,i]=0
        else:
            img[tx,ty,i]=color
    
if __name__ == "__main__":
    img=cv2.imread('D:/Desktop/20210817203252.jpg')
#     img=cv2.imread('C:/Users/student/Desktop/aa.jpg')
    rows,cols,channels = img.shape 
        #GPU function 
    dImg = cuda.to_device(img)
    threadsperblock = (16,16)
    blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
    blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))
    blockspergrid = (blockspergrid_x,blockspergrid_y)
    cuda.synchronize()
    
    start_gpu = time.time()
    process_gpu[blockspergrid,threadsperblock](dImg,channels)
    cuda.synchronize()
    
    end_gpu = time.time()
    dst_gpu=dImg.copy_to_host()
#     cv2.imwrite('D:/Desktop/gpu.jpg',dst_gpu)
    print('gpu time:',end_gpu-start_gpu)

gpu time: 0.39519309997558594


In [19]:
print(img.shape)

(300, 400, 3)


# stride GPU

In [23]:
@cuda.jit
def process_gpu(img,rows,cols,channels):
#     tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
#     ty = cuda.blockIdx.y * cuda.blockDim.y  +cuda.threadIdx.y
    tx,ty=cuda.grid(2)
    stride_x = cuda.blockDim.x * cuda.gridDim.x
    stride_y = cuda.blockDim.y * cuda.gridDim.y
    
    for i in range(tx,rows,stride_x):
        for j in range(ty,cols,stride_y):
            for i in range(channels):
                color = img[tx,ty,channels]*20+30
                if color >255:
                    img[tx,ty,channels]=255
                elif color<0:
                    img[tx,ty,channels]=0
                else:
                    img[tx,ty,channels]=color
              
    
if __name__ == "__main__":
    img=cv2.imread('C:/Users/student/Desktop/2017110917545743.jpg')
#     img=cv2.imread('C:/Users/student/Desktop/462577165016652039.jpg')
#     img=cv2.imread('C:/Users/student/Desktop/2017111008465240.jpg')
    rows,cols,channels = img.shape 
        #GPU function 
    dImg = cuda.to_device(img)
    threadsperblock = (16,16)
    blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
    blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))

    blockspergrid = (blockspergrid_x,blockspergrid_y)
    cuda.synchronize()
    
    start_gpu = time.time()
    process_gpu[blockspergrid,threadsperblock](dImg,rows,cols,channels)
    cuda.synchronize()
    end_gpu = time.time()
    dst_gpu=dImg.copy_to_host()
    cv2.imwrite('C:/Users/student/Desktop/gpu.jpg',dst_gpu)
    print('gpu time:',end_gpu-start_gpu)

gpu time: 0.3008725643157959


In [20]:
print(img.shape)

(5945, 9054, 3)


# shared GPU

In [7]:
from numba import cuda,float32
import numba
import numpy as np
import math
import time

TPB = 16

@numba.jit(nopython = True)
def matmul_cpu(A,B,C):
    for y in range(B.shape[1]):
        for x in range(A.shape[0]):
            tmp = 0
            for k in range(A.shape[1]):
                tmp += A[x,k]*B[k,y]
            C[x,y] = tmp

@cuda.jit
def matmul_gpu(A,B,C):
    tx,ty = cuda.grid(2)
    if tx < C.shape[0] and ty < C.shape[1]:
        tmp = 0
        for k in range(A.shape[1]):
            tmp += A[tx,k]*B[k,ty]
        C[tx,ty] = tmp
    
@cuda.jit
def matmul_shared_men(A,B,C):
    sA = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
    sB = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
    x,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x>=C.shape[0] and y >=C.shape[1]:
        return
    
    tmp=0
    for i in range(int(A.shape[1]/TPB)):
        sA[tx,ty] = A[x,ty+i*TPB]
        sB[tx,ty] = B[tx+i*TPB,y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx,j]*sB[j,ty]
        cuda.syncthreads()
    C[x,y] = tmp
    
A = np.full((TPB*500,TPB*500),3,np.float)
B = np.full((TPB*500,TPB*500),4,np.float)
C_cpu = np.full((A.shape[0],B.shape[1]),0,np.float)

print('start cpu')
start_cpu = time.time()
# matmul_cpu(A,B,C_cpu)
print('cpu time:',time.time()-start_cpu)

##########################start GPU###################################
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)

C_global_mem = cuda.device_array((A.shape[0],B.shape[1]))
C_shared_mem = cuda.device_array((A.shape[0],B.shape[1]))

threadsperblock = (TPB,TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x,blockspergrid_y)

print('start global GPU')
start_global_gpu = time.time()
matmul_gpu[blockspergrid,threadsperblock](A_global_mem,B_global_mem,C_global_mem)
cuda.synchronize()
print('global GPU time:',time.time()-start_global_gpu)
C_global_gpu = C_global_mem.copy_to_host()

print('start shared GPU')
start_shared_gpu = time.time()
matmul_shared_men[blockspergrid,threadsperblock](A_global_mem,B_global_mem,C_shared_mem)
cuda.synchronize()
print('global shared time:',time.time()-start_shared_gpu)
C_shared_gpu = C_shared_mem.copy_to_host()


start cpu
cpu time: 0.0
start global GPU
global GPU time: 26.339080333709717
start shared GPU
global shared time: 31.999357223510742


In [11]:
print(C_global_gpu.shape)
print(C_global_gpu[0,0:3])

(8000, 8000)
[96000. 96000. 96000.]


In [12]:
print(C_shared_gpu.shape)
print(C_shared_gpu[0,0:3])

(8000, 8000)
[96000. 96000. 96000.]


In [13]:
print((C_shared_gpu-C_global_gpu).mean())

0.0
