In [None]:
! pip install numba
import numba
from numba import cuda
from numba.cuda.cudadrv import enums
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import time
from numba import vectorize
import math

from google.colab import drive
drive.mount("/content/drive/")

params = {
    'figure.figsize': [15, 10],  # instead of 4.5, 4.5
    'axes.titlesize': 15,
    'axes.labelsize': 10,
    'axes.linewidth': 0.5,
    'font.size': 20,
    'font.family': 'monospace',
    #    'font.monospace': 'Alma Mono',
    'legend.fontsize': 15,
    'legend.loc': 'upper right',
    'legend.labelspacing': 0.25,
    # 'xtick.labelsize': 20,
    # 'ytick.labelsize': 20,
    'lines.linewidth': 3,
    'text.usetex': False,
    # 'figure.autolayout': True,
    'ytick.right': False,
    'xtick.top': False,

    'xtick.major.size': 5,
    'ytick.major.size': 5,
    'xtick.minor.size': 5,
    'ytick.minor.size': 5,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,

    'xtick.major.width': 2,
    'ytick.major.width': 2,
    'xtick.minor.width': 1,
    'ytick.minor.width': 1,

    'xtick.major.pad': 2,
    'ytick.major.pad': 2,
    # 'xtick.minor.pad': 14,
    # 'ytick.minor.pad': 14,

    'xtick.direction': 'inout',
    'ytick.direction': 'inout',

    'grid.linestyle': '-',         # solid
    'grid.linewidth': 1.5,        # in points
    'grid.alpha':     1,        # transparency, between 0.0 and 1.0
}
# plt.style.use('fivethirtyeight')
matplotlib.rcParams.update(params)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
from numba.parfors.parfor import maximize_fusion_inner
#LABWORK7 - REDUCE NORMAL

Data_path="/content/drive/MyDrive/HPC/"
Image_path=Data_path+"Test_org.jpg"
Image_path_2=Data_path+"Test_org_2.jpg"
# blockDim.x,y,z gives the number of threads in a block, in the particular direction
# gridDim.x,y,z gives the number of blocks in a grid, in the particular direction
# blockDim.x * gridDim.x gives the number of threads in a grid


# Grayscale
@cuda.jit
def grayscale(src, dst):
  tidx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  tidy = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
  g = np.uint16((src[tidx,tidy, 0] + src[tidx,tidy, 1] + src[tidx,tidy, 2]) / 3)
  dst[tidx,tidy, 0] = dst[tidx,tidy, 1] = dst[tidx,tidy, 2] = g


@cuda.jit
def main(src,dst,max,min):
  tidx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  #dst[tidx,0] = np.uint16((src[tidx,0] - min)/(max-min)*255)
  tidy = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
  g = np.uint16((src[tidx,tidy,0] - min)/(max-min)*255)
  dst[tidx,tidy,0]=dst[tidx,tidy,1] = dst[tidx,tidy,2] = g


# Image data
img_data=plt.imread(Image_path)

# Shape of the figure
(imageHeight,imageWidth,_)=img_data.shape
pixelCount = imageWidth * imageHeight
out_img = np.array(img_data, copy=True)


def compare(blockSize, option):
  #Grid size -> chunk
  #int to ensure it's an interger
  # BlockSize should be the multiplication of 32
  grid_1 = int(imageHeight/blockSize)
  grid_2 = int(imageWidth/blockSize)
  print(grid_1,grid_2)
  gridSize=(grid_1,grid_2)
  blockSize=(blockSize,blockSize)

  # Start timing
  start_time=time.time()

  # Copy image to the device from host(CPU)
  devSrc = cuda.to_device(img_data)

  # Allocate memory on the device (GPU)
  devDst_gray = cuda.device_array((imageHeight,imageWidth,3), np.uint16)

  # Reduce finding max, min
  grayscale[gridSize,blockSize](devSrc, devDst_gray)

  # Copy from device to host
  midDst = devDst_gray.copy_to_host()
  grayDst = midDst
  midDst_flatten=midDst.flatten().reshape((pixelCount,3))[:,2]
  # Find max,min
  max_in = np.amax(midDst_flatten)
  print(max_in)
  min_in = np.amin(midDst_flatten)
  print(min_in)

  # Convert
  midSrc_gray = cuda.to_device(midDst)
  devDst = cuda.device_array((imageHeight,imageWidth,3), np.uint16)
  main[gridSize,blockSize](midSrc_gray,devDst,max_in,min_in)
  hostDst=devDst.copy_to_host()

  # Stop timing
  end_time=time.time()

  #Get the running time
  run_time=end_time-start_time

  if option == True:
    return run_time, grayDst, hostDst
  else: return run_time

In [None]:
from numba.parfors.parfor import maximize_fusion_inner
#LABWORK7 - REDUCE NO SHARE MEMORY

Data_path="/content/drive/MyDrive/HPC/"
Image_path=Data_path+"Test_org.jpg"
Image_path_2=Data_path+"Test_org_2.jpg"
# blockDim.x,y,z gives the number of threads in a block, in the particular direction
# gridDim.x,y,z gives the number of blocks in a grid, in the particular direction
# blockDim.x * gridDim.x gives the number of threads in a grid


# Grayscale
@cuda.jit
def grayscale(src, dst):
  tidx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  tidy = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
  g = np.uint16((src[tidx,tidy, 0] + src[tidx,tidy, 1] + src[tidx,tidy, 2]) / 3)
  dst[tidx,tidy, 0] = dst[tidx,tidy, 1] = dst[tidx,tidy, 2] = g

@cuda.jit
def sort_min(src,dst):
  localtid = cuda.threadIdx.x
  tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  i=1
  while i < cuda.blockDim.x :
    #index = np.uint64(i*2*localtid)
    #if index < cuda.blockDim.x:
    if localtid % (i * 2) == 0:
      if src[tid] > src[tid + i]:
        src[tid] = src[tid+i]
      i = i * 2
  dst[tid] = src[tid]

@cuda.jit
def sort_max(src,dst):
  localtid = cuda.threadIdx.x
  tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  i=1
  while i < cuda.blockDim.x :
    #index = np.uint64(i*2*localtid)
    #if index < cuda.blockDim.x:
    if localtid % (i * 2) == 0:
      if src[tid] > src[tid + i]:
        src[tid] = src[tid]
      i = i * 2
  dst[tid] = src[tid]


@cuda.jit
def main(src,dst,max,min):
  tidx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
  #dst[tidx,0] = np.uint16((src[tidx,0] - min)/(max-min)*255)
  tidy = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
  g = np.uint16((src[tidx,tidy,0] - min)/(max-min)*255)
  dst[tidx,tidy,0]=dst[tidx,tidy,1] = dst[tidx,tidy,2] = g


# Image data
img_data=plt.imread(Image_path)

# Shape of the figure
(imageHeight,imageWidth,_)=img_data.shape
pixelCount = imageWidth * imageHeight
out_img = np.array(img_data, copy=True)


def compare(blockSize_unit, option):
  #Grid size -> chunk
  #int to ensure it's an interger
  # BlockSize should be the multiplication of 32
  grid_1 = int(imageHeight/blockSize_unit)
  grid_2 = int(imageWidth/blockSize_unit)
  print(grid_1,grid_2)
  gridSize=(grid_1,grid_2)
  blockSize=(blockSize_unit,blockSize_unit)

  # Start timing
  start_time=time.time()

  # Copy image to the device from host(CPU)
  devSrc = cuda.to_device(img_data)

  # Allocate memory on the device (GPU)
  devDst_gray = cuda.device_array((imageHeight,imageWidth,3), np.uint16)

  # Reduce finding max, min
  grayscale[gridSize,blockSize](devSrc, devDst_gray)
  # Copy from device to host
  midDst_gray = devDst_gray.copy_to_host()
  grayDst = midDst_gray

  # Sort min,max
  # Flatten and get the array
  midDst_flatten=midDst_gray[:,:,0].flatten()
  #midDst_flatten = np.ascontiguousarray(midDst_flatten)
  (length,)=midDst_flatten.shape
  gridSize_small= int(length/blockSize_unit)
  print(midDst_flatten)

  # Sort
  #stream = cuda.stream()

  midSrc = cuda.to_device(midDst_flatten)
  midDst = cuda.device_array(np.array(midDst_flatten,copy=True))
  sort_max[gridSize_small,blockSize_unit](midSrc,midDst)
  midDst_max=midDst.copy_to_host()
  max_in=int(midDst_max[0])

  midSrc = cuda.to_device(midDst_flatten)
  midDst = cuda.device_array(np.array(midDst_flatten,copy=True),np.unint16)
  sort_min[gridSize_small,blockSize_unit](midSrc)
  midDst_min=midDst.copy_to_host()
  min_in=int(midDst_min[0])

  print(max_in)

  # Convert
  midSrc_sort = cuda.to_device(midDst)
  devDst = cuda.device_array((imageHeight,imageWidth,3), np.uint16)
  main[gridSize,blockSize](midSrc_sort,devDst,max_in,min_in)
  hostDst=devDst.copy_to_host()

  # Stop timing
  end_time=time.time()

  #Get the running time
  run_time=end_time-start_time

  if option == True:
    return run_time, grayDst, hostDst
  else: return run_time

In [None]:
#Output run_time and hostDst

plt.subplot(1,3,1)
plt.title("Original image")
plt.imshow(img_data)

blockSize = 32
run_time, midDst, hostDst= compare(blockSize,option=True)

plt.subplot(1,3,2)
# Show the normal image
plt.title("Normal gray image")
plt.imshow(midDst)

plt.subplot(1,3,3)
# Show the resule image
plt.title("Result image")
plt.imshow(hostDst)

# Save the image
plt.savefig(Data_path+"Test_result_LW7.jpg")
print("The run time is",run_time,"s")

37 60
[17 15 15 ...  0  0  0]


  self.size = functools.reduce(operator.mul, self.shape, 1)
  self.size = int(functools.reduce(operator.mul, self.shape, 1))


In [None]:
(imageHeight,imageWidth,_)=img_data.shape
pixelCount = imageWidth * imageHeight

midDst_flatten=img_data[:,:,0]
print(midDst_flatten.shape)
#res_image = np.reshape(midDst_flatten,(imageHeight, imageWidth))
#print(res_image)

(1200, 1920)
