In [2]:
!pip install numba



In [3]:
import numpy as np

# Mảng ví dụ
matrix = np.random.randint(10, 1000, size=(10000, 10000))

np.savetxt('imat1.gz', matrix, fmt='%i')


In [4]:
# Tuần tự
%%writefile sequential1.py
import time
import numpy as np

# Hàm tính sum row
def calculate_row_sums(matrix):
    row_sums = np.zeros(matrix.shape[0], dtype=np.int32)
    for i in range(matrix.shape[0]):
        row_sums[i] = np.sum(matrix[i])
    return row_sums

matrix = np.loadtxt('imat1.gz', dtype=int)

# Start
start = time.time()

row_sums = calculate_row_sums(matrix)

end = time.time()

print(f'Processing time: {end - start} s')

np.savetxt('omat_s1.gz', row_sums, fmt='%i')


Writing sequential1.py


In [5]:
!python sequential1.py

Processing time: 0.12828278541564941 s


In [6]:
# Song song
%%writefile parallel1.py
import time
import math
import numpy as np
from numba import jit, cuda

@cuda.jit

# Hàm tính sum dùng gpu
def calculate_row_sums_gpu(matrix, row_sums):
    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    if tid < matrix.shape[0]:
        row_sum = 0
        for i in range(matrix.shape[1]):
            row_sum += matrix[tid, i]
        row_sums[tid] = row_sum


matrix = np.loadtxt('imat1.gz', dtype=int)

matrix_gpu = cuda.to_device(matrix)
row_sums_gpu = cuda.device_array(matrix.shape[0])

threads_per_block = 32
blocks_per_grid = (matrix.shape[0] - 1) // threads_per_block + 1

# Start
start = time.time()
calculate_row_sums_gpu[blocks_per_grid, threads_per_block](matrix_gpu, row_sums_gpu)
end = time.time()

row_sums = row_sums_gpu.copy_to_host()
print(f'Processing time: {end - start} s')

np.savetxt('omat_p1.gz', row_sums, fmt='%i')

Writing parallel1.py


In [7]:
!python parallel1.py # Lần chạy 1

Processing time: 0.8741281032562256 s


In [8]:
!python parallel1.py # Lần chạy 2

Processing time: 0.23253798484802246 s


In [9]:
C_s1 = np.loadtxt('omat_s1.gz', dtype=int)
C_p1 = np.loadtxt('omat_p1.gz', dtype=int)
np.mean(np.abs(C_p1 - C_s1))

0.0

In [10]:
# Song song dùng shared memory
%%writefile parallel2.py
import time
import math
import numpy as np
from numba import cuda, float32
from numba import jit, cuda

@cuda.jit

def calculate_row_sums_gpu_smem(matrix, row_sums):
    block_row = cuda.blockIdx.x
    tid = cuda.threadIdx.x

    # Cấp phát shared memory cho block
    shared_mem = cuda.shared.array(shape=32, dtype=float32)  # Specify the correct type here

    # Khởi tạo shared memory với mảng 0
    shared_mem[tid] = 0.0
    cuda.syncthreads()

    for col in range(matrix.shape[1]):
        if tid < matrix.shape[0]:
            shared_mem[tid] += matrix[tid, col]

    cuda.syncthreads()

    s = 1
    while s < cuda.blockDim.x:
        idx = 2 * s * tid
        if idx < cuda.blockDim.x:
            shared_mem[idx] += shared_mem[idx + s]
        s *= 2
        cuda.syncthreads()

    if tid == 0:
        row_sums[block_row] = shared_mem[0]

matrix = np.loadtxt('imat1.gz', dtype=int)

matrix_gpu = cuda.to_device(matrix)
row_sums_gpu = cuda.device_array(matrix.shape[0])

threads_per_block = 32
blocks_per_grid = (matrix.shape[0] - 1) // threads_per_block + 1

# Start
start = time.time()
calculate_row_sums_gpu_smem[blocks_per_grid, threads_per_block](matrix_gpu, row_sums_gpu)
end = time.time()

row_sums = row_sums_gpu.copy_to_host()
print(f'Processing time: {end - start} s')

np.savetxt('omat_p2.gz', row_sums, fmt='%i')

Writing parallel2.py


In [11]:
!python parallel2.py # Lần chạy 1

Processing time: 0.3311479091644287 s


In [14]:
!python parallel2.py # Lần chạy 2

Processing time: 0.28098416328430176 s


In [13]:
C_s1 = np.loadtxt('omat_s1.gz', dtype=int)
C_p2 = np.loadtxt('omat_p2.gz', dtype=int)
np.mean(np.abs(C_p1 - C_s1))

0.0