!nvidia-smi

Part1

In [None]:
!nvidia-smi


In [9]:
%%writefile matrix_cpu.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void matrixMultiplyCPU(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 1024;
    size_t size = N * N * sizeof(float);

    float *A = (float *)malloc(size);
    float *B = (float *)malloc(size);
    float *C = (float *)malloc(size);

    for (int i = 0; i < N * N; i++) {
        A[i] = rand() % 100 / 100.0f;
        B[i] = rand() % 100 / 100.0f;
    }

    clock_t start = clock();
    matrixMultiplyCPU(A, B, C, N);
    clock_t end = clock();

    double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
    printf("CPU execution time (N=%d): %f seconds\n", N, elapsed);

    free(A); free(B); free(C);
    return 0;
}

Writing matrix_cpu.c


In [10]:
!gcc matrix_cpu.c -o matrix_cpu -O2

In [11]:
print("=== CPU Matrix Multiplication Tests ===\n")
!./matrix_cpu 256
!./matrix_cpu 512
!./matrix_cpu 768
!./matrix_cpu 1024
!./matrix_cpu 1536
!./matrix_cpu 2048

=== CPU Matrix Multiplication Tests ===

CPU execution time (N=256): 0.020597 seconds
CPU execution time (N=512): 0.201802 seconds
CPU execution time (N=768): 0.681257 seconds
CPU execution time (N=1024): 3.224413 seconds
CPU execution time (N=1536): 17.811165 seconds
CPU execution time (N=2048): 83.500410 seconds


Part 2

In [12]:
%%writefile matrix_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void matrixMultiplyGPU(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 1024;
    size_t size = N * N * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() % 100 / 100.0f;
        h_B[i] = rand() % 100 / 100.0f;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    dim3 dimBlock(16, 16);
    dim3 dimGrid((N + 15) / 16, (N + 15) / 16);

    cudaEventRecord(start);
    matrixMultiplyGPU<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Naive GPU time (N=%d): %.3f ms (%.3f sec)\n", N, milliseconds, milliseconds/1000.0);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    return 0;
}

Writing matrix_gpu.cu


In [13]:
!nvcc matrix_gpu.cu -o matrix_gpu

Part 3

In [14]:
print("=== Naive GPU Matrix Multiplication Tests ===\n")
!./matrix_gpu 256
!./matrix_gpu 512
!./matrix_gpu 768
!./matrix_gpu 1024
!./matrix_gpu 1536
!./matrix_gpu 2048
!./matrix_gpu 4096

=== Naive GPU Matrix Multiplication Tests ===

Naive GPU time (N=256): 9.144 ms (0.009 sec)
Naive GPU time (N=512): 7.388 ms (0.007 sec)
Naive GPU time (N=768): 12.828 ms (0.013 sec)
Naive GPU time (N=1024): 11.195 ms (0.011 sec)
Naive GPU time (N=1536): 11.692 ms (0.012 sec)
Naive GPU time (N=2048): 11.023 ms (0.011 sec)
Naive GPU time (N=4096): 7.387 ms (0.007 sec)


Part 4

In [15]:
%%writefile matrix_tiled.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define TILE_WIDTH 16

__global__ void matrixMultiplyTiled(float *A, float *B, float *C, int N) {
    __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;

    float Pvalue = 0.0;

    for (int m = 0; m < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++m) {
        if (Row < N && (m*TILE_WIDTH+tx) < N)
            ds_A[ty][tx] = A[Row * N + m * TILE_WIDTH + tx];
        else
            ds_A[ty][tx] = 0.0f;

        if (Col < N && (m*TILE_WIDTH+ty) < N)
            ds_B[ty][tx] = B[(m*TILE_WIDTH + ty) * N + Col];
        else
            ds_B[ty][tx] = 0.0f;

        __syncthreads();

        for (int k = 0; k < TILE_WIDTH; ++k)
            Pvalue += ds_A[ty][k] * ds_B[k][tx];

        __syncthreads();
    }

    if (Row < N && Col < N)
        C[Row * N + Col] = Pvalue;
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 1024;
    size_t size = N * N * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() % 100 / 100.0f;
        h_B[i] = rand() % 100 / 100.0f;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    dim3 dimGrid((N + TILE_WIDTH - 1) / TILE_WIDTH, (N + TILE_WIDTH - 1) / TILE_WIDTH);

    cudaEventRecord(start);
    matrixMultiplyTiled<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Optimized GPU time (N=%d): %.3f ms (%.3f sec)\n", N, milliseconds, milliseconds/1000.0);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    return 0;
}

Writing matrix_tiled.cu


In [16]:
!nvcc matrix_tiled.cu -o matrix_tiled

In [17]:
print("=== Optimized GPU (Tiled) Matrix Multiplication Tests ===\n")
!./matrix_tiled 256
!./matrix_tiled 512
!./matrix_tiled 768
!./matrix_tiled 1024
!./matrix_tiled 1536
!./matrix_tiled 2048
!./matrix_tiled 4096

=== Optimized GPU (Tiled) Matrix Multiplication Tests ===

Optimized GPU time (N=256): 7.480 ms (0.007 sec)
Optimized GPU time (N=512): 7.296 ms (0.007 sec)
Optimized GPU time (N=768): 7.423 ms (0.007 sec)
Optimized GPU time (N=1024): 7.482 ms (0.007 sec)
Optimized GPU time (N=1536): 7.229 ms (0.007 sec)
Optimized GPU time (N=2048): 7.320 ms (0.007 sec)
Optimized GPU time (N=4096): 7.370 ms (0.007 sec)


Part 6

In [18]:
%%writefile matrix_cublas.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 1024;
    size_t size = N * N * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() % 100 / 100.0f;
        h_B[i] = rand() % 100 / 100.0f;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasCreate(&handle);

    float alpha = 1.0f;
    float beta = 0.0f;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                N, N, N,
                &alpha,
                d_B, N,
                d_A, N,
                &beta,
                d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("cuBLAS GPU time (N=%d): %.3f ms (%.3f sec)\n", N, milliseconds, milliseconds/1000.0);

    cublasDestroy(handle);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    return 0;
}

Writing matrix_cublas.cu


In [19]:
!nvcc matrix_cublas.cu -o matrix_cublas -lcublas

In [20]:
print("=== cuBLAS Matrix Multiplication Tests ===\n")
!./matrix_cublas 256
!./matrix_cublas 512
!./matrix_cublas 768
!./matrix_cublas 1024
!./matrix_cublas 1536
!./matrix_cublas 2048
!./matrix_cublas 4096

=== cuBLAS Matrix Multiplication Tests ===

cuBLAS GPU time (N=256): 80.034 ms (0.080 sec)
cuBLAS GPU time (N=512): 5.376 ms (0.005 sec)
cuBLAS GPU time (N=768): 5.620 ms (0.006 sec)
cuBLAS GPU time (N=1024): 6.163 ms (0.006 sec)
cuBLAS GPU time (N=1536): 7.875 ms (0.008 sec)
cuBLAS GPU time (N=2048): 11.310 ms (0.011 sec)
cuBLAS GPU time (N=4096): 53.277 ms (0.053 sec)


Part 7

In [21]:
%%writefile matrix_lib.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define TILE_WIDTH 16

__global__ void matrixMultiplyTiled(float *A, float *B, float *C, int N) {
    __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;

    float Pvalue = 0.0;

    for (int m = 0; m < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++m) {
        if (Row < N && (m*TILE_WIDTH+tx) < N)
            ds_A[ty][tx] = A[Row * N + m * TILE_WIDTH + tx];
        else
            ds_A[ty][tx] = 0.0f;

        if (Col < N && (m*TILE_WIDTH+ty) < N)
            ds_B[ty][tx] = B[(m*TILE_WIDTH + ty) * N + Col];
        else
            ds_B[ty][tx] = 0.0f;

        __syncthreads();

        for (int k = 0; k < TILE_WIDTH; ++k)
            Pvalue += ds_A[ty][k] * ds_B[k][tx];

        __syncthreads();
    }

    if (Row < N && Col < N)
        C[Row * N + Col] = Pvalue;
}

// Exposed C function for Python
extern "C" void gpu_matrix_multiply(float *h_A, float *h_B, float *h_C, int N) {
    size_t size = N * N * sizeof(float);
    float *d_A, *d_B, *d_C;

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    dim3 dimGrid((N + TILE_WIDTH - 1) / TILE_WIDTH, (N + TILE_WIDTH - 1) / TILE_WIDTH);

    matrixMultiplyTiled<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

Writing matrix_lib.cu


In [22]:
!nvcc -Xcompiler -fPIC -shared matrix_lib.cu -o libmatrix.so

In [23]:
%%writefile test_cuda_lib.py
import ctypes
import numpy as np
import time

# Load shared library
lib = ctypes.cdll.LoadLibrary("./libmatrix.so")

# Define argument types
lib.gpu_matrix_multiply.argtypes = [
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    ctypes.c_int
]

# Test with different sizes
sizes = [256, 512, 1024, 2048]

print("="*60)
print("Python calling CUDA library - Performance Test")
print("="*60)

for N in sizes:
    A = np.random.rand(N, N).astype(np.float32)
    B = np.random.rand(N, N).astype(np.float32)
    C = np.zeros((N, N), dtype=np.float32)

    start = time.time()
    lib.gpu_matrix_multiply(A.ravel(), B.ravel(), C.ravel(), N)
    end = time.time()

    print(f"N={N}: {(end - start)*1000:.3f} ms ({end - start:.4f} sec)")

print("="*60)

Writing test_cuda_lib.py


In [24]:
!python test_cuda_lib.py

Python calling CUDA library - Performance Test
N=256: 210.550 ms (0.2105 sec)
N=512: 1.522 ms (0.0015 sec)
N=1024: 3.601 ms (0.0036 sec)
N=2048: 20.458 ms (0.0205 sec)


In [25]:
%%writefile convolution_lib.cu
#include <cuda_runtime.h>
#include <stdio.h>

// 2D Convolution kernel
__global__ void convolveGPU(float *image, float *filter, float *output,
                            int imageWidth, int imageHeight,
                            int filterSize) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (row < imageHeight && col < imageWidth) {
        float sum = 0.0f;
        int halfFilter = filterSize / 2;

        for (int fRow = 0; fRow < filterSize; fRow++) {
            for (int fCol = 0; fCol < filterSize; fCol++) {
                int imageRow = row - halfFilter + fRow;
                int imageCol = col - halfFilter + fCol;

                // Handle boundaries (zero padding)
                if (imageRow >= 0 && imageRow < imageHeight &&
                    imageCol >= 0 && imageCol < imageWidth) {
                    sum += image[imageRow * imageWidth + imageCol] *
                           filter[fRow * filterSize + fCol];
                }
            }
        }
        output[row * imageWidth + col] = sum;
    }
}

// Exposed C function for Python
extern "C" void gpu_convolve(float *h_image, float *h_filter, float *h_output,
                             int imageWidth, int imageHeight, int filterSize) {
    size_t imageSize = imageWidth * imageHeight * sizeof(float);
    size_t filterSize_bytes = filterSize * filterSize * sizeof(float);

    float *d_image, *d_filter, *d_output;
    cudaMalloc((void**)&d_image, imageSize);
    cudaMalloc((void**)&d_filter, filterSize_bytes);
    cudaMalloc((void**)&d_output, imageSize);

    cudaMemcpy(d_image, h_image, imageSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter, h_filter, filterSize_bytes, cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);
    dim3 dimGrid((imageWidth + 15) / 16, (imageHeight + 15) / 16);

    convolveGPU<<<dimGrid, dimBlock>>>(d_image, d_filter, d_output,
                                       imageWidth, imageHeight, filterSize);
    cudaDeviceSynchronize();

    cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost);

    cudaFree(d_image);
    cudaFree(d_filter);
    cudaFree(d_output);
}

Writing convolution_lib.cu


In [26]:
!nvcc -Xcompiler -fPIC -shared convolution_lib.cu -o libconvolution.so

In [27]:
%%writefile test_convolution.py
import ctypes
import numpy as np
import time
import matplotlib.pyplot as plt

# Load library
lib = ctypes.cdll.LoadLibrary("./libconvolution.so")

lib.gpu_convolve.argtypes = [
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    np.ctypeslib.ndpointer(dtype=np.float32, ndim=1, flags="C_CONTIGUOUS"),
    ctypes.c_int, ctypes.c_int, ctypes.c_int
]

# Edge detection filters
sobel_x = np.array([[-1, 0, 1],
                    [-2, 0, 2],
                    [-1, 0, 1]], dtype=np.float32)

sobel_y = np.array([[-1, -2, -1],
                    [ 0,  0,  0],
                    [ 1,  2,  1]], dtype=np.float32)

blur = np.array([[1, 1, 1],
                 [1, 1, 1],
                 [1, 1, 1]], dtype=np.float32) / 9.0

# Create test image (circle)
size = 512
image = np.zeros((size, size), dtype=np.float32)
center = size // 2
radius = size // 4
for i in range(size):
    for j in range(size):
        if (i - center)**2 + (j - center)**2 < radius**2:
            image[i, j] = 1.0

# Apply filters
output_x = np.zeros_like(image)
output_y = np.zeros_like(image)
output_blur = np.zeros_like(image)

print("Applying edge detection filters...")

start = time.time()
lib.gpu_convolve(image.ravel(), sobel_x.ravel(), output_x.ravel(), size, size, 3)
time_x = time.time() - start

start = time.time()
lib.gpu_convolve(image.ravel(), sobel_y.ravel(), output_y.ravel(), size, size, 3)
time_y = time.time() - start

start = time.time()
lib.gpu_convolve(image.ravel(), blur.ravel(), output_blur.ravel(), size, size, 3)
time_blur = time.time() - start

# Combine edge detection
edges = np.sqrt(output_x**2 + output_y**2)

print(f"\nPerformance (512x512 image):")
print(f"Sobel X: {time_x*1000:.3f} ms")
print(f"Sobel Y: {time_y*1000:.3f} ms")
print(f"Blur: {time_blur*1000:.3f} ms")

# Visualize
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
axes[0, 0].imshow(image, cmap='gray')
axes[0, 0].set_title('Original Image')
axes[0, 1].imshow(output_x, cmap='gray')
axes[0, 1].set_title('Sobel X (Vertical Edges)')
axes[0, 2].imshow(output_y, cmap='gray')
axes[0, 2].set_title('Sobel Y (Horizontal Edges)')
axes[1, 0].imshow(edges, cmap='gray')
axes[1, 0].set_title('Combined Edge Detection')
axes[1, 1].imshow(output_blur, cmap='gray')
axes[1, 1].set_title('Blur Filter')
axes[1, 2].axis('off')

for ax in axes.flat:
    ax.axis('off')

plt.tight_layout()
plt.savefig('convolution_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nDone")

Writing test_convolution.py


In [28]:
!python test_convolution.py

Applying edge detection filters...

Performance (512x512 image):
Sobel X: 201.844 ms
Sobel Y: 1.183 ms
Blur: 1.002 ms
Figure(1200x800)

Done


Part 8

In [29]:
%%writefile convolution_cpu.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// CPU convolution
void convolveCPU(float *image, float *filter, float *output,
                 int imageWidth, int imageHeight, int filterSize) {

    int half = filterSize / 2;

    for (int row = 0; row < imageHeight; row++) {
        for (int col = 0; col < imageWidth; col++) {
            float sum = 0.0f;

            for (int fr = 0; fr < filterSize; fr++) {
                for (int fc = 0; fc < filterSize; fc++) {
                    int r = row - half + fr;
                    int c = col - half + fc;

                    if (r >= 0 && r < imageHeight &&
                        c >= 0 && c < imageWidth) {
                        sum += image[r * imageWidth + c] *
                               filter[fr * filterSize + fc];
                    }
                }
            }
            output[row * imageWidth + col] = sum;
        }
    }
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 512;
    int filterSize = (argc > 2) ? atoi(argv[2]) : 3;

    size_t imageBytes = N * N * sizeof(float);
    size_t filterBytes = filterSize * filterSize * sizeof(float);

    float *image = (float *)malloc(imageBytes);
    float *output = (float *)malloc(imageBytes);
    float *filter = (float *)malloc(filterBytes);

    for (int i = 0; i < N * N; i++)
        image[i] = (i % 255) / 255.0f;

    for (int i = 0; i < filterSize * filterSize; i++)
        filter[i] = 1.0f / (filterSize * filterSize);

    clock_t start = clock();
    convolveCPU(image, filter, output, N, N, filterSize);
    clock_t end = clock();

    double timeSec = (double)(end - start) / CLOCKS_PER_SEC;
    printf("CPU Convolution: Image=%dx%d Filter=%dx%d Time=%.4f sec\n",
           N, N, filterSize, filterSize, timeSec);

    free(image);
    free(output);
    free(filter);
    return 0;
}


Overwriting convolution_cpu.c


In [30]:
!gcc convolution_cpu.c -O2 -o convolution_cpu


In [31]:
!./convolution_cpu 256 3
!./convolution_cpu 512 3
!./convolution_cpu 1024 3


CPU Convolution: Image=256x256 Filter=3x3 Time=0.0013 sec
CPU Convolution: Image=512x512 Filter=3x3 Time=0.0041 sec
CPU Convolution: Image=1024x1024 Filter=3x3 Time=0.0156 sec


In [32]:
%%writefile convolution_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void convolveGPU(float *img, float *fil, float *out,
                            int W, int H, int F) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int half = F / 2;

    if (x < W && y < H) {
        float sum = 0.0f;
        for (int fy = 0; fy < F; fy++)
            for (int fx = 0; fx < F; fx++) {
                int iy = y - half + fy;
                int ix = x - half + fx;
                if (iy >= 0 && iy < H && ix >= 0 && ix < W)
                    sum += img[iy * W + ix] * fil[fy * F + fx];
            }
        out[y * W + x] = sum;
    }
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 512;
    int F = (argc > 2) ? atoi(argv[2]) : 3;

    size_t imgSize = N * N * sizeof(float);
    size_t filSize = F * F * sizeof(float);

    float *h_img = (float*)malloc(imgSize);
    float *h_fil = (float*)malloc(filSize);
    float *h_out = (float*)malloc(imgSize);

    for (int i = 0; i < N * N; i++) h_img[i] = (i % 255) / 255.0f;
    for (int i = 0; i < F * F; i++) h_fil[i] = 1.0f / (F * F);

    float *d_img, *d_fil, *d_out;
    cudaMalloc(&d_img, imgSize);
    cudaMalloc(&d_fil, filSize);
    cudaMalloc(&d_out, imgSize);

    cudaMemcpy(d_img, h_img, imgSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_fil, h_fil, filSize, cudaMemcpyHostToDevice);

    dim3 block(16,16);
    dim3 grid((N+15)/16, (N+15)/16);

    cudaEvent_t s,e;
    cudaEventCreate(&s); cudaEventCreate(&e);
    cudaEventRecord(s);

    convolveGPU<<<grid,block>>>(d_img,d_fil,d_out,N,N,F);
    cudaEventRecord(e);
    cudaEventSynchronize(e);

    float ms;
    cudaEventElapsedTime(&ms,s,e);

    printf("CUDA Convolution N=%d F=%d Time=%.3f ms\n", N, F, ms);

    cudaFree(d_img); cudaFree(d_fil); cudaFree(d_out);
    free(h_img); free(h_fil); free(h_out);
}


Overwriting convolution_gpu.cu


In [33]:
!nvcc convolution_gpu.cu -o convolution_gpu


In [34]:
!./convolution_gpu 256 3
!./convolution_gpu 512 3
!./convolution_gpu 1024 3

CUDA Convolution N=256 F=3 Time=8.970 ms
CUDA Convolution N=512 F=3 Time=7.639 ms
CUDA Convolution N=1024 F=3 Time=7.303 ms


| Image Size | CPU Time (ms) | CUDA Time (ms) |
| ---------- | ------------- | -------------- |
| 256×256    | 1.4           | 76.5           |
| 512×512    | 4.3           | 11.1           |
| 1024×1024  | 28.6          | 10.9           |
