In [None]:
# Colab setup: check GPU and nvcc; try installing nvcc if it's missing.
# Run in a code cell with `!` commands (Colab/python cell).
# Copy-paste this into the first cell and run.

# Show GPU details
!nvidia-smi

# Check nvcc
!which nvcc || echo "nvcc not found"

# If nvcc missing, install a lightweight package that provides nvcc
# (this works in Colab's Ubuntu environment most of the time)
import os,sys,subprocess,shlex
rc = subprocess.call("which nvcc", shell=True)
if rc != 0:
    print("nvcc not found â€” attempting to install nvidia-cuda-toolkit (may take a minute)...")
    # install package that usually contains nvcc
    !apt-get update -qq
    !DEBIAN_FRONTEND=noninteractive apt-get install -y -qq nvidia-cuda-toolkit
    print("install attempted. Re-checking nvcc:")
    !which nvcc || echo "nvcc still not found. If it's missing, try restarting the runtime and re-run this cell."
else:
    print("nvcc is available")
!nvcc --version || true


Sun Oct 26 16:13:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile device_props.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
    }
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
        {
            if (deviceProp.major < 1)
            {
                printf("There is no device supporting CUDA.\n");
            }
            else if (deviceCount == 1)
            {
                printf("There is 1 device supporting CUDA\n");
            }
            else
            {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %zu bytes\n", (size_t)deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %zu bytes\n", (size_t)deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", (size_t)deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n",deviceProp.multiProcessorCount );
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", (size_t)deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", (size_t)deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
    return 0;
}

!nvcc -arch=sm_50 -o device_props device_props.cu
!./device_props


Overwriting device_props.cu


In [None]:
!rm device_props.cu


In [None]:
%%writefile device_props.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
    }
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
        {
            if (deviceProp.major < 1)
            {
                printf("There is no device supporting CUDA.\n");
            }
            else if (deviceCount == 1)
            {
                printf("There is 1 device supporting CUDA\n");
            }
            else
            {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %zu bytes\n", (size_t)deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %zu bytes\n", (size_t)deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", (size_t)deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n",deviceProp.multiProcessorCount );
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", (size_t)deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", (size_t)deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
    return 0;
}


Writing device_props.cu


In [None]:
!nvcc -arch=sm_50 -o device_props device_props.cu
!./device_props


There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15828320256 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


In [None]:
%%writefile hello_block1.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function executed by GPU threads
__global__ void hello_kernel() {
    int tid = threadIdx.x;  // thread ID within the block
    printf("Hello World from thread %d (in block %d)\\n", tid, blockIdx.x);
}

// Main function runs on CPU
int main() {
    int threadsPerBlock = 8; // you can change to 16, 32, etc.
    printf("Launching kernel with 1 block and %d threads...\\n", threadsPerBlock);

    // Launch kernel <<<number_of_blocks, threads_per_block>>>
    hello_kernel<<<1, threadsPerBlock>>>();

    // Wait for all threads to finish before exiting
    cudaDeviceSynchronize();
    return 0;
}


Writing hello_block1.cu


In [None]:
!nvcc -arch=sm_50 -o hello_block1 hello_block1.cu
!./hello_block1


Launching kernel with 1 block and 8 threads...\n

In [None]:
%%writefile hello_blocks_threads.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function executed by GPU threads
__global__ void hello_kernel() {
    int threadId = threadIdx.x;   // thread index inside the block
    int blockId = blockIdx.x;     // block index
    int globalId = blockId * blockDim.x + threadId; // overall thread ID

    printf("Hello World from global thread %d (block %d, thread %d)\\n",
           globalId, blockId, threadId);
}

// Main function runs on CPU
int main() {
    int threadsPerBlock = 4;   // threads per block
    int numBlocks = 3;         // number of blocks
    printf("Launching kernel with %d blocks and %d threads per block...\\n",
           numBlocks, threadsPerBlock);

    // Launch kernel
    hello_kernel<<<numBlocks, threadsPerBlock>>>();

    // Wait for all threads to finish
    cudaDeviceSynchronize();
    return 0;
}

Writing hello_blocks_threads.cu


In [None]:
!nvcc -arch=sm_50 -o hello_blocks_threads hello_blocks_threads.cu
!./hello_blocks_threads


Launching kernel with 3 blocks and 4 threads per block...\n

In [None]:
%%writefile hello_2d.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function executed by GPU threads
__global__ void hello2D_kernel() {
    int bx = blockIdx.x;  // Block index (x-dimension)
    int by = blockIdx.y;  // Block index (y-dimension)
    int tx = threadIdx.x; // Thread index within the block (x-dimension)
    int ty = threadIdx.y; // Thread index within the block (y-dimension)

    // Compute unique global coordinates for thread
    printf("Hello World from block(%d,%d), thread(%d,%d)\\n", bx, by, tx, ty);
}

// Main function runs on CPU
int main() {
    dim3 threadsPerBlock(4, 2);  // Each block has 4x2 = 8 threads
    dim3 numBlocks(2, 2);        // Grid has 2x2 = 4 blocks
    printf("Launching kernel with grid(%d,%d) blocks and block(%d,%d) threads...\\n",
           numBlocks.x, numBlocks.y, threadsPerBlock.x, threadsPerBlock.y);

    // Launch the kernel with 2D configuration
    hello2D_kernel<<<numBlocks, threadsPerBlock>>>();

    // Wait for all threads to finish
    cudaDeviceSynchronize();
    return 0;
}

Writing hello_2d.cu


In [None]:
!nvcc -arch=sm_50 -o hello_2d hello_2d.cu
!./hello_2d

Launching kernel with grid(2,2) blocks and block(4,2) threads...\n

In [None]:
%%writefile vec_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono>  // for CPU timing

// =========================
// CUDA kernel for vector addition
// =========================
__global__ void vecAddKernel(const float *A, const float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

// Main program
// =========================
int main(int argc, char **argv) {
    int N = 1000000; // default size = 10^6
    if (argc > 1)
        N = atoi(argv[1]);
    printf("Vector Size: %d\n", N);

    // Allocate host memory
    float *h_A = (float *)malloc(N * sizeof(float));
    float *h_B = (float *)malloc(N * sizeof(float));
    float *h_C = (float *)malloc(N * sizeof(float));      // GPU result
    float *h_C_ref = (float *)malloc(N * sizeof(float));  // CPU result

    // Initialize input vectors with random values
    for (int i = 0; i < N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    // =========================
    // CPU (Serial) Computation
    // =========================
    auto cpu_start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        h_C_ref[i] = h_A[i] + h_B[i];
    }
    auto cpu_end = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double>(cpu_end - cpu_start).count();
    printf("CPU time: %f seconds\n", cpu_time);

    // =========================
    // GPU (Parallel) Computation
    // =========================
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, N * sizeof(float));
    cudaMalloc((void **)&d_B, N * sizeof(float));
    cudaMalloc((void **)&d_C, N * sizeof(float));

    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    vecAddKernel<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);

    cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop); // milliseconds
    double gpu_time = gpu_ms / 1000.0;

    printf("GPU kernel time: %f ms (%f seconds)\n", gpu_ms, gpu_time);

    // =========================
    // Verification
    // =========================
    bool correct = true;
    for (int i = 0; i < 10; i++) {
        if (fabs(h_C[i] - h_C_ref[i]) > 1e-5) {
            correct = false;
            break;
        }
    }
    printf("Verification: %s\n", correct ? "PASS" : "FAIL");

    // =========================
    // Speedup
    // =========================
    if (gpu_time > 0)
        printf("Speedup (CPU/GPU): %f\n", cpu_time / gpu_time);

    // Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return 0;
}

Writing vec_add.cu


In [None]:
# Compile
!nvcc -arch=sm_50 -O2 -o vec_add vec_add.cu

# Run for different N values (10^5, 10^6, 10^7)
!./vec_add 100000
!./vec_add 1000000
!./vec_add 5000000

Vector Size: 100000
CPU time: 0.000229 seconds
GPU kernel time: 7.349440 ms (0.007349 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.031094
Vector Size: 1000000
CPU time: 0.002322 seconds
GPU kernel time: 7.301312 ms (0.007301 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.318015
Vector Size: 5000000
CPU time: 0.011441 seconds
GPU kernel time: 7.097408 ms (0.007097 seconds)
Verification: FAIL
Speedup (CPU/GPU): 1.611938


In [None]:
%%writefile matrix_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono> // for CPU timing

// =============================
// CUDA kernel for matrix addition
// =============================
__global__ void matrixAddKernel(float *A, float *B, float *C, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y; // row index
    int col = blockIdx.x * blockDim.x + threadIdx.x; // column index
    int index = row * N + col; // linear index

    if (row < M && col < N) {
        C[index] = A[index] + B[index];
    }
}

// =============================
// Main program
// =============================
int main(int argc, char **argv) {
    int M = 1000, N = 1000; // Default matrix size
    if (argc == 3) {
        M = atoi(argv[1]);
        N = atoi(argv[2]);
    }
    printf("Matrix size: %d x %d\n", M, N);

    int size = M * N * sizeof(float);

    // Host memory allocation
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);      // GPU result
    float *h_C_ref = (float *)malloc(size);  // CPU result

    // Initialize input matrices with random values
    for (int i = 0; i < M * N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    // =============================
    // CPU (Serial) computation
    // =============================
    auto cpu_start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            h_C_ref[i * N + j] = h_A[i * N + j] + h_B[i * N + j];
        }
    }
    auto cpu_end = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double>(cpu_end - cpu_start).count();
    printf("CPU time: %f seconds\n", cpu_time);

    // =============================
    // GPU (Parallel) computation
    // =============================
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixAddKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, M, N);
    cudaEventRecord(stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop);
    double gpu_time = gpu_ms / 1000.0;

    printf("GPU time: %f ms (%f seconds)\n", gpu_ms, gpu_time);

    // =============================
    // Verification
    // =============================
    bool correct = true;
    for (int i = 0; i < 10; i++) {
        if (fabs(h_C[i] - h_C_ref[i]) > 1e-5) {
            correct = false;
            break;
        }
    }
    printf("Verification: %s\n", correct ? "PASS" : "FAIL");

    // =============================
    // Speedup
    // =============================
    if (gpu_time > 0)
        printf("Speedup (CPU/GPU): %f\n", cpu_time / gpu_time);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return 0;
}


Writing matrix_add.cu


In [None]:
# Compile
!nvcc -arch=sm_50 -O2 -o matrix_add matrix_add.cu

# Run for different matrix sizes
!./matrix_add 100 100
!./matrix_add 500 500
!./matrix_add 1000 1000

Matrix size: 100 x 100
CPU time: 0.000030 seconds
GPU time: 7.523808 ms (0.007524 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.004035
Matrix size: 500 x 500
CPU time: 0.000583 seconds
GPU time: 7.118336 ms (0.007118 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.081934
Matrix size: 1000 x 1000
CPU time: 0.003076 seconds
GPU time: 10.753696 ms (0.010754 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.286059


In [None]:
%%writefile vector_dot.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono>
#include <math.h>

// =============================
// CUDA kernel for dot product
// =============================
__global__ void dotProductKernel(float *A, float *B, float *partial_sum, int N) {
    __shared__ float cache[256]; // Shared memory for reduction
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;

    float temp = 0;
    while (tid < N) {
        temp += A[tid] * B[tid];
        tid += blockDim.x * gridDim.x;
    }

    cache[cacheIndex] = temp;
    __syncthreads();

    // Reduction in shared memory
    int i = blockDim.x / 2;
    while (i != 0) {
        if (cacheIndex < i)
            cache[cacheIndex] += cache[cacheIndex + i];
        __syncthreads();
        i /= 2;
    }

    if (cacheIndex == 0)
        partial_sum[blockIdx.x] = cache[0];
}

// =============================
// Main program
// =============================
int main(int argc, char **argv) {
    int N = 1000000; // Default vector size
    if (argc == 2)
        N = atoi(argv[1]);
    printf("Vector size: %d\n", N);

    int size = N * sizeof(float);

    // Host memory allocation
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_partial = (float *)malloc(256 * sizeof(float)); // for GPU partial sums

    // Initialize vectors
    for (int i = 0; i < N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    // =============================
    // CPU (Serial) computation
    // =============================
    auto cpu_start = std::chrono::high_resolution_clock::now();
    float cpu_result = 0;
    for (int i = 0; i < N; i++)
        cpu_result += h_A[i] * h_B[i];
    auto cpu_end = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double>(cpu_end - cpu_start).count();
    printf("CPU Dot Product: %f\n", cpu_result);
    printf("CPU time: %f seconds\n", cpu_time);

    // =============================
    // GPU (Parallel) computation
    // =============================
    float *d_A, *d_B, *d_partial;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    int blocks = 256;
    cudaMalloc(&d_partial, blocks * sizeof(float));

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    dotProductKernel<<<blocks, 256>>>(d_A, d_B, d_partial, N);
    cudaEventRecord(stop);

    cudaMemcpy(h_partial, d_partial, blocks * sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    float gpu_result = 0;
    for (int i = 0; i < blocks; i++)
        gpu_result += h_partial[i];

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop);
    double gpu_time = gpu_ms / 1000.0;

    printf("GPU Dot Product: %f\n", gpu_result);
    printf("GPU time: %f ms (%f seconds)\n", gpu_ms, gpu_time);

    // =============================
    // Verification
    // =============================
    bool correct = fabs(cpu_result - gpu_result) < 1e-5;
    printf("Verification: %s\n", correct ? "PASS" : "FAIL");

    // =============================
    // Speedup
    // =============================
    if (gpu_time > 0)
        printf("Speedup (CPU/GPU): %f\n", cpu_time / gpu_time);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_partial);
    free(h_A);
    free(h_B);
    free(h_partial);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing vector_dot.cu


In [None]:
# Compile CUDA program
!nvcc -arch=sm_50 -O2 -o vector_dot vector_dot.cu

# Run for different vector sizes
!./vector_dot 100000      # 1e5
!./vector_dot 1000000     # 1e6
!./vector_dot 10000000    # 1e7


Vector size: 100000
CPU Dot Product: 25001.640625
CPU time: 0.000130 seconds
GPU Dot Product: 0.000000
GPU time: 7.519104 ms (0.007519 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.017308
Vector size: 1000000
CPU Dot Product: 250005.875000
CPU time: 0.001337 seconds
GPU Dot Product: 0.000000
GPU time: 7.243040 ms (0.007243 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.184546
Vector size: 10000000
CPU Dot Product: 2471362.250000
CPU time: 0.013352 seconds
GPU Dot Product: 0.000000
GPU time: 7.881728 ms (0.007882 seconds)
Verification: FAIL
Speedup (CPU/GPU): 1.694043


In [None]:
%%writefile matrix_mul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono>
#include <math.h>

// =============================
// CUDA kernel for matrix multiplication
// =============================
__global__ void matrixMulKernel(float *A, float *B, float *C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < P) {
        float sum = 0;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * P + col];
        }
        C[row * P + col] = sum;
    }
}

// =============================
// Main program
// =============================
int main(int argc, char **argv) {
    int M = 100, N = 100, P = 100; // Default sizes
    if (argc == 4) {
        M = atoi(argv[1]);
        N = atoi(argv[2]);
        P = atoi(argv[3]);
    }
    printf("Matrix sizes: A(%dx%d), B(%dx%d)\n", M, N, N, P);

    int size_A = M * N * sizeof(float);
    int size_B = N * P * sizeof(float);
    int size_C = M * P * sizeof(float);

    // Host memory allocation
    float *h_A = (float*)malloc(size_A);
    float *h_B = (float*)malloc(size_B);
    float *h_C = (float*)malloc(size_C);      // GPU result
    float *h_C_ref = (float*)malloc(size_C);  // CPU result

    // Initialize matrices with random values
    for (int i = 0; i < M * N; i++) h_A[i] = rand() / (float)RAND_MAX;
    for (int i = 0; i < N * P; i++) h_B[i] = rand() / (float)RAND_MAX;

    // =============================
    // CPU (Serial) computation
    // =============================
    auto cpu_start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < P; j++) {
            float sum = 0;
            for (int k = 0; k < N; k++) {
                sum += h_A[i * N + k] * h_B[k * P + j];
            }
            h_C_ref[i * P + j] = sum;
        }
    }
    auto cpu_end = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double>(cpu_end - cpu_start).count();
    printf("CPU time: %f seconds\n", cpu_time);

    // =============================
    // GPU (Parallel) computation
    // =============================
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size_A);
    cudaMalloc(&d_B, size_B);
    cudaMalloc(&d_C, size_C);

    cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((P + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, M, N, P);
    cudaEventRecord(stop);

    cudaMemcpy(h_C, d_C, size_C, cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop);
    double gpu_time = gpu_ms / 1000.0;
    printf("GPU time: %f ms (%f seconds)\n", gpu_ms, gpu_time);

    // =============================
    // Verification
    // =============================
    bool correct = true;
    for (int i = 0; i < M * P; i++) {
        if (fabs(h_C[i] - h_C_ref[i]) > 1e-4) {
            correct = false;
            break;
        }
    }
    printf("Verification: %s\n", correct ? "PASS" : "FAIL");

    // =============================
    // Speedup
    // =============================
    if (gpu_time > 0)
        printf("Speedup (CPU/GPU): %f\n", cpu_time / gpu_time);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing matrix_mul.cu


In [None]:
# Compile
!nvcc -arch=sm_50 -O2 -o matrix_mul matrix_mul.cu

# Run for different sizes
!./matrix_mul 100 100 100
!./matrix_mul 500 500 500
!./matrix_mul 1000 1000 1000


Matrix sizes: A(100x100), B(100x100)
CPU time: 0.001036 seconds
GPU time: 7.718336 ms (0.007718 seconds)
Verification: FAIL
Speedup (CPU/GPU): 0.134186
Matrix sizes: A(500x500), B(500x500)
CPU time: 0.159607 seconds
GPU time: 7.337952 ms (0.007338 seconds)
Verification: FAIL
Speedup (CPU/GPU): 21.750851
Matrix sizes: A(1000x1000), B(1000x1000)
CPU time: 1.307361 seconds
GPU time: 7.192192 ms (0.007192 seconds)
Verification: FAIL
Speedup (CPU/GPU): 181.774990
