<a href="https://colab.research.google.com/github/EugenHotaj/cML/blob/main/cuda_vec_addition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
# Goofy way to use free GPUs from Google. Write all our CUDA code as a Python string,
# write that to a file, compile with NVCC, then run.

code = r"""
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define N 10000000
#define MAX_ERR 1e-6

__global__ void vector_add(float *out, float *a, float *b, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        out[i] = a[i] + b[i];
    }
}

int main(){
    float *a_h, *b_h, *out_h;
    float *a_d, *b_d, *out_d;
    int size_bytes = sizeof(float) * N;

    // Allocate host memory.
    a_h   = (float*)malloc(size_bytes);
    b_h   = (float*)malloc(size_bytes);
    out_h = (float*)malloc(size_bytes);

    // Initialize host arrays.
    for(int i = 0; i < N; i++){
        a_h[i] = 1.0f;
        b_h[i] = 2.0f;
    }

    // Allocate device memory.
    cudaMalloc((void**)&a_d, size_bytes);
    cudaMalloc((void**)&b_d, size_bytes);
    cudaMalloc((void**)&out_d, size_bytes);

    // Transfer data from host to device memory.
    cudaMemcpy(a_d, a_h, size_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_d, b_h, size_bytes, cudaMemcpyHostToDevice);

    // Execute kernel.
    vector_add<<<ceil(N/256.0),256>>>(out_d, a_d, b_d, N);

    // Transfer data back to host memory.
    cudaMemcpy(out_h, out_d, size_bytes, cudaMemcpyDeviceToHost);

    // Verification.
    for(int i = 0; i < N; i++){
        assert(fabs(out_h[i] - a_h[i] - b_h[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out_h[0]);
    printf("PASSED\n");

    // Deallocate device memory.
    cudaFree(a_d);
    cudaFree(b_d);
    cudaFree(out_d);

    // Deallocate host memory.
    free(a_h);
    free(b_h);
    free(out_h);
}

"""

In [55]:
with open("code.cu", "w") as file_:
    file_.write(code)
!nvcc code.cu && ./a.out

out[0] = 3.000000
PASSED
