<a href="https://colab.research.google.com/github/wangchengdng/CUDA_Learning/blob/main/cuda_hello_world.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

!nvcc --version

In [None]:
%%cu
#include <stdio.h>
__global__ void cuda_hello() {
    const int bid = blockIdx.x;
    const int tid = threadIdx.x;
    printf("Hello World from block %d and thread %d!\n", bid, tid);
}
int main() {
    cuda_hello<<<2,3>>>();
    cudaDeviceSynchronize();
    return 0;
}

In [None]:
%%cu
#include<iostream>

#define VECTOR_LENGTH 100

__global__ void vector_add_cuda(float *out, float *a, float *b, int n) {
    for (int i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

void vector_add(float *out, float *a, float *b, int n) {
    for (int i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
    float *a, *b, *out, *cpu_out;
    float *cuda_a, *cuda_b, *cuda_out;
    //malloc from cpu
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    cpu_out = (float*) malloc(sizeof(float) * VECTOR_LENGTH);
    for (int i = 0; i < VECTOR_LENGTH; i++) {
        a[i] = 3.0f;
        b[i] = 0.14f;
    }

    //malloc from gpu
    cudaMalloc((void**)&cuda_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&cuda_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&cuda_out, sizeof(float) * VECTOR_LENGTH);

    //copy memory from cpu to gpu
    cudaMemcpy(cuda_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(cuda_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    
    //run gpu
    vector_add_cuda<<<1,1>>>(cuda_out, cuda_a, cuda_b, VECTOR_LENGTH);
    cudaDeviceSynchronize();
    //cpy result from gpu to cpu
    cudaMemcpy(out, cuda_out, sizeof(float) * VECTOR_LENGTH, cudaMemcpyDeviceToHost);

    //run cpu
    //vector_add(cpu_out, a, b, VECTOR_LENGTH);

    //check result
    for (int i = 0; i < VECTOR_LENGTH; i++) {
        std::cout << out[i] << std::endl;
    }
    //free memory
    cudaFree(cuda_a);
    cudaFree(cuda_b);
    cudaFree(cuda_out);
    free(a);
    free(b);
    free(out);
    free(cpu_out);
    return 0;
}

In [None]:
%%cu
#include <stdio.h>

#define VECTOR_LENGTH 1000

__global__ void dot_product(float *a, float *b, float *out, int n) {
    float sum = 0;
    for(int i = 0; i < n; i++) {
        sum = a[i] * b[i] + sum;
    }
    *out = sum;
}

int main() {
    float *a, *b, *out;
    float *cuda_a, *cuda_b, *cuda_out;
    //malloc cpu
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float));

    //init
    for (int i = 0; i < VECTOR_LENGTH; i++) {
        a[i] = 3.0f;
        b[i] = 0.4f;    
    }

    //malloc gpu
    cudaMalloc((void**)&cuda_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&cuda_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&cuda_out, sizeof(float));

    //copy memory from cpu to gpu
    cudaMemcpy(cuda_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(cuda_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    
    //run gpu
    dot_product<<<1,1>>>(cuda_a, cuda_b, cuda_out, VECTOR_LENGTH);

    //copy result from gpu
    cudaMemcpy(out, cuda_out, sizeof(float), cudaMemcpyDeviceToHost);

    //check result
    printf("dot product result=%f", *out);

    //free resource
    free(a);
    free(b);
    free(out);
    cudaFree(cuda_a);
    cudaFree(cuda_b);
    cudaFree(cuda_out);
    return 0;
}