In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
%%writefile vector_addition.cu

#include <stdio.h>
#include <cuda_runtime.h>


__global__ void gpu_vector_add(int *ha, int *hb, int *hc, int size) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < size) {
    hc[idx] = ha[idx] + hb[idx];
  }
}


void vector_add(int size) {

  // khoi tao du lieu + init du lieu bat ki
  int *ha = (int*)malloc(size * sizeof(int));
  int *hb = (int*)malloc(size * sizeof(int));
  int *hc = (int*)malloc(size * sizeof(int));

  for (int i = 0; i < size; i++) {
    ha[i] = i;
    hb[i] = i;
  }


  // bo nho gpu
  int *d_ha, *d_hb, *d_hc;
  cudaMalloc((void**)&d_ha, size * sizeof(int));
  cudaMalloc((void**)&d_hb, size * sizeof(int));
  cudaMalloc((void**)&d_hc, size * sizeof(int));



  // copy du lieu vao bo nho gpu
  cudaMemcpy(d_ha, ha, size * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_hb, hb, size * sizeof(int), cudaMemcpyHostToDevice);

  int threadblock = 256;
  int blockgrid = (size + threadblock - 1) / threadblock;

  // kernel
  gpu_vector_add<<<blockgrid, threadblock>>>(d_ha, d_hb, d_hc, size);

  // copy ket qua ve cpu
  cudaMemcpy(hc, d_hc, size * sizeof(int), cudaMemcpyDeviceToHost);


  // in ket qua
  for (int i = 0; i < size; i++) {
    printf("%d + %d = %d\n", ha[i], hb[i], hc[i]);
  }

  // free bo nho
  cudaFree(d_ha);
  cudaFree(d_hb);
  cudaFree(d_hc);
  free(ha);
  free(hb);
  free(hc);

}


int main() {
  int size[] = {1024, 2048, 4096};

  for (int i = 0; i < 3; i++) {
    printf("size = %d\n", size[i]);
    vector_add(size[i]);
  }


  return 0;
}




In [None]:
%%bash
nvcc -o out vector_addition.cu -arch=sm_75
./out