In [44]:
%%writefile vector_add.cu

#include<stdio.h>
#include <stdlib.h>

// This function will be exectuted in the GPU,
// the pointers are expected to be in the device memory.

__global__ void add(float *a, float *b, float *out, int n) {
    for (int i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
  int N = 100;
  float *a = (float*) malloc(N * sizeof(float));
  float *b = (float*) malloc(N * sizeof(float));
  float *out = (float*) malloc(N * sizeof(float));
  for (int i=0; i < N; i++) {
    a[i] = i;
    b[i] = 3*i;
  }

  // Move the memory to GPU
  float *d_a, *d_b, *d_out;
  cudaMalloc((void**) &d_a, N * sizeof(float));
  cudaMalloc((void**) &d_b, N * sizeof(float));
  cudaMalloc((void**) &d_out, N * sizeof(float));

  cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

  // Execute the function on the GPU.
  add<<<1,1>>>(d_a, d_b, d_out, N);

  // Copy the memory back to CPU
  cudaMemcpy(out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

  for (int i=0; i < N; i++) {
    if (i%10 == 0) {
      printf("\n");
    }
    printf("%3.2f ", out[i]);
  }
}

Overwriting vector_add.cu


In [45]:
!nvcc vector_add.cu -o vector_add_cu
!./vector_add_cu


0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 
40.00 44.00 48.00 52.00 56.00 60.00 64.00 68.00 72.00 76.00 
80.00 84.00 88.00 92.00 96.00 100.00 104.00 108.00 112.00 116.00 
120.00 124.00 128.00 132.00 136.00 140.00 144.00 148.00 152.00 156.00 
160.00 164.00 168.00 172.00 176.00 180.00 184.00 188.00 192.00 196.00 
200.00 204.00 208.00 212.00 216.00 220.00 224.00 228.00 232.00 236.00 
240.00 244.00 248.00 252.00 256.00 260.00 264.00 268.00 272.00 276.00 
280.00 284.00 288.00 292.00 296.00 300.00 304.00 308.00 312.00 316.00 
320.00 324.00 328.00 332.00 336.00 340.00 344.00 348.00 352.00 356.00 
360.00 364.00 368.00 372.00 376.00 380.00 384.00 388.00 392.00 396.00 

### Execute with 1 block and 5 threads

In [47]:
%%writefile vector_add.cu

#include<stdio.h>
#include <stdlib.h>

// This function will be exectuted in the GPU,
// the pointers are expected to be in the device memory.

__global__ void add(float *a, float *b, float *out, int n, int threads) {
    int size = (int)n/threads;

    int start=(threadIdx.x * size);
    int end = start + size;

    for (int i = start; i < end; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
  int N = 100;
  int threads =5;
  float *a = (float*) malloc(N * sizeof(float));
  float *b = (float*) malloc(N * sizeof(float));
  float *out = (float*) malloc(N * sizeof(float));
  for (int i=0; i < N; i++) {
    a[i] = i;
    b[i] = 3*i;
  }

  // Move the memory to GPU
  float *d_a, *d_b, *d_out;
  cudaMalloc((void**) &d_a, N * sizeof(float));
  cudaMalloc((void**) &d_b, N * sizeof(float));
  cudaMalloc((void**) &d_out, N * sizeof(float));

  cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

  // Execute the function on the GPU.
  add<<<1,threads>>>(d_a, d_b, d_out, N, threads);

  // Copy the memory back to CPU
  cudaMemcpy(out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

  for (int i=0; i < N; i++) {
    if (i%10 == 0) {
      printf("\n");
    }
    printf("%.2f ", out[i]);

  }
}

Overwriting vector_add.cu


In [48]:
!nvcc vector_add.cu -o vector_add_cu_1block_5threads
!./vector_add_cu_1block_5threads


0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 
40.00 44.00 48.00 52.00 56.00 60.00 64.00 68.00 72.00 76.00 
80.00 84.00 88.00 92.00 96.00 100.00 104.00 108.00 112.00 116.00 
120.00 124.00 128.00 132.00 136.00 140.00 144.00 148.00 152.00 156.00 
160.00 164.00 168.00 172.00 176.00 180.00 184.00 188.00 192.00 196.00 
200.00 204.00 208.00 212.00 216.00 220.00 224.00 228.00 232.00 236.00 
240.00 244.00 248.00 252.00 256.00 260.00 264.00 268.00 272.00 276.00 
280.00 284.00 288.00 292.00 296.00 300.00 304.00 308.00 312.00 316.00 
320.00 324.00 328.00 332.00 336.00 340.00 344.00 348.00 352.00 356.00 
360.00 364.00 368.00 372.00 376.00 380.00 384.00 388.00 392.00 396.00 

#### Vector multiple blocks and threads

In [89]:
%%writefile vector_add.cu

#include<stdio.h>
#include <stdlib.h>

// This function will be exectuted in the GPU,
// the pointers are expected to be in the device memory.

__global__ void add(float *a, float *b, float *out, int n, int threads, int blocks) {
    int size = (int)n/(blocks*threads);
    int start = blockIdx.x * threads * size + threadIdx.x * size;
    int end = start+size;

    for (int i = start; i < end; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
  int N = 1000;
  int threads = 50;
  int blocks = 20;

  float *a = (float*) malloc(N * sizeof(float));
  float *b = (float*) malloc(N * sizeof(float));
  float *out = (float*) malloc(N * sizeof(float));
  for (int i=0; i < N; i++) {
    a[i] = i;
    b[i] = 3*i;
  }

  // Move the memory to GPU
  float *d_a, *d_b, *d_out;
  cudaMalloc((void**) &d_a, N * sizeof(float));
  cudaMalloc((void**) &d_b, N * sizeof(float));
  cudaMalloc((void**) &d_out, N * sizeof(float));

  cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

  // Execute the function on the GPU.
  add<<<blocks,threads>>>(d_a, d_b, d_out, N, threads, blocks);

  // Copy the memory back to CPU
  cudaMemcpy(out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

  for (int i=0; i < N; i++) {
    if (i%10 == 0) {
      printf("\n");
    }
    printf("%.2f ", out[i]);

  }
}

Overwriting vector_add.cu


In [91]:
!nvcc vector_add.cu -o vector_add_cu_20block_50threads
!./vector_add_cu_20block_50threads


0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 
40.00 44.00 48.00 52.00 56.00 60.00 64.00 68.00 72.00 76.00 
80.00 84.00 88.00 92.00 96.00 100.00 104.00 108.00 112.00 116.00 
120.00 124.00 128.00 132.00 136.00 140.00 144.00 148.00 152.00 156.00 
160.00 164.00 168.00 172.00 176.00 180.00 184.00 188.00 192.00 196.00 
200.00 204.00 208.00 212.00 216.00 220.00 224.00 228.00 232.00 236.00 
240.00 244.00 248.00 252.00 256.00 260.00 264.00 268.00 272.00 276.00 
280.00 284.00 288.00 292.00 296.00 300.00 304.00 308.00 312.00 316.00 
320.00 324.00 328.00 332.00 336.00 340.00 344.00 348.00 352.00 356.00 
360.00 364.00 368.00 372.00 376.00 380.00 384.00 388.00 392.00 396.00 
400.00 404.00 408.00 412.00 416.00 420.00 424.00 428.00 432.00 436.00 
440.00 444.00 448.00 452.00 456.00 460.00 464.00 468.00 472.00 476.00 
480.00 484.00 488.00 492.00 496.00 500.00 504.00 508.00 512.00 516.00 
520.00 524.00 528.00 532.00 536.00 540.00 544.00 548.00 552.00 556.00 
560.00 564.00 568.00 572.00 576.0

In [92]:
!nvprof ./vector_add_cu_20block_50threads

==18808== NVPROF is profiling process 18808, command: ./vector_add_cu_20block_50threads

0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 
40.00 44.00 48.00 52.00 56.00 60.00 64.00 68.00 72.00 76.00 
80.00 84.00 88.00 92.00 96.00 100.00 104.00 108.00 112.00 116.00 
120.00 124.00 128.00 132.00 136.00 140.00 144.00 148.00 152.00 156.00 
160.00 164.00 168.00 172.00 176.00 180.00 184.00 188.00 192.00 196.00 
200.00 204.00 208.00 212.00 216.00 220.00 224.00 228.00 232.00 236.00 
240.00 244.00 248.00 252.00 256.00 260.00 264.00 268.00 272.00 276.00 
280.00 284.00 288.00 292.00 296.00 300.00 304.00 308.00 312.00 316.00 
320.00 324.00 328.00 332.00 336.00 340.00 344.00 348.00 352.00 356.00 
360.00 364.00 368.00 372.00 376.00 380.00 384.00 388.00 392.00 396.00 
400.00 404.00 408.00 412.00 416.00 420.00 424.00 428.00 432.00 436.00 
440.00 444.00 448.00 452.00 456.00 460.00 464.00 468.00 472.00 476.00 
480.00 484.00 488.00 492.00 496.00 500.00 504.00 508.00 512.00 516.00 
520.00 524.00 52