In [16]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [17]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-x4d003i5
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-x4d003i5
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [18]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [19]:
%%cu
#include "stdio.h"
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>

// Defining number of elements in Array
#define N 5

// Defining Kernel function for vector addition

__global__ void gpuAdd(int *d_a, int *d_b, int *d_c)
{
    // Getting block index of current kernel
    int tid = blockIdx.x; // handle the data at this index
    if (tid < N)
        d_c[tid] = d_a[tid] + d_b[tid];
}

// Main program
int main(void)
{
    // Defining host arrays
    int h_a[N], h_b[N], h_c[N];

    // Defining device pointers
    int *d_a, *d_b, *d_c;

    // allocate the memory
    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));

    // Initializing Arrays
    for (int i = 0; i < N; i++) {
        h_a[i] = 2*i*i;
        h_b[i] = i ;
    }

    // Copy input arrays from host to device memory
    cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Calling kernels with N blocks and one thread per block, passing
    // device pointers as parameters
    gpuAdd <<<N, 1 >>>(d_a, d_b, d_c);

    // Copy result back to host memory from device memory
    cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Vector 1: \n");
    // Printing result on console
    for (int i = 0; i < N; i++) {
        printf("%d ", h_a[i]);
    }
    printf("\n");

    printf("Vector 2: \n");
    // Printing result on console
    for (int i = 0; i < N; i++) {
        printf("%d ", h_b[i]);
    }
    printf("\n");

    printf("Vector addition on GPU \n");
    // Printing result on console
    for (int i = 0; i < N; i++) {
        printf("%d ", h_c[i]);
    }
    printf("\n");

    // Free up memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}

Vector 1: 
0 2 8 18 32 
Vector 2: 
0 1 2 3 4 
Vector addition on GPU 
0 3 10 21 36 



In [19]:
%%cu
#include "stdio.h"
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

_global_ void add(int* a, int* b, int* c) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    c[i] = a[i] + b[i];
}

_managed_ int a[5], b[5], c[5];

int main()
{
    const int arraySize = 5;
    // Assign values to managed arrays directly
    a[0] = 1; a[1] = 2; a[2] = 3; a[3] = 4; a[4] = 5;
    b[0] = 10; b[1] = 20; b[2] = 30; b[3] = 40; b[4] = 50;
    add << <1, 5 >> > (a, b, c);

    cudaDeviceSynchronize();
    for (int i = 0; i < 5; i++) {
        printf("%d\n", c[i]);
    }
    return 0;
}