##Install NVCC

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-d_2yeilr
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-d_2yeilr
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=3809b673cf701d0ebcdfa0d1dcb97c780354ee47cded917835900a620a385beb
  Stored in directory: /tmp/pip-ephem-wheel-cache-3cs8gzur/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


##Vector Addition

In [4]:
%%writefile vectoradd.cu
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
__global__ void addVectors(int* A, int* B, int* C, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
  C[i] = A[i] + B[i];
}
}


void takeinput(int *A,int *B,int n)
{
  cout<<"Enter "<<n<<" elements for vector A :";
  for (int i=0;i<n;i++)
  {
    cin>>A[i];
  }
  cout<<"\nEnter "<<n<<" elements for vector B :";
  for (int i=0;i<n;i++)
  {
    cin>>B[i];
  }
}


int main() {
  int n = 10;
  int* A, * B, * C;
  int size = n * sizeof(int);
  // Allocate memory on the host
  cudaMallocHost(&A, size);
  cudaMallocHost(&B, size);
  cudaMallocHost(&C, size);
  // Initialize the vectors
  
  takeinput(A,B,n);
  
  // Allocate memory on the device
  int* dev_A, * dev_B, * dev_C;
  cudaMalloc(&dev_A, size);
  cudaMalloc(&dev_B, size);
  cudaMalloc(&dev_C, size);
  // Copy data from host to device
  cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
  // Launch the kernel
  //int blockSize = 256;
  //int numBlocks = (n + blockSize - 1) / blockSize;

  addVectors <<<1,n>>>(dev_A,dev_B,dev_C,n);

  // Copy data from device to host
  cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
  // Print the results
  cout<<"\nVector Addition Result  :";
  for (int i = 0; i < n; i++) {
    cout << C[i] << " ";
  }
  cout << endl;
  // Free memory
  cudaFree(dev_A);
  cudaFree(dev_B);
  cudaFree(dev_C);
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  return 0;
}


Writing vectoradd.cu


In [5]:
!nvcc vectoradd.cu -o vectoradd

In [6]:
!./vectoradd

Enter 10 elements for vector A :1 2 3 4 5 6 7 8 9 0

Enter 10 elements for vector B :0 9 8 7 6 5 4 3 2 1

Vector Addition Result  :1 11 11 11 11 11 11 11 11 1 


##Matrix Multiplication

In [7]:
%%writefile "matrixmult.cu"
#include <cuda_runtime.h>
#include <iostream>
using namespace std;

__global__ void matmul(int* A, int* B, int* C, int N) {
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (Row < N && Col < N) {
  int Pvalue = 0;
  for (int k = 0; k < N; k++) {
    Pvalue += A[Row*N+k] * B[k*N+Col];
  }
  C[Row*N+Col] = Pvalue;
  }
}

void take_input(int *A,int *B,int N)
{
  cout<<"Enter "<<N*N<<" elements in matrix A :";

  for (int i=0;i<N*N;i++)
  {
    cin>>A[i];
  }

  cout<<"Enter "<<N*N<<" elements in matrix B :";

  for (int i=0;i<N*N;i++)
  {
    cin>>B[i];
  }

}

int main() 
{
  int N = 2;
  int size = N * N * sizeof(int);
  int* A,* B,* C;
  int* dev_A, * dev_B, * dev_C;
  
  cudaMallocHost(&A, size);
  cudaMallocHost(&B, size);
  cudaMallocHost(&C, size);
  cudaMalloc(&dev_A, size);
  cudaMalloc(&dev_B, size);
  cudaMalloc(&dev_C, size);
 
  // Initialize matrices A and B

  take_input(A,B,N);

  cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
  
  dim3 dimBlock(2,2);
  dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);

  matmul<<<dimGrid,dimBlock>>>(dev_A, dev_B, dev_C, N);
  cudaMemcpy(C, dev_C,size,cudaMemcpyDeviceToHost);

  // Print the result

  cout<<"Result of matrix multiplication :\n";

  for (int i = 0; i < N; i++)
  {
    for (int j = 0; j < N; j++)
    {
      cout << C[i*N+j] << " ";
    }
    cout << "\n";
  }
  // Free memory
  cudaFree(dev_A);
  cudaFree(dev_B);
  cudaFree(dev_C);
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  return 0;
}


Writing matrixmult.cu


In [8]:
!nvcc matrixmult.cu -o matrixmult

In [9]:
!./matrixmult

Enter 4 elements in matrix A :1 2 3 4
Enter 4 elements in matrix B :4 3 2 1
Result of matrix multiplication :
8 5 
20 13 
