<a href="https://colab.research.google.com/github/svmner/Parallel-Programming/blob/main/CUDA_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [8]:
%%writefile practice1.cu

// 2D Matrix Multiplication

#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define TILE_WIDTH 2
#define WIDTH 4

__global__ void perform2DMatrixMult(int *d_A,int *d_B,int *d_C){
  int bx = blockIdx.x, by = blockIdx.y;
  int tx = threadIdx.x, ty = threadIdx.y;

  int row = by * blockDim.y + ty;
  int col = bx * blockDim.x + tx;
  int res = 0;
  if(row < WIDTH && col < WIDTH){
    for(int k = 0;k<WIDTH;k++){
      res += d_A[row*WIDTH + k] * d_B[k*WIDTH + col];
    }
    d_C[row * WIDTH + col] = res;
  }
}

int main(){
  int *d_A,*d_B,*d_C;
  int *A,*B,*C;

  A = (int*)malloc(sizeof(int)*WIDTH*WIDTH);
  B = (int*)malloc(sizeof(int)*WIDTH*WIDTH);
  C = (int*)malloc(sizeof(int)*WIDTH*WIDTH);

  printf("Enter 4x4 matrix A: \n");
  for(int i = 0; i<WIDTH*WIDTH;i++){
    scanf("%d",&A[i]);
  }

  printf("Enter 4x4 matrix B: \n");
  for(int i = 0; i<WIDTH*WIDTH;i++){
    scanf("%d",&B[i]);
  }

  cudaMalloc((void**)&d_A,sizeof(int)*WIDTH*WIDTH);
  cudaMalloc((void**)&d_B,sizeof(int)*WIDTH*WIDTH);
  cudaMalloc((void**)&d_C,sizeof(int)*WIDTH*WIDTH);

  cudaMemcpy(d_A,A,sizeof(int)*WIDTH*WIDTH,cudaMemcpyHostToDevice);
  cudaMemcpy(d_B,B,sizeof(int)*WIDTH*WIDTH,cudaMemcpyHostToDevice);

  dim3 dimGrid(WIDTH/TILE_WIDTH,WIDTH/TILE_WIDTH);
  dim3 dimBlock(TILE_WIDTH,TILE_WIDTH);

  perform2DMatrixMult<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);

  cudaMemcpy(C,d_C,sizeof(int)*WIDTH*WIDTH,cudaMemcpyDeviceToHost);

  printf("This is the resultant matrix: \n");
  for(int i =0;i<WIDTH;i++){
    for(int j = 0;j<WIDTH;j++){
      printf("%d\t",C[i*WIDTH + j]);
    }
    printf("\n");
  }
  return 0;
}

Writing practice1.cu


In [None]:
!nvcc practice1.cu -o practice1


In [None]:
!./practice1

Enter 4x4 matrix A: 
1 2 3 4 
1 2 3 4 
1 2 3 4 
1 2 3 4 
Enter 4x4 matrix B: 
1 2 3 4 
1 2 3 4 
1 2 3 4
1 2 3 4 
This is the resultant matrix: 
10	20	30	40	
10	20	30	40	
10	20	30	40	
10	20	30	40	


In [None]:
%%writefile practice2.cu

// MERGE SORT

#include <stdio.h>
#include <cuda_runtime.h>

// CUDA Kernel for merging two halves of an array
__global__ void mergeKernel(int *deviceArray, int *tempArray, int size, int width) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int start = width * idx;
    int mid = min(start + width / 2, size);
    int end = min(start + width, size);

    int a = start, b = mid, i = start;

    while (a < mid && b < end) {
        tempArray[i++] = (deviceArray[a] < deviceArray[b]) ? deviceArray[a++] : deviceArray[b++];
    }
    while (a < mid) {
        tempArray[i++] = deviceArray[a++];
    }
    while (b < end) {
        tempArray[i++] = deviceArray[b++];
    }
}

// Host function to call the merge sort kernel
void mergeSort(int *array, int size) {
    int *deviceArray, *tempArray;

    // Allocate memory on the GPU
    cudaMalloc(&deviceArray, size * sizeof(int));
    cudaMalloc(&tempArray, size * sizeof(int));

    // Copy array from host to device
    cudaMemcpy(deviceArray, array, size * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(1024);
    dim3 blocksPerGrid((size + threadsPerBlock.x - 1) / threadsPerBlock.x);
    int width;

    // Perform the merge sort
    for (width = 2; width <= size; width *= 2) {
        mergeKernel<<<blocksPerGrid, threadsPerBlock>>>(deviceArray, tempArray, size, width);
        cudaMemcpy(deviceArray, tempArray, size * sizeof(int), cudaMemcpyDeviceToDevice);
    }

    // Copy sorted array back to host
    cudaMemcpy(array, deviceArray, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(deviceArray);
    cudaFree(tempArray);
}

int main() {
    int *array;
    int size;

    printf("Enter the number of elements: ");
    scanf("%d", &size);

    array = (int*)malloc(size * sizeof(int));

    printf("Enter the elements of the array:\n");
    for (int i = 0; i < size; i++) {
        scanf("%d", &array[i]);
    }

    mergeSort(array, size);

    // Print the sorted array
    printf("Sorted array:\n");
    for (int i = 0; i < size; i++) {
        printf("%d ", array[i]);
    }
    printf("\n");

    free(array);

    return 0;
}


Overwriting practice2.cu


In [None]:
!nvcc practice2.cu -o practice2

In [None]:
!./practice2

Enter the number of elements: 10
Enter the elements of the array:
1 4 2 3 7 6 9 4 18 12
Sorted array:
1 2 3 4 4 6 7 9 12 18 


In [9]:
%%writefile practice3.cu

// 2D Convolution

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

#define TILE_WIDTH 2
#define WIDTH 4
#define MASK_WIDTH 3

__global__ void convolution(int* input, int* mask, int* output) {

    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;
    int sum = 0;

    for (int i = 0; i < MASK_WIDTH; i++) {
        for (int j = 0; j < MASK_WIDTH; j++) {
            int inputRow = row + i - MASK_WIDTH / 2;
            int inputCol = col + j - MASK_WIDTH / 2;

            if (inputRow >= 0 && inputRow < WIDTH && inputCol >= 0 && inputCol < WIDTH) {
                sum += input[inputRow * WIDTH + inputCol] * mask[i * MASK_WIDTH + j];
            }
        }
    }
    output[row * WIDTH + col] = sum;
}

int main() {

    int *input, *mask, *output, *d_input, *d_mask, *d_output;

    input = (int*)malloc(sizeof(int) * WIDTH * WIDTH);
    output = (int*)malloc(sizeof(int) * WIDTH * WIDTH);
    mask = (int*)malloc(sizeof(int) * MASK_WIDTH * MASK_WIDTH);

    printf("Enter the (4x4) input matrix:\n");
    for (int i = 0; i < WIDTH; i++) {
        for (int j = 0; j < WIDTH; j++) {
            scanf("%d", &input[i * WIDTH + j]);
        }
    }

    printf("Enter the (3x3) mask matrix:\n");
    for (int i = 0; i < MASK_WIDTH; i++) {
        for (int j = 0; j < MASK_WIDTH; j++) {
            scanf("%d", &mask[i * MASK_WIDTH + j]);
        }
    }

    cudaMalloc((void**)&d_input, WIDTH * WIDTH * sizeof(int));
    cudaMalloc((void**)&d_mask, MASK_WIDTH * MASK_WIDTH * sizeof(int));
    cudaMalloc((void**)&d_output, WIDTH * WIDTH * sizeof(int));

    cudaMemcpy(d_input, input, WIDTH * WIDTH * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_mask, mask, MASK_WIDTH * MASK_WIDTH * sizeof(int), cudaMemcpyHostToDevice);

    dim3 grid_conf(WIDTH / TILE_WIDTH, WIDTH / TILE_WIDTH);
    dim3 block_conf(TILE_WIDTH, TILE_WIDTH);

    convolution<<<grid_conf, block_conf>>>(d_input, d_mask, d_output);

    cudaMemcpy(output, d_output, WIDTH * WIDTH * sizeof(int), cudaMemcpyDeviceToHost);

    printf("After performing convolution:\n");
    for (int i = 0; i < WIDTH; i++) {
        for (int j = 0; j < WIDTH; j++) {
            printf("%d ", output[i * WIDTH + j]);
        }
        printf("\n");
    }
    cudaFree(d_input);
    cudaFree(d_mask);
    cudaFree(d_output);
    free(input);
    free(output);
    free(mask);
}

Writing practice3.cu


In [10]:
!nvcc practice3.cu -o practice3

In [11]:
!./practice3

Enter the (4x4) input matrix:
1 2 3 4 
1 2 3 4 
1 2 3 4 
1 2 3 4 
Enter the (3x3) mask matrix:
3 3 3 
5 5 5 
7 7 7 
After performing convolution:
36 72 108 84 
45 90 135 105 
45 90 135 105 
24 48 72 56 


In [12]:
%%writefile practice4.cu

// 1D Tiled Convolution

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

__global__ void convolution(int* N,int* M,int* P,int width,int mask_width){
    __shared__ int value;

    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int start = tid - (mask_width/2);

    P[tid] = 0;

    for(int i = 0;i < mask_width;i++){
        if(tid == 0)
            value = M[i];
        __syncthreads();

        if(start + i >= 0 && start + i < width)
            P[tid] += N[start+i]*value;
        __syncthreads();
    }
}

void performConvolution(int* N,int* M,int* P,int width,int mask_width){
    int* d_N,*d_M,*d_P;
    int size = width*sizeof(int), mask_size = mask_width*sizeof(int);

    cudaMalloc((void**)&d_N,size);
    cudaMalloc((void**)&d_M,mask_size);
    cudaMalloc((void**)&d_P,size);

    cudaMemcpy(d_N,N,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_M,M,mask_size,cudaMemcpyHostToDevice);

    convolution<<<1,width>>>(d_N,d_M,d_P,width,mask_width);

    cudaMemcpy(P,d_P,size,cudaMemcpyDeviceToHost);

    cudaFree(d_P);
    cudaFree(d_M);
    cudaFree(d_N);
}

int main(){

    int* N,*M,*P,width,mask_width;

    printf("Enter the width:\n");
    scanf("%d",&width);

    printf("Enter the mask width:\n");
    scanf("%d",&mask_width);

    int size = width*sizeof(int), mask_size = mask_width*sizeof(int);

    N = (int*)malloc(size);
    M = (int*)malloc(mask_size);
    P = (int*)malloc(size);

    printf("Enter array elements:\n");
    for(int i = 0;i < width;i++)
        scanf("%d",&N[i]);

    printf("Enter the mask elements:\n");
    for(int i = 0;i < mask_width;i++)
        scanf("%d",&M[i]);

    performConvolution(N,M,P,width,mask_width);

    printf("Result:\n");
    for(int i = 0;i < width;i++)
        printf("%d ",P[i]);

    free(P);
    free(M);
    free(N);
    return 0;
}

Writing practice4.cu


In [13]:
!nvcc practice4.cu -o practice4

In [14]:
!./practice4

Enter the width:
6
Enter the mask width:
3
Enter array elements:
1 3 5 6 7 8 
Enter the mask elements:
5 7 9
Result:
34 71 104 130 151 91 

In [15]:
%%writefile practice5.cu

// Sparse Matrix - Vector Multiplication using CSR

#include<cuda_runtime.h>
#include<stdio.h>
#include<stdlib.h>

__global__ void csr(int num_rows,int* data,int* col_index,int* row_ptr,int* x,int* y){
    int row = blockDim.x*blockIdx.x + threadIdx.x;
    if(row < num_rows){
        int res = 0;
        int row_start = row_ptr[row];
        int row_end = row_ptr[row+1];

        for(int i = row_start;i < row_end;i++){
            res += data[i]*x[col_index[i]];
        }
        y[row] = res;
    }
}

void csr(int n,int m,int non_zero_count,int* data,int* col_index,int* row_ptr,int* x, int* y){
    int* d_data,*d_col_index,*d_row_ptr,*d_x,*d_y;

    cudaMalloc((void**)&d_data,sizeof(int)*non_zero_count);
    cudaMalloc((void**)&d_x,sizeof(int)*m);
    cudaMalloc((void**)&d_y,sizeof(int)*n);
    cudaMalloc((void**)&d_row_ptr,sizeof(int)*(n+1));
    cudaMalloc((void**)&d_col_index,sizeof(int)*non_zero_count);

    cudaMemcpy(d_data,data,sizeof(int)*non_zero_count,cudaMemcpyHostToDevice);
    cudaMemcpy(d_x,x,sizeof(int)*m,cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_index,col_index,sizeof(int)*non_zero_count,cudaMemcpyHostToDevice);
    cudaMemcpy(d_row_ptr,row_ptr,sizeof(int)*(n + 1),cudaMemcpyHostToDevice);

    csr<<<1,n>>>(n,d_data,d_col_index,d_row_ptr,d_x,d_y);

    cudaMemcpy(y,d_y,sizeof(int)*m,cudaMemcpyDeviceToHost);

    printf("Result:\n");
    for(int i = 0; i < m;i++)
      printf("%d ",y[i]);
    printf("\n");

    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_data);
    cudaFree(d_row_ptr);
    cudaFree(d_col_index);
}

int main(){
    int m,n,non_zero_count=0;

    printf("Enter the dimensions:\n");
    scanf("%d %d", &n,&m);

    int* mat = (int*)malloc(sizeof(int)*n*m);
    printf("Enter a sparse matrix:\n");

    for(int i = 0;i < n;i++){
        for(int j = 0; j < m; j++){
            int k = i*n+j;
            scanf("%d",&mat[k]);
            if(mat[k] != 0){
                non_zero_count += 1;
            }
        }
    }

    int* x = (int*)malloc(sizeof(int)*m);
    int* y = (int*)malloc(sizeof(int)*n);
    int* row_ptr = (int*)calloc((m+1),sizeof(int));
    int* col_index = (int*)malloc(sizeof(int)*non_zero_count);
    int* data = (int*)malloc(sizeof(int)*non_zero_count);

    printf("Enter the column vector:\n");
    for(int i = 0; i < m;i++){
        scanf("%d",&x[i]);
    }

    int id = 0;

    for(int i = 0;i < n;i++){
        for(int j = 0;j < m;j++){
            int k = i*n + j;

            if(mat[k] != 0){
                data[id] = mat[k];
                col_index[id] = j;
                id += 1;
            }
            row_ptr[i + 1] = id;
        }
    }

    csr(n,m,non_zero_count,data,col_index,row_ptr,x,y);
    return 0;
}

Writing practice5.cu


In [16]:
!nvcc practice5.cu -o practice5

In [17]:
!./practice5

Enter the dimensions:
3 3 
Enter a sparse matrix:
1 2 3 
4 5 6 
0 0 0
Enter the column vector:
1 2 3 
Result:
14 32 0 
