<a href="https://colab.research.google.com/github/svmner/Parallel-Programming/blob/main/cudaadditional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
%%writefile repeatCharacters.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void repeatCharacters(const char *A, const int *B, char *output, int elements, int *offsets) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < elements) {
        int repeats = B[idx];
        char character = A[idx];
        int outputPos = offsets[idx];

        for (int j = 0; j < repeats; ++j) {
            output[outputPos + j] = character;
        }
    }
}

int main() {
    int rows, cols;
    char *A;
    int *B;
    char *output;
    int totalSize, totalOutputSize = 0;

    // Ask user for matrix dimensions
    printf("Enter the number of rows and columns for matrices A and B: ");
    scanf("%d %d", &rows, &cols);

    totalSize = rows * cols;

    // Allocate host memory
    A = (char *)malloc(totalSize * sizeof(char));
    B = (int *)malloc(totalSize * sizeof(int));
    int *offsets = (int *)malloc(totalSize * sizeof(int));

    // Initialize host matrices A and B with user input
    printf("Enter the elements of matrix A:\n");
    for (int i = 0; i < totalSize; ++i) {
        scanf(" %c", &A[i]); // Note the space before %c to catch any previous whitespaces
    }

    printf("Enter the elements of matrix B:\n");
    for (int i = 0; i < totalSize; ++i) {
        scanf("%d", &B[i]);
        if (i == 0)
            offsets[i] = 0;
        else
            offsets[i] = offsets[i - 1] + B[i - 1];
        totalOutputSize += B[i];
    }

    // Allocate output string
    output = (char *)malloc((totalOutputSize + 1) * sizeof(char)); // +1 for the null-terminator
    output[totalOutputSize] = '\0'; // Null-terminate the string

    // Allocate device memory
    char *d_A;
    int *d_B, *d_offsets;
    char *d_output;
    cudaMalloc((void **)&d_A, totalSize * sizeof(char));
    cudaMalloc((void **)&d_B, totalSize * sizeof(int));
    cudaMalloc((void **)&d_output, totalOutputSize * sizeof(char));
    cudaMalloc((void **)&d_offsets, totalSize * sizeof(int));

    // Copy matrices A and B from host to device
    cudaMemcpy(d_A, A, totalSize * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, totalSize * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_offsets, offsets, totalSize * sizeof(int), cudaMemcpyHostToDevice);

    // Define block size and grid size
    dim3 threadsPerBlock(256);
    dim3 blocksPerGrid((totalSize + threadsPerBlock.x - 1) / threadsPerBlock.x);

    // Launch CUDA kernel
    repeatCharacters<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_output, totalSize, d_offsets);

    // Copy output string from device to host
    cudaMemcpy(output, d_output, totalOutputSize * sizeof(char), cudaMemcpyDeviceToHost);

    // Print the resulting output string
    printf("Output String: %s\n", output);

    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_output); cudaFree(d_offsets);
    free(A); free(B); free(output); free(offsets);

    return 0;
}

Writing repeatCharacters.cu


In [None]:
!nvcc repeatCharacters.cu -o repeatCharacters

In [None]:
!./repeatCharacters

Enter the number of rows and columns for matrices A and B: ^C


In [None]:
%%writefile practice1.cu

#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define TILE_WIDTH 2
#define WIDTH 4

__global__ void perform2DMatrixMult(int *d_A,int *d_B,int *d_C){
  int bx = blockIdx.x, by = blockIdx.y;
  int tx = threadIdx.x, ty = threadIdx.y;

  int row = by * blockDim.y + ty;
  int col = bx * blockDim.x + tx;
  int res = 0;
  if(row < WIDTH && col < WIDTH){
    for(int k = 0;k<WIDTH;k++){
      res += d_A[row*WIDTH + k] * d_B[k*WIDTH + col];
    }
    d_C[row * WIDTH + col] = res;
  }
}

int main(){
  int *d_A,*d_B,*d_C;
  int *A,*B,*C;

  A = (int*)malloc(sizeof(int)*WIDTH*WIDTH);
  B = (int*)malloc(sizeof(int)*WIDTH*WIDTH);
  C = (int*)malloc(sizeof(int)*WIDTH*WIDTH);

  printf("Enter 4x4 matrix A: \n");
  for(int i = 0; i<WIDTH*WIDTH;i++){
    scanf("%d",&A[i]);
  }

  printf("Enter 4x4 matrix B: \n");
  for(int i = 0; i<WIDTH*WIDTH;i++){
    scanf("%d",&B[i]);
  }

  cudaMalloc((void**)&d_A,sizeof(int)*WIDTH*WIDTH);
  cudaMalloc((void**)&d_B,sizeof(int)*WIDTH*WIDTH);
  cudaMalloc((void**)&d_C,sizeof(int)*WIDTH*WIDTH);

  cudaMemcpy(d_A,A,sizeof(int)*WIDTH*WIDTH,cudaMemcpyHostToDevice);
  cudaMemcpy(d_B,B,sizeof(int)*WIDTH*WIDTH,cudaMemcpyHostToDevice);

  dim3 dimGrid(WIDTH/TILE_WIDTH,WIDTH/TILE_WIDTH);
  dim3 dimBlock(TILE_WIDTH,TILE_WIDTH);

  perform2DMatrixMult<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);

  cudaMemcpy(C,d_C,sizeof(int)*WIDTH*WIDTH,cudaMemcpyDeviceToHost);

  printf("This is the resultant matrix: \n");
  for(int i =0;i<WIDTH;i++){
    for(int j = 0;j<WIDTH;j++){
      printf("%d\t",C[i*WIDTH + j]);
    }
    printf("\n");
  }
  return 0;
}

Overwriting practice1.cu


In [None]:
!nvcc practice1.cu -o practice1


In [None]:
!./practice1

Enter 4x4 matrix A: 
1 2 3 4 
1 2 3 4 
1 2 3 4 
1 2 3 4 
Enter 4x4 matrix B: 
1 2 3 4 
1 2 3 4 
1 2 3 4
1 2 3 4 
This is the resultant matrix: 
10	20	30	40	
10	20	30	40	
10	20	30	40	
10	20	30	40	


In [None]:
%%writefile practice2.cu

#include <stdio.h>
#include <cuda_runtime.h>

// CUDA Kernel for merging two halves of an array
__global__ void mergeKernel(int *deviceArray, int *tempArray, int size, int width) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int start = width * idx;
    int mid = min(start + width / 2, size);
    int end = min(start + width, size);

    int a = start, b = mid, i = start;

    while (a < mid && b < end) {
        tempArray[i++] = (deviceArray[a] < deviceArray[b]) ? deviceArray[a++] : deviceArray[b++];
    }
    while (a < mid) {
        tempArray[i++] = deviceArray[a++];
    }
    while (b < end) {
        tempArray[i++] = deviceArray[b++];
    }
}

// Host function to call the merge sort kernel
void mergeSort(int *array, int size) {
    int *deviceArray, *tempArray;

    // Allocate memory on the GPU
    cudaMalloc(&deviceArray, size * sizeof(int));
    cudaMalloc(&tempArray, size * sizeof(int));

    // Copy array from host to device
    cudaMemcpy(deviceArray, array, size * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(1024);
    dim3 blocksPerGrid((size + threadsPerBlock.x - 1) / threadsPerBlock.x);
    int width;

    // Perform the merge sort
    for (width = 2; width <= size; width *= 2) {
        mergeKernel<<<blocksPerGrid, threadsPerBlock>>>(deviceArray, tempArray, size, width);
        cudaMemcpy(deviceArray, tempArray, size * sizeof(int), cudaMemcpyDeviceToDevice);
    }

    // Copy sorted array back to host
    cudaMemcpy(array, deviceArray, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(deviceArray);
    cudaFree(tempArray);
}

int main() {
    int *array;
    int size;

    printf("Enter the number of elements: ");
    scanf("%d", &size);

    array = (int*)malloc(size * sizeof(int));

    printf("Enter the elements of the array:\n");
    for (int i = 0; i < size; i++) {
        scanf("%d", &array[i]);
    }

    mergeSort(array, size);

    // Print the sorted array
    printf("Sorted array:\n");
    for (int i = 0; i < size; i++) {
        printf("%d ", array[i]);
    }
    printf("\n");

    free(array);

    return 0;
}


Overwriting practice2.cu


In [None]:
!nvcc practice2.cu -o practice2

In [None]:
!./practice2

Enter the number of elements: 10
Enter the elements of the array:
1 4 2 3 7 6 9 4 18 12
Sorted array:
1 2 3 4 4 6 7 9 12 18 
