In [None]:
!apt install nvidia-cuda-toolkit
!nvcc --version

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libaccinj64-11.5 libatk-wrapper-java libatk-wrapper-java-jni
  libbabeltrace1 libcub-dev libcublas11 libcublaslt11 libcudart11.0 libcufft10 libcufftw10
  libcuinj64-11.5 libcupti-dev libcupti-doc libcupti11.5 libcurand10 libcusolver11 libcusolvermg11
  libcusparse11 libdebuginfod-common libdebuginfod1 libegl-dev libfontenc1 libgail-common libgail18
  libgl-dev libgl1-mesa-dev libgles-dev libgles1 libglvnd-core-dev libglvnd-dev libglx-dev
  libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libipt2 libnppc11 libnppial11 libnppicc11 libnppidei11
  libnppif11 libnppig11 libnppim11 libnppist11 libnppisu11 libnppitc11 libnpps11 libnvblas11
  libnvidia-compute-495 libnvidia-compute-510 libnvidia-compute-535 libnvidia-ml-dev libnvjpeg11
  libnvrtc-builtins11.5 libnvrtc11.2 libnvtoolsext1 libnvvm4 libopengl-dev libq

In [None]:
%%writefile try.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

Writing try.cu


In [None]:
!nvcc try.cu -o try

In [None]:
!./try

result is 8


<center><h1>Lab 8</h1></center>

- Write a program to multiply 2 matrices using CUDA

- Write a CUDA program to find the sum of elements of 2 1D arrays.

In [None]:
%%writefile Multiplication.cu

#include <stdio.h>
#include <stdlib.h>

#define BLOCK_SIZE 16

void initializeMatrix(float *matrix, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            matrix[i * cols + j] = rand() % 100;
        }
    }
}
void printMatrix(float *matrix, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%f ", matrix[i * cols + j]);
        }
        printf("\n");
    }
}
__global__ void matrixMultiply(float *A, float *B, float *C, int rowsA, int colsA, int colsB) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rowsA && col < colsB) {
        float sum = 0.0f;
        for (int k = 0; k < colsA; k++) {
            sum += A[row * colsA + k] * B[k * colsB + col];
        }
        C[row * colsB + col] = sum;
    }
}

int main() {
    int rowsA, colsA, rowsB, colsB;
    printf("Enter dimensions of matrix A (rows cols): ");
    scanf("%d %d", &rowsA, &colsA);
    printf("Enter dimensions of matrix B (rows cols): ");
    scanf("%d %d", &rowsB, &colsB);
    if (colsA != rowsB) {
        printf("Error: Incompatible matrix dimensions.\n");
        return 1;
    }

    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;
    h_A = (float *)malloc(rowsA * colsA * sizeof(float));
    h_B = (float *)malloc(rowsB * colsB * sizeof(float));
    h_C = (float *)malloc(rowsA * colsB * sizeof(float));

    initializeMatrix(h_A, rowsA, colsA);
    initializeMatrix(h_B, rowsB, colsB);
    cudaMalloc((void **)&d_A, rowsA * colsA * sizeof(float));
    cudaMalloc((void **)&d_B, rowsB * colsB * sizeof(float));
    cudaMalloc((void **)&d_C, rowsA * colsB * sizeof(float));
    cudaMemcpy(d_A, h_A, rowsA * colsA * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, rowsB * colsB * sizeof(float), cudaMemcpyHostToDevice);
    dim3 gridDim((colsB + BLOCK_SIZE - 1) / BLOCK_SIZE, (rowsA + BLOCK_SIZE - 1) / BLOCK_SIZE);
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    matrixMultiply<<<gridDim, blockDim>>>(d_A, d_B, d_C, rowsA, colsA, colsB);
    cudaMemcpy(h_C, d_C, rowsA * colsB * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Result matrix:\n");
    printMatrix(h_C, rowsA, colsB);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Writing Multiplication.cu


In [None]:
!nvcc Multiplication.cu -o Multiplication
!./Multiplication


Enter dimensions of matrix A (rows cols): 3
3
Enter dimensions of matrix B (rows cols): 3
1
Result matrix:
9154.000000 
7026.000000 
8833.000000 


In [None]:
%%writefile arraySum.cu

#include <stdio.h>
#include <stdlib.h>

#define BLOCK_SIZE 128

// CUDA kernel to find the sum of elements in two 1D arrays
__global__ void arraySum(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n;

    printf("Enter the number of elements in each array: ");
    scanf("%d", &n);

    float *h_a, *h_b, *h_c;
    float *d_a, *d_b, *d_c;

    // Allocate host memory for arrays
    h_a = (float *)malloc(n * sizeof(float));
    h_b = (float *)malloc(n * sizeof(float));
    h_c = (float *)malloc(n * sizeof(float));

    // Initialize arrays with random values
    for (int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    // Allocate device memory for arrays
    cudaMalloc((void **)&d_a, n * sizeof(float));
    cudaMalloc((void **)&d_b, n * sizeof(float));
    cudaMalloc((void **)&d_c, n * sizeof(float));

    // Copy arrays from host to device
    cudaMemcpy(d_a, h_a, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, n * sizeof(float), cudaMemcpyHostToDevice);

    // Launch CUDA kernel for array sum
    dim3 gridDim((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
    dim3 blockDim(BLOCK_SIZE);
    arraySum<<<gridDim, blockDim>>>(d_a, d_b, d_c, n);

    // Copy result array from device to host
    cudaMemcpy(h_c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the result array
    printf("Result array:\n");
    for (int i = 0; i < n; i++) {
        printf("%f ", h_c[i]);
    }
    printf("\n");

    // Free device and host memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

Writing arraySum.cu


In [None]:
!nvcc arraySum.cu -o arraySum
!./arraySum

Enter the number of elements in each array: 5
Result array:
169.000000 92.000000 128.000000 178.000000 70.000000 


Lab 10

In [None]:

%%writefile

#include <stdio.h>
#include <cuda.h> // Include the CUDA header

#define N 1024

// Min function doesn't need to be __global__
int Min(int i, int j) {
  if (i < j) {
    return i;
  }
  return j;
}

__global__ void findMin(int *array, int *min) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  // Check if index is within the array bounds
  if (idx < N) {
    int localMin = array[idx];
    for (int i = idx + blockDim.x; i < N; i += blockDim.x) {
      // Access elements within the block only (avoid out-of-bounds)
      if (i < N) {
        localMin = Min(localMin, array[i]);
      }
    }

    // Reduce local minima within a warp (assuming warp size is a power of 2)
    for (int stride = warpSize/2; stride > 0; stride >>= 1) {
      __syncthreads();
      if (threadIdx.x < stride && idx + threadIdx.x + stride < N) {
        localMin = Min(localMin, array[idx + threadIdx.x + stride]);
      }
    }

    // Store the minimum from the first thread in each block (assuming one min value per block)
    if (threadIdx.x == 0) {
      min[blockIdx.x] = localMin;
    }
  }
}

int main() {
  int array[N];
  int min = INT_MAX;

  // Initialize the array with random values
  for (int i = 0; i < N; i++) {
    array[i] = rand() % 100;
  }

  // Allocate memory on the device
  int *d_array, *d_min;
  cudaMalloc(&d_array, N * sizeof(int));
  cudaMalloc(&d_min, sizeof(int)); // Allocate for one minimum value

  // Copy the array to the device
  cudaMemcpy(d_array, array, N * sizeof(int), cudaMemcpyHostToDevice);

  // Launch the kernel with appropriate grid size (assuming one block)
  int threadsPerBlock = 256; // Adjust based on your hardware
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  findMin<<<blocksPerGrid, threadsPerBlock>>>(d_array, d_min);

  // Copy the minimum value back to the host
  cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost);

  // No need for another loop to find minimum (already done in the kernel)
  printf("Minimum element: %d\n", min);

  // Free memory on the device
  cudaFree(d_array);
  cudaFree(d_min);

  return 0;
}

Overwriting Lab10.cu


In [None]:
!nvcc Lab10.cu -o Lab10
!./Lab10

[01m[0m[01mLab10.cu(21)[0m: [01;31merror[0m: calling a __host__ function("[01mMin(int, int)[0m") from a __global__ function("[01mfindMin[0m") is not allowed

[01m[0m[01mLab10.cu(21)[0m: [01;31merror[0m: identifier "[01mMin[0m" is undefined in device code

2 errors detected in the compilation of "Lab10.cu".
/bin/bash: line 1: ./Lab10: No such file or directory


<h3>Lab12</h3>
2D convolution

In [2]:
%%writefile Lab12.cu
#include <iostream>
#include <cuda_runtime.h>

#define MASK_WIDTH 3  // Convolution kernel size (3x3 in this example)
#define TILE_WIDTH 16 // Tile size for shared memory optimization

// CUDA kernel to perform 2D convolution
__global__ void convolution2D(float *input, float *output, float *mask, int width, int height) {
    // Calculate thread coordinates
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row_o = blockIdx.y * TILE_WIDTH + ty;  // Row index for output matrix
    int col_o = blockIdx.x * TILE_WIDTH + tx;  // Column index for output matrix

    int row_i = row_o - MASK_WIDTH / 2;  // Corresponding row index for input with padding
    int col_i = col_o - MASK_WIDTH / 2;  // Corresponding column index for input with padding

    // Shared memory to store the tile of input matrix
    __shared__ float input_tile[TILE_WIDTH + MASK_WIDTH - 1][TILE_WIDTH + MASK_WIDTH - 1];

    // Load data into shared memory (including padding for boundaries)
    if (row_i >= 0 && row_i < height && col_i >= 0 && col_i < width) {
        input_tile[ty][tx] = input[row_i * width + col_i];
    } else {
        input_tile[ty][tx] = 0.0f;  // Apply zero-padding for out-of-bound elements
    }

    __syncthreads();  // Ensure all data is loaded into shared memory

    // Perform convolution only if within output bounds
    float output_value = 0.0f;
    if (ty < TILE_WIDTH && tx < TILE_WIDTH) {
        for (int i = 0; i < MASK_WIDTH; i++) {
            for (int j = 0; j < MASK_WIDTH; j++) {
                output_value += input_tile[ty + i][tx + j] * mask[i * MASK_WIDTH + j];
            }
        }

        // Store the output result
        if (row_o < height && col_o < width) {
            output[row_o * width + col_o] = output_value;
        }
    }
}

int main() {
    int width = 512;  // Image width
    int height = 512; // Image height

    int size = width * height * sizeof(float);  // Size of input and output arrays

    // Allocate host memory for input, output, and mask (kernel)
    float *h_input = new float[width * height];
    float *h_output = new float[width * height];
    float h_mask[MASK_WIDTH * MASK_WIDTH] = {
        0, -1, 0,
        -1, 4, -1,
        0, -1, 0};  // Example 3x3 Laplacian kernel (edge detection)

    // Initialize the input matrix with random values (as an example)
    for (int i = 0; i < width * height; i++) {
        h_input[i] = rand() % 256;  // Random values between 0 and 255
    }

    // Allocate device memory
    float *d_input, *d_output, *d_mask;
    cudaMalloc((void**)&d_input, size);
    cudaMalloc((void**)&d_output, size);
    cudaMalloc((void**)&d_mask, MASK_WIDTH * MASK_WIDTH * sizeof(float));

    // Copy input data from host to device
    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_mask, h_mask, MASK_WIDTH * MASK_WIDTH * sizeof(float), cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    dim3 dimGrid((width + TILE_WIDTH - 1) / TILE_WIDTH, (height + TILE_WIDTH - 1) / TILE_WIDTH);

    // Launch the 2D convolution kernel
    convolution2D<<<dimGrid, dimBlock>>>(d_input, d_output, d_mask, width, height);

    // Copy the result back to the host
    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);

    // Display the first few elements of the output matrix (for validation)
    std::cout << "Convolution output (first 10 elements):" << std::endl;
    for (int i = 0; i < 10; i++) {
        std::cout << h_output[i] << " ";
    }
    std::cout << std::endl;

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_mask);

    // Free host memory
    delete[] h_input;
    delete[] h_output;

    return 0;
}


Writing Lab12.cu


In [4]:
!nvcc Lab12.cu -o Lab12
!./Lab12

Convolution output (first 10 elements):
65 414 -23 72 -154 792 -369 685 -482 571 
