## Refreshing the Cloud Instance of CUDA On Server

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

## Installing CUDA Version 9

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

# Q1. Write and execute a program in CUDA to add two vectors of length N to meet the following requirements using 3 different kernels  

## a) block size as N  
## b) N threads within a block 
## c) Keep the number of threads per block as 256 (constant) and vary the number of blocks to handle N elements.  

In [36]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vecAddKernel(float *dev_arr1, float *dev_arr2, float *dev_arr3, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

    if (threadId < N)
        dev_arr3[threadId] = dev_arr1[threadId] + dev_arr2[threadId];
}

int main()
{
    float *dev_arr1, *dev_arr2, *dev_arr3;
    float host_arr1[1024], host_arr2[1024], host_arr3[1024];

    int N = 1024;
    int arr_size = N * sizeof(float);

    // Initializing the arrays with 1024 random integers
    for (int f = 0; f < 1024; f++) host_arr1[f] = (rand() % 20) + 50;
    for (int f = 0; f < 1024; f++) host_arr2[f] = (rand() % 20) + 50;

    printf("\n Initial Array 1 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr1[f]);
    printf("\n Initial Array 2 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr2[f]);

    // Allocate device memory for A,B and C
    cudaMalloc((void **)&dev_arr1, arr_size);
    cudaMalloc((void **)&dev_arr2, arr_size);
    cudaMalloc((void **)&dev_arr3, arr_size);

    // Copy A and B to device memory
    cudaMemcpy(dev_arr1, host_arr1, arr_size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_arr2, host_arr2, arr_size, cudaMemcpyHostToDevice);

    // Kernel launch code – to have the device
    // to perform the actual vector addition
    vecAddKernel<<<ceil(N/1.0), 1>>>(dev_arr1, dev_arr2, dev_arr3, N);

    // Copy C from the device memory
    // Free the device vectorsz
    cudaMemcpy(host_arr3, dev_arr3, arr_size, cudaMemcpyDeviceToHost);
    cudaFree(dev_arr1);
    cudaFree(dev_arr2);
    cudaFree(dev_arr3);


    printf("\n Result Array 3 of 1024 elements after addition is: \n");
    for (int f = 0; f < 1024; f++)
        printf("%.0f ", host_arr3[f]);
    return 0;
}


 Initial Array 1 of 1024 elements is: 
53 56 67 65 63 65 56 62 59 51 52 57 60 69 53 56 50 56 62 66 61 58 57 59 52 60 52 53 57 65 59 52 52 68 59 57 63 66 61 52 59 63 51 69 54 67 68 54 65 60 63 56 61 50 66 63 52 60 66 51 55 55 54 57 66 55 56 59 63 67 54 65 52 55 64 57 64 54 53 60 57 58 66 68 58 54 53 61 64 69 62 50 66 58 69 62 56 56 64 69 65 60 64 68 57 51 67 52 67 62 62 66 51 50 56 51 55 59 54 69 50 59 61 67 67 61 51 65 59 57 57 66 67 63 56 55 56 53 69 54 58 61 62 59 53 69 60 58 58 65 50 59 66 53 68 55 56 61 51 65 69 58 54 58 51 60 63 50 64 54 54 64 57 66 53 61 57 55 69 66 62 61 67 58 65 57 64 51 68 65 59 67 65 63 68 58 53 61 58 59 66 54 53 53 63 68 56 50 54 68 58 58 59 67 67 66 54 53 60 53 60 69 62 55 54 60 55 69 64 56 59 52 52 54 67 57 65 54 58 61 52 58 69 53 56 58 50 52 61 60 55 51 51 60 68 55 50 66 54 56 52 55 58 66 52 68 54 67 52 54 50 66 62 69 69 60 58 61 63 61 51 60 63 64 50 53 69 51 69 66 69 63 63 58 60 65 56 56 54 50 60 54 66 52 66 57 55 66 69 68 57 62 58 62 69 59 66 60 52 57 

In [37]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vecAddKernel(float *dev_arr1, float *dev_arr2, float *dev_arr3, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

    if (threadId < N)
        dev_arr3[threadId] = dev_arr1[threadId] + dev_arr2[threadId];
}

int main()
{
    float *dev_arr1, *dev_arr2, *dev_arr3;
    float host_arr1[1024], host_arr2[1024], host_arr3[1024];

    int N = 1024;
    int arr_size = N * sizeof(float);

    // Initializing the arrays with 1024 random integers
    for (int f = 0; f < 1024; f++) host_arr1[f] = (rand() % 20) + 50;
    for (int f = 0; f < 1024; f++) host_arr2[f] = (rand() % 20) + 50;

    printf("\n Initial Array 1 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr1[f]);
    printf("\n Initial Array 2 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr2[f]);

    // Allocate device memory for A,B and C
    cudaMalloc((void **)&dev_arr1, arr_size);
    cudaMalloc((void **)&dev_arr2, arr_size);
    cudaMalloc((void **)&dev_arr3, arr_size);

    // Copy A and B to device memory
    cudaMemcpy(dev_arr1, host_arr1, arr_size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_arr2, host_arr2, arr_size, cudaMemcpyHostToDevice);

    // Kernel launch code – to have the device
    // to perform the actual vector addition
    vecAddKernel<<<1,ceil(N/1.0)>>>(dev_arr1, dev_arr2, dev_arr3, N);

    // Copy C from the device memory
    // Free the device vectorsz
    cudaMemcpy(host_arr3, dev_arr3, arr_size, cudaMemcpyDeviceToHost);
    cudaFree(dev_arr1);
    cudaFree(dev_arr2);
    cudaFree(dev_arr3);


    printf("\n Result Array 3 of 1024 elements after addition is: \n");
    for (int f = 0; f < 1024; f++)
        printf("%.0f ", host_arr3[f]);
    return 0;
}


 Initial Array 1 of 1024 elements is: 
53 56 67 65 63 65 56 62 59 51 52 57 60 69 53 56 50 56 62 66 61 58 57 59 52 60 52 53 57 65 59 52 52 68 59 57 63 66 61 52 59 63 51 69 54 67 68 54 65 60 63 56 61 50 66 63 52 60 66 51 55 55 54 57 66 55 56 59 63 67 54 65 52 55 64 57 64 54 53 60 57 58 66 68 58 54 53 61 64 69 62 50 66 58 69 62 56 56 64 69 65 60 64 68 57 51 67 52 67 62 62 66 51 50 56 51 55 59 54 69 50 59 61 67 67 61 51 65 59 57 57 66 67 63 56 55 56 53 69 54 58 61 62 59 53 69 60 58 58 65 50 59 66 53 68 55 56 61 51 65 69 58 54 58 51 60 63 50 64 54 54 64 57 66 53 61 57 55 69 66 62 61 67 58 65 57 64 51 68 65 59 67 65 63 68 58 53 61 58 59 66 54 53 53 63 68 56 50 54 68 58 58 59 67 67 66 54 53 60 53 60 69 62 55 54 60 55 69 64 56 59 52 52 54 67 57 65 54 58 61 52 58 69 53 56 58 50 52 61 60 55 51 51 60 68 55 50 66 54 56 52 55 58 66 52 68 54 67 52 54 50 66 62 69 69 60 58 61 63 61 51 60 63 64 50 53 69 51 69 66 69 63 63 58 60 65 56 56 54 50 60 54 66 52 66 57 55 66 69 68 57 62 58 62 69 59 66 60 52 57 

In [38]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vecAddKernel(float *dev_arr1, float *dev_arr2, float *dev_arr3, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

    if (threadId < N)
        dev_arr3[threadId] = dev_arr1[threadId] + dev_arr2[threadId];
}

int main()
{
    float *dev_arr1, *dev_arr2, *dev_arr3;
    float host_arr1[1024], host_arr2[1024], host_arr3[1024];

    int N = 1024;
    int arr_size = N * sizeof(float);

    // Initializing the arrays with 1024 random integers
    for (int f = 0; f < 1024; f++) host_arr1[f] = (rand() % 20) + 50;
    for (int f = 0; f < 1024; f++) host_arr2[f] = (rand() % 20) + 50;

    printf("\n Initial Array 1 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr1[f]);
    printf("\n Initial Array 2 of 1024 elements is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr2[f]);

    // Allocate device memory for A,B and C
    cudaMalloc((void **)&dev_arr1, arr_size);
    cudaMalloc((void **)&dev_arr2, arr_size);
    cudaMalloc((void **)&dev_arr3, arr_size);

    // Copy A and B to device memory
    cudaMemcpy(dev_arr1, host_arr1, arr_size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_arr2, host_arr2, arr_size, cudaMemcpyHostToDevice);

    // Kernel launch code – to have the device
    // to perform the actual vector addition
    vecAddKernel<<<ceil(N/256.0), 256>>>(dev_arr1, dev_arr2, dev_arr3, N);

    // Copy C from the device memory
    // Free the device vectorsz
    cudaMemcpy(host_arr3, dev_arr3, arr_size, cudaMemcpyDeviceToHost);
    cudaFree(dev_arr1);
    cudaFree(dev_arr2);
    cudaFree(dev_arr3);


    printf("\n Result Array 3 of 1024 elements after addition is: \n");
    for (int f = 0; f < 1024; f++)
        printf("%.0f ", host_arr3[f]);
    return 0;
}


 Initial Array 1 of 1024 elements is: 
53 56 67 65 63 65 56 62 59 51 52 57 60 69 53 56 50 56 62 66 61 58 57 59 52 60 52 53 57 65 59 52 52 68 59 57 63 66 61 52 59 63 51 69 54 67 68 54 65 60 63 56 61 50 66 63 52 60 66 51 55 55 54 57 66 55 56 59 63 67 54 65 52 55 64 57 64 54 53 60 57 58 66 68 58 54 53 61 64 69 62 50 66 58 69 62 56 56 64 69 65 60 64 68 57 51 67 52 67 62 62 66 51 50 56 51 55 59 54 69 50 59 61 67 67 61 51 65 59 57 57 66 67 63 56 55 56 53 69 54 58 61 62 59 53 69 60 58 58 65 50 59 66 53 68 55 56 61 51 65 69 58 54 58 51 60 63 50 64 54 54 64 57 66 53 61 57 55 69 66 62 61 67 58 65 57 64 51 68 65 59 67 65 63 68 58 53 61 58 59 66 54 53 53 63 68 56 50 54 68 58 58 59 67 67 66 54 53 60 53 60 69 62 55 54 60 55 69 64 56 59 52 52 54 67 57 65 54 58 61 52 58 69 53 56 58 50 52 61 60 55 51 51 60 68 55 50 66 54 56 52 55 58 66 52 68 54 67 52 54 50 66 62 69 69 60 58 61 63 61 51 60 63 64 50 53 69 51 69 66 69 63 63 58 60 65 56 56 54 50 60 54 66 52 66 57 55 66 69 68 57 62 58 62 69 59 66 60 52 57 

# Q2. Write  and execute a CUDA program to read an array of N integer values. Sort the array in parallel using parallel selection sort and store the result in another array. 

In [39]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vecSelectionSortKernel(float *dev_arr1, float *dev_arr2, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

    int data = dev_arr1[threadId];
    int pos = 0;
    for (int i = 0; i < N; i++)
    {
        if ((dev_arr1[i] < data) || (dev_arr1[i] == data && i < threadId))
            pos++;
    }

    dev_arr2[pos] = data;
}

int main()
{
    float *dev_arr1, *dev_arr2;
    float host_arr1[1024], host_arr2[1024];

    int N = 1024;
    int arr_size = N * sizeof(float);

    // Initializing the arrays with 1024 random integers
    for (int f = 0; f < 1024; f++) host_arr1[f] = (rand() % 49) + 50;

    printf("\n Array 0f 1024 elements before sorting is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr1[f]);

    // Allocate device memory for A,B and C
    cudaMalloc((void **)&dev_arr1, arr_size);
    cudaMalloc((void **)&dev_arr2, arr_size);

    // Copy A and B to device memory
    cudaMemcpy(dev_arr1, host_arr1, arr_size, cudaMemcpyHostToDevice);

    // Kernel launch code – to have the device
    // to perform the actual vector addition
    vecSelectionSortKernel<<<ceil(N / 1), 1>>>(dev_arr1, dev_arr2, N);

    // Copy C from the device memory
    // Free the device vectorsz
    cudaMemcpy(host_arr2, dev_arr2, arr_size, cudaMemcpyDeviceToHost);
    cudaFree(dev_arr1);
    cudaFree(dev_arr2);

    printf("\n Array 0f 1024 elements after sorting is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr2[f]);
    return 0;
}


 Array 0f 1024 elements before sorting is: 
65 89 87 55 79 60 74 73 58 67 59 97 56 68 70 71 67 72 73 73 53 71 55 87 80 97 83 98 79 55 55 50 95 97 60 80 58 85 59 66 58 68 69 64 91 95 91 59 73 65 88 76 92 93 65 73 96 54 72 81 59 77 82 55 81 97 90 89 88 50 62 53 73 81 67 66 82 64 80 57 85 74 83 78 68 54 57 66 58 79 97 68 62 85 78 93 89 70 89 78 75 57 81 50 93 55 71 77 74 57 84 65 82 73 98 56 78 56 77 86 90 81 60 54 67 89 53 62 65 97 96 95 55 83 51 50 93 72 82 69 85 67 89 68 95 89 74 74 95 53 67 87 89 82 96 62 77 55 75 92 54 72 94 64 56 95 64 56 69 52 75 60 74 65 78 71 60 58 95 57 66 68 50 56 52 52 69 79 57 94 78 61 72 73 81 83 69 95 89 93 98 70 54 73 91 87 50 52 96 52 64 68 70 69 75 77 71 50 63 79 50 91 95 72 70 77 61 94 79 57 89 83 82 98 62 79 92 63 82 94 65 52 63 90 72 93 69 98 93 82 78 93 79 80 71 50 58 88 94 92 95 89 76 83 94 94 64 87 58 52 87 78 54 51 69 81 95 93 81 94 76 65 89 61 95 66 61 55 55 62 53 56 57 80 90 52 75 60 94 88 62 82 67 66 89 91 54 90 86 85 85 68 51 80 85 97 97 52 5

# 3 Write a execute a CUDA program to read an integer array of size N. Sort this array using odd-even transposition sorting. Use 2 kernels.

In [40]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vecTranspositionSortingOddEvenKernel(float *dev_arr, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

    if((threadId % 2) != 0 && threadId + 1 <= N-1)
    {
        if(dev_arr[threadId] > dev_arr[threadId + 1])
        {
            int temp = dev_arr[threadId];
            dev_arr[threadId] = dev_arr[threadId + 1];
            dev_arr[threadId + 1] = temp;
        }
    }

}

__global__ void vecTranspositionSortingEvenOddKernel(float *dev_arr, int N)
{
    int threadId = threadIdx.x + blockDim.x * blockIdx.x;

        if((threadId % 2) == 0 && threadId + 1 <= N-1)
    {
        if(dev_arr[threadId] > dev_arr[threadId + 1])
        {
            int temp = dev_arr[threadId];
            dev_arr[threadId] = dev_arr[threadId + 1];
            dev_arr[threadId + 1] = temp;
        }
    }

}

int main()
{
    float *dev_arr;
    float host_arr[1024];

    int N = 1024;
    int arr_size = N * sizeof(float);

    // Initializing the arrays with 1024 random integers
    for (int f = 0; f < 1024; f++) host_arr[f] = (rand() % 49) + 50;

    printf("\n Array of 1024 elements before sorting is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr[f]);

    // Allocate device memory for A,B and C
    cudaMalloc((void **)&dev_arr, arr_size);

    // Copy A and B to device memory
    cudaMemcpy(dev_arr, host_arr, arr_size, cudaMemcpyHostToDevice);

    // Kernel launch code – to have the device
    // to perform the actual vector addition
    
    for(int i = 0 ; i < N / 2 ; i++)
    {
        vecTranspositionSortingOddEvenKernel<<<ceil(N / 256.0), 256>>>(dev_arr, N);
        vecTranspositionSortingEvenOddKernel<<<ceil(N / 256.0), 256>>>(dev_arr, N);
    }

    // Copy C from the device memory
    // Free the device vectorsz
    cudaMemcpy(host_arr, dev_arr, arr_size, cudaMemcpyDeviceToHost);
    cudaFree(dev_arr);

    printf("\n Array of 1024 elements after sorting is: \n");
    for (int f = 0; f < 1024; f++) printf("%.0f ", host_arr[f]);
    return 0;
}


 Array of 1024 elements before sorting is: 
65 89 87 55 79 60 74 73 58 67 59 97 56 68 70 71 67 72 73 73 53 71 55 87 80 97 83 98 79 55 55 50 95 97 60 80 58 85 59 66 58 68 69 64 91 95 91 59 73 65 88 76 92 93 65 73 96 54 72 81 59 77 82 55 81 97 90 89 88 50 62 53 73 81 67 66 82 64 80 57 85 74 83 78 68 54 57 66 58 79 97 68 62 85 78 93 89 70 89 78 75 57 81 50 93 55 71 77 74 57 84 65 82 73 98 56 78 56 77 86 90 81 60 54 67 89 53 62 65 97 96 95 55 83 51 50 93 72 82 69 85 67 89 68 95 89 74 74 95 53 67 87 89 82 96 62 77 55 75 92 54 72 94 64 56 95 64 56 69 52 75 60 74 65 78 71 60 58 95 57 66 68 50 56 52 52 69 79 57 94 78 61 72 73 81 83 69 95 89 93 98 70 54 73 91 87 50 52 96 52 64 68 70 69 75 77 71 50 63 79 50 91 95 72 70 77 61 94 79 57 89 83 82 98 62 79 92 63 82 94 65 52 63 90 72 93 69 98 93 82 78 93 79 80 71 50 58 88 94 92 95 89 76 83 94 94 64 87 58 52 87 78 54 51 69 81 95 93 81 94 76 65 89 61 95 66 61 55 55 62 53 56 57 80 90 52 75 60 94 88 62 82 67 66 89 91 54 90 86 85 85 68 51 80 85 97 97 52 5