<a href="https://colab.research.google.com/github/taqihaider7/C4AI-BIRDS-CUDA-Programming/blob/master/BIRDS_CUDA_Cohort_Week_3_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Q1. Write a simple CUDA kernel that takes an array of integers and doubles each element.**

In [None]:
! touch add_basic.cu

"""
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void doubleArray(int* arr, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n)
    {
        arr[idx] *= 2;
    }
}

int main()
{
    int n = 100; // size of the array
    int* arr;
    cudaMalloc((void**)&arr, n * sizeof(int));

    // initialize the array
    for (int i = 0; i < n; i++)
    {
        arr[i] = i;
    }

    // copy the array to the GPU
    cudaMemcpy(arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

    // launch the kernel
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    doubleArray<<<numBlocks, blockSize>>>(arr, n);

    // copy the result back to the host
    cudaMemcpy(arr, arr, n * sizeof(int), cudaMemcpyDeviceToHost);

    // print the result
    for (int i = 0; i < n; i++)
    {
        printf("%d ", arr[i]);
    }
    printf("\n");

    // free the memory
    cudaFree(arr);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    return 0;
}
"""

'\n#include <stdio.h>\n#include <cuda_runtime.h>\n\n__global__ void doubleArray(int* arr, int n) \n{\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx < n) \n    {\n        arr[idx] *= 2;\n    }\n}\n\nint main() \n{\n    int n = 100; // size of the array\n    int* arr;\n    cudaMalloc((void**)&arr, n * sizeof(int));\n\n    // initialize the array\n    for (int i = 0; i < n; i++) \n    {\n        arr[i] = i;\n    }\n\n    // copy the array to the GPU\n    cudaMemcpy(arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);\n\n    // launch the kernel\n    int blockSize = 256;\n    int numBlocks = (n + blockSize - 1) / blockSize;\n    doubleArray<<<numBlocks, blockSize>>>(arr, n);\n\n    // copy the result back to the host\n    cudaMemcpy(arr, arr, n * sizeof(int), cudaMemcpyDeviceToHost);\n\n    // print the result\n    for (int i = 0; i < n; i++) \n    {\n        printf("%d ", arr[i]);\n    }\n    printf("\n");\n\n    // free the memory\n    cudaFree(arr);\n\n    // Wait for

**Q2. Write a CUDA kernel to initialize an array of integers with the index value.**

In [None]:
! touch add_basic.cu

"""
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void initialize_array(int *array)
{
    // Calculate the index for the current thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    // Initialize the array element at the calculated index with its index value
    array[index] = index;
}

int main()
{
    const int array_size = 10;
    int *d_array;

    // Allocate memory on GPU
    cudaMalloc((void**)&d_array, array_size * sizeof(int));

    // Launch the CUDA kernel to initialize the array
    // Specify 1 block with 10 threads, assuming array_size is small for simplicity
    initialize_array<<<1, array_size>>>(d_array);

    // Copy data from device to host
    int h_array[array_size];
    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the initialized array
    cout << "Initialized Array:" << endl;
    for (int i = 0; i < array_size; ++i) {
        cout << h_array[i] << " ";
    }
    cout << endl;

    // Free GPU memory
    cudaFree(d_array);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    return 0;
}
"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n__global__ void initialize_array(int *array)\n{\n    // Calculate the index for the current thread\n    int index = blockIdx.x * blockDim.x + threadIdx.x;\n\n    // Initialize the array element at the calculated index with its index value\n    array[index] = index;\n}\n\nint main()\n{\n    const int array_size = 10;\n    int *d_array;\n\n    // Allocate memory on GPU\n    cudaMalloc((void**)&d_array, array_size * sizeof(int));\n\n    // Launch the CUDA kernel to initialize the array\n    // Specify 1 block with 10 threads, assuming array_size is small for simplicity\n    initialize_array<<<1, array_size>>>(d_array);\n\n    // Copy data from device to host\n    int h_array[array_size];\n    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);\n\n    // Print the initialized array\n    cout << "Initialized Array:" << endl;\n    for (int i = 0; i < array_size; ++i) {\n        cout << h_

**Q3 [OPTIONAL]. How do you check for and handle errors in CUDA API calls and kernel launches?**

In [None]:
# Check CUDA API calls: Use the return value of each CUDA API call and compare it against "cudaSuccess"

"""
cudaError_t err = cudaMalloc(&devicePtr, size);
if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(err));
    // Handle error (e.g., clean up resources, exit)
}
"""

# Check kernel launches: Use cudaGetLastError right after the kernel launch to detect any errors.

"""
kernel<<<gridSize, blockSize>>>(parameters);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
    fprintf(stderr, "Kernel Launch Error: %s\n", cudaGetErrorString(err));
    // Handle error (e.g., clean up resources, exit)
}
"""

# Synchronize and check for runtime errors: Use cudaDeviceSynchronize to ensure all preceding operations are complete
# and check for errors that occurred during execution.


"""
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
    fprintf(stderr, "Post-Kernel Synchronization Error: %s\n", cudaGetErrorString(err));
    // Handle error (e.g., clean up resources, exit)
}
"""