<a href="https://colab.research.google.com/github/yashrohilla25/cudalab5/blob/main/parallelab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%%writefile vector_add_static.cu

#include <stdio.h>
#include <cuda.h>

#define N 1024 * 1024 // 1M elements

// 1.1 Static global memory (symbol memory)
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd() {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        d_C[i] = d_A[i] + d_B[i];
}

// Host arrays
float h_A[N], h_B[N], h_C[N];

int main() {
    // Initialize input vectors
    for (int i = 0; i < N; i++) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    // Copy data from host to device symbols
    cudaMemcpyToSymbol(d_A, h_A, sizeof(float) * N);
    cudaMemcpyToSymbol(d_B, h_B, sizeof(float) * N);

    // Timing the kernel execution
    cudaEvent_t start, stop;
    float milliseconds = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    vectorAdd<<<numBlocks, blockSize>>>();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result from device to host
    cudaMemcpyFromSymbol(h_C, d_C, sizeof(float) * N);

    // 1.2 Print time in ms
    printf("Kernel Execution Time: %f ms\n", milliseconds);

    // 1.3 Query device properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    float memClock = prop.memoryClockRate;    // kHz
    float memBusWidth = prop.memoryBusWidth;  // bits

    // Calculate theoretical bandwidth (GB/s)
    float theoreticalBW = 2.0 * memClock * memBusWidth / 8.0 / 1e6;
    printf("Theoretical Bandwidth: %f GB/s\n", theoreticalBW);

    // 1.4 Calculate measured bandwidth
    float RBytes = 2 * N * sizeof(float); // Read A and B
    float WBytes = N * sizeof(float);     // Write C
    float timeSec = milliseconds / 1000.0;
    float measuredBW = (RBytes + WBytes) / (timeSec * 1e9);
    printf("Measured Bandwidth: %f GB/s\n", measuredBW);

    return 0;
}


Overwriting vector_add_static.cu


In [8]:
!nvcc -o vector_add_static vector_add_static.cu


In [6]:
!nvprof ./vector_add_static


==494== NVPROF is profiling process 494, command: ./vector_add_static
Kernel Execution Time: 0.027616 ms
Theoretical Bandwidth: 320.063995 GB/s
Measured Bandwidth: 455.638489 GB/s
==494== Profiling application: ./vector_add_static
==494== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   99.84%  204.41ms         2  102.21ms  4.5270us  204.41ms  cudaMemcpyToSymbol
                    0.07%  134.04us       114  1.1750us     106ns  55.077us  cuDeviceGetAttribute
                    0.06%  130.80us         1  130.80us  130.80us  130.80us  cudaGetDeviceProperties
                    0.01%  14.536us         2  7.2680us  2.6200us  11.916us  cudaEventRecord
                    0.01%  11.311us         2  5.6550us     716ns  10.595us  cudaEventCreate
                    0.01%  11.093us         1  11.093us  11.093us  11.093us  cuDeviceGetName
                    0.00%  6.5380us         1  6.5380us  6.53

// extra codes for understanding basic codes


In [11]:
code = """
#include <iostream>
#include <vector>
#include <chrono>
#include <cstdlib>

int main() {
    const int N = 1 << 20; // 1 million elements
    std::vector<float> A(N), B(N), C(N);

    // Initialize vectors
    for (int i = 0; i < N; ++i) {
        A[i] = static_cast<float>(rand()) / RAND_MAX;
        B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    auto start = std::chrono::high_resolution_clock::now();

    // Vector addition
    for (int i = 0; i < N; ++i) {
        C[i] = A[i] + B[i];
    }

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed = end - start;

    std::cout << "Serial execution time: " << elapsed.count() << " ms\\n";

    std::cout << "Sample Result (First 10 elements):\\n";
    for (int i = 0; i < 10; ++i) {
        std::cout << C[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}
"""

# Save to a .cpp file
with open("serial_vector_add.cpp", "w") as f:
    f.write(code)

# Compile and run
!g++ serial_vector_add.cpp -o serial_vector_add
!./serial_vector_add


Serial execution time: 8.1395 ms
Sample Result (First 10 elements):
1.23457 1.58154 1.1092 1.10345 0.831745 1.10627 0.878185 1.86842 1.35301 0.748571 


In [12]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cstdlib>

__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

int main() {
    const int N = 1 << 20;
    size_t size = N * sizeof(float);

    // Allocate host memory
    float *h_A = new float[N];
    float *h_B = new float[N];
    float *h_C = new float[N];

    // Initialize input vectors
    for (int i = 0; i < N; ++i) {
        h_A[i] = static_cast<float>(rand()) / RAND_MAX;
        h_B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy input vectors to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Time measurement
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    std::cout << "CUDA execution time: " << milliseconds << " ms\\n";

    std::cout << "Sample Result (First 10 elements):\\n";
    for (int i = 0; i < 10; ++i) {
        std::cout << h_C[i] << " ";
    }
    std::cout << std::endl;

    // Free memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing vector_add.cu


In [13]:
!nvcc   vector_add.cu -o vector_add -arch=sm_75
!./vector_add


CUDA execution time: 0.152864 ms\nSample Result (First 10 elements):\n1.23457 1.58154 1.1092 1.10345 0.831745 1.10627 0.878185 1.86842 1.35301 0.748571 
