<a href="https://colab.research.google.com/github/yashrohilla25/cudalab6/blob/main/parallellabn6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile vector.cu
#include <iostream>
#include <cmath>
#include <cuda.h>
#include <chrono>

#define N 1000000  // Vector size

__global__ void vectorAdd(float *a, float *b, float *c) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) c[i] = a[i] + b[i];
}

__global__ void vectorMul(float *a, float *b, float *c) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) c[i] = a[i] * b[i];
}

__global__ void vectorSqrt(float *a, float *c) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) c[i] = sqrtf(a[i]);
}

void initialize(float *a, float *b) {
    for (int i = 0; i < N; i++) {
        a[i] = (float)(i + 1);
        b[i] = (float)(i + 2);
    }
}

void measureExecutionTime(void (*kernel)(float *, float *, float *), float *a, float *b, float *c, const char* name) {
    float *d_a, *d_b, *d_c;
    size_t size = N * sizeof(float);

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();

    kernel<<<(N + 255) / 256, 256>>>(d_a, d_b, d_c);
    cudaDeviceSynchronize();

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed = end - start;

    std::cout << name << " Time: " << elapsed.count() << " seconds\n";

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
}

int main() {
    float *a = new float[N];
    float *b = new float[N];
    float *c = new float[N];

    initialize(a, b);

    // Measure addition
    measureExecutionTime(vectorAdd, a, b, c, "Addition");

    // Measure multiplication
    measureExecutionTime(vectorMul, a, b, c, "Multiplication");

    // Measure square root
    float *d_a, *d_c;
    size_t size = N * sizeof(float);
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_c, size);
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();
    vectorSqrt<<<(N + 255) / 256, 256>>>(d_a, d_c);
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed = end - start;
    std::cout << "Square Root Time: " << elapsed.count() << " seconds\n";

    cudaFree(d_a); cudaFree(d_c);

    delete[] a; delete[] b; delete[] c;
    return 0;
}


Writing vector.cu


In [None]:
!nvcc vector.cu -o vector
!./vector


Addition Time: 6.05e-06 seconds
Multiplication Time: 5.63e-07 seconds
Square Root Time: 3.9e-07 seconds


In [None]:
%%writefile sqrt.cu
#include <iostream>
#include <cmath>
#include <cuda.h>

#define N 1024  // You can increase this

// CUDA kernel to compute square root
__global__ void vectorSqrt(float *A, float *C) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < N) {
        C[idx] = sqrtf(A[idx]);
    }
}

int main() {
    float *h_A, *h_C;
    float *d_A, *d_C;
    size_t size = N * sizeof(float);

    // Allocate host memory
    h_A = new float[N];
    h_C = new float[N];

    // Initialize host array
    for (int i = 0; i < N; i++) {
        h_A[i] = i * 1.0f;  // Fill A with 0.0, 1.0, 2.0, ...
    }

    // Allocate device memory
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_C, size);

    // Copy input from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorSqrt<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print some results
    std::cout << "A[i] -> C[i] (sqrt)\n";
    for (int i = 0; i < 10; i++) {
        std::cout << h_A[i] << " -> " << h_C[i] << "\n";
    }

    // Cleanup
    delete[] h_A;
    delete[] h_C;
    cudaFree(d_A);
    cudaFree(d_C);

    return 0;
}


Writing sqrt.cu


In [None]:
!nvcc sqrt.cu -o sqrt -arch sm_75

In [None]:
!./sqrt

A[i] -> C[i] (sqrt)
0 -> 0
1 -> 1
2 -> 1.41421
3 -> 1.73205
4 -> 2
5 -> 2.23607
6 -> 2.44949
7 -> 2.64575
8 -> 2.82843
9 -> 3


In [5]:
%%writefile mulsqrt.cu
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <chrono>

__global__ void computeSqrt(float *A, float *C, int N) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < N) {
        C[idx] = sqrtf(A[idx]);
    }
}

void runTest(int N) {
    float *h_A = new float[N];
    float *h_C = new float[N];
    float *d_A, *d_C;

    for (int i = 0; i < N; ++i) {
        h_A[i] = i * 1.0f;
    }

    cudaMalloc(&d_A, N * sizeof(float));
    cudaMalloc(&d_C, N * sizeof(float));
    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    auto start = std::chrono::high_resolution_clock::now();
    computeSqrt<<<numBlocks, blockSize>>>(d_A, d_C, N);
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();

    cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

    std::chrono::duration<double, std::milli> duration = end - start;
    printf("N = %d\tTime = %.4f ms\n", N, duration.count());

    cudaFree(d_A);
    cudaFree(d_C);
    delete[] h_A;
    delete[] h_C;
}

int main() {
    int sizes[] = {50000, 500000, 5000000, 50000000};
    for (int i = 0; i < 4; ++i) {
        runTest(sizes[i]);
    }
    return 0;
}


Overwriting mulsqrt.cu


In [6]:
!nvcc mulsqrt.cu -o mulsqrt -arch sm_75
!./mulsqrt


N = 50000	Time = 0.1230 ms
N = 500000	Time = 0.1134 ms
N = 5000000	Time = 0.2676 ms
N = 50000000	Time = 1.8581 ms
