<a href="https://colab.research.google.com/github/ziiyaadddd/LP5/blob/main/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [7]:
cuda_code = """
#include <stdio.h>
#include <cuda.h>

__global__ void vectorAdd(int *a, int *b, int *c, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int N = 10;
    size_t size = N * sizeof(int);

    // Allocate host memory
    int *h_a = (int*)malloc(size);
    int *h_b = (int*)malloc(size);
    int *h_c = (int*)malloc(size);

    // Initialize vectors
    for (int i = 0; i < N; i++) {
        h_a[i] = i;
        h_b[i] = i * (N-i);
    }

    // Allocate device memory
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy data to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;
    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);

    // Copy result back
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    printf("Vector A: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_a[i]);
    }
    printf("\\n");

    printf("Vector B: ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_b[i]);
    }
    printf("\\n");

    printf("Resultant Vector C (A + B): ");
    for (int i = 0; i < N; i++) {
        printf("%d ", h_c[i]);
    }

    // Cleanup
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}
"""

# Write the code to a .cu file
with open('addition.cu', 'w') as f:
    f.write(cuda_code)



In [8]:
   # Compile the CUDA code
!nvcc addition.cu -o addition

# Run the compiled code
!./addition

Vector A: 0 1 2 3 4 5 6 7 8 9 
Vector B: 0 9 16 21 24 25 24 21 16 9 
Resultant Vector C (A + B): 0 0 0 0 0 0 0 0 0 0 

In [9]:
# Define the CUDA code for matrix multiplication
cuda_code = """
#include <stdio.h>
#include <cuda.h>

#define N 3 // Matrix size N x N

__global__ void matrixMul(int *A, int *B, int *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    if (row < width && col < width) {
        for (int k = 0; k < width; ++k) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

int main() {
    int size = N * N * sizeof(int);

    // Host memory allocation
    int *h_A = (int*)malloc(size);
    int *h_B = (int*)malloc(size);
    int *h_C = (int*)malloc(size);

    // Initialize matrices
    for (int i = 0; i < N * N; ++i) {
        h_A[i] = 10-i;
        h_B[i] = 14-i;
    }

    // Device memory allocation
    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy data to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Launch kernel
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);
    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print matrices
    printf("Matrix A:\\n");
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%d ", h_A[i * N + j]);
        }
        printf("\\n");
    }
    printf("\\n");

    printf("Matrix B:\\n");
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%d ", h_B[i * N + j]);
        }
        printf("\\n");
    }
    printf("\\n");

    // Print full matrix
    printf("Resultant Matrix C (A x B):\\n");
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%d ", h_C[i * N + j]);
        }
        printf("\\n");
    }

    // Cleanup
    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    return 0;
}
"""

# Write the CUDA code to a .cu file
with open('matrix_mul.cu', 'w') as f:
    f.write(cuda_code)



In [10]:
# Compile the CUDA code
!nvcc matrix_mul.cu -o matrix_mul

# Run the compiled code
!./matrix_mul

Matrix A:
10 9 8 
7 6 5 
4 3 2 

Matrix B:
14 13 12 
11 10 9 
8 7 6 

Resultant Matrix C (A x B):
0 0 0 
0 0 0 
0 0 0 
