# Setup

In [None]:
# https://www.geeksforgeeks.org/how-to-run-cuda-c-c-on-jupyter-notebook-in-google-colaboratory/

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-aj9acu2k
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-aj9acu2k
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10742 sha256=655723c0d2f4841dd76e45e772f96358a5fda83aba7888a669bf1eb2bb64257b
  Stored in directory: /tmp/pip-ephem-wheel-cache-q07q5873/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpf5zlhr55".


# Kode CUDA

In [94]:
%%writefile cuda.cu

#include <stdio.h>
#include <math.h>
#include <stdbool.h>

double *allocate_matrix(int n, bool isAugmented)
{
  int col = isAugmented ? 2*n : n;
    double *mat = (double *)malloc(n * col * sizeof(double));

    if (mat == NULL)
    {
        printf("Memory allocation failed!");
        free(mat);
        return NULL;
    }

    return mat;
}

int get_matrix_index(int row, int col, int width)
{
    return width * row + col;
}

void read_matrix(double *matrix, int n)
{
    double d = 0.0;
        for (int i = 0; i < n; i++)
        {
            for (int j = 0; j < n; j++)
            {
                scanf("%lf", &d);
                matrix[get_matrix_index(i, j, 2 * (n))] = d;
            }
        }
        for (int i = 0; i < n; ++i)
        {
            for (int j = n; j < 2 * (n); ++j)
            {
                if (j == (i + (n)))
                {
                    matrix[get_matrix_index(i, j, 2 * (n))] = 1;
                }
                else
                {
                    matrix[get_matrix_index(i, j, 2 * (n))] = 0;
                }
            }
        }
}

void print_result(double *mat, int rows)
{
    printf("%d\n", rows);
    for (int i = 0; i < rows; i++)
    {
        for (int j = rows; j < rows*2; j++)
        {
            printf("%lf ", mat[get_matrix_index(i, j, rows*2)]);
        }
        printf("\n");
    }
}

__device__ int GetMatrixIdx(int row, int col, int width)
{
    return width * row + col;
}

__global__ void SubsPivotKernel(double* mat, int n, int pivot_idx, int block_size) {
  int row_size = n /block_size;
  int start_row = (threadIdx.x * row_size);
  int end_row = start_row + row_size;

  for (int row = start_row; row < end_row; row++) {
    if (row == pivot_idx) {
      double pivot = mat[GetMatrixIdx(pivot_idx, pivot_idx, 2*n)];
      for (int col = 0; col < 2*n; col++) {
        mat[GetMatrixIdx(pivot_idx, col, 2*n)] /= pivot;
      }
    }

    __syncthreads();

    if (row != pivot_idx) {
      double d = mat[GetMatrixIdx(row, pivot_idx, 2*n)] / mat[GetMatrixIdx(pivot_idx, pivot_idx, 2*n)];
      for (int col = 0; col < 2*n; col++) {
        mat[GetMatrixIdx(row, col, 2*n)] -= (d * mat[GetMatrixIdx(pivot_idx, col, 2*n)]);
      }
    }

    __syncthreads();
  }
}

void invert_matrix(int n, double* mat) {
  double* d_mat;
  size_t size = n * n * 2 * sizeof(double);
  cudaMalloc((void**)&d_mat, size);
  cudaMemcpy(d_mat, mat, size, cudaMemcpyHostToDevice);

  int block_size = n >= 1024 ? 1024 : n;
   dim3 dimBlock(block_size);
   dim3 dimGrid(1, 1);

   for (int i=0; i<n; i++) {
    SubsPivotKernel<<<dimGrid, dimBlock>>>(d_mat, n, i, block_size);
   }

   cudaMemcpy(mat, d_mat, size, cudaMemcpyDeviceToHost);

   cudaFree(d_mat);
}

int main(void) {
  int n;
  scanf("%d", &n);

  double* mat = allocate_matrix(n, true);
  read_matrix(mat, n);

  invert_matrix(n, mat);

  print_result(mat, n);

  return 0;
}

Overwriting cuda.cu


In [95]:
!nvcc cuda.cu -o cuda

# Get Testcase

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


access test case using this path /content/drive/Shareddrives/Sister/test_cases/32.txt

# Execution Cuda

In [99]:
!time ./cuda < /content/drive/Shareddrives/Sister/test_cases/512.txt > outcuda.txt


real	0m1.315s
user	0m1.074s
sys	0m0.231s


# Execution Serial

In [None]:
!g++ /content/drive/Shareddrives/Sister/serial/serial.cpp -o serial

In [98]:
!time ./serial < /content/drive/Shareddrives/Sister/test_cases/512.txt > out.txt


real	0m2.357s
user	0m2.331s
sys	0m0.015s
