In [None]:
!nvidia-smi

Thu Apr 17 04:53:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!apt-get update
!apt-get install -y libopencv-dev python3-opencv pkg-config
!nvcc --version

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [73.0 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,788 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,693 kB]
Get:13 http://archive.ubuntu.com/ubun

In [None]:
!pkg-config --cflags --libs opencv4

-I/usr/include/opencv4 -lopencv_stitching -lopencv_alphamat -lopencv_aruco -lopencv_barcode -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn_objdetect -lopencv_dnn_superres -lopencv_dpm -lopencv_face -lopencv_freetype -lopencv_fuzzy -lopencv_hdf -lopencv_hfs -lopencv_img_hash -lopencv_intensity_transform -lopencv_line_descriptor -lopencv_mcc -lopencv_quality -lopencv_rapid -lopencv_reg -lopencv_rgbd -lopencv_saliency -lopencv_shape -lopencv_stereo -lopencv_structured_light -lopencv_phase_unwrapping -lopencv_superres -lopencv_optflow -lopencv_surface_matching -lopencv_tracking -lopencv_highgui -lopencv_datasets -lopencv_text -lopencv_plot -lopencv_ml -lopencv_videostab -lopencv_videoio -lopencv_viz -lopencv_wechat_qrcode -lopencv_ximgproc -lopencv_video -lopencv_xobjdetect -lopencv_objdetect -lopencv_calib3d -lopencv_imgcodecs -lopencv_features2d -lopencv_dnn -lopencv_flann -lopencv_xphoto -lopencv_photo -lopencv_imgproc -lopencv_core


In [4]:
# Step 1: Write CUDA code to a file
code = r'''
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

#define TILE_DIM 32
#define COARSE_FACTOR 4

//------------------------------------------------------------------------------
// Kernel for tiled matrix multiplication
//------------------------------------------------------------------------------
__global__ void tiled_mm_kernel(float* A, float* B, float* C, unsigned int N) {
    unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int colStart = blockIdx.x * blockDim.x * COARSE_FACTOR + threadIdx.x;

    __shared__ float A_s[TILE_DIM][TILE_DIM];
    __shared__ float B_s[TILE_DIM][TILE_DIM];

    float sum[COARSE_FACTOR];

    for (unsigned int c = 0; c < COARSE_FACTOR; ++c ) {
      sum[c] = 0.0f;
    }

    for (unsigned int tile = 0; tile < N/TILE_DIM; ++tile) {

      A_s[threadIdx.y][threadIdx.x] = A[row*N + tile*TILE_DIM + threadIdx.x];

      for (unsigned int c = 0; c < COARSE_FACTOR; ++c) {

        unsigned int col = colStart + c * TILE_DIM;
        B_s[threadIdx.y][threadIdx.x] = B[(tile*TILE_DIM + threadIdx.y)*N + col];
        __syncthreads();

        for (unsigned int i = 0; i < TILE_DIM; ++i) {
          sum[c] += A_s[threadIdx.y][i]*B_s[i][threadIdx.x];
        }
        __syncthreads();
      }
    }
    for (unsigned int c = 0; c < COARSE_FACTOR; ++c) {
      unsigned int col = colStart + c * TILE_DIM;
      C[row*N + col] = sum[c];
    }
}


//------------------------------------------------------------------------------
// GPU function for memory allocation, transfer, kernel execution and deletion
//------------------------------------------------------------------------------
void tiled_mm_gpu(float* A_h, float* B_h, float* C_h, unsigned int N) {
    float *A_d, *B_d, *C_d;
    size_t size = N * N * sizeof(float);

    // Allocate memory
    cudaMalloc((void**)&A_d, size);
    cudaMalloc((void**)&B_d, size);
    cudaMalloc((void**)&C_d, size);

    // Copy data to device
    cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
    cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);

    // Launch configuration
    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x/COARSE_FACTOR,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing with CUDA events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    tiled_mm_kernel<<<numBlocks, threadsPerBlock>>>(A_d, B_d, C_d, N);
    cudaEventRecord(stop);

    // Wait for kernel to finish
    cudaEventSynchronize(stop);

    // Measure elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "CUDA kernel time (ms): " << milliseconds << std::endl;

    // Copy result back to host
    cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);

    // Free memory
    cudaFree(A_d);
    cudaFree(B_d);
    cudaFree(C_d);

    // Destroy events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}


//------------------------------------------------------------------------------
// CPU based algorithm for
//------------------------------------------------------------------------------
void mm_cpu(const float* A, const float* B, float* C, unsigned int N) {
    for (unsigned int row = 0; row < N; ++row) {
        for (unsigned int col = 0; col < N; ++col) {
            float sum = 0.0f;
            for (unsigned int i = 0; i < N; ++i) {
                sum += A[row * N + i] * B[i * N + col];
            }
            C[row * N + col] = sum;
        }
    }
}


int main() {
    const unsigned int N = 512; // Matrix size

    // Create and randomize matrices on host
    cv::Mat A(N, N, CV_32F);
    cv::Mat B(N, N, CV_32F);
    cv::Mat C_gpu = cv::Mat::zeros(N, N, CV_32F);
    cv::Mat C_cpu = cv::Mat::zeros(N, N, CV_32F);
    cv::randu(A, 0.0f, 10.0f);
    cv::randu(B, 0.0f, 10.0f);

    float* A_h = A.ptr<float>();
    float* B_h = B.ptr<float>();
    float* C_gpu_h = C_gpu.ptr<float>();
    float* C_cpu_h = C_cpu.ptr<float>();

    // GPU matrix multiplication
    auto gpu_start = std::chrono::high_resolution_clock::now();
    tiled_mm_gpu(A_h, B_h, C_gpu_h, N);
    auto gpu_end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<float, std::milli> gpu_time = gpu_end - gpu_start;
    std::cout << "GPU Matrix multiplication took " << gpu_time.count() << " ms\n";

    // CPU matrix multiplication
    auto cpu_start = std::chrono::high_resolution_clock::now();
    mm_cpu(A_h, B_h, C_cpu_h, N);
    auto cpu_end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<float, std::milli> cpu_time = cpu_end - cpu_start;
    std::cout << "CPU Matrix multiplication took " << cpu_time.count() << " ms\n";

    return 0;
}



'''

# Step 2: Save to file
with open('script.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 script.cu -o script `pkg-config --cflags --libs opencv4`

# Step 4: Run the binary
!./script


  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

CUDA kernel time (ms): 0.88144
GPU Matrix multiplication took 164.445 ms
CPU Matrix multiplication took 765.056 ms
