<a href="https://colab.research.google.com/github/varekarprajwal/HPCS_LBP/blob/main/HPCS_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title CREATING THE REQUIRED ENVIROMENT
!sudo apt update
!sudo apt install libopencv-dev python3-opencv
!apt-get install -y mpich
!apt-get install -y openmpi-bin openmpi-doc openmpi-dev
!apt-get install -y g++
import numpy as np
import tensorflow as tf

In [2]:
# @title SELECTING THE PATH
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/OPENCV/

Mounted at /content/drive
/content/drive/MyDrive/Project/OPENCV


In [7]:
# @title SEQUENTIAL CODE
%%writefile Sequential_OPENCV.cpp
#include <iostream>
#include <iomanip>
#include <ctime>
#include <vector>
#include <opencv4/opencv2/opencv.hpp>

using namespace cv;
using namespace std;

// The weight matrix remains a constant global or can be defined locally.
const int wgt[3][3] = {
    {8, 4, 2},
    {16, 0, 1},
    {32, 64, 128}
};

// --- HELPER FUNCTION TO PRINT A CV::MAT ---
void print_matrix(const Mat& mat_to_print, const string& name) {
    cout << "\n--- " << name << " (" << mat_to_print.rows << "x" << mat_to_print.cols << ") ---" << endl;
    for (int i = 0; i < mat_to_print.rows; i++) {
        for (int j = 0; j < mat_to_print.cols; j++) {
            // Use (int) to print the numerical value of the uchar, not the character
            printf("%4d", (int)mat_to_print.at<uchar>(i, j));
        }
        printf("\n");
    }
    cout << "----------------------------------------" << endl;
}


double getCurrentTime() {
    struct timespec currentTime;
    clock_gettime(CLOCK_MONOTONIC, &currentTime);
    return (double)currentTime.tv_sec * 1000.0 + (double)currentTime.tv_nsec / 1000000.0;
}

// Same casecheck function, but now it returns a uchar.
uchar casecheck(uchar a, uchar b, uchar c, uchar d) {
    if (a == b && b == c && c == d) return 7;
    if (a == b) return 1;
    if (b == d) return 2;
    if (c == d) return 3;
    if (a == c) return 4;
    if (a == d) return 5;
    if (c == b) return 6;
    return 0;
}

int main(int argc, char* argv[]) {
    Mat image =imread("sample_image/10x10.jpg");
    if (image.empty()) {
        cerr << "Error: Couldn't load input image." << endl;
        return -1;
    }

    Mat hsv_image;
    cvtColor(image, hsv_image, COLOR_BGR2HSV);

    vector<Mat> hsvChannels;
    split(hsv_image, hsvChannels);
    Mat v_channel = hsvChannels[2]; // This is our source image, type CV_8UC1 (uchar)

    const int m_r = v_channel.rows / 2;
    const int m_c = v_channel.cols / 2;

    // Use cv::Mat with the correct uchar type (CV_8UC1) for all images.
    Mat txt_img(m_r, m_c, CV_8UC1);

    // --- Texton Calculation ---
    double startTime1 = getCurrentTime();
    for (int i = 0; i < m_r; ++i) {
        // Get pointers to the two source rows to avoid repeated .at() calls
        const uchar* p_src1 = v_channel.ptr<uchar>(i * 2);
        const uchar* p_src2 = v_channel.ptr<uchar>(i * 2 + 1);
        // Get pointer to the destination row
        uchar* p_dest = txt_img.ptr<uchar>(i);

        for (int j = 0; j < m_c; ++j) {
            uchar a = p_src1[j * 2];
            uchar b = p_src1[j * 2 + 1];
            uchar c = p_src2[j * 2];
            uchar d = p_src2[j * 2 + 1];
            p_dest[j] = casecheck(a, b, c, d);
        }
    }
    double endTime1 = getCurrentTime();

    // --- LTxXORp Calculation ---
    // Initialize the result image by copying the texton image.
    // This efficiently handles the borders, as they remain unchanged.
    Mat main_res = txt_img.clone();

    double startTime2 = getCurrentTime();
    // Loop only over the INTERIOR pixels, from 1 to rows-2 and 1 to cols-2.
    // This avoids the expensive 'if' check for borders inside the loop.
    for (int i = 1; i < m_r - 1; ++i) {
        // Get pointers to the previous, current, and next rows of the texton image
        const uchar* p_prev = txt_img.ptr<uchar>(i - 1);
        const uchar* p_curr = txt_img.ptr<uchar>(i);
        const uchar* p_next = txt_img.ptr<uchar>(i + 1);
        // Get pointer to the destination row in the result matrix
        uchar* p_dest = main_res.ptr<uchar>(i);

        for (int j = 1; j < m_c - 1; ++j) {
            const uchar center_val = p_curr[j];

            // Inlined logic from Texton_weight function, removing function call overhead
            // and the global variable. (condition) evaluates to 0 or 1, creating branchless code.
            int xor_S = 0;
            xor_S += (p_prev[j - 1] != center_val) * wgt[0][0];
            xor_S += (p_prev[j]     != center_val) * wgt[0][1];
            xor_S += (p_prev[j + 1] != center_val) * wgt[0][2];
            xor_S += (p_curr[j - 1] != center_val) * wgt[1][0];
            // xor_S += (p_curr[j]  != center_val) * wgt[1][1]; // Center is always 0
            xor_S += (p_curr[j + 1] != center_val) * wgt[1][2];
            xor_S += (p_next[j - 1] != center_val) * wgt[2][0];
            xor_S += (p_next[j]     != center_val) * wgt[2][1];
            xor_S += (p_next[j + 1] != center_val) * wgt[2][2];

            p_dest[j] = saturate_cast<uchar>(xor_S); // Safely cast to uchar
        }
    }
    double endTime2 = getCurrentTime();

    // --- PRINT THE FINAL RESULT ---
    print_matrix(txt_img, "Intermediate Texton Image");
    print_matrix(main_res, "Final LTxXORp Result");


    double totalTime1 = endTime1 - startTime1;
    double totalTime2 = endTime2 - startTime2;

    printf("\n____________________________\n\n");
    printf("Elapsed time for texton : %.4f milliseconds\n", totalTime1);
    printf("Elapsed time for LTxXORp: %.4f milliseconds\n", totalTime2);
    printf("Total elapsed time      : %.4f milliseconds\n", totalTime1 + totalTime2);

    return 0;
}

Overwriting Sequential_OPENCV.cpp


In [8]:
# @title COMMAND RUN SEQUENTIAL CODE
!g++ Sequential_OPENCV.cpp -o app `pkg-config --cflags --libs opencv4` && ./app


--- Intermediate Texton Image (5x5) ---
   0   0   0   0   0
   0   0   0   0   0
   0   0   3   0   0
   0   0   0   0   0
   5   0   0   0   0
----------------------------------------

--- Final LTxXORp Result (5x5) ---
   0   0   0   0   0
   0 128  64  32   0
   0   1 255  16   0
   0  34   4   8   0
   5   0   0   0   0
----------------------------------------

____________________________

Elapsed time for texton : 0.0010 milliseconds
Elapsed time for LTxXORp: 0.0005 milliseconds
Total elapsed time      : 0.0015 milliseconds


In [11]:
# @title MPI CODE
%%writefile Parallel_MPI_OPENCV.cpp
#include <iostream>
#include <vector>
#include <numeric>
#include <ctime>
#include <mpi.h>
#include <opencv4/opencv2/opencv.hpp>

using namespace cv;
using namespace std;

// --- HELPER FUNCTION TO PRINT A CV::MAT ---
void print_matrix(const Mat& mat_to_print, const string& name) {
    if (mat_to_print.empty()) {
        cout << "\n--- " << name << " is empty ---" << endl;
        return;
    }
    cout << "\n--- " << name << " (" << mat_to_print.rows << "x" << mat_to_print.cols << ") ---" << endl;
    for (int i = 0; i < mat_to_print.rows; i++) {
        for (int j = 0; j < mat_to_print.cols; j++) {
            // Use (int) to print the numerical value of the uchar, not the character
            printf("%4d", (int)mat_to_print.at<uchar>(i, j));
        }
        printf("\n");
    }
    cout << "----------------------------------------" << endl;
}

// Same casecheck function, but it now returns a uchar (1 byte)
uchar casecheck(uchar a, uchar b, uchar c, uchar d) {
    if (a == b && b == c && c == d) return 7;
    if (a == b) return 1;
    if (b == d) return 2;
    if (c == d) return 3;
    if (a == c) return 4;
    if (a == d) return 5;
    if (c == b) return 6;
    return 0;
}

int main(int argc, char* argv[]) {
    MPI_Init(&argc, &argv);

    int rank, world_size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    Mat v_channel_full;
    int full_img_rows = 0, full_img_cols = 0;
    int texton_rows = 0, texton_cols = 0;

    // == 1. Root Process: Load Image and Broadcast Dimensions ==
    if (rank == 0) {
        Mat image =imread("sample_image/10x10.jpg", IMREAD_COLOR);
        if (image.empty()) {
            cerr << "Error: Couldn't load input image." << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
            return -1;
        }

        Mat hsv_image;
        cvtColor(image, hsv_image, COLOR_BGR2HSV);
        vector<Mat> hsv_planes;
        split(hsv_image, hsv_planes);
        v_channel_full = hsv_planes[2];

        full_img_rows = v_channel_full.rows;
        full_img_cols = v_channel_full.cols;
        texton_rows = full_img_rows / 2;
        texton_cols = full_img_cols / 2;
    }

    // Broadcast essential dimensions to all processes
    MPI_Bcast(&full_img_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&full_img_cols, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&texton_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&texton_cols, 1, MPI_INT, 0, MPI_COMM_WORLD);

    // == 2. Distribute V-Channel Image Data ==
    int base_rows_per_proc = (full_img_rows / 2 / world_size) * 2;

    vector<int> send_counts(world_size);
    vector<int> displacements(world_size);

    if (rank == 0) {
        int current_displacement = 0;
        for (int i = 0; i < world_size; ++i) {
            int rows_to_send = (i < world_size - 1) ? base_rows_per_proc : (full_img_rows - i * base_rows_per_proc);
            send_counts[i] = rows_to_send * full_img_cols;
            displacements[i] = current_displacement;
            current_displacement += send_counts[i];
        }
    }

    MPI_Bcast(send_counts.data(), world_size, MPI_INT, 0, MPI_COMM_WORLD);
    int local_v_rows = send_counts[rank] / full_img_cols;
    Mat local_v_channel(local_v_rows, full_img_cols, CV_8UC1);

    MPI_Scatterv(v_channel_full.data, send_counts.data(), displacements.data(), MPI_UNSIGNED_CHAR,
                 local_v_channel.data, send_counts[rank], MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);

    // == 3. Parallel Texton Calculation ==
    double startTime1 = MPI_Wtime();
    int local_texton_rows = local_v_rows / 2;
    Mat local_texton(local_texton_rows, texton_cols, CV_8UC1);

    for (int i = 0; i < local_texton_rows; ++i) {
        const uchar* p_src1 = local_v_channel.ptr<uchar>(i * 2);
        const uchar* p_src2 = local_v_channel.ptr<uchar>(i * 2 + 1);
        uchar* p_dest = local_texton.ptr<uchar>(i);
        for (int j = 0; j < texton_cols; ++j) {
            p_dest[j] = casecheck(p_src1[j * 2], p_src1[j * 2 + 1], p_src2[j * 2], p_src2[j * 2 + 1]);
        }
    }
    double endTime1 = MPI_Wtime();

    // == 4. Halo Exchange for LTxXORp ==
    Mat local_texton_with_halos(local_texton_rows + 2, texton_cols, CV_8UC1);
    local_texton.copyTo(local_texton_with_halos(Rect(0, 1, texton_cols, local_texton_rows)));

    int prev_rank = (rank == 0) ? MPI_PROC_NULL : rank - 1;
    int next_rank = (rank == world_size - 1) ? MPI_PROC_NULL : rank + 1;

    MPI_Sendrecv(local_texton.ptr<uchar>(0), texton_cols, MPI_UNSIGNED_CHAR, prev_rank, 0,
                 local_texton_with_halos.ptr<uchar>(0), texton_cols, MPI_UNSIGNED_CHAR, prev_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    MPI_Sendrecv(local_texton.ptr<uchar>(local_texton_rows - 1), texton_cols, MPI_UNSIGNED_CHAR, next_rank, 0,
                 local_texton_with_halos.ptr<uchar>(local_texton_rows + 1), texton_cols, MPI_UNSIGNED_CHAR, next_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

    // == 5. Parallel LTxXORp Calculation ==
    double startTime2 = MPI_Wtime();
    Mat local_result = local_texton.clone();
    const int wgt[3][3] = {{8, 4, 2}, {16, 0, 1}, {32, 64, 128}};

    int start_row = (rank == 0) ? 1 : 0;
    int end_row = (rank == world_size - 1) ? local_texton_rows - 1 : local_texton_rows;

    for (int i = start_row; i < end_row; ++i) {
        const uchar* p_prev = local_texton_with_halos.ptr<uchar>(i);
        const uchar* p_curr = local_texton_with_halos.ptr<uchar>(i + 1);
        const uchar* p_next = local_texton_with_halos.ptr<uchar>(i + 2);
        uchar* p_dest = local_result.ptr<uchar>(i);

        for (int j = 1; j < texton_cols - 1; ++j) {
            uchar center_val = p_curr[j];
            int xor_S = 0;
            xor_S += (p_prev[j - 1] != center_val) * wgt[0][0];
            xor_S += (p_prev[j]     != center_val) * wgt[0][1];
            xor_S += (p_prev[j + 1] != center_val) * wgt[0][2];
            xor_S += (p_curr[j - 1] != center_val) * wgt[1][0];
            xor_S += (p_curr[j + 1] != center_val) * wgt[1][2];
            xor_S += (p_next[j - 1] != center_val) * wgt[2][0];
            xor_S += (p_next[j]     != center_val) * wgt[2][1];
            xor_S += (p_next[j + 1] != center_val) * wgt[2][2];
            p_dest[j] = saturate_cast<uchar>(xor_S);
        }
    }
    double endTime2 = MPI_Wtime();

    // == 6. Gather All Results ==
    vector<int> recv_counts(world_size);
    vector<int> texton_displacements(world_size);

    int local_pixel_count = local_texton_rows * texton_cols;
    MPI_Gather(&local_pixel_count, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);

    Mat texton_full, final_result_full;
    if (rank == 0) {
        texton_displacements[0] = 0;
        for (size_t i = 1; i < recv_counts.size(); ++i) {
            texton_displacements[i] = texton_displacements[i - 1] + recv_counts[i - 1];
        }
        texton_full.create(texton_rows, texton_cols, CV_8UC1);
        final_result_full.create(texton_rows, texton_cols, CV_8UC1);
    }

    MPI_Gatherv(local_texton.data, local_pixel_count, MPI_UNSIGNED_CHAR,
                texton_full.data, recv_counts.data(), texton_displacements.data(),
                MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);

    MPI_Gatherv(local_result.data, local_pixel_count, MPI_UNSIGNED_CHAR,
                final_result_full.data, recv_counts.data(), texton_displacements.data(),
                MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);

    // == 7. Print Timings and Results on Root ==
    double totalTime1 = (endTime1 - startTime1) * 1000.0;
    double totalTime2 = (endTime2 - startTime2) * 1000.0;

    double max_time1, max_time2;
    MPI_Reduce(&totalTime1, &max_time1, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&totalTime2, &max_time2, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        cout << "\n--- Optimized MPI/OpenCV Execution ---" << endl;
        print_matrix(texton_full, "Intermediate Texton Image");
        print_matrix(final_result_full, "Final LTxXORp Result");

        printf("\nMax time for Texton calculation:   %.4f milliseconds\n", max_time1);
        printf("Max time for LTxXORp calculation: %.4f milliseconds\n", max_time2);
        printf("Total max elapsed time:            %.4f milliseconds\n", max_time1 + max_time2);
    }

    MPI_Finalize();
    return 0;
}

Overwriting Parallel_MPI_OPENCV.cpp


In [12]:
# @title COMMAND RUN PARALLEL MPI CODE
!mpic++ Parallel_MPI_OPENCV.cpp -o app `pkg-config --cflags --libs opencv4` && mpirun --allow-run-as-root -np 1 ./app


--- Optimized MPI/OpenCV Execution ---

--- Intermediate Texton Image (5x5) ---
   0   0   0   0   0
   0   0   0   0   0
   0   0   3   0   0
   0   0   0   0   0
   5   0   0   0   0
----------------------------------------

--- Final LTxXORp Result (5x5) ---
   0   0   0   0   0
   0 128  64  32   0
   0   1 255  16   0
   0  34   4   8   0
   5   0   0   0   0
----------------------------------------

Max time for Texton calculation:   0.0012 milliseconds
Max time for LTxXORp calculation: 0.0014 milliseconds
Total max elapsed time:            0.0026 milliseconds


In [15]:
# @title CUDA CODE
%%writefile parallel_CUDA_OPENCV.cu

#include <iostream>
#include <vector>
#include <opencv4/opencv2/opencv.hpp>
#include <cuda_runtime.h>

// A handy macro for CUDA error checking
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const *const func, const char *const file, int const line) {
    if (result) {
        // CORRECTED LINE: The typo '"' is now " '"
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " (" << cudaGetErrorString(result) << ") at " <<
            file << ":" << line << " '" << func << "' \n";
        cudaDeviceReset();
        exit(99);
    }
}

// Device function to determine texton value from a 2x2 block
__device__ unsigned char casecheck(unsigned char a, unsigned char b, unsigned char c, unsigned char d) {
   if (a == b && b == c && c == d) return 7;
   if (a == b) return 1;
   if (b == d) return 2;
   if (c == d) return 3;
   if (a == c) return 4;
   if (a == d) return 5;
   if (c == b) return 6;
   return 0;
}

// Kernel to calculate the texton image from the value channel
__global__ void calculate_TEXTON_CUDA(const unsigned char* v_channel, unsigned char* texton_img, int rows, int cols)  {
    // Standard 2D global thread index calculation
    int texton_col = blockIdx.x * blockDim.x + threadIdx.x;
    int texton_row = blockIdx.y * blockDim.y + threadIdx.y;
    int texton_cols = cols / 2;
    int texton_rows = rows / 2;

    // Boundary check
    if (texton_col >= texton_cols || texton_row >= texton_rows) {
        return;
    }

    // Map 2D texton index to the top-left corner of the 2x2 block in the source image
    int src_row = texton_row * 2;
    int src_col = texton_col * 2;
    int src_idx = src_row * cols + src_col;

    // Read the 2x2 pixel block
    unsigned char a = v_channel[src_idx];
    unsigned char b = v_channel[src_idx + 1];
    unsigned char c = v_channel[src_idx + cols];
    unsigned char d = v_channel[src_idx + cols + 1];

    // Calculate and write the texton value
    int texton_idx = texton_row * texton_cols + texton_col;
    texton_img[texton_idx] = casecheck(a, b, c, d);
}

// Kernel to calculate Local Texton XOR Pattern (LTxXORp)
__global__ void calculate_LBP_CUDA(const unsigned char* texton_img, unsigned char* lbp_img, int rows, int cols)  {
    // Calculate global thread index for the *interior* of the texton image
    int lbp_col = blockIdx.x * blockDim.x + threadIdx.x;
    int lbp_row = blockIdx.y * blockDim.y + threadIdx.y;

    // The grid is for interior points, so the output coordinates are (lbp_row, lbp_col)
    // The corresponding center pixel in the texton_img is at (lbp_row + 1, lbp_col + 1)
    int center_row = lbp_row + 1;
    int center_col = lbp_col + 1;

    // Boundary check (grid is sized for interior, but good practice)
    if (center_row >= rows - 1 || center_col >= cols - 1) {
        return;
    }

    // Index of the center pixel in the source texton image
    int center_idx = center_row * cols + center_col;
    unsigned char center_val = texton_img[center_idx];

    // Replicating the original code's specific weights and neighbor logic
    // Your original code used: a=6, b=5, c=4, d=1, e=1, f=4, g=5, h=6
    // The relative offsets for these are complex and depend on width.
    // This implementation uses standard 3x3 neighborhood offsets for correctness and clarity.
    // If your original offsets (a,b,c..) had a different non-local meaning, this logic would need to change.
    const int original_offsets[8] = {
        -cols - 1,  // NW (Top-Left)
        -cols,      // N  (Top-Mid)
        -cols + 1,  // NE (Top-Right)
        -1,         // W  (Left)
        +1,         // E  (Right)
        +cols - 1,  // SW (Bottom-Left)
        +cols,      // S  (Bottom-Mid)
        +cols + 1   // SE (Bottom-Right)
    };
    // Your original weights: wgt[9] = {8, 4, 2,16, 0, 1,32, 64, 128};
    const unsigned int wgt[8] = {8, 4, 2, 16, 1, 32, 64, 128};

    unsigned int lbp_value = 0;
    // Unrolled loop for performance, using your weights
    lbp_value += (texton_img[center_idx + original_offsets[0]] != center_val) * wgt[0]; // NW
    lbp_value += (texton_img[center_idx + original_offsets[1]] != center_val) * wgt[1]; // N
    lbp_value += (texton_img[center_idx + original_offsets[2]] != center_val) * wgt[2]; // NE
    lbp_value += (texton_img[center_idx + original_offsets[3]] != center_val) * wgt[3]; // W
    lbp_value += (texton_img[center_idx + original_offsets[4]] != center_val) * wgt[4]; // E
    lbp_value += (texton_img[center_idx + original_offsets[5]] != center_val) * wgt[5]; // SW
    lbp_value += (texton_img[center_idx + original_offsets[6]] != center_val) * wgt[6]; // S
    lbp_value += (texton_img[center_idx + original_offsets[7]] != center_val) * wgt[7]; // SE

    int lbp_idx = center_row * cols + center_col;
    lbp_img[lbp_idx] = (unsigned char)lbp_value;
}


int main(int argc, char *argv[]) {
    std::cout << "\nCUDA HARDWARE ACTIVATED\n____________________\n" << std::endl;

    // --- 1. Load Image and Prepare Host Data ---
    cv::Mat image = cv::imread("sample_image/10x10.jpg", cv::IMREAD_COLOR);
    if (image.empty()) {
        std::cerr << "Error: Couldn't load input image." << std::endl;
        return -1;
    }

    cv::Mat hsv_image, v_channel;
    cv::cvtColor(image, hsv_image, cv::COLOR_BGR2HSV);
    std::vector<cv::Mat> hsv_planes;
    cv::split(hsv_image, hsv_planes);
    v_channel = hsv_planes[2];

    int i_rows = v_channel.rows;
    int i_cols = v_channel.cols;
    int t_rows = i_rows / 2;
    int t_cols = i_cols / 2;

    /*std::cout << "Original V-Channel Image (" << i_rows << "x" << i_cols << "):\n";
    for (int r = 0; r < i_rows; ++r) {
        for (int c = 0; c < i_cols; ++c) {
            printf("%3d ", v_channel.at<unsigned char>(r, c));
        }
        printf("\n");
    }
    printf("\n____________________\n");*/

    // --- 2. Allocate GPU Memory ---
    unsigned char *d_v_channel, *d_texton_img, *d_lbp_img;
    size_t v_size = i_rows * i_cols * sizeof(unsigned char);
    size_t t_size = t_rows * t_cols * sizeof(unsigned char);

    checkCudaErrors(cudaMalloc(&d_v_channel, v_size));
    checkCudaErrors(cudaMalloc(&d_texton_img, t_size));
    checkCudaErrors(cudaMalloc(&d_lbp_img, t_size));

    // --- 3. Copy Data to GPU ---
    checkCudaErrors(cudaMemcpy(d_v_channel, v_channel.data, v_size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_lbp_img, 0, t_size));

    // --- 4. Configure Kernel Launches ---
    dim3 threadsPerBlock(16, 16);
    dim3 textonGrid((t_cols + threadsPerBlock.x - 1) / threadsPerBlock.x, (t_rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
    dim3 lbpGrid(((t_cols - 2) + threadsPerBlock.x - 1) / threadsPerBlock.x, ((t_rows - 2) + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // --- 5. Execute Kernels and Time with CUDA Events ---
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    checkCudaErrors(cudaEventRecord(start));

    calculate_TEXTON_CUDA<<<textonGrid, threadsPerBlock>>>(d_v_channel, d_texton_img, i_rows, i_cols);
    calculate_LBP_CUDA<<<lbpGrid, threadsPerBlock>>>(d_texton_img, d_lbp_img, t_rows, t_cols);

    checkCudaErrors(cudaEventRecord(stop));
    checkCudaErrors(cudaEventSynchronize(stop));

    float milliseconds = 0;
    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));

    // --- 6. Copy Results Back to Host ---
    std::vector<unsigned char> h_texton_img(t_rows * t_cols);
    std::vector<unsigned char> h_lbp_img(t_rows * t_cols);
    checkCudaErrors(cudaMemcpy(h_texton_img.data(), d_texton_img, t_size, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_lbp_img.data(), d_lbp_img, t_size, cudaMemcpyDeviceToHost));

    // --- 7. Display Results ---

    std::cout << "\nTexton image (" << t_rows << "x" << t_cols << "):\n\n";
    for (int r = 0; r < t_rows; ++r) {
        for (int c = 0; c < t_cols; ++c) {
            printf("%4d", h_texton_img[r * t_cols + c]);
        }
        printf("\n");
    }

    printf("\n___________________\n");
    std::cout << "\nTexton Weight image (LBP) (" << t_rows << "x" << t_cols << "):\n\n";
    for (int r = 0; r < t_rows; ++r) {
        for (int c = 0; c < t_cols; ++c) {
            printf("%4d", h_lbp_img[r * t_cols + c]);
        }
        printf("\n");
    }

    printf("\nTotal GPU Kernel Elapsed time: %.5f milliseconds\n", milliseconds);

    // --- 8. Cleanup ---
    checkCudaErrors(cudaFree(d_v_channel));
    checkCudaErrors(cudaFree(d_texton_img));
    checkCudaErrors(cudaFree(d_lbp_img));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));

    return 0;
}



Overwriting parallel_CUDA_OPENCV.cu


In [17]:
# @title COMMAND RUN PARALLEL CUDA CODE
!nvcc -o parallel parallel_CUDA_OPENCV.cu `pkg-config --cflags --libs opencv4` && ./parallel


CUDA HARDWARE ACTIVATED
____________________


Texton image (5x5):

   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0

___________________

Texton Weight image (LBP) (5x5):

   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0
   0   0   0   0   0

Total GPU Kernel Elapsed time: 14.36070 milliseconds
