<a href="https://colab.research.google.com/github/varekarprajwal/HPCS_LBP/blob/main/HPCS_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title CREATING THE REQUIRED ENVIROMENT
!sudo apt update
!sudo apt install libopencv-dev python3-opencv
!apt-get install -y mpich
!apt-get install -y openmpi-bin openmpi-doc openmpi-dev
!apt-get install -y g++
import numpy as np
import tensorflow as tf

In [None]:
# @title SELECTING THE PATH
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/OPENCV/

Mounted at /content/drive
/content/drive/MyDrive/Project/OPENCV


In [None]:
# @title SEQUENTIAL CODE
%%writefile Sequential_OPENCV.cpp
#include <iostream>
#include <iomanip>
#include <ctime>
#include <vector>
#include <opencv4/opencv2/opencv.hpp>

using namespace cv;
using namespace std;

// The weight matrix remains a constant global or can be defined locally.
const int wgt[3][3] = {
    {8, 4, 2},
    {16, 0, 1},
    {32, 64, 128}
};

// --- HELPER FUNCTION TO PRINT A CV::MAT ---
void print_matrix(const Mat& mat_to_print, const string& name) {
    cout << "\n--- " << name << " (" << mat_to_print.rows << "x" << mat_to_print.cols << ") ---" << endl;
    for (int i = 0; i < mat_to_print.rows; i++) {
        for (int j = 0; j < mat_to_print.cols; j++) {
            // Use (int) to print the numerical value of the uchar, not the character
            printf("%4d", (int)mat_to_print.at<uchar>(i, j));
        }
        printf("\n");
    }
    cout << "----------------------------------------" << endl;
}


double getCurrentTime() {
    struct timespec currentTime;
    clock_gettime(CLOCK_MONOTONIC, &currentTime);
    return (double)currentTime.tv_sec * 1000.0 + (double)currentTime.tv_nsec / 1000000.0;
}

// Same casecheck function, but now it returns a uchar.
uchar casecheck(uchar a, uchar b, uchar c, uchar d) {
    if (a == b && b == c && c == d) return 7;
    if (a == b) return 1;
    if (b == d) return 2;
    if (c == d) return 3;
    if (a == c) return 4;
    if (a == d) return 5;
    if (c == b) return 6;
    return 0;
}

int main(int argc, char* argv[]) {
    if (argc < 2) {
        cerr << "Usage: " << argv[0] << " <image_path>" << endl;
        return -1;
    }

    Mat image =imread("sample_image/10x10.jpg");
    if (image.empty()) {
        cerr << "Error: Couldn't load input image." << endl;
        return -1;
    }

    Mat hsv_image;
    cvtColor(image, hsv_image, COLOR_BGR2HSV);

    vector<Mat> hsvChannels;
    split(hsv_image, hsvChannels);
    Mat v_channel = hsvChannels[2]; // This is our source image, type CV_8UC1 (uchar)

    const int m_r = v_channel.rows / 2;
    const int m_c = v_channel.cols / 2;

    // Use cv::Mat with the correct uchar type (CV_8UC1) for all images.
    Mat txt_img(m_r, m_c, CV_8UC1);

    // --- Texton Calculation ---
    double startTime1 = getCurrentTime();
    for (int i = 0; i < m_r; ++i) {
        // Get pointers to the two source rows to avoid repeated .at() calls
        const uchar* p_src1 = v_channel.ptr<uchar>(i * 2);
        const uchar* p_src2 = v_channel.ptr<uchar>(i * 2 + 1);
        // Get pointer to the destination row
        uchar* p_dest = txt_img.ptr<uchar>(i);

        for (int j = 0; j < m_c; ++j) {
            uchar a = p_src1[j * 2];
            uchar b = p_src1[j * 2 + 1];
            uchar c = p_src2[j * 2];
            uchar d = p_src2[j * 2 + 1];
            p_dest[j] = casecheck(a, b, c, d);
        }
    }
    double endTime1 = getCurrentTime();

    // --- LTxXORp Calculation ---
    // Initialize the result image by copying the texton image.
    // This efficiently handles the borders, as they remain unchanged.
    Mat main_res = txt_img.clone();

    double startTime2 = getCurrentTime();
    // Loop only over the INTERIOR pixels, from 1 to rows-2 and 1 to cols-2.
    // This avoids the expensive 'if' check for borders inside the loop.
    for (int i = 1; i < m_r - 1; ++i) {
        // Get pointers to the previous, current, and next rows of the texton image
        const uchar* p_prev = txt_img.ptr<uchar>(i - 1);
        const uchar* p_curr = txt_img.ptr<uchar>(i);
        const uchar* p_next = txt_img.ptr<uchar>(i + 1);
        // Get pointer to the destination row in the result matrix
        uchar* p_dest = main_res.ptr<uchar>(i);

        for (int j = 1; j < m_c - 1; ++j) {
            const uchar center_val = p_curr[j];

            // Inlined logic from Texton_weight function, removing function call overhead
            // and the global variable. (condition) evaluates to 0 or 1, creating branchless code.
            int xor_S = 0;
            xor_S += (p_prev[j - 1] != center_val) * wgt[0][0];
            xor_S += (p_prev[j]     != center_val) * wgt[0][1];
            xor_S += (p_prev[j + 1] != center_val) * wgt[0][2];
            xor_S += (p_curr[j - 1] != center_val) * wgt[1][0];
            // xor_S += (p_curr[j]  != center_val) * wgt[1][1]; // Center is always 0
            xor_S += (p_curr[j + 1] != center_val) * wgt[1][2];
            xor_S += (p_next[j - 1] != center_val) * wgt[2][0];
            xor_S += (p_next[j]     != center_val) * wgt[2][1];
            xor_S += (p_next[j + 1] != center_val) * wgt[2][2];

            p_dest[j] = saturate_cast<uchar>(xor_S); // Safely cast to uchar
        }
    }
    double endTime2 = getCurrentTime();

    // --- PRINT THE FINAL RESULT ---
    //print_matrix(txt_img, "Intermediate Texton Image");
    //print_matrix(main_res, "Final LTxXORp Result");


    double totalTime1 = endTime1 - startTime1;
    double totalTime2 = endTime2 - startTime2;

    printf("\n____________________________\n\n");
    printf("Elapsed time for texton : %.4f milliseconds\n", totalTime1);
    printf("Elapsed time for LTxXORp: %.4f milliseconds\n", totalTime2);
    printf("Total elapsed time      : %.4f milliseconds\n", totalTime1 + totalTime2);

    return 0;
}

Overwriting Sequential_OPENCV.cpp


In [None]:
# @title COMMAND RUN SEQUENTIAL CODE
!g++ Sequential_OPENCV.cpp -o app `pkg-config --cflags --libs opencv4` && ./app

 84  79  54  57  54  60  66  54  80  59 
 74 198 201 211 205 192 197 207 177  78 
 59 200  69  70  72  70  81  55 213  64 
 60 205  72 203 211 203 215  77 200  62 
 57 199  74 210  88  71 203  73 201  68 
 64 190  70 199  67  67 210  75 198  63 
 60 196  82 222 210 212 213  67 204  64 
 56 215  52  76  71  72  74  71 208  60 
 79 176 214 202 203 199 203 205 187  78 
 62  79  63  61  68  62  65  61  80  60 
___________________

Texton image
 
 0   0   0   0   0  
 0   0   0   0   0  
 0   0   3   0   0  
 0   0   0   0   0  
 5   0   0   0   0  
___________________

Texton Weight image
 
  0    0    0    0    0  
  0  128   64   32    0  
  0    1  255   16    0  
  0   34    4    8    0  
  5    0    0    0    0  
Elapsed time for texton : 0.013025 mili seconds
Elapsed time for LTxXORp: 0.031364 mili seconds
Elapsed time: 0.044389 mili seconds


In [None]:
# @title MPI CODE
%%writefile Parallel_MPI_OPENCV.cpp
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <mpi.h>
#include <opencv4/opencv2/core.hpp>
#include <opencv4/opencv2/highgui.hpp>
#include <opencv4/opencv2/opencv.hpp>


using namespace cv;
using namespace std;

double getCurrentTime()
{
    struct timespec currentTime;
    clock_gettime(CLOCK_MONOTONIC, &currentTime);
    return (double)currentTime.tv_sec * 1000.0 + (double)currentTime.tv_nsec / 1000000.0;
}

int casecheck(int a, int b, int c, int d) {
   if (a == b && b == c && c == d && d == a)
        return 7;
    else if (a == b)
        return 1;
    else if (b == d)
        return 2;
    else if (c == d)
        return 3;
    else if (a == c)
        return 4;
    else if (a == d)
        return 5;
    else if (c == b)
        return 6;
    else
        return 0;
}
int main( int argc, char* argv[]) {

    MPI_Init(&argc, &argv);
    int size,rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Status sta;

    int i, j ,root_rank=0;
    int t_m_r,t_m_c,i_m_r,i_m_c;

    int rank_offset[size]={0},s_rank_offset[size]={0},rank_offset1[size]={0},text_offset[size]={0};
    int *img_r = new int[2000*2000];

    int counts[size]={0},displacements[size]={0};
    int temp_displacements=0,ofset=1,ofset_r=0;
    int t_counts[size]={0},t_displacements[size]={0},tt_displacements[size]={0},et_counts[size]={0},et_displacements[size]={0};

    printf("\nGPU HARDWARE ACTIVATED %d of %d\n",rank,size);


    if (rank == 0){
        Mat image = imread("sample_image/10x10.jpg");
        if (image.empty()) {
            cerr << "Error: Couldn't load input image." << endl;
            return -1;
        }
        Mat hsv_image;
        cvtColor(image, hsv_image, COLOR_BGR2HSV);
        i_m_r = hsv_image.rows ;
        i_m_c = hsv_image.cols ;
        t_m_r = hsv_image.rows / 2;
        t_m_c = hsv_image.cols / 2;

        int k=0;
        for (i = 0; i < hsv_image.rows; i++) {
            for (j = 0; j < hsv_image.cols; j++) {
                Vec3b hsv_pixel = hsv_image.at<Vec3b>(i, j);
                int value = hsv_pixel[2];
                img_r[k]=value;
                printf("%3d ", img_r[k]);
                k++;
            }
        printf("\n");
        }

    }
    // SENDING THE DATA
    MPI_Bcast(&i_m_r,1,MPI_INT,0,MPI_COMM_WORLD);
    MPI_Bcast(&i_m_c,1,MPI_INT,0,MPI_COMM_WORLD);
    MPI_Bcast(&t_m_r,1,MPI_INT,0,MPI_COMM_WORLD);
    MPI_Bcast(&t_m_c,1,MPI_INT,0,MPI_COMM_WORLD);

    int *Final =new int[(t_m_c-2) * (t_m_r-2)];
    Final[(t_m_c-2) * (t_m_r-2)] = {0};
    int img_s[i_m_c*i_m_r];
    int txt_simg[t_m_r][t_m_c] = {0};
    int *my_texton =new int[t_m_c*t_m_r];
    int *my_r_texton =new int[t_m_c*t_m_r];
    int *txt_img =new int[t_m_c*t_m_r];
    txt_img[t_m_c*t_m_r]={0};
    int *Main_res =new int[t_m_c*t_m_r];
    Main_res[t_m_c*t_m_r]={0};


    for (i=0;i<t_m_c;i++){
        if (i!=0 && i%size==0)
            ofset++;
        rank_offset[i%size]=ofset;
        //printf("%d form rank %d coint %d\n", rank_offset[i],rank,i);

    }

    for (i=0;i<size;i++){
        counts[i]=(2*i_m_c)*rank_offset[i];
        displacements[i]=temp_displacements;
        temp_displacements+=(2*i_m_c)*rank_offset[i];
        //printf("vale  rank %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,counts[i],displacements[i],rank_offset[i]);
    }
    temp_displacements = 0;
    for (i=0;i<size;i++){
        t_counts[i]=(t_m_c)*rank_offset[i];
        t_displacements[i]=temp_displacements;
        temp_displacements+=(t_m_c)*rank_offset[i];
        //printf("vale  rank %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,t_counts[i],t_displacements[i],rank_offset[i]);
    }


    MPI_Scatter(&rank_offset, 1, MPI_INT,&ofset_r, 1,MPI_INT,0, MPI_COMM_WORLD);
    MPI_Scatterv(img_r, counts, displacements, MPI_INT, &img_s,counts[rank], MPI_INT, 0, MPI_COMM_WORLD);

    /*for (i = 0; i < 2*i_m_c*ofset_r; i++)
        printf("%d form rank %d coint %d\n",img_s[i],rank,i);*/
    int k=0,r=0,skip=0;
    double startTime1 = getCurrentTime();
        for (i = 0; i < (2*i_m_c*ofset_r);) {
            int a, b, c, d;
            a = img_s[i], b = img_s[i + 1], c = img_s[i_m_c+i], d = img_s[i_m_c+i+1];
            int rs = casecheck(a, b, c, d);
            my_texton[r]=rs;
            //printf("%d at vale and pos %d    rank %d\n", my_texton[r],r, rank);
            r++;
            k+=2;
            if (k%i_m_c==0)
                i+=i_m_c+2;
            else
                i+=2;

            skip++;
        }
        double endTime1 = getCurrentTime();
        //printf("vale  rank %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,t_counts[rank],t_displacements[rank],rank_offset[rank]);
        MPI_Gatherv(my_texton, t_counts[rank], MPI_INT, my_r_texton, t_counts, t_displacements, MPI_INT, root_rank, MPI_COMM_WORLD);

    if (rank == 0){

    printf("___________________\n");
    printf("\nTexton image\n \n");

        for (i = 0; i < t_m_r; i++) {
            for (j = 0; j < t_m_c; j++) {
                txt_simg[i][j]=my_r_texton[i*t_m_c+j];
                printf("%2d  ", txt_simg[i][j]);

            }
            printf("\n");
        }
    }

    ofset=0;
    for (i=0;i<t_m_r-2;i++){
        if (i!=0 && i%size==0)
            ofset++;
        text_offset[i%size]=ofset;
        //printf("%d form rank %d coint\n", text_offset[i%size],rank);
    }

    int tt_counts[size]={0};
    for (i=0;i<size;i++){
        tt_counts[i]=t_m_r*(3+text_offset[i]);
    }

    temp_displacements = 0;
    for (i=0;i<size;i++){
        counts[i]=(t_m_c)*text_offset[i];
        tt_displacements[i]= temp_displacements;
        temp_displacements += t_m_c + counts[i];
        //printf("vale  rank %d size count %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,tt_counts[i],counts[i],tt_displacements[i],text_offset[i]);
    }



    MPI_Bcast(&text_offset[size],size,MPI_INT,0,MPI_COMM_WORLD);
    MPI_Scatterv(txt_simg, tt_counts, tt_displacements, MPI_INT, txt_img, tt_counts[rank], MPI_INT, 0, MPI_COMM_WORLD);
    //printf("Txt vale  rank %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,tt_counts[rank],tt_displacements[rank],text_offset[rank]);


    double startTime2 = getCurrentTime();

    printf("___________________\n");
    printf("\nTexton Weight image\n \n");
    k=0;
    for (j=0;j<=text_offset[rank];j++){
        int cols=t_m_c;
        int x=((j+1)*cols)+1;

        for (i=0;i<t_m_c-2;i++){
            x=x+i;
            //printf("x=%d \n", x);
            int txt_imgs[9] = {0};
            const int a=cols+1,b=cols,c=cols-1,d=1,e=1,f=cols-1,g=cols,h=cols+1;

            if (txt_img[x] == txt_img[x-a])//6
                txt_imgs[1] = 0;
            else
                txt_imgs[1] = 1;

            if (txt_img[x] == txt_img[x-b]) //5
                txt_imgs[2] = 0;
            else
                txt_imgs[2] = 1;

            if (txt_img[x] == txt_img[x-c]) //4
                txt_imgs[3] = 0;
            else
                txt_imgs[3] = 1;

            if (txt_img[x] == txt_img[x-d]) //1
                txt_imgs[4] = 0;
            else
                txt_imgs[4] = 1;

            if (txt_img[x] == txt_img[x+e])//1
                txt_imgs[5] = 0;
            else
                txt_imgs[5] = 1;

            if (txt_img[x] == txt_img[x+f]) //4
                txt_imgs[6] = 0;
            else
                txt_imgs[6] = 1;

            if (txt_img[x] == txt_img[x+g]) //5
                txt_imgs[7] = 0;
            else
                txt_imgs[7] = 1;

            if (txt_img[x] == txt_img[x+h]) //6
                txt_imgs[8] = 0;
            else
                txt_imgs[8] = 1;
            x=((j+1)*cols)+1;
            int xor_S = 0;
            const int wgt[9] = {8, 4, 2,16, 0, 1,32, 64, 128};
            xor_S =(txt_imgs[1] * wgt[0])+(txt_imgs[2] * wgt[1])+(txt_imgs[3] * wgt[2])+(txt_imgs[4] * wgt[3])+(txt_imgs[5] * wgt[5])+(txt_imgs[6] * wgt[6])+(txt_imgs[7] * wgt[7])+(txt_imgs[8] * wgt[8]);
            Main_res[k]=xor_S;
            printf("%2d  ", Main_res[k]);
            k++;
        }
        printf("\n");
    }

    double endTime2 = getCurrentTime();

    temp_displacements = 0;
    for (i=0;i<size;i++){
        et_counts[i]=(t_m_c-2)*(text_offset[i] + 1 );
        et_displacements[i]=temp_displacements;
        temp_displacements += (t_m_c-2)*(1+text_offset[i]);
        //printf("vale  rank %d T-count %d T-diaplacemt %d Rank offset%d \n",rank ,et_counts[i],et_displacements[i],text_offset[i]);
    }

    MPI_Gatherv(Main_res, et_counts[rank], MPI_INT, Final, et_counts, et_displacements, MPI_INT, root_rank, MPI_COMM_WORLD);

    /*
    if (rank == 0){
        printf("\n___________________\n");
        printf("\nTexton Weight image\n \n");
        for (i = 0; i < t_m_r-2; i++) {
            for (j = 0; j < t_m_c-2; j++) {
                printf("%2d  ", Final[i*t_m_c+j]);
            }
            printf("\n");
        }
    }
    */

    double totalTime1 = endTime1 - startTime1;
    printf("Time taken by %d Rank Parallel to calculate Texton code: %.5f milliseconds\n",rank, totalTime1);
    double totalTime2 = endTime2 - startTime2;
    printf("Time taken by %d Rank Parallel to calculate LTxXORp code: %.5f milliseconds\n",rank, totalTime2);
    printf("Elapsed time: %f mili seconds\n", totalTime1+totalTime2);

    MPI_Finalize();
    return 0;
}

Overwriting Parallel_MPI_OPENCV.cpp


In [None]:
# @title COMMAND RUN PARALLEL MPI CODE
!mpic++ Parallel_MPI_OPENCV.cpp -o app `pkg-config --cflags --libs opencv4` && mpirun --allow-run-as-root -np 1 ./app


GPU HARDWARE ACTIVATED 0 of 1
 84  79  54  57  54  60  66  54  80  59 
 74 198 201 211 205 192 197 207 177  78 
 59 200  69  70  72  70  81  55 213  64 
 60 205  72 203 211 203 215  77 200  62 
 57 199  74 210  88  71 203  73 201  68 
 64 190  70 199  67  67 210  75 198  63 
 60 196  82 222 210 212 213  67 204  64 
 56 215  52  76  71  72  74  71 208  60 
 79 176 214 202 203 199 203 205 187  78 
 62  79  63  61  68  62  65  61  80  60 
___________________

Texton image
 
 0   0   0   0   0  
 0   0   0   0   0  
 0   0   3   0   0  
 0   0   0   0   0  
 5   0   0   0   0  
___________________

Texton Weight image
 
128  64  32  
 1  255  16  
34   4   8  
Time taken by 0 Rank Parallel to calculate Texton code: 0.00101 milliseconds
Time taken by 0 Rank Parallel to calculate LTxXORp code: 0.02734 milliseconds
Elapsed time: 0.028349 mili seconds


In [None]:
# @title CUDA CODE
%%writefile parallel_CUDA_OPENCV.cu

#include <iostream>
#include <stdio.h>
#include <opencv4/opencv2/core.hpp>
#include <opencv4/opencv2/highgui.hpp>
#include <opencv4/opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <ctime>

using namespace cv;
using namespace std;

double getCurrentTime()
{
    struct timespec currentTime;
    clock_gettime(CLOCK_MONOTONIC, &currentTime);
    return (double)currentTime.tv_sec * 1000.0 + (double)currentTime.tv_nsec / 1000000.0;
}



__device__ int getGlobalID_3D_3D(){
  int blockId = blockIdx.x + blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
  int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x) + threadIdx.x;
  return threadId;
  }

__device__ int casecheck(int a, int b, int c, int d) {
   if (a == b && b == c && c == d && d == a)
        return 7;
    else if (a == b)
        return 1;
    else if (b == d)
        return 2;
    else if (c == d)
        return 3;
    else if (a == c)
        return 4;
    else if (a == d)
        return 5;
    else if (c == b)
        return 6;
    else
        return 0;
}

__global__ void CUDA_START(){
  printf("\nCUDA HARDWARE ACTIVATED\n");
  printf("\n____________________\n");
}
__global__ void calculate_TEXTON_CUDA(int* img_s, int* d_t_img, int rows, int cols)  {
  int x = getGlobalID_3D_3D();
  int a, b, c, d, r, i, q;

  q=x%(cols/2);
  r =x/(cols/2);
  i=((r*2)*cols)+(q*2);
  a = img_s[i], b = img_s[i + 1], c = img_s[cols+i], d = img_s[cols+i+1];

  int rs = casecheck(a, b, c, d);

  d_t_img[x]=rs;
}

__global__ void calculate_LBP_CUDA(int* txt_img, int* Main_res, int rows, const int cols)  {
  int i = getGlobalID_3D_3D();
  //int x =i+(5+1+((i/3)*2));
  int x =i+(cols+1+(i/(cols-2)*2));
  const int a=6,b=5,c=4,d=1,e=1,f=4,g=5,h=6;

   if ( i<(rows*cols)-((rows*2)+(cols*2)-4))
   {
    if (txt_img[x] == txt_img[x-a])//6
      txt_img[x-a] = 0;
    else
      txt_img[x-a] = 1;

    if (txt_img[x] == txt_img[x-b]) //5
      txt_img[x-b] = 0;
    else
      txt_img[x-b] = 1;

    if (txt_img[x] == txt_img[x-c]) //4
      txt_img[x-c] = 0;
    else
      txt_img[x-c] = 1;

    if (txt_img[x] == txt_img[x-d]) //1
      txt_img[x-d] = 0;
    else
      txt_img[x-d] = 1;

    if (txt_img[x] == txt_img[x+e])//1
      txt_img[x+e] = 0;
    else
      txt_img[x+e] = 1;

    if (txt_img[x] == txt_img[x+f]) //4
      txt_img[x+f] = 0;
    else
      txt_img[x+f] = 1;

    if (txt_img[x] == txt_img[x+g]) //5
      txt_img[x+g] = 0;
    else
      txt_img[x+g] = 1;

    if (txt_img[x] == txt_img[x+h]) //6
      txt_img[x+h] = 0;
    else
      txt_img[x+h] = 1;

  int xor_S = 0;
  const int wgt[9] = {8, 4, 2,16, 0, 1,32, 64, 128};
  xor_S =(txt_img[x-a] * wgt[0])+(txt_img[x-b] * wgt[1])+(txt_img[x-c] * wgt[2])+(txt_img[x-d] * wgt[3])+(txt_img[x+e] * wgt[5])+(txt_img[x+f] * wgt[6])+(txt_img[x+g] * wgt[7])+(txt_img[x+h] * wgt[8]);
  Main_res[x]=xor_S;
   }
}

int main() {
  CUDA_START<<<1,1>>>();
  int i, j;
  int t_m_r,t_m_c;
  int i_m_r,i_m_c;
  int *h_img_r = new int[1000*1000];
  Mat image = imread("sample_image/10x10.jpg");

  if (image.empty()) {
    cerr << "Error: Couldn't load input image." << endl;
    return -1;
    }

  Mat hsv_image;
  cvtColor(image, hsv_image, COLOR_BGR2HSV);
  i_m_r = hsv_image.rows ;
  i_m_c = hsv_image.cols ;
  t_m_r = hsv_image.rows / 2;
  t_m_c = hsv_image.cols / 2;


  int k=0;
  for (i = 0; i < hsv_image.rows; i++) {
    for (j = 0; j < hsv_image.cols; j++) {
      Vec3b hsv_pixel = hsv_image.at<Vec3b>(i, j);
      int value = hsv_pixel[2];
      h_img_r[k]=value;
      printf("%3d ", h_img_r[k]);
      k++;
      }
    printf("\n");
  }
  printf("\n____________________\n");



  printf("\nTexton image\n \n");

  int* d_img;
  int* d_t_img;
  int imageSize = i_m_r * i_m_c * sizeof(int);
  int t_imageSize = t_m_r * t_m_c * sizeof(int);

  cudaMalloc((void **)&d_img, imageSize);
  cudaMalloc((void **)&d_t_img, imageSize);


  cudaMemcpy(d_img, h_img_r, imageSize, cudaMemcpyHostToDevice);


  dim3 threadsPerBlock(16, 16);
  dim3 numBlocks1((i_m_c + threadsPerBlock.x - 1) / threadsPerBlock.x, (i_m_r + threadsPerBlock.y - 1) / threadsPerBlock.y);
  dim3 numBlocks2((t_m_c + threadsPerBlock.x - 1) / threadsPerBlock.x, (t_m_r + threadsPerBlock.y - 1) / threadsPerBlock.y);


  double startTime1 = getCurrentTime();

  calculate_TEXTON_CUDA<<<numBlocks1, threadsPerBlock>>>(d_img, d_t_img, i_m_r , i_m_c);

  double endTime1 = getCurrentTime();
  int *h_T_img = new int[t_m_r *t_m_r ];
  cudaMemcpy(h_T_img, d_t_img, t_imageSize, cudaMemcpyDeviceToHost);

  for (i = 0; i < t_m_r; i++) {
    for (j = 0; j < t_m_c; j++) {
      printf("%4d", h_T_img[i*t_m_c+j]);
        }
    printf("\n");
  }

  printf("\n___________________\n");
  printf("\nTexton Weight image\n \n");


  int *h_Txt_img = new int[t_m_r *t_m_r ];
  cudaMalloc((void **)&h_Txt_img, imageSize);

  int *dr_Txt_img = new int[t_m_r *t_m_r ];
  cudaMalloc((void **)&dr_Txt_img, imageSize);

  cudaMemcpy(dr_Txt_img, h_T_img, imageSize, cudaMemcpyHostToDevice);
  cudaMemcpy(h_Txt_img, h_T_img, imageSize, cudaMemcpyHostToDevice);

  double startTime2 = getCurrentTime();

  calculate_LBP_CUDA<<<numBlocks2, threadsPerBlock>>>(dr_Txt_img, h_Txt_img, t_m_r , t_m_c);
  //calculate_LBP_CUDA<<<1,25>>>(dr_Txt_img, h_Txt_img, t_m_r , t_m_c);

  double endTime2 = getCurrentTime();

  int *MM_T_img = new int[t_m_r *t_m_r ];
  cudaMemcpy(MM_T_img, h_Txt_img, t_imageSize, cudaMemcpyDeviceToHost);

  for (i = 0; i < t_m_r; i++) {
    for (j = 0; j < t_m_c; j++) {
      printf("%4d", MM_T_img[i*t_m_c+j]);
        }
    printf("\n");
  }
  double totalTime1 = endTime1 - startTime1;
  printf("Time taken by Parallel to calculate Texton code: %.5f milliseconds\n", totalTime1);
  double totalTime2 = endTime2 - startTime2;
  printf("Time taken by Parallel to calculate LTxXORp code: %.5f milliseconds\n", totalTime2);
  printf("Elapsed time: %f mili seconds\n", totalTime1+totalTime2);

  return 0;
}


Overwriting parallel_CUDA_OPENCV.cu


In [None]:
# @title COMMAND RUN PARALLEL CUDA CODE
!nvcc -o parallel parallel_CUDA_OPENCV.cu `pkg-config --cflags --libs opencv4` && ./parallel

 84  79  54  57  54  60  66  54  80  59 
 74 198 201 211 205 192 197 207 177  78 
 59 200  69  70  72  70  81  55 213  64 
 60 205  72 203 211 203 215  77 200  62 
 57 199  74 210  88  71 203  73 201  68 
 64 190  70 199  67  67 210  75 198  63 
 60 196  82 222 210 212 213  67 204  64 
 56 215  52  76  71  72  74  71 208  60 
 79 176 214 202 203 199 203 205 187  78 
 62  79  63  61  68  62  65  61  80  60 

____________________

Texton image
 

CUDA HARDWARE ACTIVATED

____________________
   0   0   0   0   0
   0   0   0   0   0
   0   0   3   0   0
   0   0   0   0   0
   5   0   0   0   0

___________________

Texton Weight image
 
   0   0   0   0   0
   0 128  64  32   0
   0   1 255  16   0
   0  34   4   8   0
   5   0   0   0   0
Time taken by Parallel to calculate Texton code: 0.05442 milliseconds
Time taken by Parallel to calculate LTxXORp code: 0.02883 milliseconds
Elapsed time: 0.083247 mili seconds
