In [1]:
! mkdir include
! mkdir src
! mkdir bin

In [2]:
%%file include/adj_matrix_reader.hpp

#ifndef ADJ_MATRIX_READER_HPP
#define ADJ_MATRIX_READER_HPP

#include <string>

/// <summary>
/// Leggi un file CSV contenente una matrice di adiacenza.
/// </summary>
/// <param name="filename">Il nome del file da leggere</param>
/// <param name="delim">Il delimitatore del file CSV</param>
/// <param name="adjMatrix">Un puntatore vuoto dove verr� allocata la memoria per mem. la matrice</param>
/// <param name="numberOfNodes">Il numero di nodi del grafo (corrisponde al numero di righe e colonne della matrice)</param>
/// <return>0 if okay, else something less</return>
int** readAdjMatrixCSV(std::string filename, const char delim, int* numberOfNodes);

#endif


In [3]:
%%file include/adj_matrix_utils.hpp

#ifndef ADJ_MATRIX_UTILS_HPP
#define ADJ_MATRIX_UTILS_HPP

#include <stdbool.h>

#include "num_macro.hpp"

/// Parameters used when generating a graph
#define DENSITY 60
#define MIN_COST 1
#define MAX_COST 20

/// Print a bool as a string
#define bool_to_string(cond) (cond ? "true" : "false")

// ---------------------------------------------------------------
//  PRINT UTILS

void print_array(int *array, int size);
void print_matrix(int **matrix, int m, int n);
void print_element(int val, int infinity);

// ---------------------------------------------------------------
// MATRIX GENERATION, COMPARE and others utils

int** generate_graph(int n, int seed);
bool same_matrix(int **matrix_1, int **matrix_2, int m, int n);

// ---------------------------------------------------------------
// ARRAY MATRIX FUNCTIONS VARIANTS

void print_arr_matrix(int *matrix, int m, int n);
void populate_arr_graph(int* arr_matrix, int n, int seed);
void copy_arr_graph(int* src, int* target, int n);
bool same_arr_matrix(int *matrix_1, int *matrix_2, int n);

#endif // ADJ_MATRIX_UTILS_H

In [4]:
%%file include/cuda_errors_utils.cuh

#ifndef CUDA_ERRORS_UTILS_CUH
#define CUDA_ERRORS_UTILS_CUH


#define HANDLE_ERROR(err) (handle_error(err, __FILE__, __LINE__))

void handle_error(cudaError_t err, const char *file, int line);

void check_CUDA_error(const char *msg);

#endif

In [5]:
%%file include/device_floyd_warshall_v1_2.cuh

#ifndef DEVICE_FLOYD_WARSHALL_V1_2_CUH
#define DEVICE_FLOYD_WARSHALL_V1_2_CUH

#define MAX_BLOCK_SIZE 1024 // in realtà basta fare le proprerties della macchina

/// Macro to get block starting position (of a column or of a row)
#define BLOCK_START(block_index,B) (block_index * B)

/// Macro to get block ending position (of a column or of a row)
#define BLOCK_END(block_index,B) ((block_index+1) * B)

__global__ void execute_round_device_v1_2_phase_1(int *matrix, int n, int t, int B);
__global__ void execute_round_device_v1_2_phase_2(int *matrix, int n, int t, int B);
__global__ void execute_round_device_v1_2_phase_3(int *matrix, int n, int t, int B);

void floyd_warshall_blocked_device_v1_2(int *matrix, int n, int B);

#endif

In [6]:
%%file include/host_floyd_warshall.hpp

#ifndef HOST_FLOYD_WARSHALL
#define HOST_FLOYD_WARSHALL


#include "num_macro.hpp"

// ---------------------------------------------------------------------------
// Matrix data structure version

void floyd_warshall(int **matrix, int n);
void floyd_warshall_blocked(int **matrix, int n, int B);
void execute_round(int **matrix, int n, int t, int row, int col, int B);

// ---------------------------------------------------------------------------
// Array data structure version

void arr_floyd_warshall(int *matrix, int n);
void arr_floyd_warshall_blocked(int *matrix, int n, int B);
void arr_execute_round(int *matrix, int n, int t, int row, int col, int B);

#endif

In [7]:
%%file include/num_macro.hpp

#ifndef NUM_MACRO_HPP
#define NUM_MACRO_HPP

/// Big M, value that should be threated as "infinity"
#define INF __INT16_MAX__

/// Get minimum of two values
#define min(a,b) ((a < b) ? a : b)

/// Sum two numbers if they are not infinite, else return infinity
#define sum_if_not_infinite(x1,x2,infinity) ((x1==infinity) || (x2==infinity)) ? infinity : (x1+x2)

#endif

In [8]:
%%file include/performance_test.cuh

#ifndef PERFORMANCE_TEST_CUH
#define PERFORMANCE_TEST_CUH

void do_nvprof_performance_test(void (*floyd_warshall_arr_algorithm)(int * matrix, int n, int B), int input_size, int blocking_factor, int number_of_tests, int seed);


#endif

In [9]:
%%file include/statistical_test.hpp

#ifndef STATISTICAL_TEST_HPP
#define STATISTICAL_TEST_HPP

# define RANDOM_SEED 0


bool test_arr_floyd_warshall(
    void (*function_to_test) (int* arr_matrix, int n, int b), 
    int *input_instance, int *test_instance_space, 
    int input_size, int blocking_factor);


int do_arr_floyd_warshall_statistical_test(
    void (*function_to_test) (int* arr_matrix, int n, int b), 
    int input_size, int blocking_factor, 
    int n_tests, int use_always_seed, 
    bool stop_if_fail, int progress_print_fraction, bool print_failed_tests);


int multi_size_statistical_test(
    void (*function_to_test)  (int* arr_matrix, int n, int b), 
    int start_input_size, int end_input_size, 
    int min_blocking_factor, int max_blocking_factor, 
    int n_tests_per_round, int use_always_seed, 
    bool stop_if_fail, bool print_failed_tests);

#endif

In [10]:
# END HEADER REGION
# ----------------------------------------------------------------------------------------------------------------------------------------
# START SRC REGION

In [11]:
%%file src/adj_matrix_reader.cpp

#include "../include/adj_matrix_reader.hpp"
#include <fstream>
#include <sstream>
#include <iostream>

int _getNumberOfNodes(std::string adjMatrixLine, const char delim) {

	// insipired to: https://java2blog.com/split-string-space-cpp/#Using_getline_Method

	std::istringstream ss(adjMatrixLine);

	int nodesCounter = 0;

	std::string s;
	while (std::getline(ss, s, delim)) {
		nodesCounter++;
	}

	return nodesCounter;
}

int _parseLine(std::string adjMatrixLine, const char delim, int lineNumber, int** adjMatrix) {

	// insipired to: https://java2blog.com/split-string-space-cpp/#Using_getline_Method

	std::istringstream ss(adjMatrixLine);
	std::string itemStr;

	int i = 0;
	while (std::getline(ss, itemStr, delim)) {

		int value = std::stoi(itemStr);
		adjMatrix[lineNumber][i] = value;

		i++;
	}

	return 0;
}


int** readAdjMatrixCSV(const std::string filename, const char delim, int *numberOfNodes) {

	std::ifstream fs(filename);
	
	if (!fs.is_open()) {
		// todo: add error
	}
	
	if (fs.eof()) {
		// todo: add error
	}

	// read first line
	std::string line;
	std::getline(fs, line);
	int lineNumber = 0;
	
	// get number of nodes
	*numberOfNodes = _getNumberOfNodes(line, delim);

	// allocate memory for matrix
	int** adjMatrix = (int **) malloc(sizeof(int*) * (*numberOfNodes));

	// parse all lines and fill adjMatrix
	do {
		adjMatrix[lineNumber] = (int*) malloc(sizeof(int) * (*numberOfNodes));
		_parseLine(line, delim, lineNumber, adjMatrix);
		lineNumber++;
	} while (std::getline(fs, line));

	return adjMatrix;
}


In [12]:
%%file src/adj_matrix_utils.cpp

#include "../include/adj_matrix_utils.hpp"

#include <stdio.h>
#include <stdlib.h>

// ---------------------------------------------------------------
//  PRINT UTILS

void print_matrix(int **matrix, int m, int n) {
    printf("[\n");
    for (int i = 0; i < m; i++) {
        printf("  ");
        print_array(matrix[i], n);
    }
    printf("]\n");
}

void print_array(int *array, int size) {
    printf("[");
    for (int i = 0; i < size; i++) {
        print_element(array[i], INF);
        if (i < size-1) printf(", ");
    }
    printf("]\n");
}

void print_element(int val, int infinity) {
    if (val < infinity)
        printf("%02d", val);
    else 
        printf("--");
}

// ---------------------------------------------------------------
// MATRIX GENERATION, COMPARE and others utils

bool same_matrix(int **matrix_1, int **matrix_2, int m, int n) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            if(matrix_1[i][j] != matrix_2[i][j]) return false;
        }
    }
    return true;
}

int** generate_graph(int n, int seed) {

    int **matrix = (int **) malloc(sizeof(int *) * n);
    for (int i = 0; i < n; i++) {
        matrix[i] = (int *) malloc(sizeof(int) * n);
    }


    srand(seed);
    for (int i = 0; i < n; i++) {
        matrix[i][i] = 0;
        for (int j = i+1; j < n; j++) {
            bool add_edge = (rand() % 100) <= DENSITY;
            int val = (rand() % MAX_COST) + MIN_COST;
            matrix[i][j] = add_edge ? val : INF;
            //non-oriented graph
            matrix[j][i] = matrix[i][j];
        }
    }

    return matrix;
}

// ---------------------------------------------------------------
// ARRAY MATRIX FUNCTIONS VARIANTS

void print_arr_matrix(int *matrix, int m, int n) {
    printf("[\n");
    for (int i = 0; i < m; i++) {
        printf("  ");
        print_array(&(matrix[i]), n);
    }
    printf("]\n");
}

bool same_arr_matrix(int *matrix_1, int *matrix_2, int n) {
    for (int i = 0; i < n; i++) {
        if(matrix_1[i] != matrix_2[i]) return false;
    }
    return true;
}

void populate_arr_graph(int* arr_matrix, int n, int seed) {

    // int *matrix = (int *) malloc(sizeof(int *) * n * n);

    srand(seed);
    for (int i = 0; i < n; i++) {
        arr_matrix[i*n + i] = 0;
        for (int j = i+1; j < n; j++) {
            bool add_edge = (rand() % 100) <= DENSITY;
            int val = (rand() % MAX_COST) + MIN_COST;
            arr_matrix[i*n + j] = add_edge ? val : INF;
            //non-oriented graph
            arr_matrix[j*n + i] = arr_matrix[i*n + j];
        }
    }

    // return arr_matrix;
}

void copy_arr_graph(int* src, int* target, int n) {
    for (size_t i = 0; i < n*n; i++)
    {
        target[i] = src[i];
    }
}

In [13]:
%%file src/cuda_errors_utils.cu

#include "../include/cuda_errors_utils.cuh"

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"

void handle_error(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit(EXIT_FAILURE);
    }
}

void check_CUDA_error(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if(cudaSuccess != err) {
        fprintf(stderr, "ERRORE CUDA: >%s<: >%s<. Eseguo: EXIT\n", msg, cudaGetErrorString(err) );
        exit(-1);
    }
}

In [None]:
%%file src/device_floyd_warshall_v1_2.cu

#include "../include/device_floyd_warshall_v1_2.cuh"

#include <cassert>

#include "../include/cuda_errors_utils.cuh"
#include "../include/num_macro.hpp"


void floyd_warshall_blocked_device_v1_2(int *matrix, int n, int B) {

    assert(n%B == 0);                       // B must divide n
    assert(B*B<=MAX_BLOCK_SIZE);            // B*B cannot exceed max block size

    int *dev_rand_matrix;
    HANDLE_ERROR(cudaMalloc( (void**) &dev_rand_matrix, n * n* sizeof(int)));
    HANDLE_ERROR(cudaMemcpy(dev_rand_matrix, matrix, n*n*sizeof(int), cudaMemcpyHostToDevice));

    int num_rounds = n/B;
     
    for(int t = 0; t < num_rounds; t++) { 

        //arr_execute_round(int *matrix, int n, int t, int row, int col, int B)

        //phase 1: self-dependent block
        dim3 num_blocks_phase_1(1, 1);
        dim3 threads_per_block_phase_1(B, B);

        execute_round_device_v1_2_phase_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize());

        // phase 2: all blocks that share a row or a column with the self dependent, so
        //  -   all blocks just above or under t
        //  -   all block at left and at right of t

        // Phase 2/3 thread matrix is made by n*n threads, divided in num_rounds*num_rounds blocks
        dim3 num_blocks_phase_2_3(num_rounds, num_rounds);  

        execute_round_device_v1_2_phase_2<<<num_blocks_phase_2_3, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize());

        // phase 3: all the remaining blocks, so all the blocks that don't share a row or a col with t

        execute_round_device_v1_2_phase_3<<<num_blocks_phase_2_3, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize()); 
    }

    // HANDLE_ERROR(cudaDeviceSynchronize());  

    HANDLE_ERROR(cudaMemcpy(matrix, dev_rand_matrix, n*n*sizeof(int), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaFree(dev_rand_matrix));
}

__global__ void execute_round_device_v1_2_phase_1(int *matrix, int n, int t, int B) {

    // Launched block and correspondent position in the matrix

    //  t

    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 
    //  .   .   .   t   .   .
    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 

    int tid_x = threadIdx.x + blockIdx.x * blockDim.x;
    int tid_y = threadIdx.y + blockIdx.y * blockDim.y;

    int i = tid_x + t * B;  // row
    int j = tid_y + t * B;  // col

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

        if (b < matrix[i*n + j]) {
            matrix[i*n + j] = b;
        }
        
        __syncthreads();
    }
}

__global__ void execute_round_device_v1_2_phase_2(int *matrix, int n, int t, int B) {

    // Launched blocks and correspondent position in the matrix 
    // ("-" and "." blocks are just kept inactive using IF statement)

    //  .   .   .   U1  .   .
    //  .   .   .   U2  .   .
    //  .   .   .   U3  .   .
    //  L1  L2  L3  -   R1  R2
    //  .   .   .   D1  .   .
    //  .   .   .   D2  .   .


    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        if (
            /* row index is contained in s.d. block and column index is outside */
            ( BLOCK_START(t,B)<=i<BLOCK_END(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) )   ||  

            /* column index is contained in s.d. block and row index is outside */
            ( BLOCK_START(t,B)<=j<BLOCK_END(t,B) && (i<BLOCK_START(t,B) || i>=BLOCK_END(t,B)) ) 
            ) {

            int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

            if (b < matrix[i*n + j]) {
                matrix[i*n + j] = b;
            }
        }

        __syncthreads();
    }
}

__global__ void execute_round_device_v1_2_phase_3(int *matrix, int n, int t, int B) {

    // Launched blocks and correspondent position in the matrix 
    // ("-" blocks are just kept inactive using IF statement)

    //  UL  UL  UL  -   UR  UR
    //  UL  UL  UL  -   UR  UR
    //  UL  UL  UL  -   UR  UR  
    //  -   -   -   -   -   - 
    //  DL  DL  DL  -   DR  DR
    //  DL  DL  DL  -   DR  DR

    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        if (
            /* above and right or left */
            ( i>=BLOCK_END(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) )   ||  

            /* under and right or left */
            ( i<BLOCK_START(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) ) 
            ) {

            int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

            if (b < matrix[i*n + j]) {
                matrix[i*n + j] = b;
            }
        }

        __syncthreads();
    }
}



In [14]:
%%file src/host_floyd_warshall.cpp

#include "../include/device_floyd_warshall_v1_2.cuh"

#include <cassert>

#include "../include/cuda_errors_utils.cuh"
#include "../include/num_macro.hpp"


void floyd_warshall_blocked_device_v1_2(int *matrix, int n, int B) {

    assert(n%B == 0);                       // B must divide n
    assert(B*B<=MAX_BLOCK_SIZE);            // B*B cannot exceed max block size

    int *dev_rand_matrix;
    HANDLE_ERROR(cudaMalloc( (void**) &dev_rand_matrix, n * n* sizeof(int)));
    HANDLE_ERROR(cudaMemcpy(dev_rand_matrix, matrix, n*n*sizeof(int), cudaMemcpyHostToDevice));

    int num_rounds = n/B;
     
    for(int t = 0; t < num_rounds; t++) { 

        //arr_execute_round(int *matrix, int n, int t, int row, int col, int B)

        //phase 1: self-dependent block
        dim3 num_blocks_phase_1(1, 1);
        dim3 threads_per_block_phase_1(B, B);

        execute_round_device_v1_2_phase_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize());

        // phase 2: all blocks that share a row or a column with the self dependent, so
        //  -   all blocks just above or under t
        //  -   all block at left and at right of t

        // Phase 2/3 thread matrix is made by n*n threads, divided in num_rounds*num_rounds blocks
        dim3 num_blocks_phase_2_3(num_rounds, num_rounds);  

        execute_round_device_v1_2_phase_2<<<num_blocks_phase_2_3, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize());

        // phase 3: all the remaining blocks, so all the blocks that don't share a row or a col with t

        execute_round_device_v1_2_phase_3<<<num_blocks_phase_2_3, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize()); 
    }

    // HANDLE_ERROR(cudaDeviceSynchronize());  

    HANDLE_ERROR(cudaMemcpy(matrix, dev_rand_matrix, n*n*sizeof(int), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaFree(dev_rand_matrix));
}

__global__ void execute_round_device_v1_2_phase_1(int *matrix, int n, int t, int B) {

    // Launched block and correspondent position in the matrix

    //  t

    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 
    //  .   .   .   t   .   .
    //  .   .   .   .   .   . 
    //  .   .   .   .   .   . 

    int tid_x = threadIdx.x + blockIdx.x * blockDim.x;
    int tid_y = threadIdx.y + blockIdx.y * blockDim.y;

    int i = tid_x + t * B;  // row
    int j = tid_y + t * B;  // col

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

        if (b < matrix[i*n + j]) {
            matrix[i*n + j] = b;
        }
        
        __syncthreads();
    }
}

__global__ void execute_round_device_v1_2_phase_2(int *matrix, int n, int t, int B) {

    // Launched blocks and correspondent position in the matrix 
    // ("-" and "." blocks are just kept inactive using IF statement)

    //  .   .   .   U1  .   .
    //  .   .   .   U2  .   .
    //  .   .   .   U3  .   .
    //  L1  L2  L3  -   R1  R2
    //  .   .   .   D1  .   .
    //  .   .   .   D2  .   .


    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        if (
            /* row index is contained in s.d. block and column index is outside */
            ( BLOCK_START(t,B)<=i<BLOCK_END(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) )   ||  

            /* column index is contained in s.d. block and row index is outside */
            ( BLOCK_START(t,B)<=j<BLOCK_END(t,B) && (i<BLOCK_START(t,B) || i>=BLOCK_END(t,B)) ) 
            ) {

            int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

            if (b < matrix[i*n + j]) {
                matrix[i*n + j] = b;
            }
        }

        __syncthreads();
    }
}

__global__ void execute_round_device_v1_2_phase_3(int *matrix, int n, int t, int B) {

    // Launched blocks and correspondent position in the matrix 
    // ("-" blocks are just kept inactive using IF statement)

    //  UL  UL  UL  -   UR  UR
    //  UL  UL  UL  -   UR  UR
    //  UL  UL  UL  -   UR  UR  
    //  -   -   -   -   -   - 
    //  DL  DL  DL  -   DR  DR
    //  DL  DL  DL  -   DR  DR

    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int j = threadIdx.y + blockIdx.y * blockDim.y;

    //foreach k: t*B <= t < t+B
    for (int k = BLOCK_START(t,B); k < BLOCK_END(t,B); k++) {

        if (
            /* above and right or left */
            ( i>=BLOCK_END(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) )   ||  

            /* under and right or left */
            ( i<BLOCK_START(t,B) && (j<BLOCK_START(t,B) || j>=BLOCK_END(t,B)) ) 
            ) {

            int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

            if (b < matrix[i*n + j]) {
                matrix[i*n + j] = b;
            }
        }

        __syncthreads();
    }
}



In [15]:
%%file src/performance_test.cu

#include "../include/host_floyd_warshall.hpp"
#include "../include/adj_matrix_utils.hpp"

// ---------------------------------------------------------------------------
// Matrix data structure version

void floyd_warshall(int **matrix, int n) {
    for(int k = 0; k < n; k++) {
        for(int i = 0; i < n; i++) {
            for(int j = 0; j < n; j++) {
                int a = matrix[i][j];
                int b = sum_if_not_infinite(matrix[i][k], matrix[k][j], INF);
                matrix[i][j] = min(a, b);
            }
        }
    }
}

void floyd_warshall_blocked(int **matrix, int n, int B) {

    int num_rounds = n/B;

    for(int t = 0; t < num_rounds; t++) { 

        //execute_round(int **matrix, int n, int t, int row, int col, int B)

        //phase 1: self-dependent block
        execute_round(matrix, n, t, t, t, B);

        //phase 2 blocks left
        for (int j = t-1; j >= 0; j--) {
            execute_round(matrix, n, t, t, j, B);
        }

        //phase 2 blocks above
        for (int i = t-1; i >= 0; i--) {
            execute_round(matrix, n, t, i, t, B);
        }

        //phase 2 blocks below
        for (int i = t+1; i < num_rounds; i++) {
            execute_round(matrix, n, t, i, t, B);
        }

        //phase 2 blocks right
        for (int j = t+1; j < num_rounds; j++) {
            execute_round(matrix, n, t, t, j, B);
        }
        
        //phase 2,3: remaining blocks
        //phase 3 blocks above and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t-1; i >= 0; i--) {
                execute_round(matrix, n, t, i, j, B);
            }
        }
        //phase 3 blocks above and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t-1; i >= 0; i--) {
                execute_round(matrix, n, t, i, j, B);
            }
        }
        //phase 3 blocks below and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t+1; i < num_rounds; i++) {
                execute_round(matrix, n, t, i, j, B);
            }
        }      
        //phase 3 blocks below and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t+1; i < num_rounds; i++) {
                execute_round(matrix, n, t, i, j, B);
            }
        }   
        
    }
}

void execute_round(int **matrix, int n, int t, int row, int col, int B) {

    //foreach k: t*B <= t < t+B
    int block_start = t * B;
    int block_end = (t+1) * B;
    int row_start = row * B;
    int row_end = (row+1) * B;
    int col_start = col * B;
    int col_end = (col+1) * B;

    for (int k = block_start; k < block_end; k++) {
        //foreach i,j in the self-dependent block
        for (int i = row_start; i < row_end; i++) {
            for (int j = col_start; j < col_end; j++) {
                int a = matrix[i][j];
                int x1 = matrix[i][k];
                int x2 =  matrix[k][j];
                int b = sum_if_not_infinite(matrix[i][k], matrix[k][j], INF);
                matrix[i][j] = min(a, b);
                //print_matrix(matrix, n, n);
            }
        }
    }
}

// ---------------------------------------------------------------------------
// Array data structure version

void arr_floyd_warshall(int *matrix, int n) {
    for(int k = 0; k < n; k++) {
        for(int i = 0; i < n; i++) {
            for(int j = 0; j < n; j++) {
                int a = matrix[i*n + j];
                int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF);
                matrix[i*n + j] = min(a, b);
            }
        }
    }
}

void arr_floyd_warshall_blocked(int *matrix, int n, int B) {

    int num_rounds = n/B;

    for(int t = 0; t < num_rounds; t++) { 

        //execute_round(int *matrix, int n, int t, int row, int col, int B)

        //phase 1: self-dependent block
        arr_execute_round(matrix, n, t, t, t, B);

        //phase 2 blocks left
        for (int j = t-1; j >= 0; j--) {
            arr_execute_round(matrix, n, t, t, j, B);
        }

        //phase 2 blocks above
        for (int i = t-1; i >= 0; i--) {
            arr_execute_round(matrix, n, t, i, t, B);
        }

        //phase 2 blocks below
        for (int i = t+1; i < num_rounds; i++) {
            arr_execute_round(matrix, n, t, i, t, B);
        }

        //phase 2 blocks right
        for (int j = t+1; j < num_rounds; j++) {
            arr_execute_round(matrix, n, t, t, j, B);
        }
        
        //phase 2,3: remaining blocks
        //phase 3 blocks above and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t-1; i >= 0; i--) {
                arr_execute_round(matrix, n, t, i, j, B);
            }
        }
        //phase 3 blocks above and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t-1; i >= 0; i--) {
                arr_execute_round(matrix, n, t, i, j, B);
            }
        }
        //phase 3 blocks below and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t+1; i < num_rounds; i++) {
                arr_execute_round(matrix, n, t, i, j, B);
            }
        }      
        //phase 3 blocks below and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t+1; i < num_rounds; i++) {
                arr_execute_round(matrix, n, t, i, j, B);
            }
        }   
        
    }
}

void arr_execute_round(int *matrix, int n, int t, int row, int col, int B) {
    //foreach k: t*B <= t < t+B
    int block_start = t * B;
    int block_end = (t+1) * B;
    int row_start = row * B;
    int row_end = (row+1) * B;
    int col_start = col * B;
    int col_end = (col+1) * B;
    for (int k = block_start; k < block_end; k++) {
        //foreach i,j in the self-dependent block
        for (int i = row_start; i < row_end; i++) {
            for (int j = col_start; j < col_end; j++) {
                int a = matrix[i*n + j];
                int x1 = matrix[i*n + k];
                int x2 =  matrix[k*n + j];
                int b = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF);
                matrix[i*n + j] = min(a, b);
                //print_arr_matrix(matrix, n, n);
            }
        }
    }
}

In [16]:
%%file src/performance_test.cu

#include "../include/performance_test.cuh"

#include <stdio.h>
#include <stdlib.h>
#include <cuda_profiler_api.h>

#include "../include/adj_matrix_utils.hpp"


void do_nvprof_performance_test(void (*floyd_warshall_arr_algorithm)(int * matrix, int n, int B), int input_size, int blocking_factor, int number_of_tests, int seed) {

    int* arr_matrix = (int *) malloc(sizeof(int *) * input_size * input_size);

    for (int i=0; i<number_of_tests; i++) {

        populate_arr_graph(arr_matrix, input_size, seed*(i+1));

        cudaProfilerStart();
        floyd_warshall_arr_algorithm(arr_matrix, input_size, blocking_factor);
        cudaProfilerStop();

        printf("Performed test number %d\n", i);
    }
}

In [17]:
%%file src/statistical_test.cpp

#include "../include/statistical_test.hpp"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cassert>

#include "../include/host_floyd_warshall.hpp"
#include "../include/adj_matrix_utils.hpp"


bool test_arr_floyd_warshall(
    void (*function_to_test) (int* arr_matrix, int n, int b), 
    int *input_instance, int *test_instance_space, 
    int input_size, int blocking_factor) {

    copy_arr_graph(input_instance, test_instance_space, input_size);
    
    // correct floyd_warshall execution
    arr_floyd_warshall(test_instance_space, input_size);

    // function to test execution
    function_to_test(input_instance, input_size, blocking_factor);

    return same_arr_matrix(input_instance, test_instance_space, input_size);
}


int do_arr_floyd_warshall_statistical_test(
    void (*function_to_test) (int* arr_matrix, int n, int b), 
    int input_size, int blocking_factor, 
    int n_tests, int use_always_seed, 
    bool stop_if_fail, int progress_print_fraction, bool print_failed_tests) {

    printf("Performing statistical test with:\n");
    printf("\t%d executions\n", n_tests);
    if (use_always_seed==RANDOM_SEED) {
        printf("\tseed=RANDOM\n");
    } else {
        printf("\tseed=%d\n", use_always_seed);
    }

    printf("\tinput_size=%d\n\tblocking_factor=%d\n\n", input_size, blocking_factor);
    
    int n_wrong = 0;

    //matrix initialization
    int *input_instance = (int *) malloc(sizeof(int *) * input_size * input_size);
    int *test_instance_space = (int *) malloc(sizeof(int *) * input_size * input_size);

    int i;
    for (i = 0; i < n_tests; i++)
    {
        // Progression status print
        if((i > 0) && (i % (n_tests/progress_print_fraction) == 0)) {
            double perc = ((double) i) / ((double) n_tests);
            printf("%d%%: %d of %d\n", (int) (perc*100), i, n_tests);
        }
        
        // if necessary, generate (pseudo) random input instance
        int seed = (use_always_seed == RANDOM_SEED) ? clock() : use_always_seed;
        
        populate_arr_graph(input_instance, input_size, seed);

        // perform test
        if (!test_arr_floyd_warshall(*function_to_test, input_instance, test_instance_space, input_size, blocking_factor)) {

            n_wrong++;

            if (print_failed_tests) printf("%d/%d)\tseed: %d --> ERROR!\n", i, n_tests, seed);
            
            if (stop_if_fail) break;
        }
    }

    free(input_instance);
    free(test_instance_space);

    printf("Test ended. Performed %d/%d tests and got %d/%d errors\n\n", i, n_tests, n_wrong, n_tests);
    return n_wrong;
}

int multi_size_statistical_test(
    void (*function_to_test)  (int* arr_matrix, int n, int b), 
    int start_input_size, int end_input_size, 
    int min_blocking_factor, int max_blocking_factor, 
    int n_tests_per_round, int use_always_seed, 
    bool stop_if_fail, bool print_failed_tests) {

    assert(end_input_size%start_input_size==0);
    assert(end_input_size>=start_input_size);
    assert(start_input_size%2==0);
    assert(end_input_size%2==0);

    assert(max_blocking_factor%min_blocking_factor==0);
    assert(max_blocking_factor>=min_blocking_factor);
    assert(min_blocking_factor%2==0);
    assert(max_blocking_factor%2==0);

    assert(start_input_size%min_blocking_factor==0);

    printf("Performing Multi-size statistical test:\n");
    printf("\tFrom %d to %d input size (multiplying *2 every time)", start_input_size, end_input_size);
    printf("\tApplying from %d to %d blocking factor for each size (multiplying *2 every time)", min_blocking_factor, max_blocking_factor);
    printf("\t%d Executions for each single test round\n", n_tests_per_round);

    if (use_always_seed==RANDOM_SEED) {
        printf("\tseed=RANDOM\n");
    } else {
        printf("\tseed=%d\n", use_always_seed);
    }

    int n_err_tot = 0;

    for (int n = start_input_size; n <= end_input_size; n *= 2) {

        int MAX_B = min(n, max_blocking_factor);
    
        for (int BLOCKING_FACTOR = min_blocking_factor; BLOCKING_FACTOR <= MAX_B; BLOCKING_FACTOR *= 2) {

            // if((n % BLOCKING_FACTOR) == 0) {
                
                printf("n: %d, B: %d\n", n, BLOCKING_FACTOR);
                int n_err = do_arr_floyd_warshall_statistical_test(
                    function_to_test, n, BLOCKING_FACTOR, n_tests_per_round, use_always_seed, stop_if_fail, 1, print_failed_tests);
                // int n_err = do_arr_floyd_warshall_statistical_test(&arr_floyd_warshall_blocked, n, BLOCKING_FACTOR, 1000, RANDOM_SEED, true, 4, true);
                
                n_err_tot += n_err;
                if (n_err>0 && stop_if_fail) {
                    return n_err_tot;
                };
            // }

            printf("Cumulative errors at size=%d, blocking_factor=%d:\t%d (%d new ones)\n\n", n, BLOCKING_FACTOR, n_err_tot, n_err);
        }
    }

    return n_err_tot;
}

In [18]:
# END SRC REGION
# ----------------------------------------------------------------------------------------------------------------------------------------
# START MAIN REGION

In [19]:
%%file floyd_warshall_array_device_v_pitch.cu


#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdbool.h>

#include <ctime>
#include <cassert>


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>

#include "include/adj_matrix_utils.hpp"
#include "include/host_floyd_warshall.hpp"
#include "include/cuda_errors_utils.cuh"
#include "include/performance_test.cuh"
#include "include/statistical_test.hpp"

#define MAX_BLOCK_SIZE 1024 // in realtà basta fare le proprerties della macchina

void floyd_warshall_blocked_device_v1_0(int *matrix, int n, int B);
__global__ void execute_round_device_v1_0(int *matrix, int n, int t, int row, int col, int B);

void floyd_warshall_blocked_device_v1_1(int *matrix, int n, int B);
__global__ void execute_round_device_v1_1(int *matrix, int n, int t, int row, int col, int B);

void floyd_warshall_blocked_device_v_pitch(int *matrix, int n, int B);



int main() {

    // for (size_t n = 10; n < 200; n += 2) {

    //     int MAX_B = min(32, n);
    
    //     for (int BLOCKING_FACTOR = 1; BLOCKING_FACTOR < MAX_B; BLOCKING_FACTOR += 2) {

    //         if((n % BLOCKING_FACTOR) == 0) {
                
    //             printf("n: %ld, B: %d\n", n, BLOCKING_FACTOR);
    //             int n_err = do_arr_floyd_warshall_statistical_test(&floyd_warshall_blocked_device_v1_0, n, BLOCKING_FACTOR, 1000, RANDOM_SEED, true, 4, true);
    //             // int n_err = do_arr_floyd_warshall_statistical_test(&arr_floyd_warshall_blocked, n, BLOCKING_FACTOR, 1000, RANDOM_SEED, true, 4, true);

    //             if (n_err>0) return;
    //         }
    //     }
    // }

    multi_size_statistical_test(&floyd_warshall_blocked_device_v_pitch, 16, 512, 8, 32, 100, RANDOM_SEED, false, false);

    // int n = 128;
    // int b = 16;
    // int n_tests = 1000;
    // // int seed = 2862999;
    // int seed = RANDOM_SEED;

    // do_arr_floyd_warshall_statistical_test(&floyd_warshall_blocked_device_v1_1, n, b, n_tests, seed, false, 4, true);

    // 
    // do_nvprof_performance_test(&floyd_warshall_blocked_device_v1_0, n, BLOCKING_FACTOR, 100, clock());
    

    // int *input_instance = (int *) malloc(sizeof(int *) * n * n);
    // int *test_instance_space = (int *) malloc(sizeof(int *) * n * n);
    // populate_arr_graph(input_instance, n, seed);
    // copy_arr_graph(input_instance, test_instance_space, n);
    // bool result = test_arr_floyd_warshall(&floyd_warshall_blocked_device_v1_0, input_instance, test_instance_space, n, b);
    // printf("Corretto: %s\n", bool_to_string(result));

    return 0;
}


void floyd_warshall_blocked_device_v_pitch(int *matrix, int n, int B) {
    
    assert(n%B == 0);                       // B must divide n
    assert(B*B<=MAX_BLOCK_SIZE);            // B*B cannot exceed max block size

    int *dev_rand_matrix;
    size_t pitch;                          //size in bytes of memory allocated to guarantee alignment
    size_t width = n * sizeof(int);
    size_t height = n;

    //cudaMallocPitch(&devPtr, &devPitch, N_cols * sizeof(type), N_rows);

    HANDLE_ERROR(cudaMallocPitch( (void**) &dev_rand_matrix, &pitch, width, height));
    HANDLE_ERROR(cudaMemcpy(dev_rand_matrix, matrix, n * n * sizeof(int), cudaMemcpyHostToDevice));

    int num_rounds = n/B;
     
    for(int t = 0; t < num_rounds; t++) { 

        //arr_execute_round(int *matrix, int n, int t, int row, int col, int B)

        //phase 1: self-dependent block
        dim3 num_blocks_phase_1(1, 1);
        dim3 threads_per_block_phase_1(B, B);

        execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, t, t, B);
        HANDLE_ERROR(cudaDeviceSynchronize());

        //phase 2 blocks left
        for (int j = t-1; j >= 0; j--) {
            execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, t, j, B);
            // HANDLE_ERROR(cudaDeviceSynchronize());  
        }

        //phase 2 blocks above
        for (int i = t-1; i >= 0; i--) {
            execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, t, B);
            // HANDLE_ERROR(cudaDeviceSynchronize());  
        }

        //phase 2 blocks below
        for (int i = t+1; i < num_rounds; i++) {
            execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, t, B);
            // HANDLE_ERROR(cudaDeviceSynchronize());  
        }

        //phase 2 blocks right
        for (int j = t+1; j < num_rounds; j++) {
            execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, t, j, B);
            // HANDLE_ERROR(cudaDeviceSynchronize());  
        }

        HANDLE_ERROR(cudaDeviceSynchronize());
        
        //phase 3 blocks above and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t-1; i >= 0; i--) {
                execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, j, B);
                // HANDLE_ERROR(cudaDeviceSynchronize());  
            }
        }
        //phase 3 blocks above and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t-1; i >= 0; i--) {
                execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, j, B);
                // HANDLE_ERROR(cudaDeviceSynchronize());  
            }
        }
        //phase 3 blocks below and left
        for (int j = t-1; j >= 0; j--) {
            for (int i = t+1; i < num_rounds; i++) {
                execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, j, B);
                // HANDLE_ERROR(cudaDeviceSynchronize());  
            }
        }      
        //phase 3 blocks below and right
        for (int j = t+1; j < num_rounds; j++) {
            for (int i = t+1; i < num_rounds; i++) {
                execute_round_device_v1_1<<<num_blocks_phase_1, threads_per_block_phase_1>>>(dev_rand_matrix, n, t, i, j, B);
                // HANDLE_ERROR(cudaDeviceSynchronize());  
            }
        }

        HANDLE_ERROR(cudaDeviceSynchronize());  
    }
}

__global__ void execute_round_device_v1_1(int *matrix, int n, int t, int row, int col, int B) {

    int tid_x = threadIdx.x + blockIdx.x * blockDim.x;
    int tid_y = threadIdx.y + blockIdx.y * blockDim.y;

    int i = tid_x + row * B;  // row
    int j = tid_y + col * B;  // col

    //foreach k: t*B <= t < t+B
    for (int k = t * B; k < (t+1) * B; k++) {

        bool run_this = ((i >= row*B) && (i < (row+1)*B) && (j >= col*B) && (j < (col+1)*B));

        // check if thread correspond to one of the cells in current block
        if (run_this) {

            int using_k_path = sum_if_not_infinite(matrix[i*n + k], matrix[k*n + j], INF); 

            if (using_k_path < matrix[i*n + j]) {
                matrix[i*n + j] = using_k_path;
            }
        }
        
        __syncthreads();

    }
}

In [24]:
%%file Makefile

read_matrix :	
	nvcc -rdc=true -o bin/read_matrix.out \
		main.cpp \
		src/adj_matrix_reader.cpp \
		src/adj_matrix_utils.cpp

fwm:
	g++ -o bin/fwm.out \
		floyd_warshall_matrix.cpp \
		src/adj_matrix_utils.cpp  \
		src/host_floyd_warshall.cpp \
		 

fwa:
	g++ -o bin/fwa.out \
		floyd_warshall_array.cpp \
		src/adj_matrix_utils.cpp \
		src/host_floyd_warshall.cpp

fwa_dev:
	nvcc -o bin/fwa_dev.out \
		floyd_warshall_array_device.cu \
		src/adj_matrix_utils.cpp \
		src/cuda_errors_utils.cu \
		src/performance_test.cu \
		src/statistical_test.cpp \
		src/host_floyd_warshall.cpp 


fwa_dev_v1_2:
	nvcc -o bin/fwa_dev_v1_2.out \
		floyd_washall_device_v1_2.cu \
		src/adj_matrix_utils.cpp \
		src/cuda_errors_utils.cu \
		src/performance_test.cu \
		src/statistical_test.cpp \
		src/host_floyd_warshall.cpp \
		src/device_floyd_warshall_v1_2.cu
    

fwa_dev_pitch:
	nvcc -o bin/fwa_dev_pitch.out \
		floyd_warshall_array_device_v_pitch.cu \
		src/adj_matrix_utils.cpp \
		src/cuda_errors_utils.cu \
		src/performance_test.cu \
		src/statistical_test.cpp \
		src/host_floyd_warshall.cpp \
		src/device_floyd_warshall_v1_2.cu

In [25]:
! make fwa_dev_pitch

In [22]:
! ./bin/fwa_dev_pitch.out

In [23]:
#! nvprof ./bin/fwa_dev.out