Merge 8b096e3 into 8de535f

undertherain · Jul 20, 2020 · 8a26ade · 8a26ade
2 parents 8de535f + 8b096e3
commit 8a26ade
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 40 deletions.
diff --git a/benchmarker/modules/do_cblas.py b/benchmarker/modules/do_cblas.py
@@ -0,0 +1,28 @@
+import os
+from .i_gemm import IGEMM
+from benchmarker.util.abstractprocess import Process
+
+
+class Benchmark(IGEMM):
+    def __init__(self, params, remaining_args=None):
+        super().__init__(params, remaining_args)
+        assert self.params["problem"]["precision"] in ["FP32", "mixed"]
+
+    def run(self):
+        if "nb_gpus" in self.params:
+            if self.params["nb_gpus"] > 0:
+                raise Exception("cblas does not work on GPU")
+        size = " ".join(map(str, self.params['problem']['size']))
+        path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                   "problems/gemm/cblas/main")
+        if not os.path.isfile(path_binary):
+            raise(RuntimeError(f"{path_binary} not found, run make manually"))
+        command = [path_binary,
+                   self.params["problem"]["precision"],
+                   size]
+        process = Process(command=command)
+        result = process.get_output()
+        std_out = result["out"]
+        elapsed_time = float(std_out.strip())
+        self.params["time"] = elapsed_time
+        self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time
diff --git a/benchmarker/modules/do_cublas.py b/benchmarker/modules/do_cublas.py
@@ -14,7 +14,7 @@ def run(self):
                 raise Exception("cublas requires one GPU")
         size = " ".join(map(str, self.params['problem']['size']))
         path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                   "problems/cublas/main")
+                                   "problems/gemm/cublas/main")
         if not os.path.isfile(path_binary):
             raise(RuntimeError(f"{path_binary} not found, run make manually"))
         command = [path_binary,

diff --git a/benchmarker/modules/problems/cublas/args.hpp b/benchmarker/modules/problems/cublas/args.hpp
diff --git a/benchmarker/modules/problems/gemm/args.hpp b/benchmarker/modules/problems/gemm/args.hpp
@@ -0,0 +1,48 @@
+// traditional BLAS API:
+// M - Number of rows in matrices A and C.
+// N - Number of columns in matrices B and C.
+// K - Number of columns in matrix A; number of rows in matrix B.
+// we remap from intuitive order to BLAS-style
+
+void parse_args(const int argc,
+                char *argv[],
+                std::string &precision,
+                size_t &cnt_rows_A_rows_C,
+                size_t &cnt_cols_A_rows_B,
+                size_t &cnt_cols_B_cols_C
+                ) {
+    if ((argc != 3) && (argc != 5))
+    {
+        std::cerr << "provide precision, m, n, k as command line parameters\n";
+        exit(-1);
+    }
+    precision = std::string(argv[1]);
+    if (argc==3) {
+        cnt_rows_A_rows_C = atoi(argv[2]);
+        cnt_cols_A_rows_B = cnt_rows_A_rows_C;
+        cnt_cols_B_cols_C = cnt_rows_A_rows_C;
+    }
+    else
+    {
+        cnt_rows_A_rows_C = atoi(argv[2]);
+        cnt_cols_A_rows_B = atoi(argv[3]);
+        cnt_cols_B_cols_C = atoi(argv[4]);
+    }
+}
+
+template<typename precision>
+void get_matrices(size_t &cnt_rows_A_rows_C,
+                  size_t &cnt_cols_A_rows_B,
+                  size_t &cnt_cols_B_cols_C,
+                  precision * & A,
+                  precision * & B,
+                  precision * & C) {
+    size_t i;
+    A = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_A_rows_B);
+    B = (precision*) malloc(sizeof(precision) * cnt_cols_A_rows_B * cnt_cols_B_cols_C);
+    C = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_B_cols_C);
+    for(i=0; i < cnt_rows_A_rows_C * cnt_cols_A_rows_B; i++) { A[i] = rand()/RAND_MAX;}
+    for(i=0; i < cnt_cols_A_rows_B * cnt_cols_B_cols_C; i++) { B[i] = rand()/RAND_MAX;}
+    for(i=0; i < cnt_rows_A_rows_C * cnt_cols_B_cols_C; i++) { C[i] = rand()/RAND_MAX;}
+    fprintf(stderr, "done random init\n");
+}
diff --git a/benchmarker/modules/problems/gemm/cblas/Makefile b/benchmarker/modules/problems/gemm/cblas/Makefile
@@ -0,0 +1,15 @@
+.PHONY: run
+
+CXXFLAGS=-Wall -O3  -fopenmp --std=c++14 -msse3  -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec
+
+CFLAGS=-Wall -O3  -fopenmp  -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec
+
+
+all: main
+
+main: main.cpp
+	# TODO: make this customizable to build against different blas libraries
+	$(CXX) -o $@ $^ $(CXXFLAGS) -lopenblas
+
+run: main
+	./main 128 128 128
diff --git a/benchmarker/modules/problems/gemm/cblas/main.cpp b/benchmarker/modules/problems/gemm/cblas/main.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <chrono>
+#include <cblas.h>
+#include "../args.hpp"
+
+using namespace std::chrono; 
+
+
+
+int main(int argc, char * argv[]) {
+    size_t m, n, k;
+    float *A, *B, *C;
+    double dtime;
+    std::string precision;
+    parse_args(argc, argv, precision, m, k, n);
+    get_matrices<float>(m, k, n, A, B, C);
+    const float alpha = 1;
+    const float beta = 0;
+    const size_t lda=m; // k for row major;
+    const size_t ldb=k; //n; 
+    const size_t ldc=m; //n;
+    auto start = high_resolution_clock::now(); 
+    if (precision == "FP32")
+        cblas_sgemm(CblasColMajor,
+                    CblasNoTrans,
+                    CblasNoTrans,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    A, lda,
+                    B, ldb,
+                    beta,
+                    C, ldc);
+    else
+	{
+        fprintf(stderr, "not implemented yet");
+	    throw "madamada";
+	}
+    std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
+    auto stop = high_resolution_clock::now();
+    std::chrono::duration<double> seconds = (stop - start); 
+    dtime = seconds.count();
+    double gflop = (2.0 * m * n * k) / (1024 * 1024 * 1024);
+    double gflops = gflop / dtime;
+    printf("%f\n", dtime);
+    fprintf(stderr, "gflops: \t%f\n", gflop);
+    fprintf(stderr, "time: \t%f\n", dtime);
+    fprintf(stderr, "ips: \t%f\n", 1 / dtime);
+    fprintf(stderr, "gflops/s: \t%f\n", gflops);
+    return 0;
+}
diff --git a/benchmarker/modules/problems/cublas/Makefile → ...ker/modules/problems/gemm/cublas/Makefile b/benchmarker/modules/problems/cublas/Makefile → ...ker/modules/problems/gemm/cublas/Makefile
diff --git a/benchmarker/modules/problems/cublas/main.cu → ...rker/modules/problems/gemm/cublas/main.cu b/benchmarker/modules/problems/cublas/main.cu → ...rker/modules/problems/gemm/cublas/main.cu
@@ -3,7 +3,7 @@
 #include <cublas_v2.h>
 #include <cublasLt.h>
 #include <chrono>
-#include "args.hpp"
+#include "../args.hpp"
 
 using namespace std::chrono; 
 
@@ -22,27 +22,29 @@ int main(int argc, char * argv[]) {
     size_t m, n, k;
     float *A, *B, *C;
     double dtime;
-    std::string precision(argv[1]);
-    args_to_matrices<float>(argc - 1, argv + 1, m, n, k, A, B, C);
+    std::string precision;
+    parse_args(argc, argv, precision, m, k, n);
+    get_matrices<float>(m, k, n, A, B, C);
     float *d_A, *d_B, *d_C;
     cudaMalloc(&d_A, m * k * sizeof(float));
     cudaMalloc(&d_B, k * n * sizeof(float));
-    cudaMalloc(&d_C, m * k * sizeof(float));
-    cudaMemcpy(d_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_B, B, n * k * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_C, m * n * sizeof(float));
+    cudaMemcpy(d_A, A, m * k * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, k * n * sizeof(float), cudaMemcpyHostToDevice);
     cublasHandle_t handle;
     const float alf = 1;
     const float bet = 0;
     const float *alpha = &alf;
     const float *beta = &bet;
     int lda=m, ldb=k, ldc=m;
-    int gpu_id = 0; // TODO: get from command line
+    int gpu_id = 0; // this is actually OK if calle from Benchmarker bec. visible devices
     cudaSetDevice(gpu_id);
     cublasCreate(&handle);
     auto start = high_resolution_clock::now(); 
-    // TODO: this m n k ordering is a mess, rename them intuitively ><
+    // cublas only does column-major order
     if (precision == "FP32")
-        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
+        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k,
+                     alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
     else
         cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, 
                      alpha, d_A, CUDA_R_16F, lda, d_B, CUDA_R_16F, ldb, beta, d_C, CUDA_R_32F, ldc, CUDA_R_32F,

diff --git a/benchmarker/modules/problems/gemm/data.py b/benchmarker/modules/problems/gemm/data.py
@@ -1,4 +1,3 @@
-
 def get_data(params):
-	return 2048
+	return "dummy data"