Merge f19e5e0 into 1d724f5

undertherain · Jun 4, 2021 · 60410e2 · 60410e2
2 parents 1d724f5 + f19e5e0
commit 60410e2
Show file tree

Hide file tree

Showing 13 changed files with 575 additions and 0 deletions.
diff --git a/benchmarker/frameworks/do_mkl.py b/benchmarker/frameworks/do_mkl.py
@@ -0,0 +1,33 @@
+import os
+
+from benchmarker.util.abstractprocess import Process
+
+from .i_gemm import IGEMM
+
+
+class Benchmark(IGEMM):
+    def __init__(self, params, remaining_args=None):
+        super().__init__(params, remaining_args)
+        assert self.params["problem"]["precision"] in ["FP32", "mixed"]
+
+    def run(self):
+        if "nb_gpus" in self.params:
+            if self.params["nb_gpus"] > 0:
+                raise Exception("cblas does not work on GPU")
+        # TODO(Alex): this does not work inless the binaries are copied to site_packages
+        dirname = os.path.dirname(os.path.realpath(__file__))
+        path_binary = os.path.join(dirname, "../kernels", self.params["problem"].get("name"), self.params["framework"], "main")
+        if not os.path.isfile(path_binary):
+            raise (RuntimeError(f"{path_binary} not found, run make manually"))
+        command = [
+            path_binary,
+            self.params["problem"]["precision"],
+            *map(str, self.params["problem"]["size"]),
+            str(self.params["nb_epoch"]),
+        ]
+        process = Process(command=command)
+        result = process.get_output()
+        std_out = result["out"]
+        elapsed_time = float(std_out.strip())
+        self.params["time_total"] = elapsed_time
+        self.post_process()
diff --git a/benchmarker/kernels/batchmatmul/args.hpp b/benchmarker/kernels/batchmatmul/args.hpp
@@ -0,0 +1,68 @@
+// traditional BLAS API:
+// M - Number of rows in matrices A and C.
+// N - Number of columns in matrices B and C.
+// K - Number of columns in matrix A; number of rows in matrix B.
+// we remap from intuitive order to BLAS-style
+
+struct Options {
+    std::string precision;
+    size_t cnt_rows_A_rows_C;
+    size_t cnt_cols_A_rows_B;
+    size_t cnt_cols_B_cols_C;
+    size_t batch_size;
+    size_t nb_epoch;
+};
+
+Options parse_args(const int argc, char *argv[]) {
+    if (argc != 7)
+    {
+        std::cerr << "provide precision, m, n, k, batch_size, nb_epoch as command line parameters\n";
+        std::cerr << "got " << argc << " parameters\n";
+        exit(-1);
+    }
+    Options options;
+    options.precision = std::string(argv[1]);
+    options.cnt_rows_A_rows_C = atoi(argv[2]);
+    options.cnt_cols_A_rows_B = atoi(argv[3]);
+    options.cnt_cols_B_cols_C = atoi(argv[4]);
+    options.batch_size = atoi(argv[5]);
+    options.nb_epoch = atoi(argv[6]);
+    return options;
+}
+
+template<typename precision>
+void get_matrices(size_t &cnt_rows_A_rows_C,
+                  size_t &cnt_cols_A_rows_B,
+                  size_t &cnt_cols_B_cols_C,
+                  precision * & A,
+                  precision * & B,
+                  precision * & C) {
+    size_t i;
+    A = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_A_rows_B);
+    B = (precision*) malloc(sizeof(precision) * cnt_cols_A_rows_B * cnt_cols_B_cols_C);
+    C = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_B_cols_C);
+    //fprintf(stderr, "done malloc\n");
+    for(i=0; i < cnt_rows_A_rows_C * cnt_cols_A_rows_B; i++) { A[i] = static_cast<precision>(rand())/RAND_MAX;}
+    for(i=0; i < cnt_cols_A_rows_B * cnt_cols_B_cols_C; i++) { B[i] = static_cast<precision>(rand())/RAND_MAX;}
+    for(i=0; i < cnt_rows_A_rows_C * cnt_cols_B_cols_C; i++) { C[i] = static_cast<precision>(rand())/RAND_MAX;}
+    //fprintf(stderr, "done random init\n");
+}
+
+template<typename precision>
+void get_batched_matrices(size_t &cnt_rows_A_rows_C,
+                          size_t &cnt_cols_A_rows_B,
+                          size_t &cnt_cols_B_cols_C,
+                          precision ** & A,
+                          precision ** & B,
+                          precision ** & C,
+                          size_t batch_size) {
+    A = (precision**) malloc(sizeof(precision*) * batch_size);
+    B = (precision**) malloc(sizeof(precision*) * batch_size);
+    C = (precision**) malloc(sizeof(precision*) * batch_size);
+    fprintf(stderr, "done malloc\n");
+    for(size_t i=0; i<batch_size; i++){
+        get_matrices<precision>(cnt_rows_A_rows_C, cnt_cols_A_rows_B, cnt_cols_B_cols_C,
+                                A[i], B[i], C[i]);
+    }
+    fprintf(stderr, "done random init\n");
+}
diff --git a/benchmarker/kernels/batchmatmul/mkl/Makefile b/benchmarker/kernels/batchmatmul/mkl/Makefile
@@ -0,0 +1,12 @@
+.PHONY: run
+
+CXX=icpc
+
+CXXFLAGS=-Wall -O3 -fopenmp -std=c++14 -msse3  -ftree-vectorize -mkl
+
+all: main
+
+main: main.cpp
+
+run: main
+	./main 128 128 128 1000
diff --git a/benchmarker/kernels/batchmatmul/mkl/main.cpp b/benchmarker/kernels/batchmatmul/mkl/main.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <chrono>
+#include <mkl.h>
+#include "../args.hpp"
+
+using namespace std::chrono; 
+
+int main(int argc, char * argv[]) {
+    size_t m, n, k, batch_size;
+    float **A, **B, **C;
+    double dtime;
+    Options options = parse_args(argc, argv);
+    // parse_args(argc, argv, precision, m, k, n);
+    m = options.cnt_rows_A_rows_C;
+    n = options.cnt_cols_A_rows_B;
+    k = options.cnt_cols_B_cols_C;
+    batch_size = options.batch_size;
+    get_batched_matrices<float>(m, k, n, A, B, C, batch_size);
+    const float alpha = 1;
+    const float beta = 0;
+    const size_t lda=m; // k for row major;
+    const size_t ldb=k; //n; 
+    const size_t ldc=m; //n;
+    auto start = high_resolution_clock::now(); 
+    for (size_t i=0; i<options.nb_epoch; i++)
+    {
+        if (options.precision == "FP32")
+        {
+            #pragma omp parallel for
+            for(size_t j=0;j<batch_size;j++){
+                cblas_sgemm(CblasColMajor,
+                            CblasNoTrans,
+                            CblasNoTrans,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            A[j], lda,
+                            B[j], ldb,
+                            beta,
+                            C[j], ldc);
+            }
+        }
+        else
+        {
+            // TODO  (Alex): implement FP16
+            // ugly throw here to make sure benchmarker chrashes alright
+            fprintf(stderr, "not implemented yet");
+            throw "madamada";
+        }
+    }
+
+    std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
+    auto stop = high_resolution_clock::now();
+    std::chrono::duration<double> seconds = (stop - start); 
+    dtime = seconds.count();
+    double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000);
+    gflop *= static_cast<double>(options.nb_epoch);
+    double gflops = gflop / dtime;
+    printf("%f\n", dtime);
+    fprintf(stderr, "gflops: \t%f\n", gflop);
+    fprintf(stderr, "time: \t%f\n", dtime);
+    fprintf(stderr, "ips: \t%f\n", 1 / dtime);
+    fprintf(stderr, "gflops/s: \t%f\n", gflops);
+    return 0;
+}
diff --git a/benchmarker/kernels/batchmatmul/oneDNN/Makefile b/benchmarker/kernels/batchmatmul/oneDNN/Makefile
@@ -0,0 +1,24 @@
+# g++ -std=c++14 -I${DNNLROOT}/include -L${DNNLROOT}/lib64 -fopenmp -o main main.cpp -ldnnl
+.PHONY: run
+
+CXX=g++
+
+#${DNNLROOT} is root directory of dnnl
+COURSE_DIR=${DNNLROOT}/include
+
+LIB_DIR=${DNNLROOT}/lib64
+
+CXXFLAGS=-Wall -O3  -fopenmp -std=c++11 -msse3  -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec -ldnnl
+
+CPPFLAGS=-I$(COURSE_DIR)
+
+LDFLAGS=-L$(LIB_DIR)
+
+all: main
+
+dnnl: dnnl.cpp
+
+matmul: matmul.cpp
+
+run: main
+	./main 128 128 128 1000
diff --git a/benchmarker/kernels/batchmatmul/oneDNN/dnnl.cpp b/benchmarker/kernels/batchmatmul/oneDNN/dnnl.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cctype>
+#include "oneapi/dnnl/dnnl.hpp"
+#include "../args.hpp"
+
+using namespace std::chrono; 
+
+
+int main(int argc, char * argv[]) {
+    size_t m, n, k, batch_size;
+    float **A, **B, **C;
+    double dtime;
+    Options options = parse_args(argc, argv);
+    m = options.cnt_rows_A_rows_C;
+    n = options.cnt_cols_A_rows_B;
+    k = options.cnt_cols_B_cols_C;
+    batch_size = options.batch_size;
+    get_batched_matrices<float>(m, k, n, A, B, C, batch_size);
+    const float alpha = 1;
+    const float beta = 0;
+    int64_t M = (int64_t)m;
+    int64_t N = (int64_t)n;
+    int64_t K = (int64_t)k;
+    const int64_t lda=(int64_t)k; // k for row major;
+    const int64_t ldb=(int64_t)n; //n; 
+    const int64_t ldc=(int64_t)n; //n;
+    auto start = high_resolution_clock::now(); 
+    for (size_t i=0; i<options.nb_epoch; i++)
+    {
+        if (options.precision == "FP32")
+        {
+            #pragma omp parallel for
+            for(size_t j=0;j<batch_size;j++){
+                dnnl_sgemm('N',
+                           'N',
+                           M,
+                           N,
+                           K,
+                           alpha,
+                           A[j], lda,
+                           B[j], ldb,
+                           beta,
+                           C[j], ldc);
+            }
+        }
+        else
+        {
+            // TODO  (Alex): implement FP16
+            // ugly throw here to make sure benchmarker chrashes alright
+            fprintf(stderr, "not implemented yet");
+            throw "madamada";
+        }
+    }
+    std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
+    auto stop = high_resolution_clock::now();
+    std::chrono::duration<double> seconds = (stop - start); 
+    dtime = seconds.count();
+    double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000);
+    gflop *= static_cast<double>(options.nb_epoch);
+    double gflops = gflop / dtime;
+    printf("%f\n", dtime);
+    fprintf(stderr, "gflops: \t%f\n", gflop);
+    fprintf(stderr, "time: \t%f\n", dtime);
+    fprintf(stderr, "ips: \t%f\n", 1 / dtime);
+    fprintf(stderr, "gflops/s: \t%f\n", gflops);
+    return 0;
+}
diff --git a/benchmarker/kernels/batchmatmul/oneDNN/example_utils.hpp b/benchmarker/kernels/batchmatmul/oneDNN/example_utils.hpp
@@ -0,0 +1,33 @@
+#include <functional>
+#include "dnnl.hpp"
+
+inline dnnl::memory::dim product(const dnnl::memory::dims &dims) {
+    return std::accumulate(dims.begin(), dims.end(), (dnnl::memory::dim)1,
+            std::multiplies<dnnl::memory::dim>());
+}
+
+// Read from memory, write to handle
+inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t size = mem.get_desc().get_size();
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle());
+        for (size_t i = 0; i < size; ++i)
+            ((uint8_t *)handle)[i] = src[i];
+        return;
+    }
+}
+
+// Read from handle, write to memory
+inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t size = mem.get_desc().get_size();
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
+        for (size_t i = 0; i < size; ++i)
+            dst[i] = ((uint8_t *)handle)[i];
+        return;
+    }
+}