Skip to content

Commit

Permalink
Merge f19e5e0 into 1d724f5
Browse files Browse the repository at this point in the history
  • Loading branch information
simon2 committed Jun 4, 2021
2 parents 1d724f5 + f19e5e0 commit 60410e2
Show file tree
Hide file tree
Showing 13 changed files with 575 additions and 0 deletions.
33 changes: 33 additions & 0 deletions benchmarker/frameworks/do_mkl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os

from benchmarker.util.abstractprocess import Process

from .i_gemm import IGEMM


class Benchmark(IGEMM):
def __init__(self, params, remaining_args=None):
super().__init__(params, remaining_args)
assert self.params["problem"]["precision"] in ["FP32", "mixed"]

def run(self):
if "nb_gpus" in self.params:
if self.params["nb_gpus"] > 0:
raise Exception("cblas does not work on GPU")
# TODO(Alex): this does not work inless the binaries are copied to site_packages
dirname = os.path.dirname(os.path.realpath(__file__))
path_binary = os.path.join(dirname, "../kernels", self.params["problem"].get("name"), self.params["framework"], "main")
if not os.path.isfile(path_binary):
raise (RuntimeError(f"{path_binary} not found, run make manually"))
command = [
path_binary,
self.params["problem"]["precision"],
*map(str, self.params["problem"]["size"]),
str(self.params["nb_epoch"]),
]
process = Process(command=command)
result = process.get_output()
std_out = result["out"]
elapsed_time = float(std_out.strip())
self.params["time_total"] = elapsed_time
self.post_process()
68 changes: 68 additions & 0 deletions benchmarker/kernels/batchmatmul/args.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// traditional BLAS API:
// M - Number of rows in matrices A and C.
// N - Number of columns in matrices B and C.
// K - Number of columns in matrix A; number of rows in matrix B.
// we remap from intuitive order to BLAS-style

struct Options {
std::string precision;
size_t cnt_rows_A_rows_C;
size_t cnt_cols_A_rows_B;
size_t cnt_cols_B_cols_C;
size_t batch_size;
size_t nb_epoch;
};

Options parse_args(const int argc, char *argv[]) {
if (argc != 7)
{
std::cerr << "provide precision, m, n, k, batch_size, nb_epoch as command line parameters\n";
std::cerr << "got " << argc << " parameters\n";
exit(-1);
}
Options options;
options.precision = std::string(argv[1]);
options.cnt_rows_A_rows_C = atoi(argv[2]);
options.cnt_cols_A_rows_B = atoi(argv[3]);
options.cnt_cols_B_cols_C = atoi(argv[4]);
options.batch_size = atoi(argv[5]);
options.nb_epoch = atoi(argv[6]);
return options;
}

template<typename precision>
void get_matrices(size_t &cnt_rows_A_rows_C,
size_t &cnt_cols_A_rows_B,
size_t &cnt_cols_B_cols_C,
precision * & A,
precision * & B,
precision * & C) {
size_t i;
A = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_A_rows_B);
B = (precision*) malloc(sizeof(precision) * cnt_cols_A_rows_B * cnt_cols_B_cols_C);
C = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_B_cols_C);
//fprintf(stderr, "done malloc\n");
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_A_rows_B; i++) { A[i] = static_cast<precision>(rand())/RAND_MAX;}
for(i=0; i < cnt_cols_A_rows_B * cnt_cols_B_cols_C; i++) { B[i] = static_cast<precision>(rand())/RAND_MAX;}
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_B_cols_C; i++) { C[i] = static_cast<precision>(rand())/RAND_MAX;}
//fprintf(stderr, "done random init\n");
}

template<typename precision>
void get_batched_matrices(size_t &cnt_rows_A_rows_C,
size_t &cnt_cols_A_rows_B,
size_t &cnt_cols_B_cols_C,
precision ** & A,
precision ** & B,
precision ** & C,
size_t batch_size) {
A = (precision**) malloc(sizeof(precision*) * batch_size);
B = (precision**) malloc(sizeof(precision*) * batch_size);
C = (precision**) malloc(sizeof(precision*) * batch_size);
fprintf(stderr, "done malloc\n");
for(size_t i=0; i<batch_size; i++){
get_matrices<precision>(cnt_rows_A_rows_C, cnt_cols_A_rows_B, cnt_cols_B_cols_C,
A[i], B[i], C[i]);
}
fprintf(stderr, "done random init\n");
}
12 changes: 12 additions & 0 deletions benchmarker/kernels/batchmatmul/mkl/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.PHONY: run

CXX=icpc

CXXFLAGS=-Wall -O3 -fopenmp -std=c++14 -msse3 -ftree-vectorize -mkl

all: main

main: main.cpp

run: main
./main 128 128 128 1000
66 changes: 66 additions & 0 deletions benchmarker/kernels/batchmatmul/mkl/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include <iostream>
#include <chrono>
#include <mkl.h>
#include "../args.hpp"

using namespace std::chrono;

int main(int argc, char * argv[]) {
size_t m, n, k, batch_size;
float **A, **B, **C;
double dtime;
Options options = parse_args(argc, argv);
// parse_args(argc, argv, precision, m, k, n);
m = options.cnt_rows_A_rows_C;
n = options.cnt_cols_A_rows_B;
k = options.cnt_cols_B_cols_C;
batch_size = options.batch_size;
get_batched_matrices<float>(m, k, n, A, B, C, batch_size);
const float alpha = 1;
const float beta = 0;
const size_t lda=m; // k for row major;
const size_t ldb=k; //n;
const size_t ldc=m; //n;
auto start = high_resolution_clock::now();
for (size_t i=0; i<options.nb_epoch; i++)
{
if (options.precision == "FP32")
{
#pragma omp parallel for
for(size_t j=0;j<batch_size;j++){
cblas_sgemm(CblasColMajor,
CblasNoTrans,
CblasNoTrans,
m,
n,
k,
alpha,
A[j], lda,
B[j], ldb,
beta,
C[j], ldc);
}
}
else
{
// TODO (Alex): implement FP16
// ugly throw here to make sure benchmarker chrashes alright
fprintf(stderr, "not implemented yet");
throw "madamada";
}
}

std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
auto stop = high_resolution_clock::now();
std::chrono::duration<double> seconds = (stop - start);
dtime = seconds.count();
double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000);
gflop *= static_cast<double>(options.nb_epoch);
double gflops = gflop / dtime;
printf("%f\n", dtime);
fprintf(stderr, "gflops: \t%f\n", gflop);
fprintf(stderr, "time: \t%f\n", dtime);
fprintf(stderr, "ips: \t%f\n", 1 / dtime);
fprintf(stderr, "gflops/s: \t%f\n", gflops);
return 0;
}
24 changes: 24 additions & 0 deletions benchmarker/kernels/batchmatmul/oneDNN/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# g++ -std=c++14 -I${DNNLROOT}/include -L${DNNLROOT}/lib64 -fopenmp -o main main.cpp -ldnnl
.PHONY: run

CXX=g++

#${DNNLROOT} is root directory of dnnl
COURSE_DIR=${DNNLROOT}/include

LIB_DIR=${DNNLROOT}/lib64

CXXFLAGS=-Wall -O3 -fopenmp -std=c++11 -msse3 -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec -ldnnl

CPPFLAGS=-I$(COURSE_DIR)

LDFLAGS=-L$(LIB_DIR)

all: main

dnnl: dnnl.cpp

matmul: matmul.cpp

run: main
./main 128 128 128 1000
70 changes: 70 additions & 0 deletions benchmarker/kernels/batchmatmul/oneDNN/dnnl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include <iostream>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <cctype>
#include "oneapi/dnnl/dnnl.hpp"
#include "../args.hpp"

using namespace std::chrono;


int main(int argc, char * argv[]) {
size_t m, n, k, batch_size;
float **A, **B, **C;
double dtime;
Options options = parse_args(argc, argv);
m = options.cnt_rows_A_rows_C;
n = options.cnt_cols_A_rows_B;
k = options.cnt_cols_B_cols_C;
batch_size = options.batch_size;
get_batched_matrices<float>(m, k, n, A, B, C, batch_size);
const float alpha = 1;
const float beta = 0;
int64_t M = (int64_t)m;
int64_t N = (int64_t)n;
int64_t K = (int64_t)k;
const int64_t lda=(int64_t)k; // k for row major;
const int64_t ldb=(int64_t)n; //n;
const int64_t ldc=(int64_t)n; //n;
auto start = high_resolution_clock::now();
for (size_t i=0; i<options.nb_epoch; i++)
{
if (options.precision == "FP32")
{
#pragma omp parallel for
for(size_t j=0;j<batch_size;j++){
dnnl_sgemm('N',
'N',
M,
N,
K,
alpha,
A[j], lda,
B[j], ldb,
beta,
C[j], ldc);
}
}
else
{
// TODO (Alex): implement FP16
// ugly throw here to make sure benchmarker chrashes alright
fprintf(stderr, "not implemented yet");
throw "madamada";
}
}
std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
auto stop = high_resolution_clock::now();
std::chrono::duration<double> seconds = (stop - start);
dtime = seconds.count();
double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000);
gflop *= static_cast<double>(options.nb_epoch);
double gflops = gflop / dtime;
printf("%f\n", dtime);
fprintf(stderr, "gflops: \t%f\n", gflop);
fprintf(stderr, "time: \t%f\n", dtime);
fprintf(stderr, "ips: \t%f\n", 1 / dtime);
fprintf(stderr, "gflops/s: \t%f\n", gflops);
return 0;
}
33 changes: 33 additions & 0 deletions benchmarker/kernels/batchmatmul/oneDNN/example_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include <functional>
#include "dnnl.hpp"

inline dnnl::memory::dim product(const dnnl::memory::dims &dims) {
return std::accumulate(dims.begin(), dims.end(), (dnnl::memory::dim)1,
std::multiplies<dnnl::memory::dim>());
}

// Read from memory, write to handle
inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
dnnl::engine eng = mem.get_engine();
size_t size = mem.get_desc().get_size();

if (eng.get_kind() == dnnl::engine::kind::cpu) {
uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle());
for (size_t i = 0; i < size; ++i)
((uint8_t *)handle)[i] = src[i];
return;
}
}

// Read from handle, write to memory
inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
dnnl::engine eng = mem.get_engine();
size_t size = mem.get_desc().get_size();

if (eng.get_kind() == dnnl::engine::kind::cpu) {
uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
for (size_t i = 0; i < size; ++i)
dst[i] = ((uint8_t *)handle)[i];
return;
}
}

0 comments on commit 60410e2

Please sign in to comment.