-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
575 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
|
||
from benchmarker.util.abstractprocess import Process | ||
|
||
from .i_gemm import IGEMM | ||
|
||
|
||
class Benchmark(IGEMM): | ||
def __init__(self, params, remaining_args=None): | ||
super().__init__(params, remaining_args) | ||
assert self.params["problem"]["precision"] in ["FP32", "mixed"] | ||
|
||
def run(self): | ||
if "nb_gpus" in self.params: | ||
if self.params["nb_gpus"] > 0: | ||
raise Exception("cblas does not work on GPU") | ||
# TODO(Alex): this does not work inless the binaries are copied to site_packages | ||
dirname = os.path.dirname(os.path.realpath(__file__)) | ||
path_binary = os.path.join(dirname, "../kernels", self.params["problem"].get("name"), self.params["framework"], "main") | ||
if not os.path.isfile(path_binary): | ||
raise (RuntimeError(f"{path_binary} not found, run make manually")) | ||
command = [ | ||
path_binary, | ||
self.params["problem"]["precision"], | ||
*map(str, self.params["problem"]["size"]), | ||
str(self.params["nb_epoch"]), | ||
] | ||
process = Process(command=command) | ||
result = process.get_output() | ||
std_out = result["out"] | ||
elapsed_time = float(std_out.strip()) | ||
self.params["time_total"] = elapsed_time | ||
self.post_process() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
// traditional BLAS API: | ||
// M - Number of rows in matrices A and C. | ||
// N - Number of columns in matrices B and C. | ||
// K - Number of columns in matrix A; number of rows in matrix B. | ||
// we remap from intuitive order to BLAS-style | ||
|
||
struct Options { | ||
std::string precision; | ||
size_t cnt_rows_A_rows_C; | ||
size_t cnt_cols_A_rows_B; | ||
size_t cnt_cols_B_cols_C; | ||
size_t batch_size; | ||
size_t nb_epoch; | ||
}; | ||
|
||
Options parse_args(const int argc, char *argv[]) { | ||
if (argc != 7) | ||
{ | ||
std::cerr << "provide precision, m, n, k, batch_size, nb_epoch as command line parameters\n"; | ||
std::cerr << "got " << argc << " parameters\n"; | ||
exit(-1); | ||
} | ||
Options options; | ||
options.precision = std::string(argv[1]); | ||
options.cnt_rows_A_rows_C = atoi(argv[2]); | ||
options.cnt_cols_A_rows_B = atoi(argv[3]); | ||
options.cnt_cols_B_cols_C = atoi(argv[4]); | ||
options.batch_size = atoi(argv[5]); | ||
options.nb_epoch = atoi(argv[6]); | ||
return options; | ||
} | ||
|
||
template<typename precision> | ||
void get_matrices(size_t &cnt_rows_A_rows_C, | ||
size_t &cnt_cols_A_rows_B, | ||
size_t &cnt_cols_B_cols_C, | ||
precision * & A, | ||
precision * & B, | ||
precision * & C) { | ||
size_t i; | ||
A = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_A_rows_B); | ||
B = (precision*) malloc(sizeof(precision) * cnt_cols_A_rows_B * cnt_cols_B_cols_C); | ||
C = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_B_cols_C); | ||
//fprintf(stderr, "done malloc\n"); | ||
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_A_rows_B; i++) { A[i] = static_cast<precision>(rand())/RAND_MAX;} | ||
for(i=0; i < cnt_cols_A_rows_B * cnt_cols_B_cols_C; i++) { B[i] = static_cast<precision>(rand())/RAND_MAX;} | ||
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_B_cols_C; i++) { C[i] = static_cast<precision>(rand())/RAND_MAX;} | ||
//fprintf(stderr, "done random init\n"); | ||
} | ||
|
||
template<typename precision> | ||
void get_batched_matrices(size_t &cnt_rows_A_rows_C, | ||
size_t &cnt_cols_A_rows_B, | ||
size_t &cnt_cols_B_cols_C, | ||
precision ** & A, | ||
precision ** & B, | ||
precision ** & C, | ||
size_t batch_size) { | ||
A = (precision**) malloc(sizeof(precision*) * batch_size); | ||
B = (precision**) malloc(sizeof(precision*) * batch_size); | ||
C = (precision**) malloc(sizeof(precision*) * batch_size); | ||
fprintf(stderr, "done malloc\n"); | ||
for(size_t i=0; i<batch_size; i++){ | ||
get_matrices<precision>(cnt_rows_A_rows_C, cnt_cols_A_rows_B, cnt_cols_B_cols_C, | ||
A[i], B[i], C[i]); | ||
} | ||
fprintf(stderr, "done random init\n"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
.PHONY: run | ||
|
||
CXX=icpc | ||
|
||
CXXFLAGS=-Wall -O3 -fopenmp -std=c++14 -msse3 -ftree-vectorize -mkl | ||
|
||
all: main | ||
|
||
main: main.cpp | ||
|
||
run: main | ||
./main 128 128 128 1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#include <iostream> | ||
#include <chrono> | ||
#include <mkl.h> | ||
#include "../args.hpp" | ||
|
||
using namespace std::chrono; | ||
|
||
int main(int argc, char * argv[]) { | ||
size_t m, n, k, batch_size; | ||
float **A, **B, **C; | ||
double dtime; | ||
Options options = parse_args(argc, argv); | ||
// parse_args(argc, argv, precision, m, k, n); | ||
m = options.cnt_rows_A_rows_C; | ||
n = options.cnt_cols_A_rows_B; | ||
k = options.cnt_cols_B_cols_C; | ||
batch_size = options.batch_size; | ||
get_batched_matrices<float>(m, k, n, A, B, C, batch_size); | ||
const float alpha = 1; | ||
const float beta = 0; | ||
const size_t lda=m; // k for row major; | ||
const size_t ldb=k; //n; | ||
const size_t ldc=m; //n; | ||
auto start = high_resolution_clock::now(); | ||
for (size_t i=0; i<options.nb_epoch; i++) | ||
{ | ||
if (options.precision == "FP32") | ||
{ | ||
#pragma omp parallel for | ||
for(size_t j=0;j<batch_size;j++){ | ||
cblas_sgemm(CblasColMajor, | ||
CblasNoTrans, | ||
CblasNoTrans, | ||
m, | ||
n, | ||
k, | ||
alpha, | ||
A[j], lda, | ||
B[j], ldb, | ||
beta, | ||
C[j], ldc); | ||
} | ||
} | ||
else | ||
{ | ||
// TODO (Alex): implement FP16 | ||
// ugly throw here to make sure benchmarker chrashes alright | ||
fprintf(stderr, "not implemented yet"); | ||
throw "madamada"; | ||
} | ||
} | ||
|
||
std::cerr << "MNK " << m << " " << n << " " << k << std::endl; | ||
auto stop = high_resolution_clock::now(); | ||
std::chrono::duration<double> seconds = (stop - start); | ||
dtime = seconds.count(); | ||
double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000); | ||
gflop *= static_cast<double>(options.nb_epoch); | ||
double gflops = gflop / dtime; | ||
printf("%f\n", dtime); | ||
fprintf(stderr, "gflops: \t%f\n", gflop); | ||
fprintf(stderr, "time: \t%f\n", dtime); | ||
fprintf(stderr, "ips: \t%f\n", 1 / dtime); | ||
fprintf(stderr, "gflops/s: \t%f\n", gflops); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# g++ -std=c++14 -I${DNNLROOT}/include -L${DNNLROOT}/lib64 -fopenmp -o main main.cpp -ldnnl | ||
.PHONY: run | ||
|
||
CXX=g++ | ||
|
||
#${DNNLROOT} is root directory of dnnl | ||
COURSE_DIR=${DNNLROOT}/include | ||
|
||
LIB_DIR=${DNNLROOT}/lib64 | ||
|
||
CXXFLAGS=-Wall -O3 -fopenmp -std=c++11 -msse3 -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec -ldnnl | ||
|
||
CPPFLAGS=-I$(COURSE_DIR) | ||
|
||
LDFLAGS=-L$(LIB_DIR) | ||
|
||
all: main | ||
|
||
dnnl: dnnl.cpp | ||
|
||
matmul: matmul.cpp | ||
|
||
run: main | ||
./main 128 128 128 1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#include <iostream> | ||
#include <chrono> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <cctype> | ||
#include "oneapi/dnnl/dnnl.hpp" | ||
#include "../args.hpp" | ||
|
||
using namespace std::chrono; | ||
|
||
|
||
int main(int argc, char * argv[]) { | ||
size_t m, n, k, batch_size; | ||
float **A, **B, **C; | ||
double dtime; | ||
Options options = parse_args(argc, argv); | ||
m = options.cnt_rows_A_rows_C; | ||
n = options.cnt_cols_A_rows_B; | ||
k = options.cnt_cols_B_cols_C; | ||
batch_size = options.batch_size; | ||
get_batched_matrices<float>(m, k, n, A, B, C, batch_size); | ||
const float alpha = 1; | ||
const float beta = 0; | ||
int64_t M = (int64_t)m; | ||
int64_t N = (int64_t)n; | ||
int64_t K = (int64_t)k; | ||
const int64_t lda=(int64_t)k; // k for row major; | ||
const int64_t ldb=(int64_t)n; //n; | ||
const int64_t ldc=(int64_t)n; //n; | ||
auto start = high_resolution_clock::now(); | ||
for (size_t i=0; i<options.nb_epoch; i++) | ||
{ | ||
if (options.precision == "FP32") | ||
{ | ||
#pragma omp parallel for | ||
for(size_t j=0;j<batch_size;j++){ | ||
dnnl_sgemm('N', | ||
'N', | ||
M, | ||
N, | ||
K, | ||
alpha, | ||
A[j], lda, | ||
B[j], ldb, | ||
beta, | ||
C[j], ldc); | ||
} | ||
} | ||
else | ||
{ | ||
// TODO (Alex): implement FP16 | ||
// ugly throw here to make sure benchmarker chrashes alright | ||
fprintf(stderr, "not implemented yet"); | ||
throw "madamada"; | ||
} | ||
} | ||
std::cerr << "MNK " << m << " " << n << " " << k << std::endl; | ||
auto stop = high_resolution_clock::now(); | ||
std::chrono::duration<double> seconds = (stop - start); | ||
dtime = seconds.count(); | ||
double gflop = (2.0 * m * n * k * batch_size) / (1000 * 1000 * 1000); | ||
gflop *= static_cast<double>(options.nb_epoch); | ||
double gflops = gflop / dtime; | ||
printf("%f\n", dtime); | ||
fprintf(stderr, "gflops: \t%f\n", gflop); | ||
fprintf(stderr, "time: \t%f\n", dtime); | ||
fprintf(stderr, "ips: \t%f\n", 1 / dtime); | ||
fprintf(stderr, "gflops/s: \t%f\n", gflops); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#include <functional> | ||
#include "dnnl.hpp" | ||
|
||
inline dnnl::memory::dim product(const dnnl::memory::dims &dims) { | ||
return std::accumulate(dims.begin(), dims.end(), (dnnl::memory::dim)1, | ||
std::multiplies<dnnl::memory::dim>()); | ||
} | ||
|
||
// Read from memory, write to handle | ||
inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) { | ||
dnnl::engine eng = mem.get_engine(); | ||
size_t size = mem.get_desc().get_size(); | ||
|
||
if (eng.get_kind() == dnnl::engine::kind::cpu) { | ||
uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle()); | ||
for (size_t i = 0; i < size; ++i) | ||
((uint8_t *)handle)[i] = src[i]; | ||
return; | ||
} | ||
} | ||
|
||
// Read from handle, write to memory | ||
inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) { | ||
dnnl::engine eng = mem.get_engine(); | ||
size_t size = mem.get_desc().get_size(); | ||
|
||
if (eng.get_kind() == dnnl::engine::kind::cpu) { | ||
uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle()); | ||
for (size_t i = 0; i < size; ++i) | ||
dst[i] = ((uint8_t *)handle)[i]; | ||
return; | ||
} | ||
} |
Oops, something went wrong.