Skip to content

Commit

Permalink
Merge 8b096e3 into 8de535f
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Jul 20, 2020
2 parents 8de535f + 8b096e3 commit 8a26ade
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 40 deletions.
28 changes: 28 additions & 0 deletions benchmarker/modules/do_cblas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
from .i_gemm import IGEMM
from benchmarker.util.abstractprocess import Process


class Benchmark(IGEMM):
def __init__(self, params, remaining_args=None):
super().__init__(params, remaining_args)
assert self.params["problem"]["precision"] in ["FP32", "mixed"]

def run(self):
if "nb_gpus" in self.params:
if self.params["nb_gpus"] > 0:
raise Exception("cblas does not work on GPU")
size = " ".join(map(str, self.params['problem']['size']))
path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"problems/gemm/cblas/main")
if not os.path.isfile(path_binary):
raise(RuntimeError(f"{path_binary} not found, run make manually"))
command = [path_binary,
self.params["problem"]["precision"],
size]
process = Process(command=command)
result = process.get_output()
std_out = result["out"]
elapsed_time = float(std_out.strip())
self.params["time"] = elapsed_time
self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time
2 changes: 1 addition & 1 deletion benchmarker/modules/do_cublas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def run(self):
raise Exception("cublas requires one GPU")
size = " ".join(map(str, self.params['problem']['size']))
path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"problems/cublas/main")
"problems/gemm/cublas/main")
if not os.path.isfile(path_binary):
raise(RuntimeError(f"{path_binary} not found, run make manually"))
command = [path_binary,
Expand Down
28 changes: 0 additions & 28 deletions benchmarker/modules/problems/cublas/args.hpp

This file was deleted.

48 changes: 48 additions & 0 deletions benchmarker/modules/problems/gemm/args.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// traditional BLAS API:
// M - Number of rows in matrices A and C.
// N - Number of columns in matrices B and C.
// K - Number of columns in matrix A; number of rows in matrix B.
// we remap from intuitive order to BLAS-style

void parse_args(const int argc,
char *argv[],
std::string &precision,
size_t &cnt_rows_A_rows_C,
size_t &cnt_cols_A_rows_B,
size_t &cnt_cols_B_cols_C
) {
if ((argc != 3) && (argc != 5))
{
std::cerr << "provide precision, m, n, k as command line parameters\n";
exit(-1);
}
precision = std::string(argv[1]);
if (argc==3) {
cnt_rows_A_rows_C = atoi(argv[2]);
cnt_cols_A_rows_B = cnt_rows_A_rows_C;
cnt_cols_B_cols_C = cnt_rows_A_rows_C;
}
else
{
cnt_rows_A_rows_C = atoi(argv[2]);
cnt_cols_A_rows_B = atoi(argv[3]);
cnt_cols_B_cols_C = atoi(argv[4]);
}
}

template<typename precision>
void get_matrices(size_t &cnt_rows_A_rows_C,
size_t &cnt_cols_A_rows_B,
size_t &cnt_cols_B_cols_C,
precision * & A,
precision * & B,
precision * & C) {
size_t i;
A = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_A_rows_B);
B = (precision*) malloc(sizeof(precision) * cnt_cols_A_rows_B * cnt_cols_B_cols_C);
C = (precision*) malloc(sizeof(precision) * cnt_rows_A_rows_C * cnt_cols_B_cols_C);
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_A_rows_B; i++) { A[i] = rand()/RAND_MAX;}
for(i=0; i < cnt_cols_A_rows_B * cnt_cols_B_cols_C; i++) { B[i] = rand()/RAND_MAX;}
for(i=0; i < cnt_rows_A_rows_C * cnt_cols_B_cols_C; i++) { C[i] = rand()/RAND_MAX;}
fprintf(stderr, "done random init\n");
}
15 changes: 15 additions & 0 deletions benchmarker/modules/problems/gemm/cblas/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
.PHONY: run

CXXFLAGS=-Wall -O3 -fopenmp --std=c++14 -msse3 -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec

CFLAGS=-Wall -O3 -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec


all: main

main: main.cpp
# TODO: make this customizable to build against different blas libraries
$(CXX) -o $@ $^ $(CXXFLAGS) -lopenblas

run: main
./main 128 128 128
52 changes: 52 additions & 0 deletions benchmarker/modules/problems/gemm/cblas/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <iostream>
#include <chrono>
#include <cblas.h>
#include "../args.hpp"

using namespace std::chrono;



int main(int argc, char * argv[]) {
size_t m, n, k;
float *A, *B, *C;
double dtime;
std::string precision;
parse_args(argc, argv, precision, m, k, n);
get_matrices<float>(m, k, n, A, B, C);
const float alpha = 1;
const float beta = 0;
const size_t lda=m; // k for row major;
const size_t ldb=k; //n;
const size_t ldc=m; //n;
auto start = high_resolution_clock::now();
if (precision == "FP32")
cblas_sgemm(CblasColMajor,
CblasNoTrans,
CblasNoTrans,
m,
n,
k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
else
{
fprintf(stderr, "not implemented yet");
throw "madamada";
}
std::cerr << "MNK " << m << " " << n << " " << k << std::endl;
auto stop = high_resolution_clock::now();
std::chrono::duration<double> seconds = (stop - start);
dtime = seconds.count();
double gflop = (2.0 * m * n * k) / (1024 * 1024 * 1024);
double gflops = gflop / dtime;
printf("%f\n", dtime);
fprintf(stderr, "gflops: \t%f\n", gflop);
fprintf(stderr, "time: \t%f\n", dtime);
fprintf(stderr, "ips: \t%f\n", 1 / dtime);
fprintf(stderr, "gflops/s: \t%f\n", gflops);
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <cublas_v2.h>
#include <cublasLt.h>
#include <chrono>
#include "args.hpp"
#include "../args.hpp"

using namespace std::chrono;

Expand All @@ -22,27 +22,29 @@ int main(int argc, char * argv[]) {
size_t m, n, k;
float *A, *B, *C;
double dtime;
std::string precision(argv[1]);
args_to_matrices<float>(argc - 1, argv + 1, m, n, k, A, B, C);
std::string precision;
parse_args(argc, argv, precision, m, k, n);
get_matrices<float>(m, k, n, A, B, C);
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, m * k * sizeof(float));
cudaMalloc(&d_B, k * n * sizeof(float));
cudaMalloc(&d_C, m * k * sizeof(float));
cudaMemcpy(d_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_C, m * n * sizeof(float));
cudaMemcpy(d_A, A, m * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, k * n * sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
int lda=m, ldb=k, ldc=m;
int gpu_id = 0; // TODO: get from command line
int gpu_id = 0; // this is actually OK if calle from Benchmarker bec. visible devices
cudaSetDevice(gpu_id);
cublasCreate(&handle);
auto start = high_resolution_clock::now();
// TODO: this m n k ordering is a mess, rename them intuitively ><
// cublas only does column-major order
if (precision == "FP32")
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k,
alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
else
cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k,
alpha, d_A, CUDA_R_16F, lda, d_B, CUDA_R_16F, ldb, beta, d_C, CUDA_R_32F, ldc, CUDA_R_32F,
Expand Down
3 changes: 1 addition & 2 deletions benchmarker/modules/problems/gemm/data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

def get_data(params):
return 2048
return "dummy data"

0 comments on commit 8a26ade

Please sign in to comment.