Skip to content

Commit

Permalink
Merge pull request #78 from undertherain/cublas
Browse files Browse the repository at this point in the history
Cublas
  • Loading branch information
vatai committed Jul 7, 2020
2 parents ba404ec + 1e5c84f commit d8499cf
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 12 deletions.
28 changes: 28 additions & 0 deletions benchmarker/modules/do_cublas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
from .i_gemm import IGEMM
from benchmarker.util.abstractprocess import Process


class Benchmark(IGEMM):
def __init__(self, params, remaining_args=None):
super().__init__(params, remaining_args)
assert self.params["problem"]["precision"] in ["FP32", "mixed"]

def run(self):
if "nb_gpus" in self.params:
if self.params["nb_gpus"] != 1:
raise Exception("cublas requires one GPU")
size = " ".join(map(str, self.params['problem']['size']))
path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"problems/cublas/main")
if not os.path.isfile(path_binary):
raise(RuntimeError(f"{path_binary} not found, run make manually"))
command = [path_binary,
self.params["problem"]["precision"],
size]
process = Process(command=command)
result = process.get_output()
std_out = result["out"]
elapsed_time = float(std_out.strip())
self.params["time"] = elapsed_time
self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time
12 changes: 6 additions & 6 deletions benchmarker/modules/i_gemm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Module contains the interface for all deep learning modules"""
# import argparse
import argparse
import os


Expand All @@ -8,12 +8,12 @@ class IGEMM():

def __init__(self, params, remaining_args=None):
self.params = params
parser = argparse.ArgumentParser(description="gemm extra args")
parser.add_argument("--precision", default="FP32")
# args, remaining_args = parser.parse_known_args(remaining_args)
# parser = argparse.ArgumentParser(description='Benchmark GEMM operations')
# parser.add_argument('--mode', default=None)
# args = parser.parse_args(remaining_args)
# TODO: read size from args
# TODO: add float type as arg
params["problem"]["precision"] = "FP32"
args = parser.parse_args(remaining_args)
params["problem"]["precision"] = args.precision
params["path_out"] = os.path.join(params["path_out"],
params["problem"]["precision"])
if isinstance(params["problem"]["size"], int):
Expand Down
16 changes: 16 additions & 0 deletions benchmarker/modules/problems/cublas/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.PHONY: run

CXXFLAGS=-Wall -O3 -fopenmp --std=c++14 -msse3 -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec

CFLAGS=-Wall -O3 -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec

CUDAFLAGS=-O3 -Wno-deprecated-gpu-targets -lcublas -arch sm_52

all: main

main: main.cu
nvcc -o $@ $^ $(CUDAFLAGS)
#-lopenblas

run: main
./main 128 128 128
28 changes: 28 additions & 0 deletions benchmarker/modules/problems/cublas/args.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
template<typename precision>
void args_to_matrices(int argc, char *argv[], size_t &m, size_t &n, size_t &k,
precision * & A, precision *& B, precision * & C) {
if (argc==2) {
m = atoi(argv[1]);
n = m;
k = m;
}
else
if (argc==4) {
m = atoi(argv[1]);
n = atoi(argv[2]);
k = atoi(argv[3]);
}
else
{
std::cerr << "provide m, n, k as command line parameters\n";
throw "provide m, n, k as command line parameters";
}
size_t i;
A = (precision*) malloc(sizeof(precision) * m * n);
B = (precision*) malloc(sizeof(precision) * n * k);
C = (precision*) malloc(sizeof(precision) * m * k);
for(i=0; i < m * n; i++) { A[i] = rand()/RAND_MAX;}
for(i=0; i < n * k; i++) { B[i] = rand()/RAND_MAX;}
for(i=0; i < m * k; i++) { C[i] = rand()/RAND_MAX;}
fprintf(stderr, "done random init\n");
}
68 changes: 68 additions & 0 deletions benchmarker/modules/problems/cublas/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cublasLt.h>
#include <chrono>
#include "args.hpp"

using namespace std::chrono;

#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}


int main(int argc, char * argv[]) {
size_t m, n, k;
float *A, *B, *C;
double dtime;
std::string precision(argv[1]);
args_to_matrices<float>(argc - 1, argv + 1, m, n, k, A, B, C);
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, m * k * sizeof(float));
cudaMalloc(&d_B, k * n * sizeof(float));
cudaMalloc(&d_C, m * k * sizeof(float));
cudaMemcpy(d_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * k * sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
int lda=m, ldb=k, ldc=m;
int gpu_id = 0; // TODO: get from command line
cudaSetDevice(gpu_id);
cublasCreate(&handle);
auto start = high_resolution_clock::now();
// TODO: this m n k ordering is a mess, rename them intuitively ><
if (precision == "FP32")
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
else
cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k,
alpha, d_A, CUDA_R_16F, lda, d_B, CUDA_R_16F, ldb, beta, d_C, CUDA_R_32F, ldc, CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cudaDeviceSynchronize();
auto stop = high_resolution_clock::now();
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaGetLastError());
std::chrono::duration<double> seconds = (stop - start);
dtime = seconds.count();
double gflop = (2.0 * m * n * k) / (1024 * 1024 * 1024);
double gflops = gflop / dtime;
printf("%f\n", dtime);
fprintf(stderr, "gflops: \t%f\n", gflop);
fprintf(stderr, "time: \t%f\n", dtime);
fprintf(stderr, "ips: \t%f\n", 1 / dtime);
fprintf(stderr, "gflops/s: \t%f\n", gflops);
return 0;
}
6 changes: 6 additions & 0 deletions benchmarker/util/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os


def get_script_dir():
path = os.path.dirname(os.path.realpath(__file__))
return path
7 changes: 1 addition & 6 deletions benchmarker/util/sysinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,7 @@
import os
import sys

from benchmarker.util import abstractprocess


def get_script_dir():
path = os.path.dirname(os.path.realpath(__file__))
return path
from benchmarker.util import abstractprocess, get_script_dir


def get_sys_info():
Expand Down

0 comments on commit d8499cf

Please sign in to comment.