-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #78 from undertherain/cublas
Cublas
- Loading branch information
Showing
7 changed files
with
153 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import os | ||
from .i_gemm import IGEMM | ||
from benchmarker.util.abstractprocess import Process | ||
|
||
|
||
class Benchmark(IGEMM): | ||
def __init__(self, params, remaining_args=None): | ||
super().__init__(params, remaining_args) | ||
assert self.params["problem"]["precision"] in ["FP32", "mixed"] | ||
|
||
def run(self): | ||
if "nb_gpus" in self.params: | ||
if self.params["nb_gpus"] != 1: | ||
raise Exception("cublas requires one GPU") | ||
size = " ".join(map(str, self.params['problem']['size'])) | ||
path_binary = os.path.join(os.path.dirname(os.path.realpath(__file__)), | ||
"problems/cublas/main") | ||
if not os.path.isfile(path_binary): | ||
raise(RuntimeError(f"{path_binary} not found, run make manually")) | ||
command = [path_binary, | ||
self.params["problem"]["precision"], | ||
size] | ||
process = Process(command=command) | ||
result = process.get_output() | ||
std_out = result["out"] | ||
elapsed_time = float(std_out.strip()) | ||
self.params["time"] = elapsed_time | ||
self.params["GFLOP/sec"] = self.params["GFLOP"] / elapsed_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
.PHONY: run | ||
|
||
CXXFLAGS=-Wall -O3 -fopenmp --std=c++14 -msse3 -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec | ||
|
||
CFLAGS=-Wall -O3 -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=3 -fopt-info-vec | ||
|
||
CUDAFLAGS=-O3 -Wno-deprecated-gpu-targets -lcublas -arch sm_52 | ||
|
||
all: main | ||
|
||
main: main.cu | ||
nvcc -o $@ $^ $(CUDAFLAGS) | ||
#-lopenblas | ||
|
||
run: main | ||
./main 128 128 128 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
template<typename precision> | ||
void args_to_matrices(int argc, char *argv[], size_t &m, size_t &n, size_t &k, | ||
precision * & A, precision *& B, precision * & C) { | ||
if (argc==2) { | ||
m = atoi(argv[1]); | ||
n = m; | ||
k = m; | ||
} | ||
else | ||
if (argc==4) { | ||
m = atoi(argv[1]); | ||
n = atoi(argv[2]); | ||
k = atoi(argv[3]); | ||
} | ||
else | ||
{ | ||
std::cerr << "provide m, n, k as command line parameters\n"; | ||
throw "provide m, n, k as command line parameters"; | ||
} | ||
size_t i; | ||
A = (precision*) malloc(sizeof(precision) * m * n); | ||
B = (precision*) malloc(sizeof(precision) * n * k); | ||
C = (precision*) malloc(sizeof(precision) * m * k); | ||
for(i=0; i < m * n; i++) { A[i] = rand()/RAND_MAX;} | ||
for(i=0; i < n * k; i++) { B[i] = rand()/RAND_MAX;} | ||
for(i=0; i < m * k; i++) { C[i] = rand()/RAND_MAX;} | ||
fprintf(stderr, "done random init\n"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include <iostream> | ||
#include <cuda_runtime.h> | ||
#include <cublas_v2.h> | ||
#include <cublasLt.h> | ||
#include <chrono> | ||
#include "args.hpp" | ||
|
||
using namespace std::chrono; | ||
|
||
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) | ||
template<typename T> | ||
void check(T err, const char* const func, const char* const file, const int line) { | ||
if (err != cudaSuccess) { | ||
std::cerr << "CUDA error at: " << file << ":" << line << std::endl; | ||
std::cerr << cudaGetErrorString(err) << " " << func << std::endl; | ||
exit(1); | ||
} | ||
} | ||
|
||
|
||
int main(int argc, char * argv[]) { | ||
size_t m, n, k; | ||
float *A, *B, *C; | ||
double dtime; | ||
std::string precision(argv[1]); | ||
args_to_matrices<float>(argc - 1, argv + 1, m, n, k, A, B, C); | ||
float *d_A, *d_B, *d_C; | ||
cudaMalloc(&d_A, m * k * sizeof(float)); | ||
cudaMalloc(&d_B, k * n * sizeof(float)); | ||
cudaMalloc(&d_C, m * k * sizeof(float)); | ||
cudaMemcpy(d_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice); | ||
cudaMemcpy(d_B, B, n * k * sizeof(float), cudaMemcpyHostToDevice); | ||
cublasHandle_t handle; | ||
const float alf = 1; | ||
const float bet = 0; | ||
const float *alpha = &alf; | ||
const float *beta = &bet; | ||
int lda=m, ldb=k, ldc=m; | ||
int gpu_id = 0; // TODO: get from command line | ||
cudaSetDevice(gpu_id); | ||
cublasCreate(&handle); | ||
auto start = high_resolution_clock::now(); | ||
// TODO: this m n k ordering is a mess, rename them intuitively >< | ||
if (precision == "FP32") | ||
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc); | ||
else | ||
cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, | ||
alpha, d_A, CUDA_R_16F, lda, d_B, CUDA_R_16F, ldb, beta, d_C, CUDA_R_32F, ldc, CUDA_R_32F, | ||
CUBLAS_GEMM_DEFAULT_TENSOR_OP); | ||
cudaDeviceSynchronize(); | ||
auto stop = high_resolution_clock::now(); | ||
cudaFree(d_A); | ||
cudaFree(d_B); | ||
cudaFree(d_C); | ||
cublasDestroy(handle); | ||
checkCudaErrors(cudaGetLastError()); | ||
checkCudaErrors(cudaGetLastError()); | ||
std::chrono::duration<double> seconds = (stop - start); | ||
dtime = seconds.count(); | ||
double gflop = (2.0 * m * n * k) / (1024 * 1024 * 1024); | ||
double gflops = gflop / dtime; | ||
printf("%f\n", dtime); | ||
fprintf(stderr, "gflops: \t%f\n", gflop); | ||
fprintf(stderr, "time: \t%f\n", dtime); | ||
fprintf(stderr, "ips: \t%f\n", 1 / dtime); | ||
fprintf(stderr, "gflops/s: \t%f\n", gflops); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import os | ||
|
||
|
||
def get_script_dir(): | ||
path = os.path.dirname(os.path.realpath(__file__)) | ||
return path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters