forked from PacktPublishing/Learn-CUDA-Programming
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgemm.cu
130 lines (104 loc) · 3.69 KB
/
sgemm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <helper_functions.h> // for benchmark purpose
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set matrix multiply on GPU
//! C = alpha * A * B + beta * C
//! @param A matrix A as provided to device
//! @param B matrix B as provided to device
//! @param C matrix C as provided to device
//! @param N height of matrix A and matrix C
//! @param M width of matrix B and matrix C
//! @param K width of matrix A and height of matrix C
//! @param alpha scala value for matrix multiplication
//! @param beta scala value for matrix summation with C
////////////////////////////////////////////////////////////////////////////////
__global__ void
sgemm_gpu_kernel(const float *A, const float *B, float *C, int N, int M, int K, float alpha, float beta)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
float sum = 0.f;
for (int i = 0; i < K; ++i) {
sum += A[row * K + i] * B[i * K + col];
}
C[row * M + col] = alpha * sum + beta * C[row * M + col];
}
void sgemm_gpu(const float *A, const float *B, float *C, int N, int M, int K, float alpha, float beta)
{
dim3 dimBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 dimGrid(M / dimBlock.x, N / dimBlock.y);
sgemm_gpu_kernel << < dimGrid, dimBlock >> > (A, B, C, N, M, K, alpha, beta);
}
void random_init(float *data, int size)
{
for (int i = 0; i < size; ++i) {
data[i] = (rand() & 0xFF) / (float)RAND_MAX;
}
}
void performance_estimation(void(*sgemm)(const float *, const float *, float *, int, int, int, float, float),
const float *A, const float *B, float *C, int N, int M, int K, float alpha, float beta)
{
int test_iterations = 100;
// Create timer
StopWatchInterface *timer = 0;
// initial start an operation as a warm start
sgemm(A, B, C, N, M, K, alpha, beta);
// Record the start event
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
////////
// Operation body
////////
for (int i = 0; i < test_iterations; i++) {
sgemm(A, B, C, N, M, K, alpha, beta);
}
// Waits for GPU operation finish and recored the time
sdkStopTimer(&timer);
// Compute and print the performance
float operation_time = sdkGetAverageTimerValue(&timer);
float operation_time_1_epoch = operation_time / test_iterations;
printf("Operation Time= %.4f msec\n", operation_time_1_epoch);
// cleanup
sdkDeleteTimer(&timer);
}
int main()
{
float *A, *B, *C;
float *d_A, *d_B, *d_C;
int N, M, K;
float alpha = 2.f;
float beta = 1.f;
N = M = K = 2048;
// allocation of linear memory space
A = (float *)malloc(N * K * sizeof(float));
B = (float *)malloc(K * M * sizeof(float));
C = (float *)malloc(N * M * sizeof(float));
// allocation of gpu linear memory space
cudaMalloc((void **)&d_A, N * K * sizeof(float));
cudaMalloc((void **)&d_B, K * M * sizeof(float));
cudaMalloc((void **)&d_C, N * M * sizeof(float));
// initialize randomized values for memory space
random_init(A, N * K);
random_init(B, K * M);
random_init(C, N * M);
// copy initial value for gpu memory
cudaMemcpy(d_A, A, N * K * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, A, K * M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, A, N * M * sizeof(float), cudaMemcpyHostToDevice);
// do operation
//sgemm_gpu(d_A, d_B, d_C, N, M, K, alpha, beta);
performance_estimation(sgemm_gpu, d_A, d_B, d_C, N, M, K, alpha, beta);
// terminates allocated gpu memory space
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// terminates allocated memory space
free(A);
free(B);
free(C);
return 0;
}