-
Notifications
You must be signed in to change notification settings - Fork 255
/
Copy pathcuda_kernel.cu
90 lines (71 loc) · 2.47 KB
/
cuda_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#include <cstdio>
using namespace std;
__global__ void
simple_saxpy_kernel(float *y, const float* x, const float alpha, const float beta)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
y[idx] = alpha * x[idx] + beta;
}
__global__ void
iterative_saxpy_kernel(float *y, const float* x, const float alpha, const float beta, int n_loop)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < n_loop; i++)
y[idx] = alpha * x[idx] + beta;
}
__global__ void
recursive_saxpy_kernel(float *y, const float* x, const float alpha, const float beta, int depth)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (depth == 0)
return;
else
y[idx] = alpha * x[idx] + beta;
if (threadIdx.x == 0)
recursive_saxpy_kernel<<< 1, blockDim.x >>>(y, x, alpha, beta, depth - 1);
}
int main()
{
float *d_y, *d_x;
int size = 1 << 10;
int bufsize = size * sizeof(float);
int n_loop = 24;
float elapsed_time_A, elapsed_time_B, elapsed_time_C;
float alpha = 0.1f, beta = 0.2f;
cudaEvent_t start, stop;
// initialize cuda event
cudaEventCreateWithFlags(&start, cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop, cudaEventBlockingSync);
cudaMalloc((void**)&d_y, bufsize);
cudaMalloc((void**)&d_x, bufsize);
int dimBlock = 256;
int dimGrid = size / dimBlock;
// Step 1. Loop outside the kernel
cudaEventRecord(start);
for (int i = 0; i < n_loop; i++) {
simple_saxpy_kernel<<< dimGrid, dimBlock >>>(d_y, d_x, alpha, beta);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_A, start, stop);
// Step 2. Loop inside the kernel
cudaEventRecord(start);
iterative_saxpy_kernel<<< dimGrid, dimBlock >>>(d_y, d_x, alpha, beta, n_loop);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_B, start, stop);
// Step 3. Loop with the recursion
cudaEventRecord(start);
recursive_saxpy_kernel<<< dimGrid, dimBlock >>>(d_y, d_x, alpha, beta, n_loop);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_C, start, stop);
printf("Elapsed Time...\n");
printf("simple loop: %.3f ms\n", elapsed_time_A);
printf("inner loop : %.3f ms\n", elapsed_time_B);
printf("recursion : %.3f ms\n", elapsed_time_C);
cudaFree(d_y);
cudaFree(d_x);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}