-
Notifications
You must be signed in to change notification settings - Fork 255
/
Copy pathcuda_callback.cu
163 lines (127 loc) · 4.44 KB
/
cuda_callback.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#include <cstdio>
#include <helper_timer.h>
using namespace std;
__global__ void vecAdd_kernel(float *c, const float* a, const float* b);
void init_buffer(float *data, const int size);
class Operator
{
private:
int _index;
cudaStream_t stream;
StopWatchInterface *p_timer;
static void CUDART_CB Callback(cudaStream_t stream, cudaError_t status, void* userData);
void print_time();
public:
Operator() {
cudaStreamCreate(&stream);
sdkCreateTimer(&p_timer);
}
~Operator() {
cudaStreamDestroy(stream);
sdkDeleteTimer(&p_timer);
}
void set_index(int index) { _index = index; }
void async_operation(float *h_c, const float *h_a, const float *h_b,
float *d_c, float *d_a, float *d_b,
const int size, const int bufsize);
}; // Operator
void Operator::CUDART_CB Callback(cudaStream_t stream, cudaError_t status, void* userData) {
Operator* this_ = (Operator*) userData;
this_->print_time();
}
void Operator::print_time() {
sdkStopTimer(&p_timer); // end timer
float elapsed_time_msed = sdkGetTimerValue(&p_timer);
printf("stream %2d - elapsed %.3f ms \n", _index, elapsed_time_msed);
}
void Operator::async_operation(float *h_c, const float *h_a, const float *h_b,
float *d_c, float *d_a, float *d_b,
const int size, const int bufsize)
{
// start timer
sdkStartTimer(&p_timer);
// copy host -> device
cudaMemcpyAsync(d_a, h_a, bufsize, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, h_b, bufsize, cudaMemcpyHostToDevice, stream);
// launch cuda kernel
dim3 dimBlock(256);
dim3 dimGrid(size / dimBlock.x);
vecAdd_kernel<<< dimGrid, dimBlock, 0, stream >>>(d_c, d_a, d_b);
// copy device -> host
cudaMemcpyAsync(h_c, d_c, bufsize, cudaMemcpyDeviceToHost, stream);
// register callback function
cudaStreamAddCallback(stream, Operator::Callback, this, 0);
}
int main(int argc, char* argv[])
{
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
int size = 1 << 24;
int bufsize = size * sizeof(float);
int num_operator = 4;
if (argc != 1)
num_operator = atoi(argv[1]);
// initialize timer
StopWatchInterface *timer;
sdkCreateTimer(&timer);
// allocate host memories
cudaMallocHost((void**)&h_a, bufsize);
cudaMallocHost((void**)&h_b, bufsize);
cudaMallocHost((void**)&h_c, bufsize);
// initialize host values
srand(2019);
init_buffer(h_a, size);
init_buffer(h_b, size);
init_buffer(h_c, size);
// allocate device memories
cudaMalloc((void**)&d_a, bufsize);
cudaMalloc((void**)&d_b, bufsize);
cudaMalloc((void**)&d_c, bufsize);
// create list of operation elements
Operator *ls_operator = new Operator[num_operator];
sdkStartTimer(&timer);
// execute each operator collesponding data
for (int i = 0; i < num_operator; i++) {
int offset = i * size / num_operator;
ls_operator[i].set_index(i);
ls_operator[i].async_operation(&h_c[offset], &h_a[offset], &h_b[offset],
&d_c[offset], &d_a[offset], &d_b[offset],
size / num_operator, bufsize / num_operator);
}
// synchronize all the stream operation
cudaDeviceSynchronize();
sdkStopTimer(&timer);
// print out the result
int print_idx = 256;
printf("compared a sample result...\n");
printf("host: %.6f, device: %.6f\n", h_a[print_idx] + h_b[print_idx], h_c[print_idx]);
// Compute and print the performance
float elapsed_time_msed = sdkGetTimerValue(&timer);
float bandwidth = 3 * bufsize * sizeof(float) / elapsed_time_msed / 1e6;
printf("Time= %.3f msec, bandwidth= %f GB/s\n", elapsed_time_msed, bandwidth);
// delete timer
sdkDeleteTimer(&timer);
// terminate operators
delete [] ls_operator;
// terminate device memories
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// terminate host memories
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
return 0;
}
__global__ void
vecAdd_kernel(float *c, const float* a, const float* b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < 500; i++)
c[idx] = a[idx] + b[idx];
}
void init_buffer(float *data, const int size)
{
for (int i = 0; i < size; i++)
data[i] = rand() / (float)RAND_MAX;
}