forked from PacktPublishing/Learn-CUDA-Programming
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduction.cpp
119 lines (92 loc) · 3.08 KB
/
reduction.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include <stdio.h>
#include <stdlib.h>
// cuda runtime
#include <cuda_runtime.h>
#include <helper_timer.h>
#include "reduction.h"
void run_benchmark(void (*reduce)(float*, float*, int, int),
float *d_outPtr, float *d_inPtr, int size);
void init_input(float* data, int size);
float get_cpu_result(float *data, int size);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
float *h_inPtr;
float *d_inPtr, *d_outPtr;
unsigned int size = 1 << 24;
float result_host, result_gpu;
int mode = 0;
if (argc > 1 && atoi(argv[1]) == 1)
mode = 1;
srand(2019);
// Allocate memory
h_inPtr = (float*)malloc(size * sizeof(float));
// Data initialization with random values
init_input(h_inPtr, size);
// Prepare GPU resource
cudaMalloc((void**)& d_inPtr, size * sizeof(float));
cudaMalloc((void**)&d_outPtr, size * sizeof(float));
cudaMemcpy(d_inPtr, h_inPtr, size * sizeof(float), cudaMemcpyHostToDevice);
// Get reduction result from GPU
run_benchmark(atomic_reduction, d_outPtr, d_inPtr, size);
cudaMemcpy(&result_gpu, &d_outPtr[0], sizeof(float), cudaMemcpyDeviceToHost);
// Get all sum from CPU
result_host = get_cpu_result(h_inPtr, size);
printf("host: %f, device %f\n", result_host, result_gpu);
// Terminates memory
cudaFree(d_outPtr);
cudaFree(d_inPtr);
free(h_inPtr);
return 0;
}
void run_reduction(void (*reduce)(float*, float*, int, int),
float *d_outPtr, float *d_inPtr, int size, int n_threads)
{
reduce(d_outPtr, d_inPtr, size, n_threads);
}
void run_benchmark(void (*reduce)(float*, float*, int, int),
float *d_outPtr, float *d_inPtr, int size)
{
int num_threads = 256;
int test_iter = 100;
// warm-up
reduce(d_outPtr, d_inPtr, size, num_threads);
// initialize timer
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
////////
// Operation body
////////
for (int i = 0; i < test_iter; i++) {
cudaMemcpy(d_outPtr, d_inPtr, size * sizeof(float), cudaMemcpyDeviceToDevice);
run_reduction(reduce, d_outPtr, d_outPtr, size, num_threads);
}
// getting elapsed time
cudaDeviceSynchronize();
sdkStopTimer(&timer);
// Compute and print the performance
float elapsed_time_msed = sdkGetTimerValue(&timer) / (float)test_iter;
float bandwidth = size * sizeof(float) / elapsed_time_msed / 1e6;
printf("Time= %.3f msec, bandwidth= %f GB/s\n", elapsed_time_msed, bandwidth);
sdkDeleteTimer(&timer);
}
void
init_input(float *data, int size)
{
for (int i = 0; i < size; i++)
{
// Keep the numbers small so we don't get truncation error in the sum
data[i] = (rand() & 0xFF) / (float)RAND_MAX;
}
}
float
get_cpu_result(float *data, int size)
{
double result = 0.f;
for (int i = 0; i < size; i++)
result += data[i];
return (float)result;
}