-
Notifications
You must be signed in to change notification settings - Fork 3
/
parallel-reduction.cpp
250 lines (196 loc) · 6.74 KB
/
parallel-reduction.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#include <iostream>
#include <chrono>
#include <vector>
#include <future>
#include <thread>
#include <queue>
#include <functional>
#include <mutex>
#include <numeric>
#include <condition_variable>
#include <type_traits>
template <typename T>
struct MoC {
MoC(T&& rhs) : object(std::move(rhs)) {}
MoC(const MoC& other) : object(std::move(other.object)) {}
T& get() { return object; }
mutable T object;
};
// ----------------------------------------------------------------------------
// Class definition for Threadpool
// ----------------------------------------------------------------------------
class Threadpool {
public:
// constructor tasks a unsigned integer representing the number of
// workers you need
Threadpool(size_t N) {
for(size_t i=0; i<N; i++) {
threads.emplace_back([this](){
// keep doing my job until the main thread sends a stop signal
while(!stop) {
std::function<void()> task;
// my job is to iteratively grab a task from the queue
{
// Best practice: anything that happens inside the while continuation check
// should always be protected by lock
std::unique_lock lock(mtx);
while(queue.empty() && !stop) {
cv.wait(lock);
}
if(!queue.empty()) {
task = queue.front();
queue.pop();
}
}
// and run the task...
if(task) {
task();
}
}
});
}
}
// destructor will release all threading resources by joining all of them
~Threadpool() {
// I need to join the threads to release their resources
for(auto& t : threads) {
t.join();
}
}
// shutdown the threadpool
void shutdown() {
std::scoped_lock lock(mtx);
stop = true;
cv.notify_all();
}
// insert a task "callable object" into the threadpool
template <typename C>
auto insert(C&& task) {
std::promise<void> promise;
auto fu = promise.get_future();
{
std::scoped_lock lock(mtx);
queue.push(
[moc=MoC{std::move(promise)}, task=std::forward<C>(task)] () mutable {
task();
moc.object.set_value();
}
);
}
cv.notify_one();
return fu;
}
template <typename Input, typename F>
void for_each(Input beg, Input end, F func, size_t chunk_size = 1) {
// the total number of elements in the range [beg, end)
size_t N = std::distance(beg, end);
std::vector<std::future<void>> futures;
std::atomic<size_t> takens {0};
for(size_t i=0; i<threads.size(); i++) {
futures.emplace_back(insert([N, beg, end, func, chunk_size, &takens](){
size_t curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed);
while(curr_b < N) {
size_t curr_e = std::min(N, curr_b + chunk_size);
// apply func to the range specified by beg + [curr_b, curr_e)
std::for_each(beg + curr_b, beg + curr_e, func);
// get the next chunk
curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed);
}
}));
}
// caller thread to wait for all W tasks finish (futures)
for(auto & fu : futures) {
fu.get();
}
}
template <typename Input, typename T, typename F>
T reduce(Input beg, Input end, T init, F bop, size_t chunk_size = 2) {
size_t N = std::distance(beg, end);
std::vector<std::future<void>> futures;
std::atomic<size_t> takens {0};
std::mutex mutex;
for(size_t i=0; i<threads.size(); i++) {
futures.emplace_back(insert([N, beg, end, bop, &init, &mutex, chunk_size, &takens](){
// pre-reduce
size_t curr_b = takens.fetch_add(2, std::memory_order_relaxed);
// corner case #1: no more elements to reduce
if(curr_b >= N) {
return;
}
// corner case #2: only one element left
if(N - curr_b == 1) {
std::scoped_lock lock(mutex);
init = bop(init, *(beg + curr_b));
return;
}
// perform a reduction on these two elements
T temp = bop( *(beg+curr_b), *(beg+curr_b+1) );
curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed);
while(curr_b < N) {
size_t curr_e = std::min(N, curr_b + chunk_size);
// run a sequential reduction to the range specified by beg + [curr_b, curr_e)
temp = std::accumulate(beg + curr_b, beg + curr_e, temp, bop);
// get the next chunk
curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed);
}
// perform a final reduction on temp with init
{
std::scoped_lock lock(mutex);
init = bop(init, temp);
}
}));
}
// caller thread to wait for all W tasks finish (futures)
for(auto & fu : futures) {
fu.get();
}
return init;
}
private:
std::mutex mtx;
std::vector<std::thread> threads;
std::condition_variable cv;
bool stop {false};
std::queue< std::function<void()> > queue;
};
// seq version of for_each based on STL implementation
auto seq_reduce(std::vector<int>& vec) {
return std::accumulate(vec.begin(), vec.end(), 0, [](int a, int b){ return a + b; });
}
auto par_reduce(std::vector<int>& vec, Threadpool& threadpool) {
return threadpool.reduce(
vec.begin(), vec.end(), 0, [](int a, int b){ return a+b; }, 1024
);
}
int main(int argc, char* argv[]) {
// usage: ./a.out T N
if(argc != 3) {
std::cerr << "usage: ./a.out T N\n";
std::exit(EXIT_FAILURE);
}
size_t T = std::atoi(argv[1]);
size_t N = std::atoi(argv[2]);
// create a thread pool of the maximum hardware concurrency
Threadpool threadpool(T);
std::vector<int> vec(N);
for(auto& i : vec) {
i = 1;
}
// run reduce sequentially
std::cout << "running seq_reduce ... ";
auto beg = std::chrono::steady_clock::now();
auto res1 = seq_reduce(vec);
auto end = std::chrono::steady_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(end-beg).count()
<< "ns\n";
// run reduce parallely
std::cout << "running par_for_each ... ";
beg = std::chrono::steady_clock::now();
auto res2 = par_reduce(vec, threadpool);
end = std::chrono::steady_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(end-beg).count()
<< "ns\n";
// shut down the threadpool
threadpool.shutdown();
return 0;
}