/
sample11.cpp
371 lines (309 loc) · 10.9 KB
/
sample11.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/*
Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
For more information please visit: http://bitmagic.io
*/
/** \example sample11.cpp
Example of how to use various bit counting techniques
\sa bm::bvector<>::count()
\sa bm::bvector<>::count_range()
\sa bm::bvector<>::count_to()
\sa bm::count_and()
\sa bm::bvector<>::counted_enumerator
*/
/*! \file sample11.cpp
\brief Example: bvector<> bit-counting techniques analysis
*/
#include <iostream>
#include <random>
#include <memory>
#include "bm.h"
#include "bmalgo.h"
#include "bmtimer.h"
using namespace std;
// timing storage for benchmarking
bm::chrono_taker::duration_map_type timing_map;
const unsigned benchmark_count = 10000;
unsigned vector_max = 400000000;
std::random_device rand_dev;
std::mt19937 gen(rand_dev()); // mersenne_twister_engine
std::uniform_int_distribution<> rand_dis(1, int(vector_max)); // generate uniform numebrs for [1, vector_max]
/// generate pseudo-random bit-vector, mix of blocks
///
static
void generate_bvector(bm::bvector<>& bv)
{
unsigned i, j;
for (i = 0; i < vector_max;)
{
// generate bit-blocks
for (j = 0; j < 65535*8; i += 10, j++)
{
bv.set(i);
}
if (i > vector_max)
break;
// generate GAP (compressed) blocks
for (j = 0; j < 65535; i += 120, j++)
{
unsigned len = rand() % 64;
bv.set_range(i, i + len);
i += len;
if (i > vector_max)
break;
}
}
// compress vector
BM_DECLARE_TEMP_BLOCK(tb)
bv.optimize(tb);
// compute bit-vector statistics
bm::bvector<>::statistics st;
bv.calc_stat(&st);
std::cout << "Bit-vector statistics: GAP (compressed blocks)=" << st.gap_blocks
<< ", BIT (uncompressed blocks)=" << st.bit_blocks
<< std::endl << std::endl;
}
/// "pre-heat" CPU to minimize dynamic overclocking effects
///
static
unsigned pre_heat(const bm::bvector<>& bv)
{
unsigned cnt = 0;
unsigned m = 1;
for (unsigned i = 0; i < benchmark_count; ++i)
{
cnt += bv.count();
m+=cnt*cnt;
}
return m;
}
/// simple population count for the whole vector
///
static
void bv_count_test(const bm::bvector<>& bv)
{
unsigned cnt = 0;
{
bm::chrono_taker tt1("1. bvector<>::count()", benchmark_count / 2, &timing_map);
for (unsigned i = 0; i < benchmark_count / 2; ++i)
{
cnt += bv.count();
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "Count test finished." << cnt << "\r";
}
/// count_range() test
///
static
void bv_count_range(const bm::bvector<>& bv)
{
unsigned cnt = 0;
{
bm::chrono_taker tt1("2. bvector<>::count_range()", benchmark_count, &timing_map);
for (unsigned i = 0; i < benchmark_count; ++i)
{
unsigned from = unsigned(rand_dis(gen));
unsigned to = unsigned(rand_dis(gen));
if (from > to)
swap(from, to);
cnt += bv.count_range(from, to);
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "Count range test finished." << cnt << "\r";
}
/// count_range() test using pre-calculated blocks bit count
///
static
void bv_count_range_acc(const bm::bvector<>& bv)
{
unsigned cnt = 0;
// build a block population count list, used for count_range() acceleration
// for this test it is intentionally excluded from the timing measurements
unsigned blocks_cnt[bm::set_total_blocks];
bv.count_blocks(blocks_cnt);
{
bm::chrono_taker tt1("3. bvector<>::count_range() with blocks list", benchmark_count, &timing_map);
cnt = 0;
for (unsigned i = 0; i < benchmark_count; ++i)
{
unsigned from = unsigned(rand_dis(gen));
unsigned to = unsigned(rand_dis(gen));
if (from > to)
swap(from, to);
cnt += bv.count_range(from, to, blocks_cnt); // use blocks count for acceleration
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "Count range with blocks test finished." << cnt << "\r";
}
/// count_to() test using pre-calculated rank-select index
///
static
void bv_count_to_acc(const bm::bvector<>& bv)
{
unsigned cnt = 0;
// build a block population count list, used for count_to() acceleration
std::unique_ptr<bm::bvector<>::rs_index_type> rs(new bm::bvector<>::rs_index_type());
bv.build_rs_index(rs.get());
{
bm::chrono_taker tt1("4. bvector<>::count_to() with rs_index", benchmark_count, &timing_map);
for (unsigned i = 0; i < benchmark_count; ++i)
{
unsigned to = unsigned(rand_dis(gen));
cnt += bv.count_to(to, *rs); // use rank-select index for acceleration
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "Count to with blocks test finished." << cnt << "\r";
}
/// count_range implemented via two count_to() calls using pre-calculated
/// rank-select index
///
static
void bv_count_to_range_acc(const bm::bvector<>& bv)
{
unsigned cnt = 0;
// build a block population count list, used for count_to() acceleration
std::unique_ptr<bm::bvector<>::rs_index_type> rs(new bm::bvector<>::rs_index_type());
bv.build_rs_index(rs.get());
{
bm::chrono_taker tt1("5. bvector<>::count_to to simulate count_range()", benchmark_count, &timing_map);
for (unsigned i = 0; i < benchmark_count; ++i)
{
unsigned from = unsigned(rand_dis(gen));
unsigned to = unsigned(rand_dis(gen));
if (from > to)
swap(from, to);
unsigned cnt_to = bv.count_to(to, *rs);
unsigned cnt_from = bv.count_to(from - 1, *rs);
unsigned cnt_r = cnt_to - cnt_from;
cnt += cnt_r;
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "Count range via count_to test finished." << cnt << "\r";
}
/// count_range implemented via bm::count_and
///
/// this method can be used, when we need co compute multiple ranges in one call
///
static
void bv_count_and(const bm::bvector<>& bv)
{
unsigned cnt = 0;
{
bm::chrono_taker tt1("6. bm::count_and with mask vector", benchmark_count, &timing_map);
bm::bvector<> mask_bv(bm::BM_GAP); // use compressed mask, better seluts on long ranges
for (unsigned i = 0; i < benchmark_count; ++i)
{
unsigned from = unsigned(rand_dis(gen));
unsigned to = unsigned(rand_dis(gen));
if (from > to)
swap(from, to);
mask_bv.set_range(from, to, true); // set mask vector
cnt += bm::count_and(bv, mask_bv);
mask_bv.clear(true); // clear and free memory (faster)
}
}
// this is mostly to prevent compiler to optimize loop away
std::cout << "count AND finished." << cnt << "\r";
}
/// count_to implemented via bm::bvector<>::counted_enumerator
///
/// Counted enumerator is an iterator automata, which counts the running population count
/// along the iteration sequence
///
static
void bv_counted_enumerator(const bm::bvector<>& bv)
{
unsigned cnt = 0;
{
// This is a slow method so we use less iterators
bm::chrono_taker tt1("7. bm::bvector<>::counted_enumerator", benchmark_count/20, &timing_map);
for (unsigned i = 0; i < benchmark_count/20; ++i)
{
unsigned to = unsigned(rand_dis(gen));
bm::bvector<>::counted_enumerator en = bv.first();
for (; en.valid(); ++en)
{
if (*en > to)
break;
}
cnt += en.count();
}
}
std::cout << "counted_enumerator finished." << cnt << "\r";
}
int main(void)
{
try
{
bm::bvector<> bv;
generate_bvector(bv);
/// pre-heat CPU to minimize dynamic overclocking
unsigned s = pre_heat(bv);
std::cout << s << "\r";
// Test 1.
// Uses plain bvector<>::count() to compute global population count
// This function would benefit from SIMD (SSE42 / AVX2) acceleration
//
bv_count_test(bv);
// Test 2.
// Uses bvector<>::count_range() to compute population count in a randomly generated
// region of a bit-vector.
// This is should be naturally faster than Test 1, because it range is less than the whole
//
bv_count_range(bv);
// Test 3.
// Uses bvector<>::count_range() together with bvector<>::count_blocks()
// (pre-calculated bit-count for each block).
// It make sense to use this method if bit-vector is constant (or chnages infrequently)
// and we need to do many range counting calculations
//
bv_count_range_acc(bv);
// Test 4.
// Uses bvector<>::count_to() to compute population count to a specified element.
// Equivalent of count_range(0, to);
// This method uses acceleration structure using bvector<>::running_count_blocks()
// It is similar to count_range acceleration, but uses a different (faster) algorithm
//
bv_count_to_acc(bv);
// Test 5.
// Uses bvector<>::count_to() twice to simulate count_range()
// using counting difference:
// count_r = count_to(0, from) - count_to(0, to-1)
// This method can actually be faster than count_range()
//
bv_count_to_range_acc(bv);
// Test 6.
// Compute range population count via a mask vector and logical AND operation.
// Not the fastest method, but can be useful, when multiple ranges needs to be computed
//
bv_count_and(bv);
// Test 7.
// Compute cout using counted_enumerator iterator
// method combines iteratrion over bit vector and sliding population count
bv_counted_enumerator(bv);
// print all test timing results
//
std::cout << " "
<< std::endl;
bm::chrono_taker::print_duration_map(timing_map, bm::chrono_taker::ct_ops_per_sec);
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}