-
Notifications
You must be signed in to change notification settings - Fork 11
/
vb_groupby.py
53 lines (45 loc) · 1.27 KB
/
vb_groupby.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from vbench.api import Benchmark
from datetime import datetime
common_setup = """
import bquery
import numpy as np
import random
import os
import tempfile
import itertools as itt
"""
setup = common_setup + """
def gen_almost_unique_row(N):
pool = itt.cycle(['a', 'b', 'c', 'd', 'e'])
pool_b = itt.cycle([1.1, 1.2])
pool_c = itt.cycle([1, 2, 3])
pool_d = itt.cycle([1, 2, 3])
for _ in range(N):
d = (
pool.next(),
pool_b.next(),
pool_c.next(),
pool_d.next(),
random.random(),
random.randint(- 10, 10),
random.randint(- 10, 10),
)
yield d
random.seed(1)
groupby_cols = ['f0']
groupby_lambda = lambda x: x[0]
agg_list = ['f4', 'f5', 'f6']
num_rows = 100000
# -- Data --
g = gen_almost_unique_row(num_rows)
data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')
rootdir = tempfile.mkdtemp(prefix='bcolz-')
os.rmdir(rootdir) # folder should be emtpy
fact_bcolz = bquery.ctable(data, rootdir=rootdir)
fact_bcolz.flush()
fact_bcolz.cache_factor(groupby_cols, refresh=True)
result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
"""
stmt2 = "time.sleep(1)"
bm_groupby2 = Benchmark(stmt2, setup, name="GroupBy test 1",
start_date=datetime(2011, 7, 1))