-
Notifications
You must be signed in to change notification settings - Fork 11
/
bench_groupby.py
96 lines (81 loc) · 2.41 KB
/
bench_groupby.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from __future__ import print_function
# bench related imports
import numpy as np
import shutil
import bquery
import pandas as pd
import itertools as itt
import cytoolz
import cytoolz.dicttoolz
from toolz import valmap, compose
from cytoolz.curried import pluck
import blaze as blz
# other imports
import contextlib
import os
import time
try:
# Python 2
from itertools import izip
except ImportError:
# Python 3
izip = zip
t_elapsed = 0.0
@contextlib.contextmanager
def ctime(message=None):
"Counts the time spent in some context"
global t_elapsed
t_elapsed = 0.0
print('\n')
t = time.time()
yield
if message:
print(message + ": ", end='')
t_elapsed = time.time() - t
print(round(t_elapsed, 4), "sec")
ga = itt.cycle(['ES', 'NL'])
gb = itt.cycle(['b1', 'b2', 'b3', 'b4', 'b5'])
gx = itt.cycle([1, 2])
gy = itt.cycle([-1, -2])
rootdir = 'bench-data.bcolz'
if os.path.exists(rootdir):
shutil.rmtree(rootdir)
n_rows = 1000000
print('Rows: ', n_rows)
# -- data
z = np.fromiter(((a, b, x, y) for a, b, x, y in izip(ga, gb, gx, gy)),
dtype='S2,S2,i8,i8', count=n_rows)
ct = bquery.ctable(z, rootdir=rootdir, )
print(ct)
# -- pandas --
df = pd.DataFrame(z)
with ctime(message='pandas'):
result = df.groupby(['f0'])['f2'].sum()
print(result)
t_pandas = t_elapsed
# -- cytoolz --
with ctime(message='cytoolz over bcolz'):
# In Memory Split-Apply-Combine
# http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby
r = cytoolz.groupby(lambda row: row.f0, ct)
result = valmap(compose(sum, pluck(2)), r)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)
# -- blaze + bcolz --
blaze_data = blz.Data(ct.rootdir)
expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum())
with ctime(message='blaze over bcolz'):
result = blz.compute(expr)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)
# -- bquery --
with ctime(message='bquery over bcolz'):
result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)
ct.cache_factor(['f0'], refresh=True)
with ctime(message='bquery over bcolz (factorization cached)'):
result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)
shutil.rmtree(rootdir)