# Bquery/Bcolz Taxi Set Performance

Based on the great work by Matthew Rocklin, see http://matthewrocklin.com/blog/work/2016/02/22/dask-distributed-part-2

NB: The auto-caching features will make the second (and subsequent) runs faster for multi-column groupings, which is reflected in the scores below.

In [1]:
import os
import urllib
import glob
import pandas as pd
from bquery import ctable
import bquery
import bcolz
from multiprocessing import Pool, cpu_count
from collections import OrderedDict
import contextlib
import time

# do not forget to install numexpr
# os.environ["BLOSC_NOLOCK"] = "1"
bcolz.set_nthreads(1)
workdir = '/home/carst/Documents/taxi/'

In [2]:
elapsed_times = OrderedDict()

@contextlib.contextmanager
def ctime(message=None):
    "Counts the time spent in some context"
    assert message is not None
    global elapsed_times
    t_elapsed = 0.0
    print('\n')
    t = time.time()
    yield
    if message:
        print (message + ":  ") 
    t_elapsed = time.time() - t
    print (round(t_elapsed, 4), "sec")
    elapsed_times[message] = t_elapsed

In [3]:
def sub_query(input_args):
    rootdir = input_args['rootdir']
    group_cols = input_args['group_cols']
    measure_cols = input_args['measure_cols']
    ct = ctable(rootdir=rootdir, mode='a')
    result = ct.groupby(group_cols, measure_cols)
    result_df = result.todataframe()
    return result_df.to_msgpack()


def execute_query(ct_list, group_cols, measure_cols):
    p = Pool(cpu_count())
    query_list = [{
            'rootdir': rootdir, 
            'group_cols': group_cols,
            'measure_cols': measure_cols} for rootdir in ct_list]
    result_list = p.map(sub_query, query_list)
    p.close()
    result_list = [pd.read_msgpack(x) for x in result_list]
    result_df = pd.concat(result_list, ignore_index=True)
    result_df = result_df.groupby(group_cols)[measure_cols].sum()
    return result_df


In [4]:
# create workfiles if not available
ct_list = glob.glob(workdir + 'taxi_*')
# import bquery.benchmarks.taxi.load as taxi_load
# taxi_load.download_data(workdir)
# taxi_load.create_bcolz(workdir)
# taxi_load.create_bcolz_chunks(workdir)

In [5]:
ct_list = glob.glob(workdir + 'taxi_*')

In [6]:
ct = ctable(rootdir=workdir + 'taxi', mode='a')
measure_list = ['extra',
                'fare_amount',
                'improvement_surcharge',
                'mta_tax',
                'nr_rides',
                'passenger_count',
                'tip_amount',
                'tolls_amount',
                'total_amount',
                'trip_distance']

## Single Process

In [7]:
with ctime(message='CT payment_type nr_rides sum, single process'):
    ct.groupby(['payment_type'], ['nr_rides'])
    
with ctime(message='CT yearmonth nr_rides sum, single process'):
    ct.groupby(['pickup_yearmonth'], ['nr_rides'])

with ctime(message='CT yearmonth + payment_type nr_rides sum, single process'):
    ct.groupby(['pickup_yearmonth', 'payment_type'], ['nr_rides'])



CT payment_type nr_rides sum, single process:  
(7.374, 'sec')


CT yearmonth nr_rides sum, single process:  
(6.4091, 'sec')


CT yearmonth + payment_type nr_rides sum, single process:  
(12.3229, 'sec')


## Multi Process

In [8]:
with ctime(message='CT payment_type nr_rides sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['payment_type'], ['nr_rides'])

with ctime(message='CT yearmonth nr_rides sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['pickup_yearmonth'], ['nr_rides'])

with ctime(message='CT yearmonth + payment_type nr_rides sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['pickup_yearmonth', 'payment_type'], ['nr_rides'])



CT payment_type nr_rides sum, 8 processors:  
(2.4518, 'sec')


CT yearmonth nr_rides sum, 8 processors:  
(1.9917, 'sec')


CT yearmonth + payment_type nr_rides sum, 8 processors:  
(3.907, 'sec')


## Single Process, All Measures

In [9]:
with ctime(message='CT payment_type all measure sum, single process'):
    ct.groupby(['payment_type'], measure_list)

with ctime(message='CT yearmonth all measure sum, single process'):
    ct.groupby(['pickup_yearmonth'], measure_list)

with ctime(message='CT yearmonth + payment_type all measure sum, single process'):
    ct.groupby(['pickup_yearmonth', 'payment_type'], measure_list)



CT payment_type all measure sum, single process:  
(25.5256, 'sec')


CT yearmonth all measure sum, single process:  
(22.63, 'sec')


CT yearmonth + payment_type all measure sum, single process:  
(28.4253, 'sec')


## Multi Process, All Measures

In [10]:
with ctime(message='CT payment_type all measure sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['payment_type'], measure_list)

with ctime(message='CT yearmonth  all measure sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['pickup_yearmonth'], measure_list)

with ctime(message='CT yearmonth + payment_type  all measure sum, ' + str(cpu_count()) + ' processors'):
    execute_query(ct_list, ['pickup_yearmonth', 'payment_type'], measure_list)



CT payment_type all measure sum, 8 processors:  
(7.7187, 'sec')


CT yearmonth  all measure sum, 8 processors:  
(5.062, 'sec')


CT yearmonth + payment_type  all measure sum, 8 processors:  
(7.2776, 'sec')
