In [1]:
# flake8: noqa

import pyarrow.feather as feather
import pandas as pd
import json
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pandas.util.testing import rands
import gc
import os
import time

pa.set_cpu_count(8)

print(f"using {pa.cpu_count()} cpu cores")
    

def get_timing(f, niter=1):
    start = time.clock_gettime(time.CLOCK_REALTIME)
    for i in range(niter):
        f()
    result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
    return result


files = {
    'fanniemae': {
        'base': '2016Q4',
        'source': {
            'path': '2016Q4.txt',
            'sep': '|',
            'header': None
        }
    },
    'nyctaxi': {
        'base': 'yellow_tripdata_2010-01',
        'source': {
            'path': 'yellow_tripdata_2010-01.csv',
            'sep': ',',
            'header': 0
        }
    }
}


compression_cases = [
    (None, None),   # uncompressed
    ('zstd', 1),    # minimal compression
    ('zstd', 10),   # moderate
    ('lz4', None)   # LZ4 doesn't support compression level
]


def write_files(files, chunksize=1<<16):
    statistics = []
    for name, info in files.items():
        source = info['source']
        print("reading {}".format(source['path']))
        df = pd.read_csv(source['path'], sep=source['sep'], 
                         header=source['header'], 
                         low_memory=False)
        if source['header'] is None:
            df.columns = ['f{}'.format(i) for i in range(len(df.columns))]

        t = (pa.Table.from_pandas(df, preserve_index=False)
             .replace_schema_metadata(None))
        for compression, compression_level in compression_cases:
            path = '{}_{}_{}.feather'.format(info['base'], 
                                             compression or 'uncompressed',
                                             compression_level)
            print((name, compression, compression_level))
            tm = get_timing(lambda: 
                            feather.write_feather(df, path, compression=compression,
                                                  compression_level=compression_level,
                                                  chunksize=chunksize))
            file_size = os.stat(path).st_size
            result = name, compression, compression_level, file_size, tm
            print(result)
            statistics.append(result)
    return statistics

def get_read_results():
    all_results = []
    for name, info in files.items():
        for compression, compression_level in compression_cases:
            path = '{}_{}_{}.feather'.format(info['base'], 
                                             compression or 'uncompressed',
                                             compression_level)
            read_time = get_timing(lambda: feather.read_feather(path))
            result = name, compression, compression_level, read_time
            print(result)
            all_results.append(result) 
    return all_results

using 8 cpu cores


In [3]:
chunksizes = [1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
              1 << 16]

results_by_chunksize = {}
for chunksize in chunksizes:
    print(chunksize)
    write_results = write_files(files, chunksize=chunksize)
    read_results = get_read_results()    
    results_by_chunksize[chunksize] = write_results, read_results

1024
reading 2016Q4.txt
('fanniemae', None, None)
('fanniemae', None, None, 5084410194, 10.867645740509033)
('fanniemae', 'zstd', 1)
('fanniemae', 'zstd', 1, 501782274, 15.825337409973145)
('fanniemae', 'zstd', 10)
('fanniemae', 'zstd', 10, 439287250, 121.56486773490906)
('fanniemae', 'lz4', None)
('fanniemae', 'lz4', None, 745904522, 9.960903644561768)
reading yellow_tripdata_2010-01.csv
('nyctaxi', None, None)
('nyctaxi', None, None, 2522035242, 7.013156890869141)
('nyctaxi', 'zstd', 1)
('nyctaxi', 'zstd', 1, 878797970, 14.169167280197144)
('nyctaxi', 'zstd', 10)
('nyctaxi', 'zstd', 10, 828149914, 127.86313557624817)
('nyctaxi', 'lz4', None)
('nyctaxi', 'lz4', None, 1257410506, 7.765689849853516)
('fanniemae', None, None, 3.666794538497925)
('fanniemae', 'zstd', 1, 8.892261743545532)
('fanniemae', 'zstd', 10, 8.172545433044434)
('fanniemae', 'lz4', None, 6.437353610992432)
('nyctaxi', None, None, 8.010426759719849)
('nyctaxi', 'zstd', 1, 12.285831689834595)
('nyctaxi', 'zstd', 10, 11

In [11]:
reads = []
writes = []

for chunksize, (write_results, read_results) in results_by_chunksize.items():
    write_results = pd.DataFrame.from_records(
        write_results, columns=['dataset', 'codec', 'codec_level', 
                                'file_size', 'write_time'])
    read_results = pd.DataFrame.from_records(
        read_results, columns=['dataset', 'codec', 'codec_level', 
                               'read_time'])
    write_results['chunksize'] = chunksize
    read_results['chunksize'] = chunksize
    
    reads.append(read_results)
    writes.append(write_results)
    
reads = pd.concat(reads, ignore_index=True)
writes = pd.concat(writes, ignore_index=True)

def munge_codecs(codec_s, codec_level_s):
    results = []
    codec_s = codec_s.fillna('uncompressed')
    for codec, codec_level in zip(codec_s, codec_level_s):
        if pd.isnull(codec_level):
            results.append(codec)
        else:
            results.append(codec + '-' + str(int(codec_level)))
    return results

reads['codec'] = munge_codecs(reads['codec'], reads.pop('codec_level'))
writes['codec'] = munge_codecs(writes['codec'], writes.pop('codec_level'))

In [13]:
%matplotlib notebook

In [15]:
reads.to_csv('ipc_read_chunksize.csv')
writes.to_csv('ipc_write_chunksize.csv')

In [12]:
reads

Unnamed: 0,dataset,codec,read_time,chunksize
0,fanniemae,uncompressed,3.666795,1024
1,fanniemae,zstd-1,8.892262,1024
2,fanniemae,zstd-10,8.172545,1024
3,fanniemae,lz4,6.437354,1024
4,nyctaxi,uncompressed,8.010427,1024
5,nyctaxi,zstd-1,12.285832,1024
6,nyctaxi,zstd-10,11.382322,1024
7,nyctaxi,lz4,9.227901,1024
8,fanniemae,uncompressed,2.987059,2048
9,fanniemae,zstd-1,7.454243,2048
