In [None]:
# flake8: noqa

import pyarrow.feather as feather
import pandas as pd
import json
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pandas.util.testing import rands
import gc
import time

print(f"using {pa.cpu_count()} cpu cores")
    

def get_timing(f, niter):
    start = time.clock_gettime(time.CLOCK_REALTIME)
    for i in range(niter):
        f()
    result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
    return result


files = {
    'fanniemae': {
        'base': '2016Q4',
        'source': {
            'path': '2016Q4.txt',
            'sep': '|',
            'header': None
        }
    },
    'nyctaxi': {
        'base': 'yellow_tripdata_2010-01',
        'source': {
            'path': 'yellow_tripdata_2010-01.csv',
            'sep': ',',
            'header': 0
        }
    }
}


compression_cases = [
    (None, None),   # uncompressed
    ('zstd', 1),    # minimal compression
    ('zstd', 10),   # moderate
    ('lz4', None)   # LZ4 doesn't support compression level
]


def write_files(files):
    for name, info in files.items():
        source = info['source']
        print("reading {}".format(source['path']))
        df = pd.read_csv(source['path'], sep=source['sep'], 
                         header=source['header'], 
                         low_memory=False)
        if source['header'] is None:
            df.columns = ['f{}'.format(i) for i in range(len(df.columns))]

        t = (pa.Table.from_pandas(df, preserve_index=False)
             .replace_schema_metadata(None))
        for compression, compression_level in compression_cases:
            path = '{}_{}_{}.feather'.format(info['base'], 
                                             compression or 'uncompressed',
                                             compression_level)
            print(path)
            feather.write_feather(df, path, compression=compression,
                                  compression_level=compression_level)

In [None]:
write_files(files)

In [14]:
NITER = 1

all_results = []
for name, info in files.items():
    for compression, compression_level in compression_cases:
        path = '{}_{}_{}.feather'.format(info['base'], 
                                         compression or 'uncompressed',
                                         compression_level)
        mean_time = get_timing(lambda: feather.read_feather(path), NITER)
        result = name, compression, compression_level, mean_time
        print(result)
        all_results.append(result)

('fanniemae', None, None, 2.4117162227630615)
('fanniemae', 'zstd', 1, 5.116245985031128)
('fanniemae', 'zstd', 10, 3.9139928817749023)
('fanniemae', 'lz4', None, 3.5294902324676514)
('nyctaxi', None, None, 7.1993725299835205)
('nyctaxi', 'zstd', 1, 10.147839069366455)
('nyctaxi', 'zstd', 10, 8.913217782974243)
('nyctaxi', 'lz4', None, 8.480979204177856)


reading 2016Q4.txt
2016Q4_uncompressed_None.feather
2016Q4_zstd_1.feather
zstd compression
2016Q4_zstd_10.feather
zstd compression
2016Q4_lz4_None.feather
lz4 compression
reading yellow_tripdata_2010-01.csv
yellow_tripdata_2010-01_uncompressed_None.feather
yellow_tripdata_2010-01_zstd_1.feather
zstd compression
yellow_tripdata_2010-01_zstd_10.feather
zstd compression
yellow_tripdata_2010-01_lz4_None.feather
lz4 compression
