In [9]:
import pandas as pd
import pyarrow as pa
import numpy as np
import json
from datetime import date, datetime

In [2]:
with_json = False

In [3]:
def json_serial(obj):
    return int(obj.strftime("%s"))

In [14]:
data = pd.read_csv(f'flights-10m.csv', encoding='utf-8', dtype={'FlightDate': 'str', 'ArrTime': 'str', 'DepTime': 'str'})
data = data.dropna()

data.head()

Unnamed: 0,FlightDate,DepTime,DepDelay,ArrTime,ArrDelay,AirTime,Distance
0,2006-01-01,905,5.0,1229,19.0,350.0,2475.0
1,2006-01-02,1147,167.0,1546,216.0,343.0,2475.0
2,2006-01-03,853,-7.0,1208,-2.0,344.0,2475.0
3,2006-01-04,855,-5.0,1157,-13.0,331.0,2475.0
4,2006-01-05,857,-3.0,1153,-17.0,321.0,2475.0


In [15]:
renamed = data.rename(index=str, columns={"FlightDate": "FL_DATE", "DepTime": "DEP_TIME", "ArrTime": "ARR_TIME", "Distance": "DISTANCE", "AirTime": "AIR_TIME", "DepDelay": "DEP_DELAY", "ArrDelay": "ARR_DELAY"})

renamed['FL_DATE'] = pd.to_datetime(renamed.FL_DATE, format='%Y-%m-%d').dt.date

renamed['DEP_TIME'] = renamed.DEP_TIME.replace('2400', '0000')
renamed['ARR_TIME'] = renamed.ARR_TIME.replace('2400', '0000')

def toTime(col):
    col = pd.to_numeric(col)
    col = (col/100).apply(np.floor) + (col.mod(100)) / 60.
    return col

renamed['DEP_TIME'] = toTime(renamed['DEP_TIME'])
renamed['ARR_TIME'] = toTime(renamed['ARR_TIME'])

if 'ORIGIN' in renamed.columns:
    renamed = renamed.drop(['ORIGIN', 'DEST'], axis=1)

cleaned = renamed.dropna()

right_types = cleaned.astype({
    'DEP_DELAY': 'int16',
    'ARR_DELAY': 'int16',
    'AIR_TIME': 'int16',
    'DISTANCE': 'int16',
    'DEP_TIME': 'float32',
    'ARR_TIME': 'float32'
})


In [16]:
right_types.head()

Unnamed: 0,FL_DATE,DEP_TIME,DEP_DELAY,ARR_TIME,ARR_DELAY,AIR_TIME,DISTANCE
0,2006-01-01,9.083333,5,12.483334,19,350,2475
1,2006-01-02,11.783334,167,15.766666,216,343,2475
2,2006-01-03,8.883333,-7,12.133333,-2,344,2475
3,2006-01-04,8.916667,-5,11.95,-13,331,2475
4,2006-01-05,8.95,-3,11.883333,-17,321,2475


In [17]:
for size, name in [(10000, 'flights-10k'), (200000, 'flights-200k'), (500000, 'flights-500k'), (1000000, 'flights-1m'), (3000000, 'flights-3m'), (10000000, 'flights-10m')]:
    print(name)

    smaller = right_types[:size+1]
    
    table = pa.Table.from_pandas(smaller)
    
    if with_json:
        d = {}
        for column in smaller.columns:
            d[column]=list(smaller[column])

        with open(f'{name}.json', 'w') as f:
            json.dump(d, f, default=json_serial, separators=(',', ':'))

    # table = table.column('ARRIVAL').cast(pa.TimestampValue, True)

    writer = pa.RecordBatchFileWriter(f'{name}.arrow', table.schema)
    writer.write(table)
    writer.close()

flights-10k
flights-200k
flights-500k
flights-1m
flights-3m
flights-10m


In [18]:
!ls -lah

total 2182336
drwxr-xr-x@ 20 domoritz  staff   640B Jun 29 14:50 [34m.[m[m
drwxr-xr-x  24 domoritz  staff   768B Jun 28 18:35 [34m..[m[m
-rw-r--r--@  1 domoritz  staff   6.0K Nov 29  2017 .DS_Store
drwxr-xr-x   5 domoritz  staff   160B Jun 13 13:55 [34m.ipynb_checkpoints[m[m
-rw-r--r--@  1 domoritz  staff    28M Nov 28  2017 564230852_T_ONTIME.csv
-rw-r--r--@  1 domoritz  staff   6.3M Nov 28  2017 564230852_T_ONTIME.zip
-rw-r--r--   1 domoritz  staff   4.0K Jun 29 14:50 convert.ipynb
-rw-r--r--   1 domoritz  staff   276K Jun 29 14:51 flights-10k.arrow
-rw-r--r--   1 domoritz  staff   400K Nov 29  2017 flights-10k.csv
-rw-r--r--   1 domoritz  staff   409K Jun  8 22:39 flights-10k.json
-rw-r--r--   1 domoritz  staff   288M Jun 29 14:51 flights-10m.arrow
-rw-r--r--   1 domoritz  staff   439M Jun 16 10:21 flights-10m.csv
-rw-r--r--   1 domoritz  staff    29M Jun 29 14:51 flights-1m.arrow
-rw-r--r--   1 domoritz  staff    31M Nov 29  2017 flights-1m.csv
-rw-r--r--   1 domoritz  sta