In [28]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import json
from datetime import date, datetime

In [2]:
with_json = False

In [3]:
def json_serial(obj):
    return int(obj.strftime("%s"))

In [18]:
data = pd.read_csv('flights_2006_2010.csv', sep='\t', encoding='utf-8', dtype={'FlightDate': 'str', 'ArrTime': 'str', 'DepTime': 'str'})

In [91]:
renamed = data.rename(index=str, columns={"FlightDate": "FL_DATE", "DepTime": "DEP_TIME", "ArrTime": "ARR_TIME", "Distance": "DISTANCE", "AirTime": "AIR_TIME", "DepDelay": "DEP_DELAY", "ArrDelay": "ARR_DELAY"})

renamed['FL_DATE'] = pd.to_datetime(renamed.FL_DATE, format='%Y-%m-%d').dt.date

renamed['DEP_TIME'] = renamed.DEP_TIME.replace('2400', '0000')
renamed['ARR_TIME'] = renamed.ARR_TIME.replace('2400', '0000')

def toTime(col):
    col = pd.to_numeric(col)
    col = (col/100).apply(np.floor) + (col.mod(100)) / 60.
    return col

renamed['DEP_TIME'] = toTime(renamed['DEP_TIME'])
renamed['ARR_TIME'] = toTime(renamed['ARR_TIME'])

types = {
    'DEP_DELAY': 'int16',
    'ARR_DELAY': 'int16',
    'AIR_TIME': 'int16',
    'DISTANCE': 'int16',
    'DEP_TIME': 'float16',
    'ARR_TIME': 'float16'
}
columns = ['FL_DATE'] + list(types.keys())
renamed = renamed[columns]
renamed = renamed.dropna()

right_types = renamed.astype(types)

In [92]:
renamed.head()

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5.0,19.0,350.0,2475.0,9.083333,12.483333
1,2006-01-02,167.0,216.0,343.0,2475.0,11.783333,15.766667
2,2006-01-03,-7.0,-2.0,344.0,2475.0,8.883333,12.133333
3,2006-01-04,-5.0,-13.0,331.0,2475.0,8.916667,11.95
4,2006-01-05,-3.0,-17.0,321.0,2475.0,8.95,11.883333


In [93]:
for size, name in [(10000, 'flights-10k'), (200000, 'flights-200k'), (500000, 'flights-500k'), (1000000, 'flights-1m'), (3000000, 'flights-3m'), (10000000, 'flights-10m')]:
    smaller = right_types[:size]
    
    print(name, len(smaller))
    
    table = pa.Table.from_pandas(smaller, preserve_index=False)
    
    if with_json:
        d = {}
        for column in smaller.columns:
            d[column]=list(smaller[column])

        with open(f'{name}.json', 'w') as f:
            json.dump(d, f, default=json_serial, separators=(',', ':'))

    # table = table.column('ARRIVAL').cast(pa.TimestampValue, True)

    # optionally, write parquet files
    # pq.write_table(table, f'{name}.parquet')
    
    writer = pa.RecordBatchFileWriter(f'{name}.arrow', table.schema)
    writer.write(table)
    writer.close()

flights-10k 10000
flights-200k 200000
flights-500k 500000
flights-1m 1000000
flights-3m 3000000
flights-10m 10000000


In [94]:
!ls -lah

total 11634064
drwxr-xr-x  31 dominik  staff   992B Apr 28 21:01 [34m.[m[m
drwxr-xr-x  31 dominik  staff   992B Apr 28 19:21 [34m..[m[m
drwxr-xr-x   3 dominik  staff    96B Apr 28 19:25 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 dominik  staff    28M Sep 30  2019 564230852_T_ONTIME.csv
-rw-r--r--   1 dominik  staff   5.9K Apr 28 21:01 convert_flights.ipynb
-rw-r--r--   1 dominik  staff    17K Sep 30  2019 convert_movies.ipynb
-rw-r--r--   1 dominik  staff    15K Sep 30  2019 convert_weather.ipynb
-rw-r--r--   1 dominik  staff    20M Sep 30  2019 error analysis-full.ipynb
-rw-r--r--   1 dominik  staff    11M Sep 30  2019 error analysis.ipynb
-rw-r--r--   1 dominik  staff   160K Apr 28 21:04 flights-10k.arrow
-rw-r--r--   1 dominik  staff   400K Sep 30  2019 flights-10k.csv
-rw-r--r--   1 dominik  staff   409K Sep 30  2019 flights-10k.json
-rw-r--r--   1 dominik  staff    78K Apr 28 20:39 flights-10k.parquet
-rw-r--r--   1 dominik  staff   153M Apr 28 21:04 flights-10m.arrow
-rw-