In [2]:
import pandas as pd
import pyarrow as pa
import numpy as np
import json
from datetime import date, datetime

In [3]:
with_json = False

In [4]:
def json_serial(obj):
    return int(obj.strftime("%s"))

In [5]:
data = pd.read_csv('weather-10m.csv', encoding='utf-8')
data.head()

Unnamed: 0,recorddate,station,AWND,PRCP,SNOW,SNWD,TMAX,TMAX_F,TMIN,TMIN_F,...,WT22,elevation,gsn_flag,hcn_crn_flag,id,latitude,longitude,name,state,wmo_id
0,20120101,USS0006H19S,,0.0,,356.0,1.1,33.98,-15.4,4.28,...,,2572.5,,,,41.33,-106.5,,WY South Brush Creek,
1,20120101,USC00390043,,0.0,0.0,0.0,5.6,42.08,-2.8,26.96,...,,512.1,HCN,,,43.4892,-99.0631,,SD ACADEMY 2NE,
2,20120101,SWE00138512,,124.0,,0.0,,,,,...,,90.0,,,,58.45,14.89,,VADSTENA,
3,20120101,USC00163807,,0.0,,,25.6,78.08,17.2,62.96,...,,0.6,,,,29.2414,-89.9914,,LA GRAND ISLE,
4,20120101,USC00163800,,0.0,,,20.6,69.08,15.6,60.08,...,,16.8,HCN,,,30.4183,-92.0442,,LA GRAND COTEAU,


In [9]:
data.columns

Index(['recorddate', 'station', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX',
       'TMAX_F', 'TMIN', 'TMIN_F', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05',
       'WT06', 'WT07', 'WT08', 'WT09', 'WT10', 'WT11', 'WT12', 'WT13', 'WT14',
       'WT15', 'WT16', 'WT17', 'WT18', 'WT19', 'WT20', 'WT21', 'WT22',
       'elevation', 'gsn_flag', 'hcn_crn_flag', 'id', 'latitude', 'longitude',
       'name', 'state', 'wmo_id'],
      dtype='object')

In [10]:
data = data.drop(["SNOW"], axis=1)
renamed = data.rename(index=str, columns={
    "TMAX": "TEMP_MAX", 
    "TMIN": "TEMP_MIN",
    "PRCP": "PRECIPITATION",
    "elevation": "ELEVATION", 
    "AWND": "WIND", 
    "latitude": "LATITUDE", 
    "longitude": "LONGITUDE",
    "SNWD": "SNOW"
})

renamed['RECORD_DATE'] = pd.to_datetime(renamed.recorddate, format='%Y%m%d').dt.date

renamed = renamed[["RECORD_DATE", "TEMP_MAX", "TEMP_MIN", "PRECIPITATION", "ELEVATION", "WIND", "SNOW", "LATITUDE", "LONGITUDE"]]

renamed['WIND'].fillna(0, inplace=True)
renamed['SNOW'].fillna(0, inplace=True)

renamed['WIND'] = renamed['WIND'] / 10
# temperature is already corrected in the data
# renamed['TEMP_MIN'] = renamed['TEMP_MIN'] / 10
# renamed['TEMP_MAX'] = renamed['TEMP_MAX'] / 10
renamed['PRECIPITATION'] = renamed['PRECIPITATION'] / 10

cleaned = renamed.dropna()

right_types = cleaned.astype({
    'TEMP_MAX': 'float32',
    'TEMP_MIN': 'float32',
    'PRECIPITATION': 'float32',
    'ELEVATION': 'float32',
    'WIND': 'float32',
    'SNOW': 'float32',
    'LATITUDE': 'float32',
    'LONGITUDE': 'float32'
})

In [11]:
cleaned.head()

Unnamed: 0,RECORD_DATE,TEMP_MAX,TEMP_MIN,PRECIPITATION,ELEVATION,WIND,SNOW,LATITUDE,LONGITUDE
0,2012-01-01,1.1,-15.4,0.0,2572.5,0.0,356.0,41.33,-106.5
1,2012-01-01,5.6,-2.8,0.0,512.1,0.0,0.0,43.4892,-99.0631
3,2012-01-01,25.6,17.2,0.0,0.6,0.0,0.0,29.2414,-89.9914
4,2012-01-01,20.6,15.6,0.0,16.8,0.0,0.0,30.4183,-92.0442
5,2012-01-01,-6.6,-17.1,0.0,581.0,0.0,90.0,49.2167,-102.9667


In [14]:
for size, name in [(10000, 'weather-10k'), (200000, 'weather-200k'), (500000, 'weather-500k'), (1000000, 'weather-1m'), (3000000, 'weather-3m'), (10000000, 'weather-10m')]:
    print(name)

    smaller = right_types[:size]
    
    table = pa.Table.from_pandas(smaller)
    
    if with_json:
        d = {}
        for column in smaller.columns:
            d[column]=list(smaller[column])

        with open(f'{name}.json', 'w') as f:
            json.dump(d, f, default=json_serial, separators=(',', ':'))

    # table = table.column('ARRIVAL').cast(pa.TimestampValue, True)

    writer = pa.RecordBatchFileWriter(f'{name}.arrow', table.schema)
    writer.write(table)
    writer.close()

weather-10k
weather-200k
weather-500k
weather-1m
weather-3m
weather-10m
