In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

In [None]:
# Original weather data needed the weather condition to be decoded
# Done using https://gist.github.com/kfish610/29d0e2874824732ec2d2922f979e4acb
# Due to slowness saved as a csv

df_weather = pd.concat(pd.read_csv(ZipFile('data/AirportWeatherCodes.zip').open('AirportWeatherCodes.csv'), chunksize=100000))

# Remove leading character on Station (usually K, P for Honolulu Airport)
df_weather['facility'] = df_weather['Station_ID'].str.slice(1)

# Convert to datetime
df_weather['datetime'] = df_weather['Date_Time'].astype('datetime64[ns]').dt.tz_localize('UTC')

# Drop unused variables
df_weather = df_weather.drop(['Station_ID', 'Date_Time', 'wind_direction', 'weather_condition'], axis='columns')

# Sort
df_weather = df_weather.sort_values(by=['datetime'])

df_weather.head()

In [None]:
hourlyDates = pd.date_range(start='2015-01-01T00:00:00Z', end='2022-12-31T23:59:59Z', freq='H')

pd.merge_asof(pd.DataFrame(hourlyDates, columns=['datetime']), df_weather, on='datetime', direction='nearest')

In [None]:
# The original ASPM data (ASPM.zip) contains the hourly count of departures, as well as the percentage of delays
# We have to expand this into individual flights for the logistic regression
# Because we don't have the original flight data, we only have precision down to the hour
# Done using https://gist.github.com/kfish610/b63bbf488d91dcf6877925f7ddfe618b
# Not in python because python was too slow

zipFile = ZipFile('data/ASPM_Transformed.zip')

dfs_ASPM = [pd.read_csv(zipFile.open(i)) for i in zipFile.namelist()]



# Lowercase for consistency
df_ASPM.columns = df_ASPM.columns.str.lower()

# Remove leading space found in all airport codes
df_ASPM['facility'] = df_ASPM['facility'].str.strip()

print(f"{df_ASPM.shape}")
df_ASPM.head()