In [15]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import matplotlib.pyplot as plt   # plotting
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.metrics import confusion_matrix

import seaborn as sns

In [13]:
# Original weather data needed the weather condition to be decoded
# Done using https://gist.github.com/kfish610/29d0e2874824732ec2d2922f979e4acb
# Due to slowness saved as a csv

# Further minimized using https://gist.github.com/kfish610/48605253428c856bbb1f461c10b6d6ac 

In [14]:
# The original ASPM data (ASPM.zip) contains the hourly count of departures, as well as the percentage of delays
# We have to expand this into individual flights for the logistic regression
# Because we don't have the original flight data, we only have precision down to the hour
# Done using https://gist.github.com/kfish610/b63bbf488d91dcf6877925f7ddfe618b
# This also merges the weather data with the ASPM data, which in Python is too memory inefficient

zipFile = ZipFile('data/ASPM_Transformed.zip')

df = pd.concat([pd.read_csv(zipFile.open(i)) for i in zipFile.namelist()])

# Lowercase for consistency
df.columns = df.columns.str.lower()

# Drop unused datetime
df = df.drop(columns='datetime')

print(f"{df.shape}")
df.head()

(44012925, 9)


Unnamed: 0,facility,delayed,air_temp,wind_speed,precipitation,visibility,thunder,snow,hail
0,BOS,0,1.0,4.63,0.0,10.0,0,0,0
1,FLL,1,22.0,4.63,0.0,10.0,0,0,0
2,IAD,0,1.0,0.0,0.0,10.0,0,0,0
3,JFK,0,2.0,2.06,0.0,10.0,0,0,0
4,JFK,0,2.0,2.06,0.0,10.0,0,0,0


In [17]:
formula1 = 'delayed ~ 1 + precipitation + visibility'
model1 = logit(formula = formula1, data = df).fit()
model1.summary()

Optimization terminated successfully.
         Current function value: 0.584528
         Iterations 5


0,1,2,3
Dep. Variable:,delayed,No. Observations:,44012925.0
Model:,Logit,Df Residuals:,44012922.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 10 Jun 2023",Pseudo R-squ.:,0.004868
Time:,10:15:32,Log-Likelihood:,-25727000.0
converged:,True,LL-Null:,-25853000.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2608,0.002,-170.863,0.000,-0.264,-0.258
precipitation,0.1103,0.001,110.332,0.000,0.108,0.112
visibility,-0.0770,0.000,-478.780,0.000,-0.077,-0.077


In [16]:
formula2 = 'delayed ~ 1 + thunder + snow + hail'

model2 = logit(formula = formula2, data = df).fit()
model2.summary()

Optimization terminated successfully.
         Current function value: 0.584528
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.583471
         Iterations 5


0,1,2,3
Dep. Variable:,delayed,No. Observations:,44012925.0
Model:,Logit,Df Residuals:,44012921.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 10 Jun 2023",Pseudo R-squ.:,0.006666
Time:,10:12:36,Log-Likelihood:,-25680000.0
converged:,True,LL-Null:,-25853000.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.9971,0.000,-2917.403,0.000,-0.998,-0.996
thunder,1.4181,0.007,195.619,0.000,1.404,1.432
snow,1.6958,0.003,533.014,0.000,1.690,1.702
hail,-0.0040,0.235,-0.017,0.986,-0.465,0.457
