In [95]:
import pandas as pd
import numpy as np

In [96]:
df = pd.read_csv('data/flights_sample_3m.csv')

In [97]:
df.columns

Index(['FL_DATE', 'AIRLINE', 'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
       'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY', 'DEST', 'DEST_CITY',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER',
       'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
       'DELAY_DUE_LATE_AIRCRAFT'],
      dtype='object')

In [98]:
df.head()

Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",...,0.0,186.0,176.0,153.0,1065.0,,,,,
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",...,0.0,235.0,236.0,189.0,1399.0,,,,,
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",...,0.0,118.0,112.0,87.0,680.0,,,,,
3,2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",...,0.0,260.0,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0
4,2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",...,0.0,181.0,182.0,153.0,985.0,,,,,


In [132]:
df1 = df[['FL_DATE', 'AIRLINE', 'ORIGIN','DEST','CRS_DEP_TIME', 'DEP_DELAY','CANCELLED']].copy()

# Replace negative DEP_DELAY values with 0
df1['DEP_DELAY'] = df1['DEP_DELAY'].apply(lambda x: 0 if x < 0 else x)

df1 = df1[df1['CANCELLED'] == 0].copy()
df1.dropna(inplace=True)


In [133]:
df1.groupby('AIRLINE')['DEP_DELAY'].mean().sort_values(ascending=False)

AIRLINE
JetBlue Airways                       22.256309
Frontier Airlines Inc.                20.118742
Allegiant Air                         18.736789
ExpressJet Airlines LLC d/b/a aha!    17.716204
Spirit Air Lines                      16.026478
Mesa Airlines Inc.                    15.962148
American Airlines Inc.                15.735878
United Air Lines Inc.                 14.523830
SkyWest Airlines Inc.                 13.447883
Southwest Airlines Co.                12.576835
PSA Airlines Inc.                     12.453113
Delta Air Lines Inc.                  10.974932
Envoy Air                             10.471192
Republic Airline                      10.355291
Endeavor Air Inc.                      9.951903
Alaska Airlines Inc.                   9.456345
Hawaiian Airlines Inc.                 8.244626
Horizon Air                            7.993435
Name: DEP_DELAY, dtype: float64

In [134]:
df1['FL_DATE'] = pd.to_datetime(df1['FL_DATE'])
df1['MONTH'] = df1['FL_DATE'].dt.month
df1['DAY_OF_WEEK'] = df1['FL_DATE'].dt.dayofweek  # Monday=0, Sunday=6

In [135]:
df1['DEP_HOUR'] = df1['CRS_DEP_TIME'] // 100  # 1155 -> 11

In [136]:
df_encoded = pd.get_dummies(df1[['AIRLINE', 'ORIGIN', 'DEST']], drop_first=True)

In [137]:
X = pd.concat([df_encoded, df1[['MONTH', 'DAY_OF_WEEK', 'DEP_HOUR']]], axis=1)
df1['DELAY_SEVERE'] = (df1['DEP_DELAY'] > 15).astype(int)
y = df1['DELAY_SEVERE']

In [138]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [139]:
from xgboost import XGBClassifier
scale = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model = XGBClassifier(
                    max_depth=6,
                    learning_rate=0.05,
                    random_state=42,
                    use_label_encoder=False,
                    n_estimators=300,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    eval_metric='logloss',
                    scale_pos_weight=scale
)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [140]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.64      0.74    481638
           1       0.27      0.62      0.37    102534

    accuracy                           0.63    584172
   macro avg       0.58      0.63      0.56    584172
weighted avg       0.78      0.63      0.68    584172

[[307237 174401]
 [ 39110  63424]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57     95247
           1       0.61      0.63      0.62    103125

    accuracy                           0.60    198372
   macro avg       0.60      0.59      0.59    198372
weighted avg       0.60      0.60      0.60    198372

[[52946 42301]
 [37754 65371]]


In [None]:
new_input = pd.DataFrame([{
    'DEP_HOUR': 15,
    'DAY_OF_WEEK': 1,
    'MONTH': 8,
    'YEAR': 2025,
    'AIRLINE_DL': 1,
    'ORIGIN_LAX': 1,
    'DEST_JFK': 1,
}], columns=X.columns)  

prediction = model.predict_proba(new_input)[0][1]  # probability of delay
print(f"Chance of delay: {prediction:.2%}")

Chance of delay: 8.45%
