In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np 
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [56]:
df = pd.read_csv("dataset/processed/jfk_optimized.csv")
df.head()

Unnamed: 0,year,scheduled_elapsed_time,label,dep_sin,dep_cos,is_holiday_or_weekend,month_1,month_2,month_3,month_4,...,day_of_week_5,day_of_week_6,day_of_week_7,carrier_AA,carrier_B6,carrier_DL,departure_bin_night,departure_bin_morning,departure_bin_afternoon,departure_bin_evening
0,2014,375.0,not_delayed,0.965926,-0.258819,1,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,2014,385.0,delayed,0.965926,-0.258819,1,True,False,False,False,...,False,False,False,False,False,True,False,True,False,False
2,2014,385.0,delayed,0.707107,-0.707107,1,True,False,False,False,...,False,False,False,True,False,False,False,True,False,False
3,2014,389.0,delayed,0.707107,-0.707107,1,True,False,False,False,...,False,False,False,False,False,True,False,True,False,False
4,2014,374.0,not_delayed,0.639439,-0.768842,1,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False


In [57]:
df["label"] = df["label"].map({"not_delayed": 0, "delayed": 1})

df_train = df[df["year"].between(2014, 2019)]
df_val = df[df["year"] == 2019]
df_test = df[df["year"].between(2022, 2024)]

In [58]:
print("Train size:", len(df_train))
print("Val size:  ", len(df_val))
print("Test size: ", len(df_test))

Train size: 60625
Val size:   10930
Test size:  29230


In [59]:
print("Train class distribution:\n", df_train['label'].value_counts(normalize=True))
print("Val class distribution:\n", df_val['label'].value_counts(normalize=True))
print("Test class distribution:\n", df_test['label'].value_counts(normalize=True))

Train class distribution:
 label
0    0.675052
1    0.324948
Name: proportion, dtype: float64
Val class distribution:
 label
0    0.726167
1    0.273833
Name: proportion, dtype: float64
Test class distribution:
 label
0    0.652275
1    0.347725
Name: proportion, dtype: float64


In [60]:
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=2.5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False,
    early_stopping_rounds=30,
    verbose=True
)

In [61]:
X = df_train.drop(columns=["label"])
y = df_train["label"]

X_val = df_val.drop(columns=["label"])
y_val = df_val["label"]

X_test = df_test.drop(columns=["label"])
y_test = df_test["label"]

In [62]:
model.fit(
    X, y,
    eval_set=[(X_val, y_val)],    
)

[0]	validation_0-logloss:0.73160
[1]	validation_0-logloss:0.72465
[2]	validation_0-logloss:0.71858
[3]	validation_0-logloss:0.71249
[4]	validation_0-logloss:0.70710
[5]	validation_0-logloss:0.70225
[6]	validation_0-logloss:0.69776
[7]	validation_0-logloss:0.69318
[8]	validation_0-logloss:0.68910
[9]	validation_0-logloss:0.68538
[10]	validation_0-logloss:0.68190
[11]	validation_0-logloss:0.68018
[12]	validation_0-logloss:0.67814
[13]	validation_0-logloss:0.67501
[14]	validation_0-logloss:0.67265
[15]	validation_0-logloss:0.66955
[16]	validation_0-logloss:0.66736
[17]	validation_0-logloss:0.66472
[18]	validation_0-logloss:0.66224
[19]	validation_0-logloss:0.66000
[20]	validation_0-logloss:0.65800
[21]	validation_0-logloss:0.65623
[22]	validation_0-logloss:0.65450
[23]	validation_0-logloss:0.65300
[24]	validation_0-logloss:0.65159
[25]	validation_0-logloss:0.65095
[26]	validation_0-logloss:0.65031
[27]	validation_0-logloss:0.64967
[28]	validation_0-logloss:0.64821
[29]	validation_0-loglos

Parameters: { "use_label_encoder", "verbose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[31]	validation_0-logloss:0.64615
[32]	validation_0-logloss:0.64471
[33]	validation_0-logloss:0.64301
[34]	validation_0-logloss:0.64179
[35]	validation_0-logloss:0.64129
[36]	validation_0-logloss:0.64076
[37]	validation_0-logloss:0.63967
[38]	validation_0-logloss:0.63938
[39]	validation_0-logloss:0.63842
[40]	validation_0-logloss:0.63731
[41]	validation_0-logloss:0.63713
[42]	validation_0-logloss:0.63646
[43]	validation_0-logloss:0.63616
[44]	validation_0-logloss:0.63508
[45]	validation_0-logloss:0.63387
[46]	validation_0-logloss:0.63301
[47]	validation_0-logloss:0.63248
[48]	validation_0-logloss:0.63164
[49]	validation_0-logloss:0.63126
[50]	validation_0-logloss:0.63048
[51]	validation_0-logloss:0.62994
[52]	validation_0-logloss:0.62901
[53]	validation_0-logloss:0.62844
[54]	validation_0-logloss:0.62797
[55]	validation_0-logloss:0.62765
[56]	validation_0-logloss:0.62709
[57]	validation_0-logloss:0.62621
[58]	validation_0-logloss:0.62535
[59]	validation_0-logloss:0.62460
[60]	validatio

In [63]:
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]

print(classification_report(y_val, y_val_pred))
print("Validation ROC AUC:", roc_auc_score(y_val, y_val_proba))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.84      0.69      0.76      7937
           1       0.45      0.65      0.53      2993

    accuracy                           0.68     10930
   macro avg       0.64      0.67      0.65     10930
weighted avg       0.73      0.68      0.70     10930

Validation ROC AUC: 0.7459230077016882
Confusion Matrix:
 [[5515 2422]
 [1033 1960]]
