#### Machine Learning

**Logistic Regression**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df_airlines = pd.read_csv('../data/ontime_2025.csv')
df_airlines.head()

Unnamed: 0,year,month,day_of_month,day_of_week,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,origin,origin_city_name,origin_state_abr,...,carrier_delay,weather_delay,nas_delay,security_delay,dep_time_dt,arr_time_dt,dep_bin,season,delay_category,fl_date
0,2024,11,1,5,9E,20363,4800,CLT,"Charlotte, NC",NC,...,6.0,0.0,0.0,0.0,1900-01-01 18:28:00,1900-01-01 20:15:00,Early Evening,Autumn,Moderate Delay,2024-11-01
1,2024,11,1,5,9E,20363,4801,CVG,"Cincinnati, OH",KY,...,0.0,0.0,0.0,0.0,1900-01-01 17:41:00,1900-01-01 18:49:00,Afternoon,Autumn,On Time,2024-11-01
2,2024,11,1,5,9E,20363,4802,DSM,"Des Moines, IA",IA,...,0.0,0.0,0.0,0.0,1900-01-01 15:54:00,1900-01-01 16:57:00,Afternoon,Autumn,On Time,2024-11-01
3,2024,11,1,5,9E,20363,4803,CVG,"Cincinnati, OH",KY,...,0.0,0.0,0.0,0.0,1900-01-01 13:50:00,1900-01-01 14:55:00,Midday,Autumn,On Time,2024-11-01
4,2024,11,1,5,9E,20363,4804,GSP,"Greer, SC",SC,...,0.0,0.0,0.0,0.0,1900-01-01 12:45:00,1900-01-01 14:35:00,Midday,Autumn,On Time,2024-11-01


In [3]:
df_airlines['is_delayed'] = df_airlines['dep_del15']

In [4]:
df_ml = df_airlines[df_airlines['cancelled'] == 0].copy()


- numeric_features = ['month', 'day_of_month', 'day_of_week']

- categorical_features = [
    'dep_bin',
    'season',
    'origin',
    'dest',
    'op_unique_carrier'
]


In [5]:
df_ml_numeric = df_ml[['month', 'day_of_month', 'day_of_week']]

df_ml_cat = df_ml[
    [
        'dep_bin',
        'season',
        'origin',
        'dest',
        'op_unique_carrier'
    ]
]

In [6]:
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_ml_numeric.columns)
    ]
)

ml_numeric_scaled = preprocessor.fit_transform(df_ml_numeric)

df_ml_numeric_scaled = pd.DataFrame(
    ml_numeric_scaled,
    columns=df_ml_numeric.columns,
    index=df_ml_numeric.index
)

In [7]:
df_ml_numeric_scaled

Unnamed: 0,month,day_of_month,day_of_week
0,1.296277,-1.680872,0.494745
1,1.296277,-1.680872,0.494745
2,1.296277,-1.680872,0.494745
3,1.296277,-1.680872,0.494745
4,1.296277,-1.680872,0.494745
...,...,...,...
7014744,1.001899,1.737789,0.494745
7014745,1.001899,1.737789,0.494745
7014746,1.001899,1.737789,0.494745
7014748,1.001899,1.737789,0.494745


In [8]:
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, df_ml_cat.columns)
    ]
)

ml_cat_scaled = preprocessor.fit_transform(df_ml_cat)

feature_names = preprocessor.get_feature_names_out()

df_ml_cat_scaled = pd.DataFrame(
    ml_cat_scaled.toarray() if hasattr(ml_cat_scaled, "toarray") else ml_cat_scaled,
    columns=feature_names,
    index=df_ml_cat.index
)

In [None]:
df_ml_concat = pd.concat([df_ml_numeric_scaled, df_ml_cat_scaled], axis=1)

In [None]:
features = [
    'month',
    'day_of_month',
    'day_of_week',
    'dep_bin',
    'season',
    'origin',
    'dest',
    'op_unique_carrier'
]

target = 'is_delayed'

X = df_ml_concat[features]
y = df_ml_concat[target]

#### Logistic Regression Model

In [None]:
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

*Split into Training and Testing sets*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*Fit the model*

In [None]:
log_reg.fit(X_train, y_train)

*Predict and Evaluate*

In [None]:
y_pred = log_reg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

*Visualize the performance*

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(log_reg, X_test, y_test, cmap='Blues')
plt.title("Flight Delay Prediction Performance")
plt.show()

In [None]:
df_ml_concat['is_delayed'].value_counts()

In [None]:
import joblib

joblib.dump(log_reg, "log_reg_model.pkl")

#### Oversampling

In [None]:
delayed_plt = df_ml_concat["is_delayed"].value_counts()
delayed_plt.plot(kind="bar")
plt.show()

In [None]:
# X_train_scaled_df = pd.DataFrame(X_train, columns=X_train.columns, index=X_train.index)
# X_test_scaled_df  = pd.DataFrame(X_test, columns=X_test.columns, index=X_test.index)

In [None]:
train = pd.DataFrame(X_train, columns=X_train.columns, index=X_train.index)

In [None]:
X_train

In [None]:
train["is_delayed"] = y_train.values

In [None]:
train["is_delayed"]

In [None]:
train["is_delayed"].value_counts()

In [None]:
delayed = train[train["is_delayed"] == 1]
no_delayed = train[train["is_delayed"] == 0]

In [None]:
len(delayed),len(no_delayed)

In [None]:
from sklearn.utils import resample

yes_oversampled = resample(delayed,
                                    replace=True,
                                    n_samples = len(no_delayed),
                                    random_state=42)

In [None]:
train_over = pd.concat([yes_oversampled, no_delayed])
train_over

In [None]:
delayed_plt = train_over["is_delayed"].value_counts()
delayed_plt.plot(kind="bar")
plt.show()

In [None]:
X_train_over = train_over.drop(columns = ["is_delayed"])
y_train_over = train_over["is_delayed"]

In [None]:
X_train_over

In [None]:
y_train_over

In [None]:
log_reg_over = LogisticRegression()
log_reg_over.fit(X_train_over, y_train_over)

In [None]:
from imblearn.over_sampling import RandomOverSampler

over_sampling = RandomOverSampler(random_state=42)
X_over, y_over = over_sampling.fit_resample(X_train, y_train)

lr_over = LogisticRegression(max_iter=1000).fit(X_over, y_over)
print(f"Oversampling Recall: {recall_score(y_test, lr_over.predict(X_test))}")

#### Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

uner_sampling = RandomUnderSampler(random_state=42)
X_under, y_under = uner_sampling.fit_resample(X_train, y_train)

lr_under = LogisticRegression(max_iter=1000).fit(X_under, y_under)
print(f"Undersampling Recall: {recall_score(y_test, lr_under.predict(X_test))}")

#### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

lr_smote = LogisticRegression(max_iter=1000).fit(X_smote, y_smote)
print(f"SMOTE Recall: {recall_score(y_test, lr_smote.predict(X_test))}")