In [50]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load cleaned file
df = pd.read_csv("cleaned_file.csv",  low_memory= False)
df.head()


Unnamed: 0,ResNumber,EntityCode,ServiceAreaCode,DispatchVehicleCode,DispatchDriverCode,Pickup Date,Pickup DateTime,Passenger,CustomerCode,InvoiceNumber,...,OnLocationDateTime,PassengerEmailAddress,CustomerName,NumberOfPassengers,CCBatchRowId$1476,CreditCardNumberPartial,Distance,Pickup Country,Dropoff Country,PassengerOnBoardDateTime
0,1090385*1,SUN-NW,US-NYC,1619,251619,2024-10-01 00:00:00,2024-10-01 00:00:00,"CREW,AVIANCA",61789,292114.0,...,2024-10-01 00:02:00,,Avianca (Long Island Marriott),7,,,17.8,US,US,2024-10-01 00:02:00
1,1087026*7,SUN-NW,US-NYC,1696-1,251696-1,2024-10-01 00:00:00,2024-10-01 00:20:00,"CREW,AVIANCA",61789,292114.0,...,2024-09-30 23:30:00,,Avianca (Long Island Marriott),9,,,17.8,US,US,2024-10-01 00:23:00
2,1085036*1,SUN-LA,US-WDC,8498,108498,2024-10-01 00:00:00,2024-10-01 00:26:00,"RODRIGUEZ,JOSE",50241,,...,2024-10-01 00:06:00,,AMGEN VVIP,1,11303.0,1007.0,31.13,US,US,2024-10-01 01:06:00
3,1090466*1,SUN-NW,US-NYC,2890,251223-00,2024-10-01 00:00:00,2024-10-01 00:40:00,"CREW,AVIANCA",61789,292114.0,...,2024-09-30 23:49:00,,Avianca (Long Island Marriott),7,,,19.93,US,US,2024-10-01 00:53:00
4,1089861*1,SUN-NW,US-WDC,1246,109244,2024-10-01 00:00:00,2024-10-01 00:53:00,"POLAK,BENJAMIN",6405,291427.0,...,2024-09-30 23:47:00,POLAK.BEN@BCG.COM;GARITY.KATHY@BCG.COM;GENINVO...,BOSTON CONSULTING DALLAS,1,,,4.01,US,US,2024-10-01 00:59:00


Convert Date Columns + Filter Year + Split Jan–Jun / Jul–Dec

In [51]:
# ===Correct train/test split for Oct 2024 → Oct 2025 ===

df["Pickup DateTime"] = pd.to_datetime(df["Pickup DateTime"], errors="coerce")
df["PassengerOnBoardDateTime"] = pd.to_datetime(df["PassengerOnBoardDateTime"], errors="coerce")

# Drop rows missing timestamps
df = df.dropna(subset=["Pickup DateTime", "PassengerOnBoardDateTime"])

# Extract year and month
df["year"] = df["Pickup DateTime"].dt.year
df["month"] = df["Pickup DateTime"].dt.month

# 6-month training window: Oct 2024 → Mar 2025
train_df = df[
    ((df["year"] == 2024) & (df["month"].between(10, 12))) |
    ((df["year"] == 2025) & (df["month"].between(1, 3)))
].copy()

# 6-month testing window: Apr 2025 → Sep 2025
test_df = df[
    (df["year"] == 2025) & (df["month"].between(4, 9))
].copy()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)



Train shape: (40526, 90)
Test shape: (41978, 90)


Create WaitTimeMinutes + Delay_Flag

In [52]:
# === Create WaitTimeMinutes + Delay_Flag ===

# Compute wait time in minutes
train_df["WaitTimeMinutes"] = (
    train_df["PassengerOnBoardDateTime"] - train_df["Pickup DateTime"]
).dt.total_seconds() / 60

test_df["WaitTimeMinutes"] = (
    test_df["PassengerOnBoardDateTime"] - test_df["Pickup DateTime"]
).dt.total_seconds() / 60

# Set delay threshold (SLA)
delay_threshold = 15

# Binary target
train_df["Delay_Flag"] = (train_df["WaitTimeMinutes"] > delay_threshold).astype(int)
test_df["Delay_Flag"]  = (test_df["WaitTimeMinutes"] > delay_threshold).astype(int)

print("Train delay distribution:")
print(train_df["Delay_Flag"].value_counts())
print("\nTest delay distribution:")
print(test_df["Delay_Flag"].value_counts())


Train delay distribution:
Delay_Flag
1    20543
0    19983
Name: count, dtype: int64

Test delay distribution:
Delay_Flag
1    21740
0    20238
Name: count, dtype: int64


Create Time Features (Hour, Weekday, Weekend)

In [53]:
# === Create time-based features ===

for part in [train_df, test_df]:
    # Hour of the day (0–23)
    part["Pickup_Hour"] = part["Pickup DateTime"].dt.hour

    # Day of week (0=Monday … 6=Sunday)
    part["Pickup_Weekday"] = part["Pickup DateTime"].dt.weekday

    # Weekend flag (1 if Saturday/Sunday, else 0)
    part["IsWeekend"] = part["Pickup_Weekday"].isin([5, 6]).astype(int)

# Quick check
train_df[["Pickup DateTime", "Pickup_Hour", "Pickup_Weekday", "IsWeekend"]].head()


Unnamed: 0,Pickup DateTime,Pickup_Hour,Pickup_Weekday,IsWeekend
0,2024-10-01 00:00:00,0,1,0
1,2024-10-01 00:20:00,0,1,0
2,2024-10-01 00:26:00,0,1,0
3,2024-10-01 00:40:00,0,1,0
4,2024-10-01 00:53:00,0,1,0


Encode ServiceAreaCode & Pickup City

In [68]:
# === Label encode categorical variables ===

from sklearn.preprocessing import LabelEncoder

categorical_cols = ["ServiceAreaCode", "Pickup City"]

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()

    # Convert to string (required for encoding)
    train_df[col] = train_df[col].astype(str)
    test_df[col] = test_df[col].astype(str)

    # Fit on training data only
    le.fit(train_df[col])
    label_encoders[col] = le

    # Transform for train
    train_df[col + "_enc"] = le.transform(train_df[col])

    # Transform for test (unknown values → 0)
    test_df[col + "_enc"] = test_df[col].apply(
        lambda x: le.transform([x])[0] if x in le.classes_ else 0
    )

# Quick check
train_df[["ServiceAreaCode", "ServiceAreaCode_enc",
          "Pickup City", "Pickup City_enc"]].head()


for col, le in label_encoders.items():
    joblib.dump(le, f"{col}_encoder.pkl")

In [55]:
# === Define ML feature columns ===

feature_cols = [
    "Pickup_Hour",
    "Pickup_Weekday",
    "IsWeekend",
    "Distance",
    "BilledNumberMinutesInHours",
    "BaseChargeTotal",
    "ServiceAreaCode_enc",
    "Pickup City_enc"
]

print("Features we will use:")
print(feature_cols)


Features we will use:
['Pickup_Hour', 'Pickup_Weekday', 'IsWeekend', 'Distance', 'BilledNumberMinutesInHours', 'BaseChargeTotal', 'ServiceAreaCode_enc', 'Pickup City_enc']


In [56]:
# ===  Prepare ML data ===

# Make safe copies
train_ml = train_df.copy()
test_ml  = test_df.copy()

# 1) Convert all feature columns to numeric
for col in feature_cols:
    train_ml[col] = pd.to_numeric(train_ml[col], errors="coerce")
    test_ml[col]  = pd.to_numeric(test_ml[col], errors="coerce")

# 2) Fill missing feature values with 0
train_ml[feature_cols] = train_ml[feature_cols].fillna(0)
test_ml[feature_cols]  = test_ml[feature_cols].fillna(0)

# 3) Drop rows where target (Delay_Flag) is missing
train_ml = train_ml.dropna(subset=["Delay_Flag"])
test_ml  = test_ml.dropna(subset=["Delay_Flag"])

print("Training set size:", train_ml.shape)
print("Testing set size:", test_ml.shape)


Training set size: (40526, 97)
Testing set size: (41978, 97)


In [57]:
# === Define X_train, y_train, X_test, y_test ===

X_train = train_ml[feature_cols]
y_train = train_ml["Delay_Flag"]

X_test = test_ml[feature_cols]
y_test = test_ml["Delay_Flag"]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("\nFeature dtypes:")
print(X_train.dtypes)


X_train shape: (40526, 8)
X_test shape: (41978, 8)

Feature dtypes:
Pickup_Hour                     int32
Pickup_Weekday                  int32
IsWeekend                       int64
Distance                      float64
BilledNumberMinutesInHours    float64
BaseChargeTotal               float64
ServiceAreaCode_enc             int64
Pickup City_enc                 int64
dtype: object


In [58]:
# === Train Logistic Regression ===

log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train, y_train)

joblib.dump(log_reg, "delay_logreg_model.pkl")

print("Model training complete.")


Model training complete.


In [70]:
print(log_reg.feature_names_in_)


['Pickup_Hour' 'Pickup_Weekday' 'IsWeekend' 'Distance'
 'BilledNumberMinutesInHours' 'BaseChargeTotal' 'ServiceAreaCode_enc'
 'Pickup City_enc']


In [59]:
# === Evaluate model ===

y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6313783410357806

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.57      0.60     20238
           1       0.63      0.69      0.66     21740

    accuracy                           0.63     41978
   macro avg       0.63      0.63      0.63     41978
weighted avg       0.63      0.63      0.63     41978


Confusion Matrix:
 [[11513  8725]
 [ 6749 14991]]


## Data Visualization ##

In [60]:
train_df["ServiceArea_original"] = label_encoders["ServiceAreaCode"].inverse_transform(
    train_df["ServiceAreaCode_enc"]
)

train_df["PickupCity_original"] = label_encoders["Pickup City"].inverse_transform(
    train_df["Pickup City_enc"]
)

In [61]:
X_train = X_train.copy()
X_test = X_test.copy()

In [62]:
X_train["delay"] = y_train.values
X_test["delay"]  = y_test.values

In [63]:
X_train["split"] = "train"
X_test["split"]  = "test"

In [64]:
final_df = pd.concat([X_train, X_test], axis=0).reset_index(drop=True)
final_df.head()

Unnamed: 0,Pickup_Hour,Pickup_Weekday,IsWeekend,Distance,BilledNumberMinutesInHours,BaseChargeTotal,ServiceAreaCode_enc,Pickup City_enc,delay,split
0,0,1,0,17.8,0.0,110.0,0,735,0,train
1,0,1,0,17.8,0.0,110.0,0,735,0,train
2,0,1,0,31.13,0.0,89.0,1,344,1,train
3,0,1,0,19.93,0.0,110.0,0,350,0,train
4,0,1,0,4.01,0.0,45.0,1,171,0,train


In [65]:
final_df["ServiceArea"] = label_encoders["ServiceAreaCode"].inverse_transform(
    final_df["ServiceAreaCode_enc"]
)

final_df["PickupCity"] = label_encoders["Pickup City"].inverse_transform(
    final_df["Pickup City_enc"]
)

In [66]:
final_df.head()

Unnamed: 0,Pickup_Hour,Pickup_Weekday,IsWeekend,Distance,BilledNumberMinutesInHours,BaseChargeTotal,ServiceAreaCode_enc,Pickup City_enc,delay,split,ServiceArea,PickupCity
0,0,1,0,17.8,0.0,110.0,0,735,0,train,US-NYC,Uniondale
1,0,1,0,17.8,0.0,110.0,0,735,0,train,US-NYC,Uniondale
2,0,1,0,31.13,0.0,89.0,1,344,1,train,US-WDC,IAD
3,0,1,0,19.93,0.0,110.0,0,350,0,train,US-NYC,JFK
4,0,1,0,4.01,0.0,45.0,1,171,0,train,US-WDC,DCA


In [47]:
final_df.to_csv('model_trained.csv')