In [75]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from joblib import dump

# Paths
FEATURES_PATH = "./data/features/features.parquet"
MODELS_DIR = "./models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Load features
features = pd.read_parquet(FEATURES_PATH)

# Keep only rows with label
df = features.dropna(subset=["is_late"]).copy()
df = df.sort_values("order_purchase_timestamp")

print("Loaded dataset:", df.shape)
df[["order_purchase_timestamp","is_late"]].head()

Loaded dataset: (99441, 34)


Unnamed: 0,order_purchase_timestamp,is_late
4541,2016-09-04 21:15:19,False
4396,2016-09-05 00:15:34,False
10071,2016-09-13 15:24:19,False
30710,2016-09-15 12:16:38,True
83078,2016-10-02 22:07:52,False


In [77]:
train_end = "2017-06-01"
valid_end = "2017-10-01"

train = df[df["order_purchase_timestamp"] < train_end]
valid = df[(df["order_purchase_timestamp"] >= train_end) &
           (df["order_purchase_timestamp"] < valid_end)]
test  = df[df["order_purchase_timestamp"] >= valid_end]

print("Train/Valid/Test:", len(train), len(valid), len(test))

Train/Valid/Test: 11695 15887 71859


In [79]:
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale",  StandardScaler())
])

# Use dense output from OneHotEncoder so HGB can consume it
try:
    cat_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe",    OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    # fallback for older versions
    cat_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe",    OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

pre = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])


In [81]:
def evaluate_clf(model, name):
    model.fit(X_train, y_train)
    pred_v = model.predict(X_valid)
    proba_v = model.predict_proba(X_valid)[:,1] if hasattr(model,"predict_proba") else pred_v
    acc = accuracy_score(y_valid, pred_v)
    f1  = f1_score(y_valid, pred_v)
    auc = roc_auc_score(y_valid, proba_v)
    print(f"{name:30s}  ACC={acc:.3f}  F1={f1:.3f}  AUC={auc:.3f}")
    return model

# 1) Baseline classifier (predict majority class)
pipe_dummy = Pipeline([("pre", pre), ("mdl", DummyClassifier(strategy="most_frequent"))])
_ = evaluate_clf(pipe_dummy, "DummyClassifier(majority)")

# 2) RandomForest
pipe_rf = Pipeline([
    ("pre", pre),
    ("mdl", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1))
])
best_model = evaluate_clf(pipe_rf, "RandomForestClassifier")

# 3) HistGradientBoosting (handles missing values too)
pipe_hgb = Pipeline([
    ("pre", pre),
    ("mdl", HistGradientBoostingClassifier(max_depth=6, learning_rate=0.1, random_state=42))
])
best_model = evaluate_clf(pipe_hgb, "HistGradientBoostingClassifier")


DummyClassifier(majority)       ACC=0.968  F1=0.000  AUC=0.500
RandomForestClassifier          ACC=0.968  F1=0.000  AUC=0.594
HistGradientBoostingClassifier  ACC=0.968  F1=0.000  AUC=0.563


In [82]:
pred_t = best_model.predict(X_test)
proba_t = best_model.predict_proba(X_test)[:,1] if hasattr(best_model,"predict_proba") else pred_t

print("=== Classification Report (Test) ===")
print(classification_report(y_test, pred_t))

print("Confusion Matrix:")
print(confusion_matrix(y_test, pred_t))

print("Test ROC-AUC:", roc_auc_score(y_test, proba_t))


=== Classification Report (Test) ===
              precision    recall  f1-score   support

       False       0.92      1.00      0.96     66278
        True       0.00      0.00      0.00      5581

    accuracy                           0.92     71859
   macro avg       0.46      0.50      0.48     71859
weighted avg       0.85      0.92      0.89     71859

Confusion Matrix:
[[66274     4]
 [ 5581     0]]
Test ROC-AUC: 0.5363413536070281


In [83]:
model_path = os.path.join(MODELS_DIR, "late_delivery_clf.joblib")
dump(best_model, model_path)
print("Model saved at:", model_path)

Model saved at: ./models\late_delivery_clf.joblib
