In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
df = pd.read_csv("APL_Logistics_features.csv")
print("Data loaded. Shape:", df.shape)


Data loaded. Shape: (180519, 35)


In [3]:
y = (df['Late_delivery_risk'] > 0).astype(int)
print("Target distribution:\n", y.value_counts())


Target distribution:
 1    98977
0    81542
Name: Late_delivery_risk, dtype: int64


In [4]:
# 3.1️⃣ Define the features you want to use
safe_features = [
    'Days for shipping (real)',
    'Days for shipment (scheduled)',
    'Benefit per order',
    'Sales per customer',
    'Category Name',
    'Customer City',
    'Customer Country',
    'Customer Segment',
    'Customer State',
    'Department Name',
    'Market',
    'Order City',
    'Order Country',
    'Order Region',
    'Order Item Discount',
    'Order Item Discount Rate',
    'Order Item Product Price',
    'Order Item Profit Ratio',
    'Order Item Quantity',
    'Sales',
    'Order Item Total',
    'Order Profit Per Order',
    'Product Price',
    'Shipping_Pressure_Index',
    'Shipping_Mode_Express',
    'Regional_Congestion_Index',
    'Order_Complexity_Score'
]

# 3.2️⃣ Keep only columns that exist in df
safe_features_existing = [c for c in safe_features if c in df.columns]

# 3.3️⃣ Save this list for evaluation
joblib.dump(safe_features_existing, "feature_columns.pkl")
print("Feature columns saved:", safe_features_existing)

# 3.4️⃣ Prepare X
X = df[safe_features_existing]
print("X shape:", X.shape)


Feature columns saved: ['Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Sales per customer', 'Category Name', 'Customer City', 'Customer Country', 'Customer Segment', 'Customer State', 'Department Name', 'Market', 'Order City', 'Order Country', 'Order Region', 'Order Item Discount', 'Order Item Discount Rate', 'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Product Price', 'Shipping_Pressure_Index', 'Shipping_Mode_Express', 'Regional_Congestion_Index', 'Order_Complexity_Score']
X shape: (180519, 27)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (144415, 27)
Test shape: (36104, 27)


In [6]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("=== Logistic Regression Performance ===")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


=== Logistic Regression Performance ===
[[15320   988]
 [   11 19785]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97     16308
           1       0.95      1.00      0.98     19796

    accuracy                           0.97     36104
   macro avg       0.98      0.97      0.97     36104
weighted avg       0.97      0.97      0.97     36104



In [7]:
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("=== Random Forest Performance ===")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


=== Random Forest Performance ===
[[15439   869]
 [    2 19794]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     16308
           1       0.96      1.00      0.98     19796

    accuracy                           0.98     36104
   macro avg       0.98      0.97      0.98     36104
weighted avg       0.98      0.98      0.98     36104



In [8]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

print("=== XGBoost Performance ===")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


=== XGBoost Performance ===
[[15442   866]
 [    0 19796]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     16308
           1       0.96      1.00      0.98     19796

    accuracy                           0.98     36104
   macro avg       0.98      0.97      0.98     36104
weighted avg       0.98      0.98      0.98     36104



In [9]:
import joblib

# Save XGBoost model
joblib.dump(xgb_clf, "APL_Logistics_best_model_xgb.pkl")
print("XGBoost model saved successfully")


XGBoost model saved successfully
