In [74]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

FEATURES_PATH = "./data/features/features.parquet"
MODELS_DIR = "./models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Load features created in Step 3
features = pd.read_parquet(FEATURES_PATH)

# Keep rows with target present
df = features.dropna(subset=["actual_delivery_days"]).copy()
df = df.sort_values("order_purchase_timestamp")  # ensure temporal order

print("Loaded:", df.shape)
df[["order_purchase_timestamp","actual_delivery_days"]].head()

Loaded: (96476, 34)


Unnamed: 0,order_purchase_timestamp,actual_delivery_days
30710,2016-09-15 12:16:38,54.0
93285,2016-10-03 09:44:50,23.0
28424,2016-10-03 16:56:50,24.0
92636,2016-10-03 21:01:41,35.0
97979,2016-10-03 21:13:36,30.0


In [76]:
# Adjust dates depending on your dataset distribution

train_end = "2017-06-01"
valid_end = "2017-10-01"

train = df[df["order_purchase_timestamp"] < train_end]
valid = df[(df["order_purchase_timestamp"] >= train_end) &
           (df["order_purchase_timestamp"] < valid_end)]
test  = df[df["order_purchase_timestamp"] >= valid_end]

print("Train/Valid/Test sizes:", len(train), len(valid), len(test))

Train/Valid/Test sizes: 11069 15350 70057


In [78]:
# Candidate feature lists (keep only those that exist)
num_candidates = [
    "price_sum","freight_sum","n_items","n_sellers","n_products","avg_price",
    "pay_total","pay_types","installments_max",
    "avg_product_weight","avg_product_length","avg_product_height","avg_product_width",
    "avg_distance_km",
    "purchase_month","purchase_dow","purchase_hour"
]
cat_candidates = ["customer_state","customer_city","main_category"]

num_cols = [c for c in num_candidates if c in df.columns]
cat_cols = [c for c in cat_candidates if c in df.columns]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Split features/target
X_train = train[num_cols + cat_cols]
X_valid = valid[num_cols + cat_cols]
X_test  = test[num_cols + cat_cols]

y_train = train["actual_delivery_days"]
y_valid = valid["actual_delivery_days"]
y_test  = test["actual_delivery_days"]

# Pipelines with imputation to handle NaNs
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),     # robust for skewed dists
    ("scale",  StandardScaler())
])
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),  # fill missing labels
    ("ohe",    OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
], remainder="drop")

print("NaN ratio in numeric cols (train):")
display(X_train[num_cols].isna().mean().sort_values(ascending=False).head(10))

Numeric columns: ['price_sum', 'freight_sum', 'n_items', 'n_sellers', 'n_products', 'avg_price', 'pay_total', 'pay_types', 'installments_max', 'avg_product_weight', 'avg_product_length', 'avg_product_height', 'avg_product_width', 'avg_distance_km', 'purchase_month', 'purchase_dow', 'purchase_hour']
Categorical columns: ['customer_state', 'customer_city', 'main_category']
NaN ratio in numeric cols (train):


avg_distance_km       0.003794
avg_product_length    0.000903
avg_product_width     0.000903
avg_product_height    0.000903
avg_product_weight    0.000903
installments_max      0.000090
pay_total             0.000090
pay_types             0.000090
purchase_dow          0.000000
purchase_month        0.000000
dtype: float64

In [80]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Use mean_squared_error(squared=False) to compute RMSE
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

root_mean_squared_error = rmse

In [82]:
def evaluate(model, name):
    """
    Fit model on training set,
    evaluate on validation set,
    and print MAE (mean absolute error) and RMSE (root mean squared error).
    """
    model.fit(X_train, y_train)
    pred_v = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, pred_v)
    rmse = root_mean_squared_error(y_valid, pred_v)
    print(f"{name:24s}  Valid MAE={mae:.3f}  RMSE={rmse:.3f}")
    return mae, rmse, model

# 1) Baseline: always predict the mean
pipe_dummy = Pipeline([
    ("pre", pre), 
    ("mdl", DummyRegressor(strategy="mean"))
])
_ = evaluate(pipe_dummy, "DummyRegressor(mean)")

# 2) Random Forest Regressor
pipe_rf = Pipeline([
    ("pre", pre),
    ("mdl", RandomForestRegressor(
        n_estimators=400, random_state=42, n_jobs=-1
    ))
])
mae_rf, rmse_rf, best_model = evaluate(pipe_rf, "RandomForestRegressor")



DummyRegressor(mean)      Valid MAE=5.472  RMSE=7.727
RandomForestRegressor     Valid MAE=8.013  RMSE=10.728




In [83]:
# Evaluate the chosen model on the held-out test set
pred_t = best_model.predict(X_test)
mae_t  = mean_absolute_error(y_test, pred_t)
rmse_t = root_mean_squared_error(y_test, pred_t)

print(f"TEST — MAE={mae_t:.3f}  RMSE={rmse_t:.3f}")

TEST — MAE=7.356  RMSE=10.495




In [84]:
# Save the best model (pipeline including preprocessing + model)
model_path = os.path.join(MODELS_DIR, "delivery_time_rf.joblib")
dump(best_model, model_path)

print("Model saved at:", model_path)

Model saved at: ./models\delivery_time_rf.joblib


In [85]:
from joblib import load

# Load the trained model
model_path = "./models/delivery_time_rf.joblib"
loaded_model = load(model_path)

# Test prediction on a few rows
sample = X_test.head(5)
print("Predicted delivery days:", loaded_model.predict(sample))
print("Actual delivery days:", y_test.head(5).tolist())

Predicted delivery days: [14.5525 15.055  22.8025 16.185  22.2475]
Actual delivery days: [5.0, 12.0, 8.0, 9.0, 15.0]
