In [1]:
%pip install pandas matplotlib
%pip install scikit-learn
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import timedelta

In [3]:
df = pd.read_csv("sampleData/master_orders.csv")
df["Order Date"] = pd.to_datetime(df["Order Date"], utc=True)
df = df.sort_values("Order Date")

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 17684 entries, 17563 to 1685
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   Order Date     17506 non-null  datetime64[ns, UTC]
 1   SKU            16507 non-null  object             
 2   Item           17683 non-null  object             
 3   Variant        14338 non-null  object             
 4   Quantity       17663 non-null  float64            
 5   Delivery Date  11825 non-null  object             
 6   Delivery Time  11573 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), object(5)
memory usage: 1.1+ MB
None


# Aggregate based on monthly

In [5]:
monthly = (
    df
    .set_index("Order Date")
    .groupby(["SKU", pd.Grouper(freq="ME")])["Quantity"]
    .sum()
    .reset_index()
    .rename(columns={"Order Date": "Month"})
    .sort_values(["SKU", "Month"])
)

# Feature Engineering

In [6]:
monthly["month_num"] = monthly["Month"].dt.month

monthly["year"] = monthly["Month"].dt.year

monthly["sku_id"] = monthly["SKU"].astype("category").cat.codes

# demand lag by x months
monthly["lag_1"] = monthly.groupby("SKU")["Quantity"].shift(1)
monthly["lag_2"] = monthly.groupby("SKU")["Quantity"].shift(2)
monthly["lag_3"] = monthly.groupby("SKU")["Quantity"].shift(3)

# month sin/cos seasonality
monthly["month_sin"] = np.sin(2 * np.pi * monthly["month_num"] / 12)
monthly["month_cos"] = np.cos(2 * np.pi * monthly["month_num"] / 12)

# Get top 5 SKUs

In [56]:
top_k = 0

In [57]:
vol_df = (
    monthly.groupby("SKU", observed=True)["Quantity"]
    .sum()
    .rename("total_qty")
    .reset_index()
)

vol_df = vol_df.sort_values(["total_qty", "SKU"], ascending=[False, True], kind="mergesort").reset_index(drop=True)

vol_df.head()

Unnamed: 0,SKU,total_qty
0,SF50,5108.0
1,CU-1,1423.0
2,SC5,674.0
3,DR50,592.0
4,SC15,538.0


In [58]:
top5 = vol_df.head(top_k)["SKU"].tolist()
top5

[]

# Stratify SKUs into sales volume

In [59]:
rest_df = vol_df[~vol_df["SKU"].isin(top5)].copy()
rest_total = rest_df["total_qty"].sum()

rest_df["cum_share"] = rest_df["total_qty"].cumsum() / rest_total


In [60]:
bins   = [-np.inf, 0.70, 0.95, np.inf]
labels = ["highest", "medium", "rest"]

rest_tiers = pd.cut(
    rest_df["cum_share"],
    bins=bins,
    labels=labels,
    include_lowest=True
).astype("string")

In [61]:
solo_labels = pd.Series({sku: f"solo::{sku}"} for sku in top5)
solo_labels = pd.Series({sku: f"solo::{sku}" for sku in top5}, name="tier", dtype="string")

rest_labels = pd.Series(rest_tiers.values, index=rest_df["SKU"].values, name="tier", dtype="string")

sku_tier = pd.concat([solo_labels, rest_labels])
sku_tier = sku_tier.astype("string")

sku_tier.name = "tier"


In [62]:
tiered = monthly.merge(sku_tier.to_frame(), left_on="SKU", right_index=True, how="inner")


In [63]:
# # total volume per SKU (use your monthly/weekly aggregated frame)
# sku_vol = monthly.groupby("SKU")["Quantity"].sum().sort_values(ascending=False)

# # cumulative revenue/volume share
# cum_share = (sku_vol.cumsum() / sku_vol.sum())

# bins   = [-np.inf, 0.70, 0.95, np.inf]
# labels = ["highest", "medium", "rest"]

# sku_tier = pd.cut(cum_share, bins=bins, labels=labels, include_lowest=True).astype(str)
# sku_tier.name = "tier"

In [64]:
print(sku_tier.value_counts())
print((sku_tier.value_counts(normalize=True) * 100).round(2).astype(str) + "%")

tier
rest       835
medium     465
highest     38
Name: count, dtype: Int64
tier
rest       62.41%
medium     34.75%
highest     2.84%
Name: proportion, dtype: object


In [65]:
# tiered = monthly.merge(sku_tier, left_on="SKU", right_index=True, how="inner")

# Train Test Split

In [66]:
!pip install scikit-learn



In [67]:
from sklearn.model_selection import train_test_split

In [68]:
import xgboost as xgb

In [69]:
tiered["Quantity"] = tiered["Quantity"].clip(lower=1)
tiered["lag_1"] = tiered["lag_1"].clip(lower=1)

tiered["log_qty"] = np.log1p(tiered["Quantity"])


In [70]:
X = monthly[["month_num", "year", "sku_id","lag_1", "lag_2", "lag_3", "month_sin", "month_cos"]]
y = monthly["Quantity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


# Training and Evaluation

In [71]:
def train_xgb_core_api(
    df_tier,
    feat=("month_num", "year", "sku_id", "lag_1", "lag_2", "lag_3", "month_sin", "month_cos"),
):
    # Drop rows that cannot form lag_1
    df_tier = df_tier.dropna(subset=["lag_1"]).reset_index(drop=True)

    # If too small, return 6-tuple with features so callers can unpack safely
    if len(df_tier) < 20:
        return None, None, None, None, None, list(feat)

    X = df_tier[list(feat)].to_numpy(dtype=float)
    y = df_tier["Quantity"].to_numpy(dtype=float)

    # 80/20 split with guards so validation is never empty
    split_idx = int(len(df_tier) * 0.8)
    if split_idx <= 0:
        split_idx = 1
    if split_idx >= len(df_tier):
        split_idx = len(df_tier) - 1

    X_tr, X_va = X[:split_idx], X[split_idx:]
    y_tr, y_va = y[:split_idx], y[split_idx:]

    dtr = xgb.DMatrix(X_tr, label=y_tr, feature_names=list(feat))
    dva = xgb.DMatrix(X_va, label=y_va, feature_names=list(feat))

    params = {
        "objective": "reg:squarederror",
        "eta": 0.1,
        "max_depth": 4,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "eval_metric": ["rmse", "mae"],
        "seed": 42,
    }
    ev = [(dtr, "train"), (dva, "valid")]
    bst = xgb.train(
        params,
        dtr,
        num_boost_round=500,
        evals=ev,
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    # Predict using best iteration if available
    best_iter = getattr(bst, "best_iteration", None)
    if best_iter is None:
        pred = bst.predict(dva)
    else:
        pred = bst.predict(dva, iteration_range=(0, best_iter + 1))

    mae = float(np.mean(np.abs(y_va - pred)))
    rmse = float(np.sqrt(np.mean((y_va - pred) ** 2)))
    mape = float(np.mean(np.abs((y_va - pred) / np.clip(y_va, 1e-8, None))) * 100)
    smape = float(100 * np.mean(2 * np.abs(y_va - pred) / (np.abs(y_va) + np.abs(pred) + 1e-8)))

    return bst, mae, rmse, mape, smape, list(feat)


In [72]:
from pathlib import Path

In [73]:
def slugify_tier(tier: str) -> str:
    s = re.sub(r"[^A-Za-z0-9]+", "_", str(tier)).strip("_")
    return s or "tier"

In [74]:
import re
from pathlib import Path

def slugify_tier(tier: str) -> str:
    s = re.sub(r"[^A-Za-z0-9]+", "_", str(tier)).strip("_")
    return s or "tier"

models = {}
metrics = []
future_all = []

cur_dir = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd()
save_dir = cur_dir / "models"
save_dir.mkdir(parents=True, exist_ok=True)

tier_list = tiered["tier"].dropna().astype(str).unique().tolist()

for tier in tier_list:
    sub = tiered.loc[tiered["tier"] == tier].copy()
    if sub.empty:
        print(f"[{tier}] skipped (no rows)")
        continue

    bst, mae, rmse, mape, smape, feat = train_xgb_core_api(sub)
    if bst is None:
        print(f"[{tier}] skipped (not enough data)")
        continue

    # Stable, safe filename per tier (e.g., solo__SKU_123)
    tier_slug = slugify_tier(tier)
    model_path = save_dir / f"xgb_sku_{tier_slug}_model.json"
    bst.save_model(model_path)
    models[tier] = (bst, feat)

    # best_iteration can be None if early stopping not triggered
    best_iter = getattr(bst, "best_iteration", None)
    if best_iter is None:
        print(f"[{tier}] MAE={mae:.3f}  RMSE={rmse:.3f}  MAPE={mape:.2f}%  SMAPE={smape:.2f}%")
    else:
        print(f"[{tier}] MAE={mae:.3f}  RMSE={rmse:.3f}  MAPE={mape:.2f}%  SMAPE={smape:.2f}%  (best_iter={best_iter})")

    # ---------- Build next-month features per SKU in this tier ----------
    sub = sub.sort_values(["SKU", "Month"])
    last_rows = sub.groupby("SKU", as_index=False).tail(1).reset_index(drop=True)

    def nth_from_end(s, n):
        import numpy as np
        return s.iloc[-n] if len(s) >= n else np.nan

    # Lags computed within this tier; each SKU appears in exactly one tier
    lag2_map = sub.groupby("SKU")["Quantity"].apply(lambda s: nth_from_end(s, 2))
    lag3_map = sub.groupby("SKU")["Quantity"].apply(lambda s: nth_from_end(s, 3))

    fut_month = pd.to_datetime(last_rows["Month"]) + pd.offsets.MonthEnd(1)
    fut_month_num = fut_month.dt.month

    future = pd.DataFrame({
        "SKU": last_rows["SKU"],
        "Month": fut_month,
        "month_num": fut_month_num,
        "year": fut_month.dt.year,
        "sku_id": last_rows["sku_id"],
        "lag_1": last_rows["Quantity"],
        "lag_2": last_rows["SKU"].map(lag2_map),
        "lag_3": last_rows["SKU"].map(lag3_map),
        "month_sin": np.sin(2 * np.pi * fut_month_num / 12),
        "month_cos": np.cos(2 * np.pi * fut_month_num / 12),
    })

    # Ensure exact feature order expected by this tier’s model
    for c in feat:
        if c not in future.columns:
            future[c] = np.nan
    future = future[list(feat)]

    dfut = xgb.DMatrix(future.to_numpy(dtype=float), feature_names=list(feat))

    # Respect early-stopping boundary if present
    if best_iter is None:
        preds = models[tier][0].predict(dfut)
    else:
        preds = models[tier][0].predict(dfut, iteration_range=(0, best_iter + 1))

    out = last_rows[["SKU"]].copy()
    out["Month"] = fut_month
    out["predicted_qty"] = preds
    out["tier"] = tier
    future_all.append(out[["SKU", "Month", "predicted_qty", "tier"]])

future_preds = pd.concat(future_all, ignore_index=True)
print(future_preds.head(20))


[rest] MAE=0.227  RMSE=0.477  MAPE=11.36%  SMAPE=15.15%  (best_iter=0)
[medium] MAE=1.596  RMSE=3.122  MAPE=90.37%  SMAPE=59.44%  (best_iter=4)
[highest] MAE=5.151  RMSE=6.476  MAPE=115.23%  SMAPE=54.23%  (best_iter=44)
                               SKU                     Month  predicted_qty  \
0                         10031400 2022-11-30 00:00:00+00:00            1.0   
1                         10031737 2023-01-31 00:00:00+00:00            1.0   
2                         10061281 2022-08-31 00:00:00+00:00            1.0   
3                         10077190 2023-03-31 00:00:00+00:00            1.0   
4                      10097234-AD 2025-03-31 00:00:00+00:00            1.0   
5                         10098455 2022-07-31 00:00:00+00:00            1.0   
6                         10152878 2023-07-31 00:00:00+00:00            1.0   
7                         10153754 2022-11-30 00:00:00+00:00            1.0   
8                         10178643 2023-08-31 00:00:00+00:00         