In [1]:
# --- core ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings, joblib, pathlib
warnings.filterwarnings("ignore")

# --- scikit-learn / imblearn ---
from sklearn.metrics import (roc_auc_score, average_precision_score,
                              classification_report,
                              precision_recall_curve,
                              RocCurveDisplay, ConfusionMatrixDisplay)
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek          # for main model
from imblearn.pipeline import Pipeline as ImbPipeline
# NOTE: SMOTETomek requires imbalanced-learn ≥0.13

# --- hyper-parameter search ---
from skopt import BayesSearchCV
from skopt.space import Real, Integer            # search spaces

# --- XGBoost ---
from xgboost import XGBClassifier

# --- plotting theme ---
plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.dpi"] = 110

In [2]:
# Pull in the engineered table (creates `dataset`)
%run 02_features.ipynb

print("Dataset shape :", dataset.shape)
dataset.head()


(1067371, 8)
Cleaned data shape: (805620, 8)
User-level agg shape: (5881, 5)
Final user table table: (5881, 7)
✅ Extended feature table shape: (5881, 53)
✅ Modelling dataset shape: (5881, 57)
   CustomerID  did_repurchase_7d          first_date           last_date  \
0       12346               True 2009-12-14 08:34:00 2011-01-18 10:01:00   
1       12347              False 2010-10-31 14:20:00 2011-12-07 15:52:00   
2       12348              False 2010-09-27 14:59:00 2011-09-25 13:13:00   
3       12349              False 2010-04-29 13:20:00 2011-11-21 09:51:00   
4       12350              False 2011-02-02 16:01:00 2011-02-02 16:01:00   

   days_to_repurchase  total_orders  total_quantity  first_quantity  \
0                 4.0            12           74285              10   
1                37.0             8            3286              10   
2                80.0             5            2714              24   
3                18.0             4            1624               4

Unnamed: 0,CustomerID,did_repurchase_7d,first_date,last_date,days_to_repurchase,total_orders,total_quantity,first_quantity,first_revenue,diversity_first_day,...,country_United Arab Emirates,country_United Kingdom,country_Unspecified,country_West Indies,recency_2nd,freq_3d,monetary_3d,first_hour,first_dow,diversity_7d
0,12346,True,2009-12-14 08:34:00,2011-01-18 10:01:00,4.0,12,74285,10,45.0,1,...,False,True,False,False,0,3,90.0,8,0,2
1,12347,False,2010-10-31 14:20:00,2011-12-07 15:52:00,37.0,8,3286,10,12.5,40,...,False,False,False,False,0,1,611.53,14,6,40
2,12348,False,2010-09-27 14:59:00,2011-09-25 13:13:00,80.0,5,2714,24,6.96,20,...,False,False,False,False,0,1,222.16,14,0,20
3,12349,False,2010-04-29 13:20:00,2011-11-21 09:51:00,18.0,4,1624,4,29.96,46,...,False,False,False,False,0,1,1068.52,13,3,46
4,12350,False,2011-02-02 16:01:00,2011-02-02 16:01:00,0.0,1,197,12,17.4,17,...,False,False,False,False,0,1,334.4,16,2,17


In [3]:
# -------------------------------------
# 2 · feature / target matrices
# -------------------------------------
leak_cols = ["days_to_repurchase", "recency_2nd",
             "diversity_7d", "freq_3d", "monetary_3d"]

X = (dataset
      .drop(columns=["CustomerID", "first_date", "last_date",
                     "did_repurchase_7d"] + leak_cols))

y = dataset["did_repurchase_7d"].astype(int)

# keep chronological order (important for leakage control)
idx_sorted = dataset.sort_values("first_date").index
X = X.loc[idx_sorted];  y = y.loc[idx_sorted]

print("Predictor matrix :", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))


Predictor matrix : (5881, 48)
Target distribution:
 did_repurchase_7d
0    0.947968
1    0.052032
Name: proportion, dtype: float64


In [4]:
print("\n🔹 Baseline: GradientBoosting")

tscv = TimeSeriesSplit(n_splits=5, test_size=180)

gb_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("gb"   , GradientBoostingClassifier(random_state=42))
])

cv_auc = cross_val_score(gb_pipe, X, y,
                         cv=tscv, scoring="roc_auc", n_jobs=-1)

print(f"5-fold ROC-AUC : {cv_auc.mean():.3f} ± {cv_auc.std():.3f}")

# hold-out = last split of TSCV
train_idx, test_idx = list(tscv.split(X))[-1]
gb_pipe.fit(X.iloc[train_idx], y.iloc[train_idx])

y_pred  = gb_pipe.predict(X.iloc[test_idx])
y_prob  = gb_pipe.predict_proba(X.iloc[test_idx])[:,1]

print(classification_report(y.iloc[test_idx], y_pred, digits=3))
print("Hold-out ROC-AUC :", roc_auc_score(y.iloc[test_idx], y_prob).round(3))



🔹 Baseline: GradientBoosting
5-fold ROC-AUC : 0.863 ± 0.099
              precision    recall  f1-score   support

           0      0.939     1.000     0.968       169
           1      0.000     0.000     0.000        11

    accuracy                          0.939       180
   macro avg      0.469     0.500     0.484       180
weighted avg      0.882     0.939     0.909       180

Hold-out ROC-AUC : 0.932


In [5]:
num_cols = X.select_dtypes("number").columns
cat_cols = X.select_dtypes("bool").columns     

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", "passthrough",    cat_cols)
])

xgb = XGBClassifier(
        objective="binary:logistic",
        eval_metric="aucpr",          
        tree_method="hist",           
        learning_rate=0.05,
        n_estimators=400,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y==0).sum()/(y==1).sum(),  
        random_state=42
)

pipe = ImbPipeline(steps=[
    ("pre",   pre),                           
    ("smote", SMOTETomek(random_state=42)),   
    ("clf",   xgb)                            
])

param_space = {
    "clf__n_estimators": Integer(200, 600),
    "clf__max_depth"   : Integer(3, 6),
    "clf__gamma"       : Real(1e-3, 5, prior="log-uniform")
}

tscv = TimeSeriesSplit(n_splits=5, test_size=180)

opt = BayesSearchCV(
        pipe, param_space,
        cv          = tscv,
        scoring     = "average_precision",
        n_iter      = 30,     
        n_points    = 3,       
        n_jobs      = -1,
        random_state= 42,
        verbose     = 0
)

opt.fit(X, y)
print(f"Best CV PR-AUC : {opt.best_score_:.3f}")
print("Best params  :", opt.best_params_)
train_idx, test_idx = list(tscv.split(X))[-1]

best_pipe = opt.best_estimator_
y_prob = best_pipe.predict_proba(X.iloc[test_idx])[:,1]


prec, rec, thr = precision_recall_curve(y.iloc[test_idx], y_prob)
f1  = 2*prec*rec/(prec+rec+1e-9)
thr_opt = thr[f1.argmax()]
print(f"optimal threshold = {thr_opt:.2f}")

y_pred = (y_prob >= thr_opt).astype(int)

print("\n", classification_report(y.iloc[test_idx], y_pred, digits=3))
print("Hold-out PR-AUC :", average_precision_score(y.iloc[test_idx], y_prob).round(3))



Best CV PR-AUC : 0.334
Best params  : OrderedDict({'clf__gamma': 4.999999999999999, 'clf__max_depth': 3, 'clf__n_estimators': 200})
optimal threshold = 0.58

               precision    recall  f1-score   support

           0      0.988     0.953     0.970       169
           1      0.529     0.818     0.643        11

    accuracy                          0.944       180
   macro avg      0.759     0.885     0.806       180
weighted avg      0.960     0.944     0.950       180

Hold-out PR-AUC : 0.652


In [6]:
pathlib.Path("models").mkdir(exist_ok=True)
joblib.dump(best_pipe, "models/xgb_smote_pipeline.pkl")
print("✅  Pipeline saved to models/xgb_smote_pipeline.pkl")


✅  Pipeline saved to models/xgb_smote_pipeline.pkl
