"""
Phase 5: Advanced Modeling (Naïve Evaluation)

⚠️ IMPORTANT MODEL VALIDITY WARNING ⚠️

The results in this notebook show unrealistically high performance
(ROC-AUC ≈ 1.0). This is caused by temporal data leakage due to:

• Random train/test splitting
• Features computed using future transaction information
• No enforcement of churn label availability windows

This notebook is intentionally retained to demonstrate:
WHY naïve evaluation is dangerous in churn prediction.

❗ These results are NOT used for final model selection ❗

Temporal leakage is formally diagnosed and corrected in:
- Notebook 06: Temporal Validation
- Notebook 07: Rolling Cross-Validation
"""


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    roc_curve
)

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

print("Libraries loaded successfully")


Libraries loaded successfully


In [12]:
df = pd.read_csv("../data/processed/customer_features.csv")

print(f"Dataset Shape: {df.shape}")
df.head()


Dataset Shape: (4312, 22)


Unnamed: 0,customerid,first_purchase,last_purchase,frequency,total_transactions,monetary_value,avg_order_value,total_quantity,avg_quantity_per_txn,unique_products,unique_invoices,min_price,max_price,avg_price,price_std,country_count,recency_days,customer_tenure_days,days_since_first_purchase,days_since_last_purchase,avg_days_between_purchases,churn
0,12346.0,2009-12-14 08:34:00,2010-06-28 13:53:00,11,33,206.36,6.253333,70,2.121212,26,11,1.0,7.49,6.253333,1.682971,1,164,196,360,164,17.818182,1
1,12347.0,2010-10-31 14:20:00,2010-12-07 14:57:00,2,71,162.95,2.29507,828,11.661972,70,2,0.38,12.75,2.29507,1.869887,1,2,37,39,2,18.5,0
2,12348.0,2010-09-27 14:59:00,2010-09-27 14:59:00,1,20,14.39,0.7195,373,18.65,20,1,0.29,1.45,0.7195,0.431856,1,73,0,73,73,0.0,0
3,12349.0,2010-04-29 13:20:00,2010-10-28 08:23:00,3,102,875.34,8.581765,993,9.735294,90,3,0.42,250.0,8.581765,31.299379,1,42,181,224,42,60.333333,0
4,12351.0,2010-11-29 15:23:00,2010-11-29 15:23:00,1,21,49.46,2.355238,261,12.428571,21,1,0.42,12.75,2.355238,2.735753,1,10,0,10,10,0.0,0


In [13]:
drop_cols = [
    "customerid",
    "first_purchase",
    "last_purchase",
    "churn"
]

X = df.drop(columns=drop_cols)
y = df["churn"]

print("Feature matrix shape:", X.shape)
print("Churn distribution:")
print(y.value_counts(normalize=True).round(3))


Feature matrix shape: (4312, 18)
Churn distribution:
churn
0    0.669
1    0.331
Name: proportion, dtype: float64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 3018
Test size: 1294


In [15]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [16]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

log_reg.fit(X_train_scaled, y_train)

y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]
y_pred_lr = log_reg.predict(X_test_scaled)

roc_lr = roc_auc_score(y_test, y_prob_lr)

print("Logistic Regression ROC-AUC:", round(roc_lr, 4))
print(classification_report(y_test, y_pred_lr))


Logistic Regression ROC-AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       866
           1       0.99      1.00      1.00       428

    accuracy                           1.00      1294
   macro avg       1.00      1.00      1.00      1294
weighted avg       1.00      1.00      1.00      1294



In [17]:
dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=20,
    class_weight="balanced",
    random_state=42
)

dt.fit(X_train, y_train)

dt_prob = dt.predict_proba(X_test)[:, 1]
dt_pred = dt.predict(X_test)

roc_dt = roc_auc_score(y_test, dt_prob)

print("Decision Tree ROC-AUC:", round(roc_dt, 4))
print(classification_report(y_test, dt_pred))


Decision Tree ROC-AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       866
           1       1.00      1.00      1.00       428

    accuracy                           1.00      1294
   macro avg       1.00      1.00      1.00      1294
weighted avg       1.00      1.00      1.00      1294



In [18]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_split=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_prob = rf.predict_proba(X_test)[:, 1]
rf_pred = rf.predict(X_test)

roc_rf = roc_auc_score(y_test, rf_prob)

print("Random Forest ROC-AUC:", round(roc_rf, 4))
print(classification_report(y_test, rf_pred))


Random Forest ROC-AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       866
           1       1.00      1.00      1.00       428

    accuracy                           1.00      1294
   macro avg       1.00      1.00      1.00      1294
weighted avg       1.00      1.00      1.00      1294



In [19]:
gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

gb_prob = gb.predict_proba(X_test)[:, 1]
gb_pred = gb.predict(X_test)

roc_gb = roc_auc_score(y_test, gb_prob)

print("Gradient Boosting ROC-AUC:", round(roc_gb, 4))
print(classification_report(y_test, gb_pred))


Gradient Boosting ROC-AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       866
           1       1.00      1.00      1.00       428

    accuracy                           1.00      1294
   macro avg       1.00      1.00      1.00      1294
weighted avg       1.00      1.00      1.00      1294



In [20]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting"
    ],
    "ROC_AUC": [
        round(roc_lr, 4),
        round(roc_dt, 4),
        round(roc_rf, 4),
        round(roc_gb, 4)
    ]
})

results.sort_values("ROC_AUC", ascending=False)


Unnamed: 0,Model,ROC_AUC
0,Logistic Regression,1.0
1,Decision Tree,1.0
2,Random Forest,1.0
3,Gradient Boosting,1.0


In [21]:
from pathlib import Path

Path("../models").mkdir(exist_ok=True)

joblib.dump(log_reg, "../models/logistic_regression_naive.pkl")
joblib.dump(dt, "../models/decision_tree_naive.pkl")
joblib.dump(rf, "../models/random_forest_naive.pkl")
joblib.dump(gb, "../models/gradient_boosting_naive.pkl")
joblib.dump(scaler, "../models/scaler_naive.pkl")

print("Naïve models saved successfully")


Naïve models saved successfully


"""
Final Interpretation – Notebook 05

All models achieved near-perfect ROC-AUC scores.

These results are INVALID due to temporal data leakage caused by:
• Random splitting of time-dependent customer behavior
• Inclusion of features derived from future activity
• No enforcement of churn observation windows

This notebook demonstrates why naïve modeling is misleading.

Temporal validation and leakage-safe evaluation are implemented in:
- Notebook 06: Temporal Validation
- Notebook 07: Rolling Cross-Validation

Only leakage-corrected results are considered for final deployment.
"""


"""
FINAL NOTE – MODEL VALIDITY (PHASE 5)

All models achieve ROC-AUC ≈ 1.0 with perfect precision and recall.

This performance is NOT realistic and is caused by:
• Temporal data leakage
• Random train/test splitting
• Features derived using future transaction behavior

These results are intentionally retained to demonstrate
why naïve evaluation is dangerous in churn prediction.

✔ This notebook fulfills Phase 5 (Model Development)
❌ These models are NOT used for deployment

Leakage-safe evaluation is performed in:
- Notebook 06: Temporal Validation
- Notebook 07: Rolling Cross-Validation
"""
