In [1]:
# train_decision_tree_customer.ipynb

import os
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# 1) Load raw data saved from your EDA step
df = pd.read_csv("customer_sales_raw.csv")

# 2) Target & features
y = df["churn"]
num_cols = ["price", "quantity", "total_value", "age", "tenure_months"]
cat_cols = ["gender", "region", "segment", "product_name", "category", "sentiment"]
X = df[num_cols + cat_cols].copy()

# 3) Split (stratify to preserve churn balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Preprocess (no scaling needed for trees, but impute + one-hot is essential)
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols),
])

# 5) Model + small grid to reduce overfitting
dt = DecisionTreeClassifier(random_state=42)
pipe = Pipeline([
    ("prep", preprocess),
    ("clf", dt)
])

param_grid = {
    "clf__max_depth": [None, 4, 6, 8, 10],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 5],
    "clf__class_weight": [None, "balanced"]
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="roc_auc",  # more informative for churn
    n_jobs=-1,
    cv=5,
    verbose=0
)

# 6) Train
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("Best params:", grid.best_params_)

# 7) Evaluate
y_pred = best_model.predict(X_test)
try:
    y_proba = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
except Exception:
    y_proba, auc = None, None

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")
if auc is not None:
    print(f"ROC-AUC: {auc:.3f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

# 8) Save model
os.makedirs("models", exist_ok=True)
out_path = "models/customer_churn_dtree.pkl"
with open(out_path, "wb") as f:
    pickle.dump(best_model, f)

print(f"\n✅ Decision Tree model saved to {out_path}")


Best params: {'clf__class_weight': 'balanced', 'clf__max_depth': None, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 10}
Accuracy: 0.830
ROC-AUC: 0.867

Classification report:
              precision    recall  f1-score   support

           0      0.931     0.836     0.881      1203
           1      0.620     0.811     0.703       397

    accuracy                          0.830      1600
   macro avg      0.776     0.824     0.792      1600
weighted avg      0.854     0.830     0.837      1600


Confusion matrix:
[[1006  197]
 [  75  322]]

✅ Decision Tree model saved to models/customer_churn_dtree.pkl
