In [1]:
import os
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# 1) Load raw dataset
df = pd.read_csv("customer_sales_raw.csv")

# 2) Target & features
y = df["churn"]
num_cols = ["price", "quantity", "total_value", "age", "tenure_months"]
cat_cols = ["gender", "region", "segment", "product_name", "category", "sentiment"]
X = df[num_cols + cat_cols].copy()

# 3) Train / test split (stratify to preserve churn distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Preprocessing
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())   # SVM needs scaling!
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols),
])

# 5) SVM classifier with grid search
svm = SVC(probability=True, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", svm)
])

param_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__kernel": ["linear", "rbf"],
    "clf__class_weight": [None, "balanced"]
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    n_jobs=-1,
    cv=5,
    verbose=0
)

# 6) Train
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("Best params:", grid.best_params_)

# 7) Evaluate
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Accuracy: {acc:.3f}")
print(f"ROC-AUC: {auc:.3f}")
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=3))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

# 8) Save model
os.makedirs("models", exist_ok=True)
out_path = "models/customer_churn_svm.pkl"
with open(out_path, "wb") as f:
    pickle.dump(best_model, f)

print(f"\n✅ SVM model saved to {out_path}")


Best params: {'clf__C': 10, 'clf__class_weight': None, 'clf__kernel': 'rbf'}
Accuracy: 0.741
ROC-AUC: 0.552

Classification report:
               precision    recall  f1-score   support

           0      0.757     0.965     0.849      1203
           1      0.373     0.063     0.108       397

    accuracy                          0.741      1600
   macro avg      0.565     0.514     0.478      1600
weighted avg      0.662     0.741     0.665      1600


Confusion matrix:
 [[1161   42]
 [ 372   25]]

✅ SVM model saved to models/customer_churn_svm.pkl
