In [1]:
# train_logistic_customer.ipynb

import os
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1) Load dataset prepared in eda_customer_sales.ipynb
#    (expects files saved as shown earlier)
df = pd.read_csv("customer_sales_raw.csv")

# 2) Features (X) and target (y)
# Target must be 0/1
y = df["churn"]

# Select feature columns (exclude obvious identifiers and text-only fields)
num_cols = ["price", "quantity", "total_value", "age", "tenure_months"]
cat_cols = ["gender", "region", "segment", "product_name", "category", "sentiment"]

X = df[num_cols + cat_cols].copy()

# 3) Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Preprocessing + Model (pipeline)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

log_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# 5) Train
log_model.fit(X_train, y_train)

# 6) Evaluate
y_pred = log_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy (Churn): {accuracy:.2f}")

# 7) Save model
os.makedirs("models", exist_ok=True)
with open("models/customer_churn_logreg.pkl", "wb") as f:
    pickle.dump(log_model, f)

print("✅ Logistic Regression model saved to models/customer_churn_logreg.pkl")


Logistic Regression Accuracy (Churn): 0.53
✅ Logistic Regression model saved to models/customer_churn_logreg.pkl
