In [1]:
# train_kmeans_customer.ipynb

import os
import pickle
import pandas as pd
from sklearn.cluster import KMeans

# 1) Load standardized dataset prepared in eda_customer_sales.ipynb
#    (expects 'customer_sales_scaled.csv' from the earlier step)
df = pd.read_csv("customer_sales_scaled.csv")

# 2) Keep only numeric features for clustering (already standardized)
#    Adjust if you added more numeric features.
num_cols = ["price", "quantity", "total_value", "age", "tenure_months"]
X = df[num_cols].copy()

# Safety: drop rows with any missing numeric values
X = X.dropna()

# 3) Train KMeans
kmeans_model = KMeans(n_clusters=3, n_init=10, random_state=42)
kmeans_model.fit(X)

# 4) (Optional) Assign clusters for inspection
df_clusters = df.loc[X.index].copy()
df_clusters["cluster"] = kmeans_model.labels_
print("Cluster assignments (first 10):", df_clusters["cluster"].head(10).tolist())
print("\nCluster sizes:\n", df_clusters["cluster"].value_counts().sort_index())
print("\nInertia (sum of squared distances):", kmeans_model.inertia_)

# Optional diagnostics if you have these columns:
if "churn" in df_clusters.columns:
    print("\nCluster vs Churn:\n", pd.crosstab(df_clusters["cluster"], df_clusters["churn"]))
if "segment" in df_clusters.columns:
    print("\nCluster vs Segment:\n", pd.crosstab(df_clusters["cluster"], df_clusters["segment"]))

# 5) Save model
os.makedirs("models", exist_ok=True)
with open("models/customer_kmeans.pkl", "wb") as f:
    pickle.dump(kmeans_model, f)

print("✅ KMeans model saved to models/customer_kmeans.pkl")


Cluster assignments (first 10): [1, 2, 2, 1, 0, 0, 1, 2, 2, 1]

Cluster sizes:
 cluster
0    3266
1    3324
2    1410
Name: count, dtype: int64

Inertia (sum of squared distances): 23346.25844777483

Cluster vs Churn:
 churn       0    1
cluster           
0        2468  798
1        2513  811
2        1034  376

Cluster vs Segment:
 segment  Consumer  Corporate  Small Business
cluster                                     
0            1015       1172            1079
1            1053       1226            1045
2             447        519             444
✅ KMeans model saved to models/customer_kmeans.pkl
