In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

# 1. Load the data 
# I am using the file name you have: customers_l9_dataset.csv
df = pd.read_csv("customers_l9_dataset.csv")

# 2. Rename columns to make them shorter (As requested by assignment)
df = df.rename(columns={
    "Annual Income ($)": "Income_$",
    "Spending Score (1-100)": "SpendingScore"
})

# 3. Select the features we need
features = ["Income_$", "SpendingScore"]
X = df[features].copy()

# Fill any empty spots with the median
X = X.fillna(X.median())

# 4. Scale the data so Income doesn't bully the Spending Score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Elbow Check - Printing SSE to find the best K
print("--- My Elbow Check Results ---")
for k in range(1, 11):
    model = KMeans(n_clusters=k, n_init=10, random_state=42)
    model.fit(X_scaled)
    print(f"k={k} | SSE={model.inertia_:.2f}")

# 6. Training the final model with K=5
# I chose 5 because the SSE numbers start slowing down there
K = 5
final_km = KMeans(n_clusters=K, n_init=10, random_state=42)
df["Cluster"] = final_km.fit_predict(X_scaled)

# 7. Checking how good the clusters are
sil = silhouette_score(X_scaled, df["Cluster"])
dbi = davies_bouldin_score(X_scaled, df["Cluster"])

print("\n--- Performance Metrics ---")
print(f"Silhouette Score : {sil:.3f}")
print(f"Davies–Bouldin   : {dbi:.3f}")

# 8. Finding the center of each group (Original Units)
centers = scaler.inverse_transform(final_km.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=features)
centers_df.index.name = "Cluster"

print("\n--- Where the Centers Are ---")
print(centers_df.round(2))

# 9. Sanity Check - Looking at 3 random customers
print("\n--- Checking 3 Customers ---")
print(df.loc[[5, 80, 155], ["Income_$", "SpendingScore", "Cluster"]])

# 10. Saving my results
df.to_csv("spending_labeled_clusters.csv", index=False)
print("\nDone! I saved the file as spending_labeled_clusters.csv")

--- My Elbow Check Results ---
k=1 | SSE=400.00
k=2 | SSE=269.69
k=3 | SSE=157.70
k=4 | SSE=108.92
k=5 | SSE=65.57
k=6 | SSE=55.06
k=7 | SSE=44.86
k=8 | SSE=37.23
k=9 | SSE=32.39
k=10 | SSE=29.98

--- Performance Metrics ---
Silhouette Score : 0.555
Davies–Bouldin   : 0.572

--- Where the Centers Are ---
         Income_$  SpendingScore
Cluster                         
0           55.30          49.52
1           86.54          82.13
2           25.73          79.36
3           88.20          17.11
4           26.30          20.91

--- Checking 3 Customers ---
     Income_$  SpendingScore  Cluster
5          17             76        2
80         54             51        0
155        78             89        1

Done! I saved the file as spending_labeled_clusters.csv
