In [11]:
import pandas as pd
import numpy as np
import os
os.environ["OMP_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score



In [12]:
df = pd.read_csv("spending_l9_dataset.csv")

# Handle missing values via median
df["Income_$"] = df["Income_$"].fillna(df["Income_$"].median())
df["SpendingScore"] = df["SpendingScore"].fillna(df["SpendingScore"].median())

# Select features
X = df[["Income_$", "SpendingScore"]]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Dataset Loaded and Scaled. First 5 rows of scaled data:")
print(X_scaled[:5])

Dataset Loaded and Scaled. First 5 rows of scaled data:
[[-0.62783049  0.72998073]
 [-0.89031514  1.07359091]
 [-0.92312573  1.11176982]
 [-0.89031514  0.53908619]
 [-0.95593631  1.11176982]]


In [13]:
print("=== ELBOW CHECK (SSE) ===")
sse_list = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    sse_list.append(kmeans.inertia_)
    print(f"k={k} → SSE={kmeans.inertia_:.2f}")

=== ELBOW CHECK (SSE) ===
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=77.01
k=4 → SSE=21.37
k=5 → SSE=17.93
k=6 → SSE=15.65
k=7 → SSE=13.88
k=8 → SSE=12.45
k=9 → SSE=11.06
k=10 → SSE=9.93


In [14]:

# 3) Model Training (Pick K)

CHOSEN_K = 3 
CHOSEN_K = 3 # Change this based on your SSE results
final_model = KMeans(n_clusters=CHOSEN_K, random_state=42, n_init=10)
df['Cluster'] = final_model.fit_predict(X_scaled)


# 4) Evaluate Clustering

sil_score = silhouette_score(X_scaled, df['Cluster'])
dbi_score = davies_bouldin_score(X_scaled, df['Cluster'])

print(f"\nChosen K: {CHOSEN_K}")
print(f"Silhouette Score : {sil_score:.3f}")
print(f"Davies–Bouldin   : {dbi_score:.3f}")


Chosen K: 3
Silhouette Score : 0.611
Davies–Bouldin   : 0.584


In [15]:

# 5) Cluster Centers (Original Units)

centers_scaled = final_model.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(
    centers_original, 
    columns=["Income_$", "SpendingScore"]
).round(2)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df)


# 6) Sanity Check & Save

print("\n=== SANITY CHECK (3 Samples) ===")
print(df[["Income_$", "SpendingScore", "Cluster"]].sample(3))

# Save output
df.to_csv("spending_labeled_clusters.csv", index=False)
print("\nLabeled dataset saved as: spending_labeled_clusters.csv")


=== CLUSTER CENTERS (Original Units) ===
         Income_$  SpendingScore
Cluster                         
0           37.81          71.19
1           32.43          22.45
2           97.75          78.08

=== SANITY CHECK (3 Samples) ===
     Income_$  SpendingScore  Cluster
45         23             80        0
156        96             68        2
79         56             58        0

Labeled dataset saved as: spending_labeled_clusters.csv
