In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
# 1. Load Dataset
df = pd.read_csv('spending_l9_dataset.csv')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       200 non-null    int64 
 1   Age              200 non-null    int64 
 2   Income_$         200 non-null    int64 
 3   SpendingScore    200 non-null    int64 
 4   VisitsPerMonth   200 non-null    int64 
 5   OnlinePurchases  200 non-null    int64 
 6   Gender           200 non-null    object
 7   Region           200 non-null    object
dtypes: int64(6), object(2)
memory usage: 12.6+ KB


In [3]:
df["Income_$"] = df["Income_$"].fillna(df["Income_$"].median())
df["SpendingScore"] = df["SpendingScore"].fillna(df["SpendingScore"].median())

print(df.isnull().sum())

CustomerID         0
Age                0
Income_$           0
SpendingScore      0
VisitsPerMonth     0
OnlinePurchases    0
Gender             0
Region             0
dtype: int64


In [4]:
X = df[["Income_$", "SpendingScore"]]

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
print("Elbow Check (SSE)")

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    print(f"k={k} → SSE={kmeans.inertia_:.2f}")

Elbow Check (SSE)
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=77.01
k=4 → SSE=21.37
k=5 → SSE=17.93
k=6 → SSE=15.65
k=7 → SSE=13.88
k=8 → SSE=12.45
k=9 → SSE=11.06
k=10 → SSE=9.93


In [7]:
K = 4

kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels

print("Cluster column added")
df.head()

Cluster column added


Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region,Cluster
0,1,28,33,78,14,9,Female,East,2
1,2,21,25,87,8,23,Male,North,2
2,3,23,24,88,13,10,Male,South,2
3,4,24,25,73,16,11,Female,West,2
4,5,20,23,88,17,16,Male,West,2


In [8]:
sil_score = silhouette_score(X_scaled, labels)
dbi_score = davies_bouldin_score(X_scaled, labels)

print("=== CLUSTER EVALUATION ===")
print("Silhouette Score :", round(sil_score, 3))
print("Davies–Bouldin   :", round(dbi_score, 3))

=== CLUSTER EVALUATION ===
Silhouette Score : 0.729
Davies–Bouldin   : 0.387


In [9]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(
    centers_original,
    columns=["Income_$", "SpendingScore"]
)

centers_df.index.name = "Cluster"
centers_df = centers_df.round(2)

print("=== CLUSTER CENTERS (Original Units) ===")
centers_df

=== CLUSTER CENTERS (Original Units) ===


Unnamed: 0_level_0,Income_$,SpendingScore
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,56.32,53.58
1,28.92,19.6
2,24.14,83.1
3,99.16,79.24


In [11]:
print("=== SANITY CHECK SAMPLE ===")
df[["Income_$", "SpendingScore", "Cluster"]].sample(3, random_state=42)

=== SANITY CHECK SAMPLE ===


Unnamed: 0,Income_$,SpendingScore,Cluster
95,53,61,0
15,19,86,2
30,27,80,2
