<a href="https://colab.research.google.com/github/varshinihebbal/ElevateLabs_EDA_Task8/blob/main/task8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 0: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [4]:
# Step 1: Load the Dataset
df = pd.read_csv("Mall_Customers.csv")

print("--- DataFrame Head ---")
print(df.head())

print("\n--- DataFrame Info ---")
df.info()

--- DataFrame Head ---
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [5]:
# Step 2.1: Select the features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']].values

# Step 2.2: Scale the features
# K-Means uses distance measures, so features must be on a similar scale.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFirst 5 scaled data points:")
print(X_scaled[:5])


First 5 scaled data points:
[[-1.73899919 -0.43480148]
 [-1.73899919  1.19570407]
 [-1.70082976 -1.71591298]
 [-1.70082976  1.04041783]
 [-1.66266033 -0.39597992]]


In [6]:
# Step 3: Use the Elbow Method to find the optimal K
inertia = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.xticks(K_range)
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()

In [7]:
# Step 4.1: Fit K-Means with the optimal K (K=5)
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Step 4.2: Extract cluster centroids (transformed back to original scale)
# Note: Centroids are in the scaled space, we inverse transform for interpretability
centroids_scaled = kmeans.cluster_centers_
centroids_original = scaler.inverse_transform(centroids_scaled)
centroids_df = pd.DataFrame(centroids_original, columns=['Annual Income (k$)', 'Spending Score (1-100)'])

# Step 4.3: Evaluate Clustering with Silhouette Score
# The score is calculated on the scaled data and the assigned labels.
score = silhouette_score(X_scaled, df['Cluster'])

print(f"\nOptimal Number of Clusters (K): {optimal_k}")
print(f"Silhouette Score: {score:.4f}")
print("\nCluster Centroids (Original Scale) for Interpretation:")
print(centroids_df)


Optimal Number of Clusters (K): 5
Silhouette Score: 0.5547

Cluster Centroids (Original Scale) for Interpretation:
   Annual Income (k$)  Spending Score (1-100)
0           55.296296               49.518519
1           86.538462               82.128205
2           25.727273               79.363636
3           88.200000               17.114286
4           26.304348               20.913043


In [8]:
# Step 4.1: Fit K-Means with the optimal K (K=5)
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Step 4.2: Extract cluster centroids (transformed back to original scale)
# Note: Centroids are in the scaled space, we inverse transform for interpretability
centroids_scaled = kmeans.cluster_centers_
centroids_original = scaler.inverse_transform(centroids_scaled)
centroids_df = pd.DataFrame(centroids_original, columns=['Annual Income (k$)', 'Spending Score (1-100)'])

# Step 4.3: Evaluate Clustering with Silhouette Score
# The score is calculated on the scaled data and the assigned labels.
score = silhouette_score(X_scaled, df['Cluster'])

print(f"\nOptimal Number of Clusters (K): {optimal_k}")
print(f"Silhouette Score: {score:.4f}")
print("\nCluster Centroids (Original Scale) for Interpretation:")
print(centroids_df)


Optimal Number of Clusters (K): 5
Silhouette Score: 0.5547

Cluster Centroids (Original Scale) for Interpretation:
   Annual Income (k$)  Spending Score (1-100)
0           55.296296               49.518519
1           86.538462               82.128205
2           25.727273               79.363636
3           88.200000               17.114286
4           26.304348               20.913043


In [9]:
# Step 5: Visualize the Clusters
plt.figure(figsize=(12, 8))

# Scatter plot of data points, colored by cluster
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], c=df['Cluster'], cmap='viridis', s=60, alpha=0.8)

# Plotting the cluster centroids
plt.scatter(centroids_df['Annual Income (k$)'], centroids_df['Spending Score (1-100)'],
            marker='X', s=300, color='red', label='Centroids')

plt.title(f'Customer Segments using K-Means (K={optimal_k})')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.grid(True)
plt.savefig('kmeans_clusters.png')
plt.close()

print("\nVisualization saved as 'kmeans_clusters.png'")


Visualization saved as 'kmeans_clusters.png'
