In [None]:
# Explanation: Imports data handling libraries (e.g., pandas) and possibly reads or manipulates data.
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Explanation: Reads data from a file (CSV/Excel) into a DataFrame.
df = pd.read_csv("Mall_Customers.csv")

In [None]:
# Explanation: This cell contains code that starts with: `df`
df

In [None]:
# Explanation: Encodes categorical variables into numeric labels for modeling.
# Convert Categorical Data to Numerical Data
from sklearn.preprocessing import LabelEncoder
df['Genre'] = LabelEncoder().fit_transform(df['Genre'])

In [None]:
# Explanation: This cell contains code that starts with: `df`
df

In [None]:
# Explanation: This cell contains code that starts with: `x = df.iloc[:, 3:]            #Create feature matrix`
x = df.iloc[:, 3:]            #Create feature matrix
x

In [None]:
# Explanation: This cell contains code that starts with: `from sklearn.preprocessing import StandardScaler`
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler().fit_transform(x)                #Feature scaling on x so that Clustering works on Normalized Data

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
plt.title('Unclustered Data')
sns.scatterplot(x=x['Annual Income (k$)'], y=x['Spending Score (1-100)'])

In [None]:
# Explanation: This cell contains code that starts with: `from sklearn.cluster import KMeans, AgglomerativeClustering`
from sklearn.cluster import KMeans, AgglomerativeClustering

## KMeans Clustering

In [None]:
# Explanation: This cell contains code that starts with: `km = KMeans(n_clusters=4)`
km = KMeans(n_clusters=4)

In [None]:
# Explanation: This cell contains code that starts with: `km.fit_predict(x)`
km.fit_predict(x)

In [None]:
# Explanation: This cell contains code that starts with: `#sse`
#sse
km.inertia_                # Calculating Sum of Squared Error. Euclidean Distances to nearest Centroid. Minimize SSE

In [None]:
# Explanation: This cell contains code that starts with: `sse =[]`
sse =[]
for k in range(1, 16):
    km = KMeans(n_clusters=k)
    km.fit_predict(x)
    sse.append(km.inertia_)

In [None]:
# Explanation: This cell contains code that starts with: `sse`
sse

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
sns.lineplot(x=range(1, 16), y=sse)
plt.xlabel('Cluster')
plt.ylabel('SSE')
plt.title('Elbow Method for Optimal k')
plt.show()

So at 5th cluster

In [None]:
# Explanation: This cell contains code that starts with: `#Method second or alternative for elbow method`
#Method second or alternative for elbow method
from sklearn.metrics import silhouette_score

In [None]:
# Explanation: This cell contains code that starts with: `silh = []`
silh = []
for k in range(2, 16):
    km = KMeans(n_clusters=k, random_state=1)
    labels = km.fit_predict(x) 
    score = silhouette_score(x, labels) 
    silh.append(score) 

In [None]:
# Explanation: This cell contains code that starts with: `silh`
silh

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
plt.plot(range(2, 16), silh, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silouette Score vs Number of Clusters')
plt.show()

Same at 5th cluster we are getting highest silhouette score
this is most efficient cluster

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
plt.bar(range(2,16,1),silh)

In [None]:
# Explanation: This cell contains code that starts with: `km = KMeans(n_clusters=5, random_state=1)`
km = KMeans(n_clusters=5, random_state=1)

In [None]:
# Explanation: This cell contains code that starts with: `labels = km.fit_predict(x)`
labels = km.fit_predict(x) 

In [None]:
# Explanation: This cell contains code that starts with: `km.labels_`
km.labels_

In [None]:
# Explanation: This cell contains code that starts with: `cent = km.cluster_centers_`
cent = km.cluster_centers_
cent

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
plt.title('Clustered Data')
sns.scatterplot(x=x['Annual Income (k$)'], y=x['Spending Score (1-100)'], c=labels)
sns.scatterplot(x=cent[:, 0], y=cent[:, 1], s=200, color='red')

In [None]:
# Explanation: This cell contains code that starts with: `df[labels==0]`
df[labels==0]

## Aggolmerative Clustering (Hierarchical Clustering)

In [None]:
# Explanation: This cell contains code that starts with: `from sklearn.cluster import AgglomerativeClustering`
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
plt.figure(figsize=(8,5))
dendrogram = sch.dendrogram(sch.linkage(x, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean Distances')
plt.show()

In [None]:
# Explanation: This cell contains code that starts with: `hc = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')`
hc = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(x)

In [None]:
# Explanation: This cell contains code that starts with: `y_hc`
y_hc

In [None]:
# Explanation: Creates visualizations (plots, heatmaps) to explore data or show results.
df['Cluster'] = y_hc

plt.figure(figsize=(8,6))
for cluster in range(5):
    plt.scatter(
        x.iloc[y_hc == cluster, 0],
        x.iloc[y_hc == cluster, 1], 
        label=f'Cluster {cluster + 1}'
    )

plt.title("Clusters of Customers (Agglomerative) - 5 Clusters")
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend() 
plt.show() 

print(df)

In [None]:
# Explanation: This cell contains code that starts with: `print("\n--- Model Comparison ---")`
print("\n--- Model Comparison ---")
print(f"K-Means Silhouette Score: {silhouette_score(x, labels):.3f}")
print(f"Agglomerative Clustering Silhouette Score: {silhouette_score(x, y_hc):.3f}")