Imports + data (Iris as a no-download dataset)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

iris = load_iris(as_frame=True)
df = iris.frame  # features + target
X = df.drop(columns=["target"])
y_true = df["target"]  # labels only for reference (not used to train)

display(df.head())


Scale Features

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Elbow method (choose k)

In [None]:
inertias = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure()
plt.plot(list(K), inertias, marker="o")
plt.xlabel("k")
plt.ylabel("Inertia (within-cluster SSE)")
plt.title("Elbow Method")
plt.tight_layout()
plt.savefig("../images/elbow.png", dpi=150)
plt.show()


Silhouette scores (second opinion on k)

In [None]:
sil_scores = []
for k in K:
    km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X_scaled)
    sil_scores.append(silhouette_score(X_scaled, km.labels_))

plt.figure()
plt.plot(list(K), sil_scores, marker="o")
plt.xlabel("k")
plt.ylabel("Silhouette score")
plt.title("Silhouette Analysis")
plt.tight_layout()
plt.savefig("../images/silhouette.png", dpi=150)
plt.show()

best_k = K[int(np.argmax(sil_scores))]
best_k


Final KMeans, clustering, and 2D visualization (PCA)

In [None]:
k = 3  # Iris often separates well at 3; or use best_k from above
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure()
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette="tab10")
plt.title(f"KMeans Clusters (k={k}) on PCA(2D)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig(f"../images/kmeans_pca_k{k}.png", dpi=150)
plt.show()

print("Silhouette:", silhouette_score(X_scaled, clusters))


Compare to true species (purely for reference)

In [None]:
from sklearn.metrics import adjusted_rand_score
print("Adjusted Rand Index vs true labels:", adjusted_rand_score(y_true, clusters))
pd.crosstab(y_true, clusters, rownames=["True"], colnames=["Cluster"])


Imports + data (Iris as a no-download dataset)