In [None]:
# 1. Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, silhouette_score

# 2. Load the Iris dataset
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=['species'])
target_names = iris.target_names

print("First 5 rows of the dataset:")
display(X.head())

# 3. Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Apply K-Means clustering (k=3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
kmeans.fit(X_scaled)

# Add cluster labels to the DataFrame
X['cluster'] = kmeans.labels_
y['true_species'] = [target_names[i] for i in y['species']]

# 5. Compare K-Means cluster assignments with actual species
print("\nConfusion Matrix:")
cm = confusion_matrix(y['species'], X['cluster'])
print(cm)

# Since K-Means cluster numbers are arbitrary, align clusters with actual species labels
# Find the best mapping based on majority assignment
mapping = {}
for cluster in range(3):
    label = y['species'][X['cluster'] == cluster].mode()[0]
    mapping[cluster] = label

# Apply the mapping to K-Means results
X['pred_species'] = X['cluster'].map(mapping)

# Calculate accuracy
accuracy = accuracy_score(y['species'], X['pred_species'])
print(f"\nClustering Accuracy: {accuracy * 100:.2f}%")

# 6. Visualize clusters using Petal Length vs Petal Width
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=iris.data[:, 2],  # Petal length
    y=iris.data[:, 3],  # Petal width
    hue=kmeans.labels_,
    palette='coolwarm',
    edgecolor='k'
)
plt.title("K-Means Clustering on Iris Dataset (k=3)")
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.legend(title='Cluster')
plt.show()

# 7. Visualize actual species for comparison
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=iris.data[:, 2],
    y=iris.data[:, 3],
    hue=iris.target,
    palette='viridis',
    edgecolor='k'
)
plt.title("Actual Species Distribution (Petal Length vs Petal Width)")
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.legend(title='Species', labels=target_names)
plt.show()

# 8. Evaluate clustering quality with Silhouette Score
sil_score = silhouette_score(X_scaled, kmeans.labels_)
print(f"\nSilhouette Score: {sil_score:.4f}")

# 9. Discussion
print("\n📘 Discussion:")
print("""
• K-Means was fairly effective at identifying the natural groups (clusters) in the Iris dataset.
• The algorithm correctly separates 'Setosa' flowers since they are clearly distinct in feature space.
• However, it partially overlaps 'Versicolor' and 'Virginica' because their measurements are more similar.
• Silhouette score and visualization confirm that K-Means finds reasonable clusters,
  though it cannot perfectly match the true species due to natural class overlap.
""")