# DSA2040 END SEMESTER EXAM

## SECTION 2: DATA MINING

### TASK 2: CLUSTERING

In [8]:
# Necessary imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# 1. Load & Preprocess Data
iris = load_iris(as_frame=True)
df = iris.frame
feature_cols = iris.feature_names

#Scaling features
scaler = MinMaxScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])
true_labels = iris.target



In [9]:
# 2. K-Means Clustering (k=3)
kmeans_3 = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster_k3'] = kmeans_3.fit_predict(df[feature_cols])

# Evaluate using Adjusted Rand Index (ARI)
ari_k3 = adjusted_rand_score(true_labels, df['cluster_k3'])
print(f"ARI for k=3: {ari_k3:.4f}")



ARI for k=3: 0.7163


In [10]:
# 3. Experiment with k=2 and k=4
ari_scores = {}
for k in [2, 3, 4]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    pred = km.fit_predict(df[feature_cols])
    ari_scores[k] = adjusted_rand_score(true_labels, pred)

print("\nARI scores for k=2, k=3, k=4:")
for k, score in ari_scores.items():
    print(f"k={k}: ARI = {score:.4f}")

# Elbow method to justify optimal k
inertia_values = []
k_values = range(1, 7)
for k in k_values:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(df[feature_cols])
    inertia_values.append(km.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.savefig("iris_elbow_curve.png")
plt.close()




ARI scores for k=2, k=3, k=4:
k=2: ARI = 0.5681
k=3: ARI = 0.7163
k=4: ARI = 0.6231


In [None]:
# 4. Visualizing clusters
plt.figure(figsize=(6, 4))
sns.scatterplot(
    x=df[feature_cols[2]],  
    y=df[feature_cols[3]],  
    hue=df['cluster_k3'],
    palette="Set1",
    s=60
)
plt.title("K-Means Clusters (k=3)")
plt.savefig("iris_clusters_k3.png")
plt.close()

