# 1. Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

import warnings

warnings.filterwarnings("ignore")

### Set up

In [None]:
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)

sns.set(style="whitegrid", palette="muted", font_scale=1.1)
pd.plotting.register_matplotlib_converters()
%matplotlib inline

# 2. Load Data

In [None]:
filepath = ""
df = pd.read_csv(filepath)
df.head()

# 3. Quick Data Check

In [None]:
print(df.shape)
print(df.info())
df.describe().T

# 4. EDA

In [None]:
# Missing values
print(df.isnull().sum())
# Duplicates
print(df.duplicated().sum())

In [None]:
# Pairplot sample
sns.pairplot(df.sample(min(200, len(df))))
plt.show()

# Correlation
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

# 5. Data Preprocessing

In [None]:
# Drop non-numeric columns (or encode them)
df_num = df.select_dtypes(include=[np.number]).copy()

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_num)

# Optional: Dimensionality reduction (for visualization)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 6. KMeans Clustering

In [None]:
# Elbow Method
inertia = []
K = range(2, 11)
for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

plt.plot(K, inertia, "bx-")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

# Fit best model
kmeans = KMeans(n_clusters=3, random_state=42)
labels_kmeans = kmeans.fit_predict(X_scaled)

# 7. Other Clustering Models

In [None]:
agg = AgglomerativeClustering(n_clusters=3)
labels_agg = agg.fit_predict(X_scaled)

db = DBSCAN(eps=0.5, min_samples=5)
labels_db = db.fit_predict(X_scaled)

# 8. Evaluation

In [None]:
def evaluate_clustering(X, labels, name):
    if len(set(labels)) > 1:
        sil = silhouette_score(X, labels)
        dbi = davies_bouldin_score(X, labels)
        print(f"{name}: Silhouette = {sil:.3f}, Davies-Bouldin = {dbi:.3f}")
    else:
        print(f"{name}: Only one cluster detected")


evaluate_clustering(X_scaled, labels_kmeans, "KMeans")
evaluate_clustering(X_scaled, labels_agg, "Agglomerative")
evaluate_clustering(X_scaled, labels_db, "DBSCAN")

# 9. Visualization

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 3, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_kmeans, cmap="viridis")
plt.title("KMeans")

plt.subplot(1, 3, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_agg, cmap="viridis")
plt.title("Agglomerative")

plt.subplot(1, 3, 3)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_db, cmap="viridis")
plt.title("DBSCAN")

plt.show()