In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage


In [None]:
X = pd.read_csv('../../../household_power_consumption.csv', sep=';', na_values=['?','na'])

In [None]:
X

In [None]:
X.isnull().sum().max()

In [None]:
X.drop(["Date", "Time"], axis=1, inplace=True)

In [None]:
X.dropna(inplace=True)

In [None]:
X.drop_duplicates(inplace=True)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

kmeans

In [None]:
cluster_range = range(1, 11)

inertia_values = []

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca) 
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(3, 3))
plt.plot(cluster_range, inertia_values, marker='o', color='b')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.xticks(cluster_range)
plt.grid(True)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42) 
kmeans.fit(X_pca)

labels_kmeans = kmeans.labels_

plt.figure(figsize=(5, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels_kmeans, palette='Set2', s=50, marker='o')
plt.title('KMeans Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster', loc='best')
plt.show()

hierarchial

dbscan

In [None]:
# dbscan = DBSCAN(eps=1.5, min_samples=10)
# dbscan_labels = dbscan.fit_predict(X_train)

In [None]:
# plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis')
# plt.title('DBSCAN Clustering')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.show()

In [None]:
# dbscan = DBSCAN(eps=1.5, min_samples=10)
# dbscan.fit(X)
# labels_dbscan = dbscan.labels_

# plt.figure(figsize=(5, 5))
# sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels_dbscan, palette='Set2', s=50, marker='o')
# plt.title('DBSCAN Clustering')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.legend(title='Cluster', loc='best')
# plt.show()

silhouette score

In [None]:
from sklearn.metrics import silhouette_score

silhouette_sklearn = silhouette_score(X_pca, labels_kmeans)
print(f"Silhouette Score (sklearn): {silhouette_sklearn:.4f}")


In [None]:
def silhouette_score_manual(X, labels):
    unique_clusters = np.unique(labels)
    n = len(X)
    silhouette_scores = np.zeros(n)

    for i in range(n):
        current_cluster = labels[i]
        
        in_cluster_points = X[labels == current_cluster]
        a_i = np.mean([np.linalg.norm(X[i] - p) for p in in_cluster_points if not np.array_equal(X[i], p)])
        
        nearest_cluster_distances = []
        for cluster in unique_clusters:
            if cluster != current_cluster:
                out_cluster_points = X[labels == cluster]
                nearest_cluster_distances.append(np.mean([np.linalg.norm(X[i] - p) for p in out_cluster_points]))
        b_i = min(nearest_cluster_distances)

        silhouette_scores[i] = (b_i - a_i) / max(a_i, b_i)
    
    return np.mean(silhouette_scores)

silhouette_manual = silhouette_score_manual(X_pca, labels_kmeans)
print(f"Silhouette Score (from scratch): {silhouette_manual:.4f}")