In [None]:
%pip install hdbscan

Imports

In [None]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

# Load reduced features
FEATURE_FILES = ["pca.npy", "umap.npy", "autoencoder.npy"]

features_list = [np.load(f) for f in FEATURE_FILES]
features = np.concatenate(features_list, axis=1)
features = StandardScaler().fit_transform(features)

Finding Optimal Values

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.datasets import make_blobs  # For sample data (replace with your data)

# Generate some sample data (replace with your actual data)
your_data, _ = make_blobs(n_samples=300, centers=4, random_state=42)


# 1. MiniBatchKMeans
silhouette_scores_mbkmeans = []
for k in range(2, 11):
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
    kmeans.fit(your_data)
    labels = kmeans.labels_
    score = silhouette_score(your_data, labels)
    silhouette_scores_mbkmeans.append(score)

plt.plot(range(2, 11), silhouette_scores_mbkmeans)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Method for MiniBatchKMeans")
plt.show()

# 2. KMeans (for comparison - MiniBatchKMeans is often used for larger datasets)
silhouette_scores_kmeans = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(your_data)
    labels = kmeans.labels_
    score = silhouette_score(your_data, labels)
    silhouette_scores_kmeans.append(score)

plt.plot(range(2, 11), silhouette_scores_kmeans)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Method for KMeans")
plt.show()


# 3. Gaussian Mixture Models (GMM)
bics = []
aics = []
for n_components in range(2, 11):
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(your_data)
    bics.append(gmm.bic(your_data))
    aics.append(gmm.aic(your_data))

plt.plot(range(2, 11), bics, label="BIC")
plt.plot(range(2, 11), aics, label="AIC")
plt.xlabel("Number of Components")
plt.ylabel("Information Criterion")
plt.title("BIC and AIC for GMM")
plt.legend()
plt.show()


# 4. Hierarchical Clustering (AgglomerativeClustering)
silhouette_scores_hierarchical = []
davies_bouldin_scores_hierarchical = []
for k in range(2, 11):
    hierarchical = AgglomerativeClustering(n_clusters=k)
    labels = hierarchical.fit_predict(your_data)
    score = silhouette_score(your_data, labels)
    davies_bouldin = davies_bouldin_score(your_data, labels)
    silhouette_scores_hierarchical.append(score)
    davies_bouldin_scores_hierarchical.append(davies_bouldin)


plt.plot(range(2, 11), silhouette_scores_hierarchical, label='Silhouette')
plt.plot(range(2, 11), davies_bouldin_scores_hierarchical, label='Davies-Bouldin')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Score")
plt.title("Silhouette and Davies-Bouldin for Hierarchical Clustering")
plt.legend()
plt.show()

# 5. DBSCAN (requires different approach for parameter tuning)
# Example: Varying epsilon and min_samples
eps_values = np.arange(0.5, 2.0, 0.1)  # Example range for epsilon
min_samples_values = range(2, 6)  # Example range for min_samples
best_score = -1
best_eps = None
best_min_samples = None

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(your_data)
        # DBSCAN may identify all points as noise (-1), if so, skip score calculation
        if len(set(labels)) > 1:
            score = silhouette_score(your_data, labels)
            if score > best_score:
                best_score = score
                best_eps = eps
                best_min_samples = min_samples

print(f"Best DBSCAN parameters: eps={best_eps}, min_samples={best_min_samples}, Silhouette Score={best_score}")


# Note:  Replace `your_data` with your actual data.  The sample data is just for demonstration.
# Also, adjust the parameter ranges (e.g., k values, eps range, min_samples range) as needed for your data.

MiniBatchKmeans

In [None]:
kmeans = MiniBatchKMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(features)

# Compute silhouette score
sil_score = silhouette_score(features, kmeans_labels)
print(f"Silhouette Score: {sil_score:.4f}")

np.save("cluster_kmeans.npy", kmeans_labels)

Plot

In [None]:
# Visualize Clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=kmeans_labels, palette="viridis", legend=None)
plt.title("Mini-Batch K-Means Clustering")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

Gausian Mixture Model

In [None]:
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels = gmm.fit_predict(features)

sil_score = silhouette_score(features, gmm_labels)
print(f"Silhouette Score: {sil_score:.4f}")

np.save("cluster_gmm.npy", gmm_labels)

Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=gmm_labels, palette="viridis", legend=None)
plt.title("GMM Clustering")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

Hierarchical Clustering

In [None]:
hierarchical = AgglomerativeClustering(n_clusters=2)
hierarchical_labels = hierarchical.fit_predict(features)

sil_score = silhouette_score(features, hierarchical_labels)
print(f"Silhouette Score: {sil_score:.4f}")

np.save("cluster_hierarchical.npy", hierarchical_labels)

Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=hierarchical_labels, palette="viridis", legend=None)
plt.title("Hierarchical Clustering")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

DBSCAN

In [None]:
dbscan = hdbscan.HDBSCAN(min_cluster_size=100)
dbscan_labels = dbscan.fit_predict(features)

if len(set(dbscan_labels)) > 1:
    sil_score = silhouette_score(features, labels)
    print(f"Silhouette Score: {sil_score:.4f}")
else:
    print("DBSCAN found only one cluster, silhouette score not applicable.")

np.save("cluster_dbscan.npy", dbscan_labels)

Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=dbscan_labels, palette="viridis", legend=None)
plt.title("DBSCAN Clustering")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

In [None]:
print("Clustering completed and results saved.")