Program with functions to test for best epsilon and min_samples

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


Function for k-NN distances

In [None]:
def compute_knn_distances(X, k=5):
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(X)
    distances, _ = neighbors_fit.kneighbors(X)
    distances = np.sort(distances[:, k-1])
    return distances

def plot_knn_distances(distances):
    plt.plot(distances)
    plt.title('k-NN Distance Plot')
    plt.xlabel('Data Points')
    plt.ylabel('k-NN Distance')
    plt.show()

Functions for determining best epsilon using sihouette score and min_sample using sensitivity

In [None]:
def find_best_epsilon(X, distances, percentiles=[0.80, 0.85, 0.90, 0.95], min_samples=5):
    best_score = -1
    best_epsilon = None
    for percentile in percentiles:
        epsilon = distances[round(len(distances) * percentile)]
        db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
        labels = db.labels_
        if len(set(labels)) > 1:
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_epsilon = epsilon
    return best_epsilon, best_score

def test_min_samples(X, best_epsilon, min_samples_values=[5, 10, 15, 20]):
    best_score = -1
    best_params = None
    for min_samples in min_samples_values:
        db = DBSCAN(eps=best_epsilon, min_samples=min_samples).fit(X)
        labels = db.labels_
        if len(set(labels)) > 1:
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_params = (best_epsilon, min_samples)
    return best_params, best_score

End result

In [None]:
X = np.random.rand(4312, 3)  # Example dataset with 3 features
distances = compute_knn_distances(X)
plot_knn_distances(distances)
best_epsilon, _ = find_best_epsilon(X, distances)
best_params, best_score = test_min_samples(X, best_epsilon)

print("Best params (epsilon, min_samples):", best_params)
print("Best Silhouette Score:", best_score)