# DBSCAN Clustering with PyCaret


## 1. Install and Import Libraries

In [3]:
# Install PyCaret if not already installed
!pip install pycaret[full] --quiet

# Additional imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

# Set random seed and plotting style
np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
print("Base libraries imported successfully!")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Import PyCaret
from pycaret.clustering import *
print("PyCaret imported successfully!")
print(f"PyCaret version: ", end="")
import pycaret
print(pycaret.__version__)

## 2. Generate Synthetic Data

In [None]:
# Generate different types of clusters

# 1. Blob clusters
X_blobs, y_blobs = make_blobs(n_samples=300, centers=3, cluster_std=0.6, random_state=42)

# 2. Moon-shaped clusters
X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)

# 3. Concentric circles
X_circles, y_circles = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)

# 4. Clusters with noise
X_noise, y_noise = make_blobs(n_samples=250, centers=3, cluster_std=0.4, random_state=42)
# Add noise points
noise_points = np.random.uniform(low=-10, high=10, size=(50, 2))
X_with_noise = np.vstack([X_noise, noise_points])
y_with_noise = np.concatenate([y_noise, [-1] * 50])  # -1 for noise

# Visualize all datasets
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

datasets = [
    (X_blobs, y_blobs, 'Blob Clusters'),
    (X_moons, y_moons, 'Moon Clusters'),
    (X_circles, y_circles, 'Concentric Circles'),
    (X_with_noise, y_with_noise, 'Clusters with Noise')
]

for ax, (X, y, title) in zip(axes.flatten(), datasets):
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', alpha=0.7, edgecolors='k', s=50)
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.set_title(title)

plt.suptitle('Synthetic Datasets for DBSCAN', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Understanding DBSCAN Parameters

In [None]:
# Elbow method to find optimal epsilon using k-distance graph
def find_optimal_eps(X, min_samples=5):
    """
    Uses the k-distance graph method to find optimal epsilon.
    The 'elbow' in the graph suggests a good eps value.
    """
    neighbors = NearestNeighbors(n_neighbors=min_samples)
    neighbors.fit(X)
    distances, _ = neighbors.kneighbors(X)
    distances = np.sort(distances[:, min_samples-1])
    return distances

# Plot k-distance graphs for different datasets
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, (X, y, title) in zip(axes.flatten(), datasets):
    distances = find_optimal_eps(X, min_samples=5)
    ax.plot(range(len(distances)), distances, 'b-', linewidth=2)
    ax.set_xlabel('Points (sorted by distance)')
    ax.set_ylabel('5-NN Distance')
    ax.set_title(f'K-Distance Graph: {title}')
    ax.grid(True, alpha=0.3)

plt.suptitle('Finding Optimal Epsilon with K-Distance Graph', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nThe 'elbow' point in each graph suggests the optimal eps value.")
print("Points to the right of the elbow are likely noise.")

## 4. DBSCAN with Sklearn (Baseline)

In [None]:
# Apply DBSCAN with different parameters on moon data
eps_values = [0.1, 0.2, 0.3, 0.5]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, eps in enumerate(eps_values):
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(X_moons)

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = (labels == -1).sum()

    # Plot
    unique_labels = set(labels)
    colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))

    for label, color in zip(unique_labels, colors):
        if label == -1:
            color = 'black'
            marker = 'x'
        else:
            marker = 'o'

        mask = labels == label
        axes[idx].scatter(X_moons[mask, 0], X_moons[mask, 1], c=[color],
                          marker=marker, s=50, alpha=0.7, edgecolors='k' if marker == 'o' else None)

    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].set_title(f'eps={eps}\nClusters: {n_clusters}, Noise: {n_noise}')

plt.suptitle('DBSCAN with Different Epsilon Values on Moon Data', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Using PyCaret for Clustering

In [None]:
# Create DataFrame for PyCaret
df_moons = pd.DataFrame(X_moons, columns=['feature_1', 'feature_2'])
df_moons['true_label'] = y_moons

print("Moon Dataset:")
print(df_moons.head())
print(f"\nShape: {df_moons.shape}")

In [None]:
# Initialize PyCaret clustering experiment
# We ignore 'true_label' since it's for validation only
exp = setup(
    data=df_moons,
    ignore_features=['true_label'],
    normalize=True,
    session_id=42,
    verbose=False
)

print("PyCaret setup complete!")

In [None]:
# Create DBSCAN model using PyCaret
dbscan_model = create_model('dbscan', eps=0.2, min_samples=5)

print("\nDBSCAN Model Created!")
print(dbscan_model)

In [None]:
# Assign cluster labels
dbscan_results = assign_model(dbscan_model)

print("\nCluster Assignments:")
print(dbscan_results.head(10))

# Cluster distribution
print("\nCluster Distribution:")
print(dbscan_results['Cluster'].value_counts())

In [None]:
# Visualize DBSCAN results using PyCaret
try:
    plot_model(dbscan_model, plot='cluster')
except:
    # Fallback manual visualization
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(dbscan_results['feature_1'], dbscan_results['feature_2'],
                          c=dbscan_results['Cluster'].astype('category').cat.codes,
                          cmap='viridis', alpha=0.7, edgecolors='k', s=50)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('DBSCAN Clustering Results (PyCaret)')
    plt.colorbar(scatter, label='Cluster')
    plt.show()

## 6. Comparing Multiple Clustering Models with PyCaret

In [None]:
# Create a new dataset with blobs for model comparison
df_blobs = pd.DataFrame(X_blobs, columns=['feature_1', 'feature_2'])
df_blobs['true_label'] = y_blobs

# Setup for blobs dataset
exp_blobs = setup(
    data=df_blobs,
    ignore_features=['true_label'],
    normalize=True,
    session_id=42,
    verbose=False
)

In [None]:
# Create multiple models
models_to_compare = ['kmeans', 'dbscan', 'hclust', 'meanshift']
model_results = {}

for model_name in models_to_compare:
    try:
        if model_name == 'kmeans':
            model = create_model(model_name, num_clusters=3)
        elif model_name == 'dbscan':
            model = create_model(model_name, eps=0.5, min_samples=5)
        elif model_name == 'hclust':
            model = create_model(model_name, num_clusters=3)
        else:
            model = create_model(model_name)

        results = assign_model(model)
        model_results[model_name] = {
            'model': model,
            'results': results
        }
        print(f"Created {model_name} model successfully")
    except Exception as e:
        print(f"Error creating {model_name}: {e}")

print(f"\nSuccessfully created {len(model_results)} models")

In [None]:
# Visualize all models
n_models = len(model_results)
fig, axes = plt.subplots(1, n_models + 1, figsize=(4 * (n_models + 1), 4))

# Ground truth
axes[0].scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs, cmap='viridis',
                alpha=0.7, edgecolors='k', s=50)
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('Ground Truth')

# Each model
for idx, (model_name, data) in enumerate(model_results.items(), 1):
    results = data['results']
    labels = results['Cluster'].astype('category').cat.codes

    # Calculate ARI if possible
    try:
        ari = adjusted_rand_score(y_blobs, labels)
        title = f'{model_name.upper()}\nARI: {ari:.3f}'
    except:
        title = model_name.upper()

    axes[idx].scatter(results['feature_1'], results['feature_2'], c=labels,
                      cmap='viridis', alpha=0.7, edgecolors='k', s=50)
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].set_title(title)

plt.suptitle('Comparison of Clustering Algorithms (PyCaret)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. DBSCAN on Real Dataset

In [None]:
# Load a real dataset - we'll use the Iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['species'] = iris.target

print("Iris Dataset:")
print(df_iris.head())
print(f"\nShape: {df_iris.shape}")

In [None]:
# Setup PyCaret for Iris
exp_iris = setup(
    data=df_iris,
    ignore_features=['species'],
    normalize=True,
    session_id=42,
    verbose=False
)

# Find optimal eps for Iris
scaler = StandardScaler()
X_iris_scaled = scaler.fit_transform(iris.data)

distances = find_optimal_eps(X_iris_scaled, min_samples=5)

plt.figure(figsize=(10, 5))
plt.plot(range(len(distances)), distances, 'b-', linewidth=2)
plt.xlabel('Points (sorted by distance)')
plt.ylabel('5-NN Distance')
plt.title('K-Distance Graph for Iris Dataset')
plt.grid(True, alpha=0.3)
plt.axhline(y=0.5, color='red', linestyle='--', label='Suggested eps â‰ˆ 0.5')
plt.legend()
plt.show()

In [None]:
# Create DBSCAN model for Iris
dbscan_iris = create_model('dbscan', eps=0.5, min_samples=5)

# Assign labels
iris_results = assign_model(dbscan_iris)

print("\nIris DBSCAN Cluster Distribution:")
print(iris_results['Cluster'].value_counts())

In [None]:
# Evaluate clustering quality
from sklearn.decomposition import PCA

# Convert cluster labels
cluster_labels = iris_results['Cluster'].astype('category').cat.codes
valid_mask = cluster_labels >= 0  # Exclude noise for some metrics

# Calculate metrics
if valid_mask.sum() > 1 and len(set(cluster_labels[valid_mask])) > 1:
    sil_score = silhouette_score(X_iris_scaled[valid_mask], cluster_labels[valid_mask])
    ari_score = adjusted_rand_score(iris.target, cluster_labels)
    print(f"Silhouette Score (excl. noise): {sil_score:.4f}")
    print(f"Adjusted Rand Index: {ari_score:.4f}")
else:
    print("Not enough valid clusters for metrics calculation")

# PCA for visualization
pca = PCA(n_components=2)
X_iris_pca = pca.fit_transform(X_iris_scaled)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# True labels
scatter1 = axes[0].scatter(X_iris_pca[:, 0], X_iris_pca[:, 1], c=iris.target,
                           cmap='viridis', alpha=0.7, edgecolors='k', s=60)
axes[0].set_xlabel('First Principal Component')
axes[0].set_ylabel('Second Principal Component')
axes[0].set_title('Iris - True Species')

# DBSCAN labels
scatter2 = axes[1].scatter(X_iris_pca[:, 0], X_iris_pca[:, 1], c=cluster_labels,
                           cmap='viridis', alpha=0.7, edgecolors='k', s=60)
# Mark noise points
noise_mask = cluster_labels == -1
axes[1].scatter(X_iris_pca[noise_mask, 0], X_iris_pca[noise_mask, 1],
                c='red', marker='x', s=100, linewidths=2, label='Noise')
axes[1].set_xlabel('First Principal Component')
axes[1].set_ylabel('Second Principal Component')
axes[1].set_title('Iris - DBSCAN Clustering')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. Parameter Sensitivity Analysis

In [None]:
# Analyze how eps and min_samples affect clustering
eps_range = [0.3, 0.5, 0.7, 0.9]
min_samples_range = [3, 5, 7, 10]

fig, axes = plt.subplots(len(min_samples_range), len(eps_range), figsize=(16, 16))

for i, min_samples in enumerate(min_samples_range):
    for j, eps in enumerate(eps_range):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_iris_scaled)

        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = (labels == -1).sum()

        # Plot
        scatter = axes[i, j].scatter(X_iris_pca[:, 0], X_iris_pca[:, 1],
                                      c=labels, cmap='viridis', alpha=0.7, s=30)
        axes[i, j].set_title(f'eps={eps}, min_samples={min_samples}\n'
                             f'Clusters: {n_clusters}, Noise: {n_noise}', fontsize=9)
        axes[i, j].set_xlabel('PC1', fontsize=8)
        axes[i, j].set_ylabel('PC2', fontsize=8)

plt.suptitle('DBSCAN Parameter Sensitivity on Iris Dataset', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 9. DBSCAN for Anomaly Detection

In [None]:
# Use DBSCAN for anomaly detection
# Points classified as noise (-1) are potential anomalies

# Create dataset with anomalies
df_anomaly = pd.DataFrame(X_with_noise, columns=['feature_1', 'feature_2'])
df_anomaly['true_anomaly'] = (y_with_noise == -1).astype(int)

# Setup PyCaret
exp_anomaly = setup(
    data=df_anomaly,
    ignore_features=['true_anomaly'],
    normalize=True,
    session_id=42,
    verbose=False
)

# Create DBSCAN model
dbscan_anomaly = create_model('dbscan', eps=0.5, min_samples=5)
anomaly_results = assign_model(dbscan_anomaly)

# Identify anomalies (noise points)
anomaly_results['predicted_anomaly'] = (anomaly_results['Cluster'] == 'Noise').astype(int)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(df_anomaly['true_anomaly'], anomaly_results['predicted_anomaly'])

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Visualization
normal_mask = anomaly_results['predicted_anomaly'] == 0
anomaly_mask = anomaly_results['predicted_anomaly'] == 1

axes[0].scatter(anomaly_results.loc[normal_mask, 'feature_1'],
                anomaly_results.loc[normal_mask, 'feature_2'],
                c='blue', alpha=0.5, s=30, label='Normal')
axes[0].scatter(anomaly_results.loc[anomaly_mask, 'feature_1'],
                anomaly_results.loc[anomaly_mask, 'feature_2'],
                c='red', marker='x', s=100, linewidths=2, label='Anomaly')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('DBSCAN Anomaly Detection')
axes[0].legend()

# Confusion matrix heatmap
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')
axes[1].set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(df_anomaly['true_anomaly'], anomaly_results['predicted_anomaly'],
                           target_names=['Normal', 'Anomaly']))

## 10. Clustering Quality Metrics Summary

In [None]:
# Comprehensive metrics for DBSCAN on different datasets
def evaluate_dbscan(X, y_true, eps, min_samples, dataset_name):
    """
    Evaluate DBSCAN clustering and return metrics.
    """
    # Scale data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X_scaled)

    # Calculate metrics
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = (labels == -1).sum()

    metrics = {
        'Dataset': dataset_name,
        'eps': eps,
        'min_samples': min_samples,
        'N Clusters': n_clusters,
        'N Noise': n_noise
    }

    # Silhouette score (excluding noise)
    valid_mask = labels >= 0
    if valid_mask.sum() > 1 and n_clusters > 1:
        metrics['Silhouette'] = silhouette_score(X_scaled[valid_mask], labels[valid_mask])
    else:
        metrics['Silhouette'] = np.nan

    # ARI (if ground truth available)
    if y_true is not None:
        metrics['ARI'] = adjusted_rand_score(y_true, labels)
    else:
        metrics['ARI'] = np.nan

    return metrics

# Evaluate on all datasets
evaluation_data = [
    (X_blobs, y_blobs, 0.5, 5, 'Blobs'),
    (X_moons, y_moons, 0.2, 5, 'Moons'),
    (X_circles, y_circles, 0.15, 5, 'Circles'),
    (iris.data, iris.target, 0.5, 5, 'Iris')
]

results_list = []
for X, y, eps, min_s, name in evaluation_data:
    results_list.append(evaluate_dbscan(X, y, eps, min_s, name))

results_df = pd.DataFrame(results_list)

print("DBSCAN Clustering Quality Metrics:")
print("="*80)
print(results_df.to_string(index=False))
print("\n" + "="*80)

## 11. Summary and Conclusions

In [None]:
print("="*70)
print("DBSCAN CLUSTERING WITH PYCARET - SUMMARY")
print("="*70)

print("\n1. KEY CONCEPTS:")
print("   - Density-based clustering algorithm")
print("   - Discovers clusters of arbitrary shapes")
print("   - Automatically identifies noise/outliers")
print("   - Does not require specifying number of clusters")

print("\n2. PARAMETERS:")
print("   - eps: Maximum distance between neighbors")
print("   - min_samples: Minimum points to form a cluster")
print("   - Use k-distance graph to find optimal eps")

print("\n3. PYCARET ADVANTAGES:")
print("   - Simple, low-code interface")
print("   - Automatic preprocessing and normalization")
print("   - Easy model creation and comparison")
print("   - Built-in visualization functions")

print("\n4. CLUSTERING QUALITY METRICS USED:")
print("   - Silhouette Score (excluding noise points)")
print("   - Adjusted Rand Index (with ground truth)")
print("   - Number of clusters found")
print("   - Number of noise points detected")

print("\n5. BEST USE CASES:")
print("   - Non-globular cluster shapes (moons, circles)")
print("   - Data with noise/outliers")
print("   - When number of clusters is unknown")
print("   - Anomaly detection applications")

print("\n6. LIMITATIONS:")
print("   - Sensitive to eps parameter")
print("   - Struggles with varying density clusters")
print("   - Computationally expensive for large datasets")

print("\n" + "="*70)