# HDBSCAN 

### Get Data (Template)

In [None]:
import sys
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler


# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch, including 'data_type'
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 
    'blocksize', 'unit', 'min_measure', 'mean_measure', 
    'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 
    'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs', 
    'name', 'reference', 'created'
]

# Not encoded
df = handler.get_data("ssd_clean_data", columns, limit=145000, encode=True)
# Check the DataFrame
print(df)

handler.disconnect()

In [None]:
pip install hdbscan

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score
import hdbscan
import pandas as pd
import matplotlib.pyplot as plt

# Define clustering parameters
min_samples = 1000
default_cluster_size = 1000
latency_cluster_size = 500

# Target features and special metric condition
target_features = [
    "Sequential Write", "Sequential Read", 
    "Random Write", "Random Read", 
    "Random Write Latency", "Random Read Latency"
]
latency_types = {"Random Write Latency", "Random Read Latency"}  # Only apply metric == 2 for these types

# Obtain unique data_type values and map them to their names
data_type_values = df['data_type'].unique()
encoding_map = {value: handler.encoding_map['data_type'][value] for value in data_type_values}

# Filter only for the relevant data types
selected_data_types = [
    {"name": encoding_map[i], "encoded_value": i} 
    for i in data_type_values if encoding_map.get(i) in target_features
]

# Process each selected data type
for item in selected_data_types:
    name = item["name"]
    specific_encoded_value = item["encoded_value"]

    print(f"\nProcessing {name}...")

    # Step 1: Filter DataFrame based on whether the type is latency-related
    if name in latency_types:
        # Apply metric == 2 filter for latency types
        df_specific = df[(df['data_type'] == specific_encoded_value) & (df['metric'] == 2)].copy()
        min_cluster_size = latency_cluster_size  # Use a smaller cluster size for latency types
    else:
        # No metric filter for non-latency types
        df_specific = df[df['data_type'] == specific_encoded_value].copy()
        min_cluster_size = default_cluster_size  # Use the default cluster size for non-latency types

    # Check if df_specific is empty
    if df_specific.empty:
        print(f"No data found for data_type = '{name}'")
        continue

    # Step 2: Select numerical columns and handle missing values
    df_numerical = df_specific.select_dtypes(include=['float64', 'int64'])
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df_numerical), columns=df_numerical.columns)

    # Step 3: Normalize the numerical data
    scaler = MinMaxScaler()
    df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

    # Step 4: Apply HDBSCAN for clustering
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=0.1)
    cluster_labels = clusterer.fit_predict(df_normalized)

    # Add the cluster labels back to the filtered DataFrame
    df_specific['Cluster'] = cluster_labels

    # Calculate silhouette score if there is more than one cluster
    non_noise_points = df_normalized[cluster_labels != -1]
    non_noise_labels = cluster_labels[cluster_labels != -1]

    if len(set(non_noise_labels)) > 1:
        score = silhouette_score(non_noise_points, non_noise_labels)
        print(f'Silhouette Score for {name} with min_samples={min_samples}: {score:.4f}')
    else:
        print(f'Silhouette Score for {name} with min_samples={min_samples}: Not applicable (only one cluster)')

    # Additional HDBSCAN-specific metrics
    stability_scores = clusterer.cluster_persistence_
    print(f"Cluster Stability Scores for {name}: {stability_scores}")

    outlier_scores = clusterer.outlier_scores_
    df_specific['Outlier Score'] = outlier_scores
    print(f"Average Outlier Score for {name}: {outlier_scores.mean():.4f}")

    # Display summary information about clusters
    cluster_summary = df_specific.groupby('Cluster').size()
    print(f"Cluster summary for {name}:")
    print(cluster_summary)

    # Step 5: Apply t-SNE for visualization
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(df_normalized)

    # Plotting the clusters with t-SNE
    plt.figure(figsize=(10, 8))
    unique_labels = set(cluster_labels)
    for label in unique_labels:
        label_mask = (cluster_labels == label)
        plt.scatter(tsne_results[label_mask, 0], tsne_results[label_mask, 1], 
                    label=f'Cluster {label}' if label != -1 else 'Noise', s=10)
    
    plt.title(f"t-SNE Visualization of HDBSCAN Clusters for {name}")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.legend()
    plt.show()

    print("\n" + "-"*50 + "\n")
