# HIERARCHICAL

### Get Data (Template)

In [None]:
import sys
import os
import pandas as pd

# Initialize the PostgresHandler
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Connect to the database
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 
    'blocksize', 'unit', 'min_measure', 'mean_measure', 
    'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 
    'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs', 
    'name', 'reference', 'created'
]

# Fetch data with encoding enabled
df = handler.get_data("ssd_clean_data", columns, limit=50000, encode=True)

# Disconnect the handler
handler.disconnect()

# Display the DataFrame to verify encoded values
print(df.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import seaborn as sns

# Define the encoded data types and metrics
data_types = [
    {"name": "Random Read Latency", "type": 0, "metric": 2},
    {"name": "Random Write Latency", "type": 1, "metric": 2},
    {"name": "Random Read", "type": 0, "metric": None},
    {"name": "Random Write", "type": 1, "metric": None},
    {"name": "Sequential Write", "type": 3, "metric": None},
    {"name": "Sequential Read", "type": 2, "metric": None},
]

# Loop through each data type and perform clustering
for item in data_types:
    name = item["name"]
    specific_type = item["type"]
    latency_metric = item["metric"]
    
    print(f"Processing {name}...")

    # Step 1: Filter the DataFrame for the specific data type
    df_specific = df[df['data_type'] == specific_type].copy()
    
    # Step 2: If a latency metric is specified, filter for it as well
    if latency_metric is not None:
        df_specific = df_specific[df_specific['metric'] == latency_metric]

    # Check if df_specific is empty
    if df_specific.empty:
        print(f"No data found for data_type = '{name}'")
        continue

    # Select numerical columns for clustering
    df_numerical = df_specific.select_dtypes(include=['float64', 'int64'])

    # Handle missing values by imputing
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df_numerical), columns=df_numerical.columns)

    # Normalize the data
    scaler = MinMaxScaler()
    df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

    # Apply PCA for dimensionality reduction
    num_features = df_normalized.shape[1]
    pca = PCA(n_components=min(50, num_features))
    df_pca = pd.DataFrame(pca.fit_transform(df_normalized))

    # Perform hierarchical clustering
    linkage_matrix = linkage(df_pca, method='ward')
    distance_threshold = 20  # Adjust as needed
    clusters = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')
    
    # Add cluster labels back to the DataFrame
    df_specific['Cluster'] = clusters

    # Calculate silhouette score if more than one cluster exists
    if len(set(clusters)) > 1:
        silhouette_avg = silhouette_score(df_normalized, clusters)
        print(f'Silhouette Score for {name}: {silhouette_avg:.4f}')
    else:
        print(f'Silhouette Score for {name}: Not applicable (only one cluster)')

    # Visualize with dendrogram
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix, truncate_mode='level', p=5)
    plt.title(f'Hierarchical Clustering Dendrogram for {name}')
    plt.xlabel('Sample index')
    plt.ylabel('Distance')
    plt.show()

    # Visualize with t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(df_pca)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=clusters, palette='tab10', s=100, alpha=0.7)
    plt.title(f't-SNE Visualization of Clusters for {name}')
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.legend(title='Cluster')
    plt.show()

    # Summarize clusters by selecting only numeric columns
    numeric_columns = df_specific.select_dtypes(include=['float64', 'int64']).columns
    cluster_summary = df_specific.groupby('Cluster')[numeric_columns].mean()
    print(f"Cluster Summary for {name}:")
    print(cluster_summary)
    print("\n" + "-"*50 + "\n")
