# Generate Heatmaps using Kmeans

In [None]:
pip install psycopg2-binary scikit-learn


In [None]:
a

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'operating_pci_speed_gts', 'operating_pci_width', 'device_type', 'model'
]
df = handler.get_data("ssd_clean_data", columns, limit=None, encode=True)

# Disconnect from the database
handler.disconnect()

# Define numeric columns for clustering
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure', 'capacity_gib', 'operating_pci_speed_gts', 
    'operating_pci_width'
]

# Impute missing values in numeric columns
imputer = SimpleImputer(strategy="mean")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Standardize numeric features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Initialize empty list to collect clusters
all_clusters = []

# Iterate over each data type and metric and apply KMeans clustering
data_types = df['data_type'].unique()
metrics = df['metric'].unique()

print("Data Types: " + ' and '.join(handler.encoding_map['data_type'][i] for i in data_types))
print("Metrics: " + ' and '.join(handler.encoding_map['metric'][i] for i in metrics))

for data_type in data_types:
    for metric in metrics:
        # Filter dataset by data type and metric
        df_filtered = df[(df['data_type'] == data_type) & (df['metric'] == metric)].copy()
        
        # Skip if no data for this combination
        if df_filtered.empty:
            continue

        # Apply KMeans for 6 clusters
        kmeans = KMeans(n_clusters=6, random_state=0)
        df_filtered['Cluster'] = kmeans.fit_predict(scaler.transform(df_filtered[numeric_columns]))
        
        # Calculate silhouette score
        silhouette_avg = silhouette_score(scaler.transform(df_filtered[numeric_columns]), df_filtered['Cluster'])
        print(f"Silhouette Score for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]} with 6 Clusters: {silhouette_avg}")
        
        # Identify the highest performance cluster by mean of 'mean_measure' metric
        cluster_performance = df_filtered.groupby('Cluster')['mean_measure'].mean()
        top_cluster = cluster_performance.idxmax()
        print(f"Top Cluster for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]}: Cluster {top_cluster}")
        
        # Filter top cluster data
        top_cluster_data = df_filtered[df_filtered['Cluster'] == top_cluster]
        
        # Find top-performing device type and model within the top cluster
        top_device = top_cluster_data.groupby(['device_type', 'model'])['mean_measure'].mean().idxmax()
        print(f"Top-performing Device and Model for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]}: {top_device}")

        # Append filtered DataFrame with clusters to main DataFrame
        all_clusters.append(df_filtered)

        # Use PCA for 2D visualization of clusters
        pca = PCA(n_components=2)
        df_pca = pca.fit_transform(scaler.transform(df_filtered[numeric_columns]))

        # Plot the clusters for the current data type and metric
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df_filtered['Cluster'], cmap='viridis', s=50)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.title(f'KMeans Clusters for {handler.encoding_map["data_type"][data_type]} - {handler.encoding_map["metric"][metric]} (6 clusters)')
        plt.colorbar(scatter, label='Cluster')
        plt.show()

# Concatenate all clusters into the main DataFrame
df_clusters = pd.concat(all_clusters)

# Analyze cluster distribution by data type and metric
cluster_summary = df_clusters.groupby(['data_type', 'metric', 'Cluster']).size()
print("Cluster Summary by Data Type and Metric:\n", cluster_summary)
