# Generate Heatmaps using Kmeans

In [None]:
pip install psycopg2-binary scikit-learn


In [None]:
pip install psycopg2-binary scikit-learn    


## HEATMAPs

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'operating_pci_speed_gts', 'operating_pci_width', 'device_type', 'model'
]
df = handler.get_data("ssd_clean_data", columns, limit=None, encode=True)

# Disconnect from the database
handler.disconnect()

# Define initial numeric columns for clustering
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure', 'capacity_gib', 'operating_pci_speed_gts', 
    'operating_pci_width'
]

# Subsets of features to remove
feature_combinations = [
    numeric_columns,
    [col for col in numeric_columns if col != 'queue_depth'],
    [col for col in numeric_columns if col != 'num_jobs'],
    [col for col in numeric_columns if col not in ['queue_depth', 'num_jobs']],
    [col for col in numeric_columns if col != 'blocksize'],
    [col for col in numeric_columns if col not in ['capacity_gib', 'operating_pci_speed_gts']],
]

# Initialize SimpleImputer for missing data
imputer = SimpleImputer(strategy="mean")

# Loop over each feature subset
for idx, selected_columns in enumerate(feature_combinations, start=1):
    print(f"\n\n--- Feature Set {idx}: Using Columns {selected_columns} ---\n")
    
    # Impute and standardize selected features
    df[selected_columns] = imputer.fit_transform(df[selected_columns])
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[selected_columns])

    # Initialize list to collect clusters
    all_clusters = []
    
    # Iterate over each data type and metric to apply KMeans clustering
    data_types = df['data_type'].unique()
    metrics = df['metric'].unique()
    
    print("Data Types: " + ' and '.join(handler.encoding_map['data_type'][i] for i in data_types))
    print("Metrics: " + ' and '.join(handler.encoding_map['metric'][i] for i in metrics))
    
    for data_type in data_types:
        for metric in metrics:
            # Filter dataset by data type and metric
            df_filtered = df[(df['data_type'] == data_type) & (df['metric'] == metric)].copy()
            
            # Skip if no data for this combination
            if df_filtered.empty:
                continue

            # Apply KMeans for 6 clusters
            kmeans = KMeans(n_clusters=6, random_state=0)
            df_filtered['Cluster'] = kmeans.fit_predict(scaler.transform(df_filtered[selected_columns]))
            
            # Calculate silhouette score
            silhouette_avg = silhouette_score(scaler.transform(df_filtered[selected_columns]), df_filtered['Cluster'])
            print(f"Silhouette Score for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]} with 6 Clusters: {silhouette_avg}")
            
            # Identify the highest performance cluster by mean of 'mean_measure' metric
            cluster_performance = df_filtered.groupby('Cluster')['mean_measure'].mean()
            top_cluster = cluster_performance.idxmax()
            print(f"Top Cluster for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]}: Cluster {top_cluster}")
            
            # Append filtered DataFrame with clusters to main DataFrame
            all_clusters.append(df_filtered)

            # Use PCA for 2D visualization of clusters
            pca = PCA(n_components=2)
            df_pca = pca.fit_transform(scaler.transform(df_filtered[selected_columns]))
            
            # Normalizing cluster means for heatmap
            cluster_means = df_filtered.groupby('Cluster')[selected_columns].mean()
            cluster_means_normalized = (cluster_means - cluster_means.mean()) / cluster_means.std()
            cluster_means_normalized = cluster_means_normalized.T  # Transpose for heatmap
            
            # Generate the heatmap
            plt.figure(figsize=(12, 8))
            sns.heatmap(cluster_means_normalized, annot=True, cmap="viridis", fmt=".2f", cbar=True)
            plt.title(f'Normalized Mean Values Across Clusters (Feature Set {idx}) - {handler.encoding_map["data_type"][data_type]} - {handler.encoding_map["metric"][metric]}')
            plt.xlabel("Cluster")
            plt.ylabel("Feature")
            plt.show()
    
    # Concatenate all clusters into a DataFrame and analyze
    df_clusters = pd.concat(all_clusters)
    cluster_summary = df_clusters.groupby(['data_type', 'metric', 'Cluster']).size()
    print(f"\nCluster Summary by Data Type and Metric for Feature Set {idx}:\n", cluster_summary)
