# Connect to DF

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'name', 'metric', 'queue_depth', 'num_jobs', 'blocksize','unit', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'device_type', 'model', 'operating_pci_speed_gts', 'operating_pci_width', 
]

df = handler.get_data("ssd_clean_data", columns, limit=None, encode=False)

# Disconnect from the database
handler.disconnect()
# Filter dataset for Sequential Write

# Define numeric columns for clustering (REMOVED:  'stddev_measure', 'median_measure','min_measure','max_measure', )
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure','operating_pci_speed_gts', 'operating_pci_width', 
]

df = pd.get_dummies(df, columns=['metric'])
display(df)



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE

# Define data types to loop through
data_types = ["Random Read", "Random Write", "Sequential Write", "Sequential Read"]

# Loop through each data type and apply the clustering analysis
for data_type in data_types:
    print(f"\nProcessing data type: {data_type}")

    # Filter the DataFrame for the current data type
    df_filtered = df[df['data_type'] == data_type].reset_index(drop=True)


    # Define input columns for clustering
    input_columns = [
        'blocksize', 
        'num_jobs', 
        'queue_depth', 
        # multiply num jub and queue depth (effective queue depth)
        'operating_pci_speed_gts',  
        'operating_pci_width'
    ] 
    
    print(input_columns)

    # Step 1: Data Scaling using StandardScaler
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_filtered[input_columns])

    # Step 2: Outlier Detection and Removal using IQR
    Q1 = np.percentile(df_scaled, 25, axis=0)
    Q3 = np.percentile(df_scaled, 75, axis=0)
    IQR = Q3 - Q1

    # Define outlier thresholds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify and remove outliers
    outlier_indices = []
    for i in range(df_scaled.shape[1]):
        outlier_list_col = df_filtered.index[(df_scaled[:, i] < lower_bound[i]) | (df_scaled[:, i] > upper_bound[i])].tolist()
        outlier_indices.extend(outlier_list_col)

    outlier_indices = list(set(outlier_indices))  # Remove duplicates
    df_no_outliers = df_filtered.drop(index=outlier_indices).reset_index(drop=True)
    df_scaled_no_outliers = np.delete(df_scaled, outlier_indices, axis=0)
    print(f"Number of outliers removed for {data_type}: {len(outlier_indices)}")

    # Step 3: Apply PCA to reduce dimensionality while retaining 95% of variance
    pca = PCA(n_components=0.95, random_state=42)
    df_pca_no_outliers = pca.fit_transform(df_scaled_no_outliers)

    print("Variance explained by each PCA component:", pca.explained_variance_ratio_)
    print(f"Original number of features: {df_scaled_no_outliers.shape[1]}")
    print(f"Reduced number of features after PCA: {df_pca_no_outliers.shape[1]}")

    # Step 4: Run KMeans clustering with 6 clusters
    n_clusters = 8
    kmeans = KMeans(
        n_clusters=n_clusters,
        init='k-means++',
        n_init=10,
        max_iter=300,
        random_state=42
    )
    df_no_outliers['Cluster'] = kmeans.fit_predict(df_pca_no_outliers)

    # Step 5: Evaluate Clustering
    silhouette_avg = silhouette_score(df_pca_no_outliers, df_no_outliers['Cluster'])
    ch_score = calinski_harabasz_score(df_pca_no_outliers, df_no_outliers['Cluster'])
    db_score = davies_bouldin_score(df_pca_no_outliers, df_no_outliers['Cluster'])

    print(f"Silhouette Score for {data_type}: {silhouette_avg}")
    print(f"Calinski-Harabasz Index for {data_type}: {ch_score}")
    print(f"Davies-Bouldin Index for {data_type}: {db_score}")

    # Step 6: Analyze Cluster Characteristics
    
    cluster_counts = df_no_outliers['Cluster'].value_counts()
    print(f"\nNumber of items in each cluster for {data_type}:")
    print(cluster_counts)

    # Calculate the mean, standard deviation, and median of 'mean_measure' for each cluster
    cluster_stats = df_no_outliers.groupby('Cluster')['mean_measure'].agg(['mean', 'std', 'median'])
    print(df_no_outliers)
    print(f"Mean, Standard Deviation, and Median of 'mean_measure' for each cluster in {data_type}:")
    print(cluster_stats)

    # Calculate average numerical statistics for each cluster
    cluster_numerical_stats = df_no_outliers.groupby('Cluster')[input_columns].mean()
    print(f"Average Numerical Stats for Each Cluster in {data_type}:")
    print(cluster_numerical_stats)

    # Step 7: Visualize Clusters using PCA Components
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(df_pca_no_outliers[:, 0], df_pca_no_outliers[:, 1], c=df_no_outliers['Cluster'], cmap='tab10', s=50)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(f"KMeans Clusters for {data_type} Visualized with PCA Components")
    plt.colorbar(scatter, label='Cluster')
    plt.show()

    # Step 8: Visualize Clusters using t-SNE
    tsne = TSNE(n_components=2, perplexity=30, n_iter=500, random_state=42)
    df_tsne_no_outliers = tsne.fit_transform(df_pca_no_outliers)

    plt.figure(figsize=(8, 6))
    tsne_scatter = plt.scatter(df_tsne_no_outliers[:, 0], df_tsne_no_outliers[:, 1], c=df_no_outliers['Cluster'], cmap='tab10', s=50)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.title(f"t-SNE Visualization of KMeans Clusters for {data_type}")
    plt.colorbar(tsne_scatter, label='Cluster')
    plt.show()

    # Step 9: Generate a Heatmap of Mean Values for Each Feature Across Clusters
    cluster_means = df_no_outliers.groupby('Cluster')[input_columns].mean()
    cluster_means_normalized = (cluster_means - cluster_means.mean()) / cluster_means.std()

    plt.figure(figsize=(12, 8))
    sns.heatmap(cluster_means_normalized.T, annot=True, cmap="coolwarm", fmt=".2f", cbar=True, linewidths=0.5)
    plt.title(f'Normalized Mean Values of Input Features Across Clusters for {data_type}')
    plt.xlabel("Cluster")
    plt.ylabel("Feature")
    plt.show()
