In [None]:
pip install psycopg2-binary scikit-learn

In [None]:
pip install psycopg2-binary scikit-learn    

In [None]:
pip install scikit-learn-extra

In [None]:
pip install --upgrade pip


In [None]:

pip install num

# Connect to Server

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'name', 'metric', 'queue_depth', 'num_jobs', 'blocksize','unit', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'device_type', 'model', 'operating_pci_speed_gts', 'operating_pci_width', 
]

df = handler.get_data("ssd_clean_data", columns, limit=None, encode=False)
df = df[df['data_type'] == 'Random Read']
# Disconnect from the database
handler.disconnect()
# Filter dataset for Sequential Write

# Define numeric columns for clustering (REMOVED:  'stddev_measure', 'median_measure','min_measure','max_measure', )
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure','operating_pci_speed_gts', 'operating_pci_width', 
]

df = pd.get_dummies(df, columns=['metric'])
display(df)



In [None]:
pip install umap-learn

In [None]:
pip install numpy==2.0

# Random Write Clustering

##  Applied PCA (optimally reduce features) + Outliers + Duplicates 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import necessary modules
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE

# Assume 'df' is your DataFrame

# Define input columns for clustering
metric_columns = [col for col in df.columns if col.startswith('metric_')]
input_columns = [
    'blocksize', 
    'num_jobs', 
    'queue_depth',
    'operating_pci_speed_gts',  
    'operating_pci_width'
] + metric_columns

df = df.reset_index(drop=True)

# Step 1: Data Scaling using MinMaxScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[input_columns])

# Step 2: Outlier Detection and Removal using IQR
Q1 = np.percentile(df_scaled, 25, axis=0)
Q3 = np.percentile(df_scaled, 75, axis=0)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify and remove outliers
outlier_indices = []
for i in range(df_scaled.shape[1]):
    outlier_list_col = df.index[(df_scaled[:, i] < lower_bound[i]) | (df_scaled[:, i] > upper_bound[i])].tolist()
    outlier_indices.extend(outlier_list_col)

outlier_indices = list(set(outlier_indices))  # Remove duplicates
df_no_outliers = df.drop(index=outlier_indices).reset_index(drop=True)
df_scaled_no_outliers = np.delete(df_scaled, outlier_indices, axis=0)
print(f"Number of outliers removed: {len(outlier_indices)}")

# Step 3: Apply PCA to reduce dimensionality while retaining 95% of variance
pca = PCA(n_components=0.95, random_state=42)
df_pca_no_outliers = pca.fit_transform(df_scaled_no_outliers)

print("Variance explained by each PCA component:", pca.explained_variance_ratio_)
print(f"Original number of features: {df_scaled_no_outliers.shape[1]}")
print(f"Reduced number of features after PCA: {df_pca_no_outliers.shape[1]}")

# Step 4: Run KMeans clustering with 6 clusters
n_clusters = 8
kmeans = KMeans(
    n_clusters=n_clusters,
    init='k-means++',
    n_init=10,
    max_iter=300,
    random_state=42
)
df_no_outliers['Cluster'] = kmeans.fit_predict(df_pca_no_outliers)

# Step 5: Evaluate Clustering
silhouette_avg = silhouette_score(df_pca_no_outliers, df_no_outliers['Cluster'])
ch_score = calinski_harabasz_score(df_pca_no_outliers, df_no_outliers['Cluster'])
db_score = davies_bouldin_score(df_pca_no_outliers, df_no_outliers['Cluster'])

print(f"Silhouette Score: {silhouette_avg}")
print(f"Calinski-Harabasz Index: {ch_score}")
print(f"Davies-Bouldin Index: {db_score}")


cluster_mean_measure = df_no_outliers.groupby('Cluster')['mean_measure'].mean()
print("Mean of 'mean_measure' for each cluster:")
print(cluster_mean_measure)

# Step 6: Analyze Cluster Characteristics
cluster_numerical_stats = df_no_outliers.groupby('Cluster')[input_columns].mean()
print("Average Numerical Stats for Each Cluster:")
print(cluster_numerical_stats)

# Step 7: Visualize Clusters using PCA Components
plt.figure(figsize=(8, 6))
scatter = plt.scatter(df_pca_no_outliers[:, 0], df_pca_no_outliers[:, 1], c=df_no_outliers['Cluster'], cmap='tab10', s=50)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title("KMeans Clusters Visualized with PCA Components")
plt.colorbar(scatter, label='Cluster')
plt.show()

# Step 8: Visualize Clusters using t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=500, random_state=42)
df_tsne_no_outliers = tsne.fit_transform(df_pca_no_outliers)

plt.figure(figsize=(8, 6))
tsne_scatter = plt.scatter(df_tsne_no_outliers[:, 0], df_tsne_no_outliers[:, 1], c=df_no_outliers['Cluster'], cmap='tab10', s=50)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title("t-SNE Visualization of KMeans Clusters")
plt.colorbar(tsne_scatter, label='Cluster')
plt.show()

# Step 9: Generate a Heatmap of Mean Values for Each Feature Across Clusters
cluster_means = df_no_outliers.groupby('Cluster')[input_columns].mean()
cluster_means_normalized = (cluster_means - cluster_means.mean()) / cluster_means.std()

plt.figure(figsize=(12, 8))
sns.heatmap(cluster_means_normalized.T, annot=True, cmap="coolwarm", fmt=".2f", cbar=True, linewidths=0.5)
plt.title('Normalized Mean Values of Input Features Across Clusters')
plt.xlabel("Cluster")
plt.ylabel("Feature")
plt.show()


In [None]:
pip install networkx

# Spectral  Clustering

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Assuming `df` is already loaded with the relevant data
# Define numeric columns for clustering, including metric columns if available
metric_columns = [col for col in df.columns if col.startswith('metric_')]
numeric_columns = ['blocksize', 'num_jobs', 'queue_depth', 'operating_pci_speed_gts', 'operating_pci_width'] + metric_columns

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numeric_columns])

# Apply PCA to reduce to 2 components for visualization
pca = PCA(n_components=3)
X_principal = pca.fit_transform(scaled_features)
X_principal = pd.DataFrame(X_principal)
X_principal.columns = ['P1', 'P2', 'P3']

X_principal = pd.DataFrame(X_principal)

display(X_principal)

# Building the clustering model
spectral_model = SpectralClustering(n_clusters=4, affinity='rbf')
 
# Training the model and Storing the predicted cluster labels
labels = spectral_model.fit_predict(X_principal)

# Visualizing the clustering
plt.scatter(X_principal['P1'], X_principal['P2'],
            c=SpectralClustering(n_clusters=4, affinity='rbf') .fit_predict(X_principal), cmap=plt.cm.Set1)
pt.title("Spectral clustering")
plt.show()



In [None]:
display(df['Cluster'])