# KMean Notebook

### Dependences you'll need

In [None]:
!pip install psycopg2-binary scikit-learn
!pip install matplotlib seaborn

## ---

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import sys
import os

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define the columns to fetch from the database
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 
    'blocksize', 'unit', 'min_measure', 'mean_measure', 
    'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 
    'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs', 
    'name', 'reference', 'created'
]

# Step 1: Data Preparation
df = handler.get_data("ssd_clean_data", columns, limit=None, encode=False)

# Check the columns in the DataFrame
print("Available columns in DataFrame:", df.columns.tolist())

# Step 2: Define the features to use for clustering
features = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 
    'blocksize', 'unit', 'min_measure', 'mean_measure', 
    'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 
    'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs', 
    'name', 'reference', 'created'
]

# Step 3: Filter features to only include those present in the DataFrame
features = [feature for feature in features if feature in df.columns]
print("Using the following features for clustering:", features)

# Define categorical and numerical features based on available columns
categorical_features = ['data_type', 'metric', 'unit', 'device_type', 'family', 'vendor', 'model', 'firmware', 'name', 'reference']
numerical_features = ['queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 
                     'median_measure', 'max_measure', 'stddev_measure', 'capacity_GiB', 
                     'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs']

# Filter categorical and numerical features to only include those present in the DataFrame
categorical_features = [feature for feature in categorical_features if feature in df.columns]
numerical_features = [feature for feature in numerical_features if feature in df.columns]

# Define the imputer for numerical features
numerical_imputer = SimpleImputer(strategy='mean')  # You can choose other strategies like 'median' or 'most_frequent'

# Preprocessing pipeline with imputer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', numerical_imputer),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)
    ]
)

# Step 4: Clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # You can choose the number of clusters

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('clusterer', kmeans)])

# Fit the model
pipeline.fit(df[features])

# Step 5: Evaluation
df['cluster'] = pipeline.predict(df[features])

# Calculate silhouette score
silhouette_avg = silhouette_score(pipeline.named_steps['preprocessor'].transform(df[features]), df['cluster'])
print(f'Silhouette Score: {silhouette_avg:.2f}')

# Visualize the clusters
# Step 6: Dimensionality Reduction for Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(pipeline.named_steps['preprocessor'].transform(df[features]))

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=X_pca, columns=['PCA1', 'PCA2'])
pca_df['cluster'] = df['cluster']

# Plotting
plt.figure(figsize=(10, 7))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='cluster', palette='viridis', s=100, alpha=0.7)
plt.title('KMeans Clustering Visualization (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid()
plt.show()

# Check cluster assignments
print(df[['cluster'] + features].head())

# Disconnect from the database
handler.disconnect()


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 'operating_pci_speed_GTs', 
    'operating_pci_width', 'linkrate_Gbs', 'name', 'reference', 'created'
]
df = handler.get_data("ssd_clean_data", columns, limit=100, encode=True)

# Disconnect from the database
handler.disconnect()

# Check the encoding map if any encoding was applied and print the DataFrame
print("Encoding Map:", handler.encoding_map)
print(df.head())

# Data Preprocessing: Select numeric columns and fill missing values
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure'
]

# Impute missing values in numeric columns
imputer = SimpleImputer(strategy="mean")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Standardize numeric features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Determine the optimal number of clusters with the elbow method
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 4))
plt.plot(k_values, inertia, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Apply KMeans with 6 clusters and calculate silhouette score
kmeans = KMeans(n_clusters=6, random_state=0)
df['Cluster'] = kmeans.fit_predict(df_scaled)
silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
print(f"Silhouette Score for 6 Clusters: {silhouette_avg}")

# Use PCA for 2D visualization of clusters
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis', s=50)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('KMeans Clusters with 2D PCA')
plt.legend(handles=scatter.legend_elements()[0], labels=set(df['Cluster']), title="Clusters")
plt.colorbar(scatter, label='Cluster')
plt.show()

# Display cluster summary by data_type
cluster_summary = df.groupby(['data_type', 'Cluster']).size()
print("Cluster Summary by Data Type:\n", cluster_summary)


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'operating_pci_speed_gts', 'operating_pci_width'
]
df = handler.get_data("ssd_clean_data", columns, limit=100000, encode=True)

# Disconnect from the database
handler.disconnect()

# Check encoding map and data preview
print("Encoding Map:", handler.encoding_map)
print(df.head())

# Define numeric columns for clustering
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure', 'capacity_gib', 'operating_pci_speed_gts', 
    'operating_pci_width'
]

# Impute missing values in numeric columns
imputer = SimpleImputer(strategy="mean")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
# Initialize empty list to collect clusters
all_clusters = []

# Standardize numeric features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Iterate over each data type and apply KMeans clustering
data_types = df['data_type'].unique()
for data_type in data_types:
    # Filter dataset by data type
    df_filtered = df[df['data_type'] == data_type].copy()
    
    # Apply KMeans for 6 clusters
    kmeans = KMeans(n_clusters=6, random_state=0)
    df_filtered['Cluster'] = kmeans.fit_predict(scaler.transform(df_filtered[numeric_columns]))
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(scaler.transform(df_filtered[numeric_columns]), df_filtered['Cluster'])
    print(f"Silhouette Score for {data_type} with 6 Clusters: {silhouette_avg}")
    
    # Append filtered DataFrame with clusters to main DataFrame
    all_clusters.append(df_filtered)

    # Use PCA for 2D visualization of clusters
    pca = PCA(n_components=2)
    df_pca = pca.fit_transform(scaler.transform(df_filtered[numeric_columns]))

    # Plot the clusters for the current data type
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df_filtered['Cluster'], cmap='viridis', s=50)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(f'KMeans Clusters for {data_type} (6 clusters)')
    plt.colorbar(scatter, label='Cluster')
    plt.show()

# Concatenate all clusters into the main DataFrame
df_clusters = pd.concat(all_clusters)

# Analyze cluster distribution by data type
cluster_summary = df_clusters.groupby(['data_type', 'Cluster']).size()
print("Cluster Summary by Data Type:\n", cluster_summary)


# Final 

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'operating_pci_speed_gts', 'operating_pci_width', 'device_type', 'model'
]
df = handler.get_data("ssd_clean_data", columns, limit=None, encode=True)

# Disconnect from the database
handler.disconnect()

# Define numeric columns for clustering
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure', 'capacity_gib', 'operating_pci_speed_gts', 
    'operating_pci_width'
]

# Impute missing values in numeric columns
imputer = SimpleImputer(strategy="mean")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Standardize numeric features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Initialize empty list to collect clusters
all_clusters = []

# Iterate over each data type and apply KMeans clustering
data_types = df['data_type'].unique()
print("Data Types: " + ' and '.join(handler.encoding_map['data_type'][i] for i in data_types))
for data_type in data_types:
    # Filter dataset by data type
    df_filtered = df[df['data_type'] == data_type].copy()
    
    # Apply KMeans for 6 clusters
    kmeans = KMeans(n_clusters=6, random_state=0)
    df_filtered['Cluster'] = kmeans.fit_predict(scaler.transform(df_filtered[numeric_columns]))
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(scaler.transform(df_filtered[numeric_columns]), df_filtered['Cluster'])
    print(f"Silhouette Score for {handler.encoding_map['data_type'][data_type]} with 6 Clusters: {silhouette_avg}")
    
    # Identify the highest performance cluster by mean of 'mean_measure' metric
    cluster_performance = df_filtered.groupby('Cluster')['mean_measure'].mean()
    top_cluster = cluster_performance.idxmax()
    print(f"Top Cluster for {handler.encoding_map['data_type'][data_type]}: Cluster {top_cluster}")
    
    # Filter top cluster data
    top_cluster_data = df_filtered[df_filtered['Cluster'] == top_cluster]
    
    # Find top-performing device type and model within the top cluster
    top_device = top_cluster_data.groupby(['device_type', 'model'])['mean_measure'].mean().idxmax()
    print(f"Top-performing Device and Model for {handler.encoding_map['data_type'][data_type]}: {top_device}")

    # Append filtered DataFrame with clusters to main DataFrame
    all_clusters.append(df_filtered)

    # Use PCA for 2D visualization of clusters
    pca = PCA(n_components=2)
    df_pca = pca.fit_transform(scaler.transform(df_filtered[numeric_columns]))

    # Plot the clusters for the current data type
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df_filtered['Cluster'], cmap='viridis', s=50)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(f'KMeans Clusters for {handler.encoding_map['data_type'][data_type]} (6 clusters)')
    plt.colorbar(scatter, label='Cluster')
    plt.show()

# Concatenate all clusters into the main DataFrame
df_clusters = pd.concat(all_clusters)

# Analyze cluster distribution by data type
cluster_summary = df_clusters.groupby(['data_type', 'Cluster']).size()
print("Cluster Summary by Data Type:\n", cluster_summary)


# Maybe - Take a look at this @Thomas

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 
    'mean_measure', 'median_measure', 'max_measure', 'stddev_measure', 'capacity_gib', 
    'operating_pci_speed_gts', 'operating_pci_width', 'device_type', 'model'
]
df = handler.get_data("ssd_clean_data", columns, limit=None, encode=True)

# Disconnect from the database
handler.disconnect()

# Define numeric columns for clustering
numeric_columns = [
    'queue_depth', 'num_jobs', 'blocksize', 'min_measure', 'mean_measure', 'median_measure', 
    'max_measure', 'stddev_measure', 'capacity_gib', 'operating_pci_speed_gts', 
    'operating_pci_width'
]

# Impute missing values in numeric columns
imputer = SimpleImputer(strategy="mean")
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Standardize numeric features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Initialize empty list to collect clusters
all_clusters = []

# Iterate over each data type and metric and apply KMeans clustering
data_types = df['data_type'].unique()
metrics = df['metric'].unique()

print("Data Types: " + ' and '.join(handler.encoding_map['data_type'][i] for i in data_types))
print("Metrics: " + ' and '.join(handler.encoding_map['metric'][i] for i in metrics))

for data_type in data_types:
    for metric in metrics:
        # Filter dataset by data type and metric
        df_filtered = df[(df['data_type'] == data_type) & (df['metric'] == metric)].copy()
        
        # Skip if no data for this combination
        if df_filtered.empty:
            continue

        # Apply KMeans for 6 clusters
        kmeans = KMeans(n_clusters=6, random_state=0)
        df_filtered['Cluster'] = kmeans.fit_predict(scaler.transform(df_filtered[numeric_columns]))
        
        # Calculate silhouette score
        silhouette_avg = silhouette_score(scaler.transform(df_filtered[numeric_columns]), df_filtered['Cluster'])
        print(f"Silhouette Score for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]} with 6 Clusters: {silhouette_avg}")
        
        # Identify the highest performance cluster by mean of 'mean_measure' metric
        cluster_performance = df_filtered.groupby('Cluster')['mean_measure'].mean()
        top_cluster = cluster_performance.idxmax()
        print(f"Top Cluster for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]}: Cluster {top_cluster}")
        
        # Filter top cluster data
        top_cluster_data = df_filtered[df_filtered['Cluster'] == top_cluster]
        
        # Find top-performing device type and model within the top cluster
        top_device = top_cluster_data.groupby(['device_type', 'model'])['mean_measure'].mean().idxmax()
        print(f"Top-performing Device and Model for {handler.encoding_map['data_type'][data_type]} - {handler.encoding_map['metric'][metric]}: {top_device}")

        # Append filtered DataFrame with clusters to main DataFrame
        all_clusters.append(df_filtered)

        # Use PCA for 2D visualization of clusters
        pca = PCA(n_components=2)
        df_pca = pca.fit_transform(scaler.transform(df_filtered[numeric_columns]))

        # Plot the clusters for the current data type and metric
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df_filtered['Cluster'], cmap='viridis', s=50)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.title(f'KMeans Clusters for {handler.encoding_map["data_type"][data_type]} - {handler.encoding_map["metric"][metric]} (6 clusters)')
        plt.colorbar(scatter, label='Cluster')
        plt.show()

# Concatenate all clusters into the main DataFrame
df_clusters = pd.concat(all_clusters)

# Analyze cluster distribution by data type and metric
cluster_summary = df_clusters.groupby(['data_type', 'metric', 'Cluster']).size()
print("Cluster Summary by Data Type and Metric:\n", cluster_summary)
