In [None]:
!pip install scikit-learn


In [None]:
import sys
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import OPTICS

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler

# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host="172.25.221.34",
    password="Senna",
    port=1433
)

handler.connect()


In [None]:
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs',
    'blocksize', 'unit', 'min_measure', 'mean_measure', 'median_measure',
    'max_measure', 'stddev_measure', 'device_type', 'family', 'vendor',
    'model', 'firmware', 'capacity_GiB', 'operating_pci_speed_GTs',
    'operating_pci_width', 'Linkrate_Gbs', 'name', 'reference', 'created'
]

df = handler.get_data("ssd_clean_data", columns, limit=100000, encode=True)
print(df.head())

# Select only 60% of the dataset
df = df.sample(frac=0.6, random_state=42)  # Randomly select 60% of the data
print(df.shape)  # Verify the shape to ensure it's 60% of the original data

print(df.columns)

handler.disconnect()


In [None]:
# Select only numerical columns
df_numerical = df.select_dtypes(include=['float64', 'int64'])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_numerical), columns=df_numerical.columns)

# Normalize the data
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)


In [None]:
from sklearn.decomposition import PCA

# Reduce to 10 dimensions using PCA
pca = PCA(n_components=10, random_state=42)
df_pca = pca.fit_transform(df_normalized)

# Apply OPTICS again on reduced data
optics_pca = OPTICS(min_samples=100, max_eps=1.0, xi=0.1)
pca_cluster_labels = optics_pca.fit_predict(df_pca)

# Add the cluster labels to the DataFrame
df['PCA_OPTICS_Cluster'] = pca_cluster_labels
print(df['PCA_OPTICS_Cluster'].value_counts())


In [None]:
# Initialize OPTICS with parameters
#optics = OPTICS(min_samples=50, max_eps=0.5, cluster_method='xi')
optics = OPTICS(min_samples=300, max_eps=1.0, xi=0.1)

# Fit the model on the normalized data
cluster_labels = optics.fit_predict(df_normalized)

# Add the cluster labels back to the original DataFrame
df['Cluster'] = cluster_labels

print(df['Cluster'].value_counts())


In [None]:
!pip install matplotlib


In [None]:
!pip install seaborn


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Apply t-SNE to reduce dimensions to 2
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
df_tsne = pd.DataFrame(tsne.fit_transform(df_normalized), columns=['t-SNE1', 't-SNE2'])

# Add the cluster labels to the t-SNE DataFrame
df_tsne['Cluster'] = df['Cluster']

# Plot t-SNE with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_tsne, x='t-SNE1', y='t-SNE2', hue='Cluster', palette='tab10', s=100, alpha=0.7)
plt.title('t-SNE of Data Points (OPTICS Clustering)')
plt.show()


In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
cluster_summary = df.groupby('Cluster')[numeric_columns].mean()

print(cluster_summary)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create a reachability plot
plt.figure(figsize=(12, 6))
space = np.arange(len(df_normalized))
reachability = optics.reachability_[optics.ordering_]
labels = optics.labels_[optics.ordering_]

plt.plot(space, reachability, 'k-', alpha=0.7)
for class_member in np.unique(labels):
    if class_member != -1:
        mask = (labels == class_member)
        plt.plot(space[mask], reachability[mask], marker='o', linestyle='-', alpha=0.7)

plt.xlabel('Sample Index')
plt.ylabel('Reachability Distance')
plt.title('Reachability Plot')
plt.show()


In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import OPTICS
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Print unique values in 'data_type' and 'metric' to determine encoding mappings
print("Unique values in 'data_type':", df['data_type'].unique())
print("Unique values in 'metric':", df['metric'].unique())

# Define the Data Types and Encoding for Each Feature based on printed mappings
data_types = [
    {"name": "Random Read Latency", "type": 0, "metric": 2},
    {"name": "Random Write Latency", "type": 1, "metric": 2},
    {"name": "Random Read", "type": 0, "metric": None},
    {"name": "Random Write", "type": 1, "metric": None},
    {"name": "Sequential Write", "type": 3, "metric": None},
    {"name": "Sequential Read", "type": 2, "metric": None}
]

# OPTICS clustering parameters
min_samples = 100
max_eps = 1.0
xi = 0.1

# Loop through each feature type and apply OPTICS clustering
for item in data_types:
    name = item["name"]
    specific_type = item["type"]
    latency_metric = item["metric"]
    
    print(f"Processing {name}...")

    # Step 2: Filter the DataFrame for the specific encoded data type
    df_specific = df[df['data_type'] == specific_type].copy()
    
    # Step 3: If a latency metric is specified, apply the additional filter
    if latency_metric is not None:
        df_specific = df_specific[df_specific['metric'] == latency_metric]

    # Check if df_specific is empty
    if df_specific.empty:
        print(f"No data found for data_type = '{specific_type}' with metric = '{latency_metric}'")
        continue
    else:
        print(f"Data found for {name}: {df_specific.shape[0]} rows")

    # Step 4: Select numerical columns
    df_numerical = df_specific.select_dtypes(include=['float64', 'int64'])

    # Step 5: Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df_numerical), columns=df_numerical.columns)
    
    # Step 6: Normalize data
    scaler = MinMaxScaler()
    df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)
    
    # Step 7: Apply OPTICS clustering
    optics = OPTICS(min_samples=min_samples, max_eps=max_eps, xi=xi)
    cluster_labels = optics.fit_predict(df_normalized)

    # Add the cluster labels to the original DataFrame for visualization
    df_specific['OPTICS_Cluster'] = cluster_labels

    # Step 8: Calculate the silhouette score (exclude noise points labeled as -1)
    non_noise_points = df_normalized[cluster_labels != -1]
    non_noise_labels = cluster_labels[cluster_labels != -1]

    if len(set(non_noise_labels)) > 1:  # Ensure there's more than one cluster
        score = silhouette_score(non_noise_points, non_noise_labels)
        print(f"Silhouette Score for {name} with min_samples={min_samples}: {score:.4f}")
    else:
        print(f"Silhouette Score for {name} with min_samples={min_samples}: Not applicable (only one cluster or no clusters)")

    # Step 9: Visualize with t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(df_normalized)
    
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=cluster_labels, palette='tab10', s=10)
    plt.title(f't-SNE Visualization of OPTICS Clusters for {name}')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend(title='Cluster', loc='best')
    plt.show()

    # Display cluster counts for each category
    print(df_specific['OPTICS_Cluster'].value_counts())
    print("\n" + "-"*50 + "\n")

