Affinity Propogation

Best use cases:

Medium-sized datasets where the number of clusters is not known beforehand.


Applications where flexibility in cluster size is needed.


How does affinity propagation work?

1. Determine similarity between different points based off of euclidean distance: S(i,k) ‎ =  -| x(i) - x(k) | ^2 
2. Determine responsibility matrix
    1. For every point, determine if this point would be a good exemplar compared to the other data points
    2. Updates the likelihood that a point is the best exemplar
3. Determine availability matrix
    1. Ask yourself: when this data point chooses another data point as its exemplar, how good of a candidate is that point is as an exemplar


In [None]:
pip install scikit-learn seaborn matplotlib scaler

In [None]:
import sys
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
from data_access.postgres_handler import PostgresHandler


# Initialize the PostgresHandler
handler = PostgresHandler(
    database="nutanix",
    user="postgres",
    host='172.25.221.34',
    password="Senna",
    port=1433
)
handler.connect()

# Define columns to fetch, including 'data_type'
columns = [
    'concord_id', 'data_type', 'metric', 'queue_depth', 'num_jobs', 
    'blocksize', 'unit', 'min_measure', 'mean_measure', 
    'median_measure', 'max_measure', 'stddev_measure', 'device_type', 
    'family', 'vendor', 'model', 'firmware', 'capacity_GiB', 
    'operating_pci_speed_GTs', 'operating_pci_width', 'linkrate_Gbs', 
    'name', 'reference', 'created'
]

# Not encoded
df = handler.get_data("ssd_clean_data", columns, limit=100000, encode=True)
# Check the DataFrame
print(df)

# Step 1: Select Numerical Columns
df_numerical = df.select_dtypes(include=['float64', 'int64'])

# Step 2: Handle Missing Values by Imputing (Filling with Mean)
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_numerical), columns=df_numerical.columns)

# Optional: Convert to float32 to save memory
df_imputed = df_imputed.astype('float32')

# Step 3: Normalize the Numerical Data Using MinMaxScaler
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

# Optional: Sample the Data to Reduce Computational Load
# Note: Adjust 'n_samples' based on your system's capacity
n_samples = 10000  # For example, 10,000 samples
if len(df_normalized) > n_samples:
    df_normalized = df_normalized.sample(n=n_samples, random_state=42)
    df_original_sampled = df.iloc[df_normalized.index].copy()
else:
    df_original_sampled = df.copy()

# # Check the DataFrame
# print(df)
handler.disconnect()

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt


# Define a List of Different Preference Values to Test
preference_values = [-400,-350,-300]

for preference in preference_values:
    try:
        print(f"\nProcessing Affinity Propagation with preference={preference}...")

        # Step 4: Apply Affinity Propagation for Clustering
        clusterer = AffinityPropagation(
            preference=preference,
            damping=0.9,
            max_iter=1000,
            convergence_iter=100,
            random_state=42
        )
        cluster_labels = clusterer.fit_predict(df_normalized)

        # Add the Cluster Labels Back to the Original Sampled DataFrame
        df_original_sampled['Cluster'] = cluster_labels

        # Step 5: Apply t-SNE to Reduce to 2 Dimensions for Visualization
        tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
        df_tsne = pd.DataFrame(tsne.fit_transform(df_normalized), columns=['t-SNE1', 't-SNE2'])
        df_tsne['Cluster'] = cluster_labels

        # Step 6: Plot the t-SNE-Transformed Data with Cluster Labels
        plt.figure(figsize=(10, 8))
        sns.scatterplot(
            data=df_tsne,
            x='t-SNE1',
            y='t-SNE2',
            hue='Cluster',
            palette='tab10',
            s=100,
            alpha=0.7
        )
        plt.title(f't-SNE of Data Points (preference={preference})')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.legend(title='Cluster')
        plt.show()

        # Step 7: Analyze How the Original Features Differ by Cluster
        numeric_columns = df_original_sampled.select_dtypes(include=['float64', 'int64']).columns
        cluster_summary = df_original_sampled.groupby('Cluster')[numeric_columns].mean()

        # Display the Summary Statistics by Cluster
        print(f"\nCluster Summary Statistics for preference={preference}:")
        print(cluster_summary)

        # Step 8: Visualize How 'queue_depth' Differs Across Clusters (If Exists)
        if 'queue_depth' in df_original_sampled.columns:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='Cluster', y='queue_depth', data=df_original_sampled)
            plt.title(f'Distribution of Queue Depth by Cluster (preference={preference})')
            plt.show()

        # Step 9: Visualize How 'num_jobs' Differs Across Clusters (If Exists)
        if 'num_jobs' in df_original_sampled.columns:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='Cluster', y='num_jobs', data=df_original_sampled)
            plt.title(f'Distribution of Number of Jobs by Cluster (preference={preference})')
            plt.show()

    except Exception as e:
        print(f"An error occurred for preference={preference}: {e}")