In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import sys
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN

# Add the root project directory to the Python path
project_root = Path.cwd().parent  # This will get the project root since the notebook is in 'notebooks/'
sys.path.append(str(project_root))
from configs.path_config import CONFIG_DIR, OUTPUT_DIR

In [None]:
# Load the strain data
df_strain = pd.read_csv(OUTPUT_DIR / 'strain_distributions' / 'N-B_Far_Comp_20091129120000_20210611160000_strain_distribution.csv')
df_strain.isna().sum().sum()
# df_strain = df_strain.iloc[0:4700,:]
df_strain


In [None]:
# Find outliers
df_strain_data = df_strain.drop(columns = 'Timestamp')
means = df_strain_data.mean(axis=1)  # Calculate mean for each row

# Calculate the mean and standard deviation of the row means
mean_val = means.mean()
std_val = means.std()

# Define a threshold for outliers (3 standard deviations from the mean)
threshold = 1

# Find outliers: rows where the absolute deviation from the mean is greater than the threshold
outliers = means[np.abs(means - mean_val) > threshold * std_val]
print("Outliers:")
print(outliers)

df_strain = df_strain.drop(outliers.index)
print("Number of outliers removed: ", len(outliers))

In [None]:
zeros = df_strain_data[df_strain_data.eq(0).all(axis=1)].index
zeros

In [None]:
# Exclude timestamp in column 0
strain_data = df_strain.iloc[:, 1:].values  
strain_data

In [None]:
# Fit PCA on the entire strain data (matrix-wise)
# Set the number of components directly (e.g., 5 components)
pca = PCA(n_components=10)
pca.fit(strain_data)
# 
# Get the explained variance ratio
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)

# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(per_var) + 1), per_var.cumsum(), marker="o", linestyle="--")
plt.grid()
plt.ylabel("Percentage Cumulative of Explained Variance")
plt.xlabel("Number of Components")
plt.xticks(range(1, len(per_var) + 1, 1))
plt.title("Explained Variance by Number of Components")
plt.show()

In [None]:
def do_pca(n_components, strain_data):
    # Perform PCA
    pca = PCA(n_components=n_components)

    # Fit PCA on the entire strain data (matrix-wise)
    pca.fit(strain_data)

    # Apply PCA to the entire strain data (matrix-wise)
    pca_results = pca.transform(strain_data)

    # Normalize the results
    normalized_pca_components = StandardScaler().fit_transform(pca_results)

    # Convert results into a DataFrame
    df_pca = pd.DataFrame(normalized_pca_components, columns=[f'PC{i+1}' for i in range(n_components)])

    # Add timestamps back
    df_pca.insert(0, 'Timestamp', df_strain['Timestamp'].values)

    return normalized_pca_components, df_pca

n_components = 5
normalized_pca_components, df_pca = do_pca(n_components, strain_data)

#### KMeans Clustering

In [None]:
def kmeans_clustering(data, n_clusters):
   
    kmeans = KMeans(n_clusters, random_state=42)
    clusters = kmeans.fit_predict(normalized_pca_components)

    plt.figure(figsize=(10, 6))

    # Scatter plot with the clusters
    sns.scatterplot(x=normalized_pca_components[:, 0], y=normalized_pca_components[:, 1], hue=clusters, palette="viridis", s=100, alpha=0.7)

    # Label the axes
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("PCA + K-Means Clustering")

    # Show the plot
    plt.legend(title='Cluster')
    plt.show()

    # Add cluster labels to your original data (without overwriting)
    data_with_KMeans = df_strain.copy()  # Make a copy to preserve the original DataFrame

    # Insert the clusters as the second column (at index 1)
    data_with_KMeans.insert(1, 'Cluster', clusters)

    # Show the updated DataFrame with the Cluster column as the second column
    return data_with_KMeans

n_components = 8
normalized_pca_components, df_pca = do_pca(n_components, strain_data)

n_clusters = 8
data_with_KMeans = kmeans_clustering(normalized_pca_components, n_clusters)
data_with_KMeans

In [None]:
def gmm_clustering(data, n_clusters):
   
    # Fit the Gaussian Mixture Model
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    clusters = gmm.fit_predict(normalized_pca_components)

    # Plot the clusters
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=normalized_pca_components[:, 0], y=normalized_pca_components[:, 1], hue=clusters, palette="viridis", s=100, alpha=0.7)

    # Label the axes
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("PCA + GMM Clustering")

    # Show the plot
    plt.legend(title='Cluster')
    plt.show()

    # Add the GMM cluster labels to the original data (without overwriting)
    data_with_GMM = df_strain.copy()  # Make a copy to preserve the original DataFrame
    data_with_GMM.insert(1, 'Cluster', clusters)  # Insert GMM clusters as the second column

    # Show the updated DataFrame with the Cluster column as the second column
    return data_with_GMM

n_components = 8
normalized_pca_components, df_pca = do_pca(n_components, strain_data)

n_clusters = 8
gmm_clustering(normalized_pca_components, n_clusters)

In [None]:
def dbscan_clustering(data, eps=0.5, min_samples=5):
    """
    Perform DBSCAN clustering on the provided data and visualize the results.

    Parameters:
    - data: The dataset to cluster.
    - eps: The maximum distance between two samples for them to be considered as in the same neighborhood.
    - min_samples: The number of samples in a neighborhood for a point to be considered as a core point.
    """
    # Fit the DBSCAN model
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(data)

    # Plot the clusters
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=data[:, 0], y=data[:, 1], hue=clusters, palette="viridis", s=100, alpha=0.7)

    # Label the axes
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("PCA + DBSCAN Clustering")

    # Show the plot
    plt.legend(title='Cluster')
    plt.show()

    # Add the DBSCAN cluster labels to the original data (without overwriting)
    data_with_dbscan = df_strain.copy()  # Make a copy to preserve the original DataFrame
    data_with_dbscan.insert(1, 'Cluster', clusters)  # Insert DBSCAN clusters as the second column

    # Show the updated DataFrame with the Cluster column as the second column
    return data_with_dbscan

n_components = 5
normalized_pca_components, df_pca = do_pca(n_components, strain_data)

# Call the function with the normalized PCA components
data_with_dbscan = dbscan_clustering(normalized_pca_components, eps=0.5, min_samples=5)

#### Number of clusters

In [None]:
def evaluate_clusters(data, n_clusters) -> None:

    # Initialize lists to store inertia and Davies-Bouldin index for KMeans
    silhouette_kmeans = []  # Sum of squared distances of samples to their closest cluster center
    db_index_kmeans = []  # Davies-Bouldin Index

    # Initialize lists to store inertia and Davies-Bouldin index for GMM
    silhouette_gmm = []  
    db_index_gmm = []  

    # Initialize lists to store inertia and Davies-Bouldin index for DBSCAN
    silhouette_dbscan = []  
    db_index_dbscan = []  
 

    # Loop over different number of clusters
    for n in range(2, n_clusters + 1):

        # KMeans Clustering
        kmeans = KMeans(n_clusters=n, random_state=42)
        kmeans_labels = kmeans.fit_predict(data)

        # Davies-Bouldin Index for KMeans
        db_score_kmeans = davies_bouldin_score(data, kmeans_labels)
        db_index_kmeans.append(db_score_kmeans)
        # Silhouette Score for KMeans
        silhouette_kmeans.append(silhouette_score(data, kmeans_labels))
        
        print(f"K-Means - Number of clusters: {n}, Davies-Bouldin Index: {db_score_kmeans}, Silhouette Score: {silhouette_score(data, kmeans_labels)}")

        # GMM Clustering
        gmm = GaussianMixture(n_components=n, random_state=42)
        gmm_labels = gmm.fit_predict(data)

        # Davies-Bouldin Index for GMM
        db_score_gmm = davies_bouldin_score(data, gmm_labels)
        db_index_gmm.append(db_score_gmm)
        # Silhouette Score for GMM
        silhouette_gmm.append(silhouette_score(data, gmm_labels))
        
        print(f"GMM - Number of components: {n}, Davies-Bouldin Index: {db_score_gmm}, Silhouette Score: {silhouette_score(data, gmm_labels)}")

        # DBSCAN Clustering (DBSCAN does not require number of clusters, so we use eps and min_samples)
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        dbscan_labels = dbscan.fit_predict(data)

        # Davies-Bouldin Index for DBSCAN (ignoring -1 labels as noise points)
        db_score_dbscan = davies_bouldin_score(data[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1])
        db_index_dbscan.append(db_score_dbscan)
        
        # Silhouette Score for DBSCAN (ignoring -1 labels as noise points)
        silhouette_dbscan.append(silhouette_score(data[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]))
        
        print(f"DBSCAN - Davies-Bouldin Index: {db_score_dbscan}, Silhouette Score: {silhouette_score(data[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1])}")

    # Plot the Elbow graph and Davies-Bouldin Index in subplots
    fig, axes = plt.subplots(2, 1, figsize=(10, 12))

    # Elbow Method Plot (Inertia)
    axes[0].plot(range(2, n_clusters+1), silhouette_kmeans, marker='o')
    axes[0].plot(range(2, n_clusters+1), silhouette_gmm, marker='o')
    axes[0].plot(range(2, n_clusters+1), silhouette_dbscan, marker='o')
    axes[0].set_title("Silhouette Score for Different Number of Clusters")
    axes[0].set_xlabel("Number of Clusters")
    axes[0].set_ylabel("Silhouette Score")
    axes[0].legend(['KMeans', 'GMM', 'DBSCAN'])

    # Davies-Bouldin Index Plot
    axes[1].plot(range(2, n_clusters+1), db_index_kmeans, marker='o')
    axes[1].plot(range(2, n_clusters+1), db_index_gmm, marker='o')
    axes[1].plot(range(2, n_clusters+1), db_index_dbscan, marker='o')
    axes[1].set_title("Davies-Bouldin Index for Different Number of Clusters")
    axes[1].set_xlabel("Number of Clusters")
    axes[1].set_ylabel("Davies-Bouldin Index")
    axes[1].legend(['KMeans', 'GMM','DBSCAN'])

    plt.tight_layout()

    plt.show()

# Call the function with the normalized PCA components
n_components = 5
normalized_pca_components, df_pca = do_pca(n_components, strain_data)

n_clusters = 6
data = normalized_pca_components
evaluate_clusters(data, n_clusters)