K-Means Clustering

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Load the data
def load_data(file_path):
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Apply K-means clustering
def apply_kmeans(data, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    return kmeans, kmeans.labels_

# Evaluate clustering performance using all metrics
def evaluate_clustering(data, labels):
    silhouette_avg = silhouette_score(data, labels)  # Silhouette score
    davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
    calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index

    return silhouette_avg, davies_bouldin, calinski_harabasz

# Main function to run the process and update the existing Excel file
def main(file_path, n_clusters=10):
    # Load the data
    data = load_data(file_path)
    if data is None:
        return  # Exit if there was an error loading the data

    # Select only the first 196 columns
    if data.shape[1] < 196:
        print("The dataset has fewer than 196 columns. Please check the input data.")
        return

    data_values = data.iloc[:, :196]  # Select first 196 columns

    # Apply K-means clustering
    kmeans_model, labels = apply_kmeans(data_values, n_clusters)

    # Evaluate performance
    silhouette_avg, davies_bouldin, calinski_harabasz = evaluate_clustering(data_values, labels)

    # Print results
    print(f"Silhouette Score (k={n_clusters}): {silhouette_avg}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

    # Add the labels as a new column to the original data
    data['k-means_label'] = labels

    # Save the updated data back to the existing Excel file
    try:
        data.to_excel(file_path, index=False)  # Overwrite the existing file
        print(f"Updated dataset saved to {file_path}")
    except Exception as e:
        print(f"Error saving updated dataset: {e}")

    return data

# Specify the file path and number of clusters
file_path = 'Updated_Clustering_with_Labels.xlsx'  # Path to the existing file
n_clusters = 10

# Run the main function and get the updated dataset
updated_data = main(file_path, n_clusters)

Silhouette Score (k=10): 0.5643016818392292
Davies-Bouldin Index: 0.8411646958445402
Calinski-Harabasz Index: 98.14582721500408
Updated dataset saved to Updated_Clustering_with_Labels.xlsx


Hierarchical Clustering

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np

# Module 1: Load the data
def load_data(file_path):
    data = pd.read_excel(file_path)
    return data

# Module 2: Apply Hierarchical (Agglomerative) Clustering
def apply_hierarchical_clustering(data, n_clusters=10):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(data)
    return clustering, labels

# Module 4: Calculate Centroids for Clusters
def calculate_centroids(data, labels):
    unique_labels = np.unique(labels)
    centroids = np.array([data[labels == label].mean(axis=0) for label in unique_labels])
    return centroids

# Module 6: Evaluate clustering performance using all metrics
def evaluate_clustering(data, labels):
    silhouette_avg = silhouette_score(data, labels)  # Silhouette score
    davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
    calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index

    return silhouette_avg, davies_bouldin, calinski_harabasz

# Main function to run the process and update the existing Excel file
def main(file_path, n_clusters):
    # Load the data
    data = load_data(file_path)

    # Select only the first 195 columns
    data = data.iloc[:, :196]

    # Apply Hierarchical clustering
    clustering_model, labels = apply_hierarchical_clustering(data, n_clusters)

    # Evaluate performance
    silhouette_avg, davies_bouldin, calinski_harabasz = evaluate_clustering(data, labels)

    # Print results
    print(f"Silhouette Score (Hierarchical Clustering): {silhouette_avg}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

    # Add the labels as a new column to the original data
    data['Hierarchical_Clustering_label'] = labels

    # Save the updated dataset back to the existing Excel file
    data.to_excel(file_path, index=False)  # Overwrite the existing file
    print(f"Updated dataset saved to {file_path}")

    return data

# Specify the file path and number of clusters
file_path = 'Updated_Clustering_with_Labels.xlsx'  # Path to the existing file
n_clusters = 10

# Run the main function and get the updated dataset
updated_data = main(file_path, n_clusters)


Silhouette Score (Hierarchical Clustering): 0.5767027381927193
Davies-Bouldin Index: 0.852873153751695
Calinski-Harabasz Index: 100.83198234709431
Updated dataset saved to Updated_Clustering_with_Labels.xlsx


Gaussian Mixture Model (GMM)

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler  # Optional: for scaling

# Load the dataset
def load_data(file_path, sheet_name=0):
    return pd.read_excel(file_path, sheet_name=sheet_name)

# Apply GMM clustering
def apply_gmm(data, n_components=10, covariance_type='full'):
    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type, random_state=42)
    labels = gmm.fit_predict(data)
    return labels, gmm

# Evaluate clustering performance using three metrics
def evaluate_clustering(data, labels):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)  # Silhouette score
        davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
        calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    else:
        silhouette = -1  # Invalid silhouette score for 1 cluster
        davies_bouldin = -1
        calinski_harabasz = -1

    return silhouette, davies_bouldin, calinski_harabasz

# Main function to execute GMM on the loaded data and update the existing Excel file
def run_gmm_clustering(file_path, sheet_name=0, n_components=10, covariance_type='full'):
    # Load data
    data = load_data(file_path, sheet_name)

    # Select only the first 195 columns and drop rows with NaNs
    data_values = data.iloc[:, :196].dropna().values

    # Optional: Scale the data
    # scaler = StandardScaler()
    # data_values = scaler.fit_transform(data_values)

    # Apply GMM
    labels, gmm = apply_gmm(data_values, n_components=n_components, covariance_type=covariance_type)

    # Evaluate performance
    silhouette, davies_bouldin, calinski_harabasz = evaluate_clustering(data_values, labels)

    # Print results
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

    # Add the labels as a new column to the original dataset
    data['Gaussian_Mixture_Model_(GMM)_Clustering_label'] = labels

    # Save the updated dataset back to the existing Excel file
    data.to_excel(file_path, index=False)  # Overwrite the existing file
    print(f"Updated dataset saved to {file_path}")

# Specify the file path and run the GMM clustering with n_components=10
file_path = 'Updated_Clustering_with_Labels.xlsx'  # Path to the existing file
run_gmm_clustering(file_path, n_components=10, covariance_type='full')

Silhouette Score: 0.5643016818392292
Davies-Bouldin Index: 0.8411646958445402
Calinski-Harabasz Index: 98.14582721500408
Updated dataset saved to Updated_Clustering_with_Labels.xlsx


In [None]:
!pip install scikit-learn-extra



K-Medoids Clustering

In [None]:
import pandas as pd
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Load the data
def load_data(file_path):
    try:
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Apply K-Medoids clustering
def apply_kmedoids(data, n_clusters=10, metric='euclidean'):
    kmedoids = KMedoids(n_clusters=n_clusters, metric=metric, random_state=42)
    kmedoids.fit(data)
    return kmedoids, kmedoids.labels_

# Evaluate clustering performance using all metrics
def evaluate_clustering(data, labels):
    silhouette_avg = silhouette_score(data, labels)  # Silhouette score
    davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
    calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index

    return silhouette_avg, davies_bouldin, calinski_harabasz

# Main function to run the process and update the existing Excel file
def main(file_path, n_clusters=10, metric='euclidean'):
    # Load the data
    data = load_data(file_path)
    if data is None:
        return  # Exit if there was an error loading the data

    # Select only the first 196 columns
    if data.shape[1] < 196:
        print("The dataset has fewer than 196 columns. Please check the input data.")
        return

    data_values = data.iloc[:, :196]  # Select first 196 columns

    # Apply K-Medoids clustering
    kmedoids_model, labels = apply_kmedoids(data_values, n_clusters, metric)

    # Evaluate performance
    silhouette_avg, davies_bouldin, calinski_harabasz = evaluate_clustering(data_values, labels)

    # Print results
    print(f"Silhouette Score (k={n_clusters}): {silhouette_avg}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

    # Add the labels as a new column to the original data
    data['k-medoids_label'] = labels

    # Save the updated data back to the existing Excel file
    try:
        data.to_excel(file_path, index=False)  # Overwrite the existing file
        print(f"Updated dataset saved to {file_path}")
    except Exception as e:
        print(f"Error saving updated dataset: {e}")

    return data

# Specify the file path and number of clusters
file_path = 'Updated_Clustering_with_Labels.xlsx'  # Path to the existing file
n_clusters = 10

# Run the main function and get the updated dataset
updated_data = main(file_path, n_clusters)



Silhouette Score (k=10): 0.20790050906770893
Davies-Bouldin Index: 1.1259539660727949
Calinski-Harabasz Index: 30.790791260647566
Updated dataset saved to Updated_Clustering_with_Labels.xlsx


Diagonal Covariance GMM

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler  # Optional: for scaling

# Load the dataset
def load_data(file_path, sheet_name=0):
    return pd.read_excel(file_path, sheet_name=sheet_name)

# Apply GMM clustering
def apply_gmm(data, n_components=10, covariance_type='diag'):
    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type, random_state=42)
    labels = gmm.fit_predict(data)
    return labels, gmm

# Evaluate clustering performance using three metrics
def evaluate_clustering(data, labels):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)  # Silhouette score
        davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
        calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    else:
        silhouette = -1  # Invalid silhouette score for 1 cluster
        davies_bouldin = -1
        calinski_harabasz = -1

    return silhouette, davies_bouldin, calinski_harabasz

# Main function to execute GMM on the loaded data and update the existing Excel file
def run_gmm_clustering(file_path, sheet_name=0, n_components=10, covariance_type='full'):
    # Load data
    data = load_data(file_path, sheet_name)

    # Select only the first 195 columns and drop rows with NaNs
    data_values = data.iloc[:, :196].dropna().values

    # Optional: Scale the data
    # scaler = StandardScaler()
    # data_values = scaler.fit_transform(data_values)

    # Apply GMM
    labels, gmm = apply_gmm(data_values, n_components=n_components, covariance_type=covariance_type)

    # Evaluate performance
    silhouette, davies_bouldin, calinski_harabasz = evaluate_clustering(data_values, labels)

    # Print results
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

    # Add the labels as a new column to the original dataset
    data['Diag_Gaussian_Mixture_Model_(GMM)_Clustering_label'] = labels

    # Save the updated dataset back to the existing Excel file
    data.to_excel(file_path, index=False)  # Overwrite the existing file
    print(f"Updated dataset saved to {file_path}")

# Specify the file path and run the GMM clustering with n_components=10
file_path = 'Updated_Clustering_with_Labels.xlsx'  # Path to the existing file
run_gmm_clustering(file_path, n_components=10, covariance_type='diag')


Silhouette Score: 0.5643016818392292
Davies-Bouldin Index: 0.8411646958445402
Calinski-Harabasz Index: 98.14582721500408
Updated dataset saved to Updated_Clustering_with_Labels.xlsx
