K-Means Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np
from scipy.spatial.distance import pdist, squareform

# Load the data
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Apply K-means clustering (without standardization)
def apply_kmeans(data, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    return kmeans, kmeans.labels_



#  Evaluate clustering performance using all metrics
def evaluate_clustering(data, kmeans, labels):

    silhouette_avg = silhouette_score(data, labels)  # Silhouette score
    davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
    calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index

    return  silhouette_avg, davies_bouldin, calinski_harabasz

# Main function to run the process
def main(file_path, n_clusters):
    # Load the data
    data = load_data(file_path)

    # Apply K-means clustering
    kmeans_model, labels = apply_kmeans(data, n_clusters)

    # Evaluate performance
    silhouette_avg, davies_bouldin, calinski_harabasz = evaluate_clustering(data, kmeans_model, labels)

    # Print results
    print(f"Silhouette Score (without standardization): {silhouette_avg}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

# Specify the file path and number of clusters
file_path = '/content/labels2.xlsx'
n_clusters = 3

# Run the main function
main(file_path, n_clusters)

Silhouette Score (without standardization): 0.491845569472662
Davies-Bouldin Index: 0.7735773995402444
Calinski-Harabasz Index: 85.04454109655205


Hierarchical Clustering(Agglomerative Clustering)

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np
from scipy.spatial.distance import pdist

# Module 1: Load the data
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Module 2: Apply Hierarchical (Agglomerative) Clustering
def apply_hierarchical_clustering(data, n_clusters=3):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(data)
    return clustering, labels



# Module 4: Calculate Centroids for Clusters
def calculate_centroids(data, labels):
    unique_labels = np.unique(labels)
    centroids = np.array([data[labels == label].mean(axis=0) for label in unique_labels])
    return centroids


# Module 6: Evaluate clustering performance using all metrics
def evaluate_clustering(data, labels):
    silhouette_avg = silhouette_score(data, labels)  # Silhouette score
    davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
    calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    centroids = calculate_centroids(data, labels)  # Calculate cluster centroids

    return silhouette_avg, davies_bouldin, calinski_harabasz

# Main function to run the process
def main(file_path, n_clusters):
    # Load the data
    data = load_data(file_path)

    # Apply Hierarchical clustering
    clustering_model, labels = apply_hierarchical_clustering(data, n_clusters)

    # Evaluate performance
    silhouette_avg, davies_bouldin, calinski_harabasz= evaluate_clustering(data, labels)

    # Print results
    print(f"Silhouette Score (Hierarchical Clustering): {silhouette_avg}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")


# Specify the file path and number of clusters
file_path = '/content/labels2.xlsx'
n_clusters = 3

# Run the main function
main(file_path, n_clusters)

Silhouette Score (Hierarchical Clustering): 0.4858844159845584
Davies-Bouldin Index: 0.6959735397655115
Calinski-Harabasz Index: 66.85866975012894


Gaussian Mixture Models (GMM)

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Load the dataset
def load_data(file_path, sheet_name=0):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Apply GMM clustering
def apply_gmm(data, n_components=2, covariance_type='full'):
    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
    labels = gmm.fit_predict(data)
    return labels, gmm

# Evaluate clustering performance using three metrics
def evaluate_clustering(data, labels):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)  # Silhouette score
        davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
        calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    else:
        silhouette = -1  # Invalid silhouette score for 1 cluster
        davies_bouldin = -1
        calinski_harabasz = -1

    return silhouette, davies_bouldin, calinski_harabasz

# Main function to execute GMM on the loaded data
def run_gmm_clustering(file_path, sheet_name=0, n_components=2, covariance_type='full'):
    # Load data
    data = load_data(file_path, sheet_name)


    # Apply GMM
    labels, gmm = apply_gmm(data, n_components=n_components, covariance_type=covariance_type)

    # Evaluate performance
    silhouette, davies_bouldin, calinski_harabasz = evaluate_clustering(data, labels)

    # Print results
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

# Run the GMM clustering using the uploaded Excel file
file_path = '/content/labels2.xlsx'
run_gmm_clustering(file_path, n_components=3, covariance_type='full')

Silhouette Score: 0.41937219155744565
Davies-Bouldin Index: 1.0196791831162377
Calinski-Harabasz Index: 72.4993415269381


In [None]:
!pip install scikit-learn-extra


Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0


K-Medoids Clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Load the dataset
def load_data(file_path, sheet_name=0):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Apply K-Medoids clustering
def apply_kmedoids(data, n_clusters=2, metric='euclidean'):
    kmedoids = KMedoids(n_clusters=n_clusters, metric=metric, random_state=42)
    labels = kmedoids.fit_predict(data)
    return labels, kmedoids

# Evaluate clustering performance using three metrics
def evaluate_clustering(data, labels):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)  # Silhouette score
        davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
        calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    else:
        silhouette = -1  # Invalid silhouette score for 1 cluster
        davies_bouldin = -1
        calinski_harabasz = -1

    return silhouette, davies_bouldin, calinski_harabasz

# Main function to execute K-Medoids on the loaded data
def run_kmedoids_clustering(file_path, sheet_name=0, n_clusters=2, metric='euclidean'):
    # Load data
    data = load_data(file_path, sheet_name)

    # Apply K-Medoids
    labels, kmedoids = apply_kmedoids(data, n_clusters=n_clusters, metric=metric)

    # Evaluate performance
    silhouette, davies_bouldin, calinski_harabasz = evaluate_clustering(data, labels)

    # Print results
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

# Run the K-Medoids clustering using the uploaded Excel file
file_path = '/content/labels2.xlsx'
run_kmedoids_clustering(file_path, n_clusters=3, metric='euclidean')


Silhouette Score: -1
Davies-Bouldin Index: -1
Calinski-Harabasz Index: -1




Diagonal Covariance GMM



In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
def load_data(file_path, sheet_name=0):
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Apply GMM clustering with Diagonal Covariance
def apply_gmm(data, n_components=2, covariance_type='diag'):
    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
    labels = gmm.fit_predict(data)
    return labels, gmm

# Evaluate clustering performance using three metrics
def evaluate_clustering(data, labels):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)  # Silhouette score
        davies_bouldin = davies_bouldin_score(data, labels)  # Davies-Bouldin index
        calinski_harabasz = calinski_harabasz_score(data, labels)  # Calinski-Harabasz index
    else:
        silhouette = -1  # Invalid silhouette score for 1 cluster
        davies_bouldin = -1
        calinski_harabasz = -1

    return silhouette, davies_bouldin, calinski_harabasz

# Main function to execute GMM with Diagonal Covariance on the loaded data
def run_gmm_clustering(file_path, sheet_name=0, n_components=2, covariance_type='diag'):
    # Load data
    data = load_data(file_path, sheet_name)

    # Apply GMM with Diagonal Covariance
    labels, gmm = apply_gmm(data, n_components=n_components, covariance_type=covariance_type)

    # Evaluate performance
    silhouette, davies_bouldin, calinski_harabasz = evaluate_clustering(data, labels)

    # Print results
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Index: {davies_bouldin}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz}")

# Run the GMM clustering using the uploaded Excel file
file_path = '/content/labels2.xlsx'
run_gmm_clustering(file_path, n_components=3, covariance_type='diag')





Silhouette Score: 0.4237725054190832
Davies-Bouldin Index: 0.7185499008968916
Calinski-Harabasz Index: 65.66553280947187
