Applying grid search on algorithms.

K-Means Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Grid Search for K-Means
def grid_search_kmeans(data, cluster_range):
    best_score = -1
    best_k = None
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        silhouette = silhouette_score(data, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_k = k
    return best_k, best_score

# Main function to execute K-Means with Grid Search
def run_kmeans_clustering(file_path):
    data = load_data(file_path)
    cluster_range = range(2, 11)  # Testing k from 2 to 10
    best_k, best_score = grid_search_kmeans(data, cluster_range)
    print(f"Best K-Means k: {best_k} with score: {best_score}")

# Run the K-Means clustering
file_path = '/content/labels2.xlsx'
run_kmeans_clustering(file_path)

Best K-Means k: 10 with score: 0.9049649294173752


Gaussian Mixture Model (GMM)

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Grid Search for GMM
def grid_search_gmm(data, cluster_range):
    best_score = -1
    best_k = None
    for k in cluster_range:
        gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
        labels = gmm.fit_predict(data)
        silhouette = silhouette_score(data, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_k = k
    return best_k, best_score

# Main function to execute GMM with Grid Search
def run_gmm_clustering(file_path):
    data = load_data(file_path)
    cluster_range = range(2, 11)  # Testing k from 2 to 10
    best_k, best_score = grid_search_gmm(data, cluster_range)
    print(f"Best GMM k: {best_k} with score: {best_score}")

# Run the GMM clustering
file_path = '/content/labels2.xlsx'
run_gmm_clustering(file_path)

Best GMM k: 10 with score: 0.9049649294173752


Hierarchical Clustering


In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Grid Search for Hierarchical Clustering
def grid_search_hierarchical(data, cluster_range):
    best_score = -1
    best_k = None
    for k in cluster_range:
        hierarchical = AgglomerativeClustering(n_clusters=k)
        labels = hierarchical.fit_predict(data)
        silhouette = silhouette_score(data, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_k = k
    return best_k, best_score

# Main function to execute Hierarchical Clustering with Grid Search
def run_hierarchical_clustering(file_path):
    data = load_data(file_path)
    cluster_range = range(2, 11)  # Testing k from 2 to 10
    best_k, best_score = grid_search_hierarchical(data, cluster_range)
    print(f"Best Hierarchical k: {best_k} with score: {best_score}")

# Run the Hierarchical clustering
file_path = '/content/labels2.xlsx'
run_hierarchical_clustering(file_path)

Best Hierarchical k: 10 with score: 0.9049649294173752


In [None]:
!pip install scikit-learn-extra

Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0


K-Medoids Clustering

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
import numpy as np

# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Grid Search for K-Medoids
def grid_search_kmedoids(data, cluster_range, metric='euclidean'):
    best_score = -1
    best_k = None
    for k in cluster_range:
        kmedoids = KMedoids(n_clusters=k, metric=metric, random_state=42)
        labels = kmedoids.fit_predict(data)

        # Check for empty clusters
        if len(np.unique(labels)) < k:
            print(f"Cluster count mismatch for k={k}. Skipping this value.")
            continue

        silhouette = silhouette_score(data, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_k = k
    return best_k, best_score

# Main function to execute K-Medoids with Grid Search
def run_kmedoids_clustering(file_path):
    data = load_data(file_path)
    cluster_range = range(2, 10)  # Testing k from 2 to 10
    best_k, best_score = grid_search_kmedoids(data, cluster_range)
    print(f"Best K-Medoids k: {best_k} with score: {best_score}")

# Run the K-Medoids clustering
file_path = '/content/labels2.xlsx'
run_kmedoids_clustering(file_path)



Cluster count mismatch for k=2. Skipping this value.
Cluster count mismatch for k=3. Skipping this value.
Cluster count mismatch for k=4. Skipping this value.
Cluster count mismatch for k=5. Skipping this value.
Cluster count mismatch for k=6. Skipping this value.
Cluster count mismatch for k=7. Skipping this value.
Cluster count mismatch for k=8. Skipping this value.
Cluster count mismatch for k=9. Skipping this value.
Best K-Medoids k: None with score: -1




Diagonal Covariance GMM

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    scaler = StandardScaler()  # Standard scaling
    return scaler.fit_transform(data)

# Grid Search for GMM
def grid_search_gmm(data, cluster_range):
    best_score = -1
    best_k = None
    for k in cluster_range:
        gmm = GaussianMixture(n_components=k, covariance_type='diag', random_state=42)
        labels = gmm.fit_predict(data)
        silhouette = silhouette_score(data, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_k = k
    return best_k, best_score

# Main function to execute GMM with Grid Search
def run_gmm_clustering(file_path):
    data = load_data(file_path)
    cluster_range = range(2, 11)  # Testing k from 2 to 10
    best_k, best_score = grid_search_gmm(data, cluster_range)
    print(f"Best GMM k: {best_k} with score: {best_score}")

# Run the GMM clustering
file_path = '/content/labels2.xlsx'
run_gmm_clustering(file_path)






Best GMM k: 10 with score: 0.9049649294173752
