Load and inspect data

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

# Load dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Combine train and test features, drop labels
features_train = train.drop(columns=['Activity', 'subject'])
features_test = test.drop(columns=['Activity', 'subject'])
combined_features = pd.concat([features_train, features_test], axis=0)

# Standardize data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

k_baseline = 6

# Initialize clustering models
print("Running baseline models...")
models = {
    "K-Means": KMeans(n_clusters=k_baseline, random_state=42),
    "GMM": GaussianMixture(n_components=k_baseline, random_state=42),
    "Agglomerative": AgglomerativeClustering(n_clusters=k_baseline, linkage='ward'),
    "K-Medoids": KMedoids(n_clusters=k_baseline, random_state=42, method="pam"),
}

# Evaluate baseline models
for model_name, model in models.items():
    if model_name == "GMM":
        cluster_labels = model.fit_predict(scaled_features)
    else:
        cluster_labels = model.fit(scaled_features).labels_
    
    silhouette = silhouette_score(scaled_features, cluster_labels)
    calinski = calinski_harabasz_score(scaled_features, cluster_labels)
    davies_bouldin = davies_bouldin_score(scaled_features, cluster_labels)
    
    print(f"\n{model_name} Results (Baseline k={k_baseline}):")
    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Calinski-Harabasz Index: {calinski:.3f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")



Running baseline models...


  super()._check_params_vs_input(X, default_n_init=10)



K-Means Results (Baseline k=6):
Silhouette Score: 0.110
Calinski-Harabasz Index: 2556.542
Davies-Bouldin Index: 2.384

GMM Results (Baseline k=6):
Silhouette Score: 0.135
Calinski-Harabasz Index: 2458.728
Davies-Bouldin Index: 2.581

Agglomerative Results (Baseline k=6):
Silhouette Score: 0.117
Calinski-Harabasz Index: 2349.668
Davies-Bouldin Index: 2.482

K-Medoids Results (Baseline k=6):
Silhouette Score: 0.062
Calinski-Harabasz Index: 2398.673
Davies-Bouldin Index: 2.750
