In [2]:
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering, Birch, HDBSCAN
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import f1_score

In [3]:
# Define clustering models in a dictionary for flexibility
clustering_models = {
    "KMeans": lambda n: KMeans(n_clusters=n, random_state=0),
    "Spectral": lambda n: SpectralClustering(n_clusters=n, assign_labels='discretize', random_state=0, affinity='nearest_neighbors'),
    "Birch": lambda n: Birch(n_clusters=n)
}

# Initialize result container
metrics = []
metric_kmeans = []
latents = []

for n_latents in [3, 5, 8]:
    data = pd.read_csv(f"../data/fmri_derived/dfcs/latent_representations_{n_latents}.csv", index_col=0)
    
    for n_clusters in [3, 5, 8]:  
        for name, model_func in clustering_models.items():
            model = model_func(n_clusters)
            labels = model.fit_predict(data)
    
            # Evaluation with distance metrics
            ss = silhouette_score(data, labels)
            db = davies_bouldin_score(data, labels)
            ch = calinski_harabasz_score(data, labels)
    
            if name == "KMeans":
                metric_kmeans.append([n_latents, n_clusters, model.inertia_])

            # Evaluation with a classifier
            X =
            y = 
            x_train
            train predict
            f1_score = f1_score(y_true, y_pred)
    
            # Store results
            metrics.append([name, n_latents, n_clusters, ss, db, ch, f1_score])
            latents.append(pd.Series(name = f"{name}_{n_latents}_{n_clusters}", data =labels))

# Convert to DataFrame
df_metrics = pd.DataFrame(metrics, columns=["Model", "n_latents", "n_clusters", "Silhouette", "Davies-Bouldin", "Calinski-Harabasz"])
df_metrics



Unnamed: 0,Model,n_latents,n_clusters,Silhouette,Davies-Bouldin,Calinski-Harabasz
0,KMeans,2,2,0.614298,0.532556,16106.128083
1,Spectral,2,2,0.622568,0.496189,15652.32085
2,Birch,2,2,0.617669,0.427036,13002.711153
3,KMeans,2,3,0.563949,0.531035,21782.912335
4,Spectral,2,3,0.503124,0.538596,14916.975697
5,Birch,2,3,0.506023,0.512108,16254.330725
6,KMeans,2,5,0.545586,0.513257,30215.19482
7,Spectral,2,5,0.498923,0.605836,23818.685406
8,Birch,2,5,0.481925,0.529937,21068.883863
9,KMeans,2,8,0.527789,0.52339,39149.834243


In [4]:
Y = pd.DataFrame(latents).T
Y.to_csv("../data/fmri_derived/dfcs/all_dfcs_flattened_clusters.csv")

In [5]:
Y

Unnamed: 0,KMeans_2_2,Spectral_2_2,Birch_2_2,KMeans_2_3,Spectral_2_3,Birch_2_3,KMeans_2_5,Spectral_2_5,Birch_2_5,KMeans_2_8,...,Birch_8_2,KMeans_8_3,Spectral_8_3,Birch_8_3,KMeans_8_5,Spectral_8_5,Birch_8_5,KMeans_8_8,Spectral_8_8,Birch_8_8
0,0,0,0,0,0,2,4,0,0,4,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,2,4,0,0,4,...,1,0,0,0,0,0,0,0,0,0
2,1,1,0,2,2,0,2,2,3,2,...,0,2,0,1,1,1,4,1,5,1
3,1,1,0,2,2,0,2,2,3,2,...,0,2,0,1,1,1,4,1,1,1
4,1,1,1,1,2,1,2,2,2,5,...,0,2,0,1,1,1,4,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7089,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
7090,0,0,0,0,0,2,0,0,0,4,...,1,0,0,0,0,0,0,7,0,0
7091,0,0,0,0,0,0,0,4,1,0,...,1,0,0,0,0,0,0,0,0,0
7092,0,0,0,0,0,0,0,4,1,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
df_metrics['parameters'] = df_metrics.apply(lambda row: f"{row['n_latents']}_{row['n_clusters']}", axis=1)
metrics = ["Silhouette", "Davies-Bouldin", "Calinski-Harabasz"]

fig, axes = plt.subplots(len(metrics), 1, figsize=(8, 10), sharex=True)
for i, metric in enumerate(metrics):
    ax = axes[i]
    for model in ["KMeans", "Spectral", "Birch"]:
        dummy = df_metrics[df_metrics["Model"] == model][['parameters', metric]].set_index('parameters')
        ax.plot(dummy, label=model)
    
    ax.set_title(metric)
    ax.set_ylabel(metric)
    ax.legend()
    ax.grid(True, linestyle="--", alpha=0.5)
    ax.tick_params(axis='x', rotation=45)

axes[-1].set_xlabel("Number of latent dimensions & clusters")

plt.tight_layout()
plt.savefig("../data/fmri_derived/dfcs/Clusters_Evaluation.png", dpi=100)
plt.close()


In [None]:
# Columns you want to filter on
cols = ["Silhouette", "Davies-Bouldin", "Calinski-Harabasz"]

# Compute 75th percentile threshold for each column
thresholds = df_metrics[cols].quantile(0.7)

# Keep rows where ALL three columns are >= their respective thresholds
df_filtered = df_metrics[
    (df_metrics[cols[0]] >= thresholds[cols[0]]) &
    (df_metrics[cols[1]] <= thresholds[cols[1]]) &
    (df_metrics[cols[2]] >= thresholds[cols[2]])
]
df_filtered