In [None]:
import MinkowskiEngine as ME
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
import torchvision.transforms.v2 as transforms
import torchvision.transforms.v2.functional as F
from torch import nn

## Jupyter magic
%matplotlib inline
mpl.rcParams['figure.figsize'] = [8, 6]
mpl.rcParams['font.size'] = 16

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.device(device)
import numpy as np
SEED=12345
_=np.random.seed(SEED)
_=torch.manual_seed(SEED)

In [None]:
## Includes from my libraries for this project                                                                                                                                           
from ME_dataset_libs import make_dense, make_dense_from_tensor, Label

In [None]:
from FSD_training_analysis import get_models_from_checkpoint

## Load in the pre-calculated model weights
file_dir = "/pscratch/sd/c/cwilk"

## This is interesting, but limited so the best performance really is for ~N=20-30. The best silhouette is ~0.25
# chk_file = "state_lat64_hid128_clust25_nchan64_5E-6_1024_PROJ0.5one_CLUST0.5one_ent1E-1_soft1.0_arch24x8silu_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilinfix0.5_DROP0_WEIGHT_DECAY0.05_10M_DATA1_FSDCCFIX.pth"
chk_file="state_lat64_hid128_nchan64_5E-6_1024_PROJ0.5two_arch24x8silu_poolmax_flat1_grow1_kern7_onecycle50_bigaugbilinfix0.5_DROP0_DECAY0_5M_FSDSIMCLR.pth"

encoder, heads, args = get_models_from_checkpoint(file_dir+"/"+chk_file)
encoder.eval()
for h in heads.values(): h.eval()

encoder.to(device)
for h in heads.values(): h.to(device)

print("Loaded:", chk_file)

In [None]:
## Setup the dataloader
from FSD_training_analysis import get_dataset
import time

data_dir = "/pscratch/sd/c/cwilk/FSD/DATA"
sim_dir = "/pscratch/sd/c/cwilk/FSD/SIMULATIONv2"
max_data_events=20000
max_sim_events=20000

start = time.time() 
sim_dataset, sim_loader = get_dataset(sim_dir, max_sim_events, return_metadata=True)
data_dataset, data_loader = get_dataset(data_dir, max_data_events, return_metadata=True)
print("Time taken to load", data_dataset.__len__(),"data and", sim_dataset.__len__(), "images:", time.time() - start)

In [None]:
import numpy as np
import FSD_training_analysis
importlib.reload(FSD_training_analysis)
from FSD_training_analysis import image_loop, reorder_clusters
import time
start = time.time()
## Get the processed vectors of interest from the datasets                                                                                                                                                     
data_processed = image_loop(encoder, heads, data_loader, False)
sim_processed = image_loop(encoder, heads, sim_loader, False)
print("Time to process events:", time.time() - start)

start = time.time()
## Do some magic to re-order the clusters for presentation purposes                                                                                                                                            
reorder_clusters(data_processed, sim_processed)
print("Time to reorder events:", time.time() - start)


In [None]:
## Play with some GMM options
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

def run_gmm_skl(dataset, k, covariance_type='full', max_iter=200):

    scaler = StandardScaler()
    input_vect = scaler.fit_transform(dataset)
    
    gmm = GaussianMixture(
        n_components=k,
        n_init=1,
        verbose=2,
        covariance_type=covariance_type,
        max_iter=max_iter
    )

    gmm.fit(input_vect)

    labels = gmm.predict(input_vect)
    probs = gmm.predict_proba(input_vect)
    aic = gmm.aic(input_vect)
    bic = gmm.bic(input_vect)
    
    print("Cluster weights:", gmm.weights_)
    print("AIC:", aic)
    print("BIC:", bic)

    return labels, aic, bic

In [None]:
import numpy as np
import faiss
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def run_faiss_spherical_kmeans(dataset, n_clusters, n_iter=20, verbose=True, seed=123):
    # Normalize embeddings (critical for cosine clustering)
    X = dataset.astype(np.float32)
    X /= np.linalg.norm(X, axis=1, keepdims=True)

    N, d = X.shape

    # FAISS k-means (spherical via normalization)
    kmeans = faiss.Kmeans(
        d=d,
        k=n_clusters,
        niter=n_iter,
        verbose=verbose,
        seed=seed,
        spherical=True  # ensures centroid normalization
    )
    kmeans.train(X)

    # Assign clusters
    _, labels = kmeans.index.search(X, 1)
    labels = labels.flatten()

    # Cluster weights
    counts = np.bincount(labels, minlength=n_clusters)
    weights = counts / N

    # Metrics
    labs = np.unique(labels)
    metrics = {}

    if labs.size < 2 or labs.size >= len(labels):
        metrics["silhouette"] = None
        metrics["calinski_harabasz"] = None
        metrics["davies_bouldin"] = None
    else:
        metrics["silhouette"] = silhouette_score(X, labels, metric="cosine")
        metrics["calinski_harabasz"] = calinski_harabasz_score(X, labels)
        metrics["davies_bouldin"] = davies_bouldin_score(X, labels)

    if verbose:
        print("Cluster weights:", weights)
        print("Silhouette score:", metrics["silhouette"])
        print("Calinski-Harabasz =", metrics["calinski_harabasz"])
        print("Davies-Bouldin =", metrics["davies_bouldin"])

    return labels, metrics, kmeans.centroids

In [None]:
#!pip install spherecluster2

In [None]:
## Play with some GMM options
import spherecluster
from spherecluster import VonMisesFisherMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def run_vMF(dataset, n_clusters, init="random-class", n_copies=10, verbose=True):

    X_norm = dataset / np.linalg.norm(dataset, axis=1, keepdims=True)

    ## init: k-means++, spherical-k-means, random, random-class (default), random-orthonormal
    ## max_iter: 300
    ## n_init: 10
    ## n_jobs: 1 (number of CPUs to use)
    
    vMF = VonMisesFisherMixture(n_clusters=n_clusters, posterior_type='soft', n_init=n_copies, n_jobs=n_copies, verbose=verbose, max_iter=500)
    vMF.fit(X_norm)

    ## For some reasons labels are floats
    labels = vMF.predict(X_norm).astype(int)
    weights = vMF.weights_

    metrics = {}
    metrics["silhouette"] = silhouette_score(X_norm, labels, metric="cosine")
    metrics["calinski_harabasz"] = calinski_harabasz_score(X_norm, labels)
    metrics["davies_bouldin"] = davies_bouldin_score(X_norm, labels)

    if verbose:
        print("Cluster weights:", weights)
        print("Silhouette score:", metrics["silhouette"])
        print("Calinski-Harabasz =", metrics["calinski_harabasz"])
        print("Davies-Bouldin =", metrics["davies_bouldin"])

    return labels, metrics

In [None]:
## Get a tSNE plot for comparison
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import run_tsne_skl
ntsne=20000
#tsne_results = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
#                            data_processed['clust_index'][:ntsne].copy(), \
#                            alpha_vect=data_processed['clust_max'][:ntsne].copy(), \
#                            perp=150, exag=20, lr=500)

tsne_results = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
                            np.zeros(ntsne), \
                            perp=150, exag=20, lr=500)

In [None]:
gmm_k = [10, 20, 30, 40, 50]
gmm_labels = []
gmm_bic = []
gmm_aic = []

for k in gmm_k:
    print("Running k =", k)
    # these_labels, aic, bic = run_gmm_skl(data_processed['latent'], k)
    # these_labels, metrics = run_vMF(data_processed['latent'], k)
    these_labels, metrics, _ = run_faiss_spherical_kmeans(data_processed['latent'], k)
    _ = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
                     these_labels[:ntsne].copy(), tsne_results=tsne_results)
    
    gmm_labels.append(these_labels)
    #gmm_aic.append(aic)
    #gmm_bic.append(bic)

In [None]:
n_clusters = 30
for init in ["k-means++", "spherical-k-means", "random", "random-class", "random-orthonormal"]:
    print("Running init =", init)
    these_labels, metrics = run_vMF(data_processed['latent'], n_clusters, init=init, n_copies=10, verbose=True)
    _ = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
                     these_labels[:ntsne].copy(), tsne_results=tsne_results)

In [None]:
n_clusters = 30
for init in ["random-orthonormal"]:
    print("Running init =", init)
    these_labels, metrics = run_vMF(data_processed['latent'], n_clusters, init=init, n_copies=10, verbose=True)
    _ = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
                     these_labels[:ntsne].copy(), tsne_results=tsne_results)

In [None]:
gmm_k2 = [60, 70, 80, 90, 100]

for k in gmm_k2:
    print("Running k =", k)
    # these_labels, aic, bic = run_gmm_skl(data_processed['latent'], k)
    these_labels = run_vMF(data_processed['latent'], k)
    _ = run_tsne_skl(data_processed['latent'][:ntsne].copy(), \
                     these_labels[:ntsne].copy(), tsne_results=tsne_results)