In [None]:
import joblib
import numpy as np
from scipy.stats import pearsonr
from tslearn.metrics import dtw_path_from_metric
from scipy.stats import entropy
from itertools import combinations

In [None]:
SAMPLE_LEN = 256

def intra_cluster_dtw_distances(clusters, centroids):
    intra_dists = []
    for i, cluster in enumerate(clusters):
        dists = [dtw_path_from_metric(ts, centroids[i]) for ts in cluster]
        dists = [d[1] for d in dists]
        intra_dists.append(round(np.median(dists), 2))
    return intra_dists

def inter_cluster_dtw_distances(centroids):
    distance_grid = [[0 for i in range(len(centroids))] for _ in range(len(centroids))]
    for c1, c2 in combinations(range(len(centroids)), 2):
        _, dist = dtw_path_from_metric(centroids[c1], centroids[c2])
        dist = round(dist, 2)
        distance_grid[c1][c2] = dist
        distance_grid[c2][c1] = dist
    return distance_grid

def dtw_silhouette_score(clusters):
    all_series = [ts for cluster in clusters for ts in cluster]
    labels = []
    for i, cluster in enumerate(clusters):
        labels.extend([i] * len(cluster))

    scores = []
    scores_per_cluster = [[] for _ in range(len(set(labels)))]
    for idx, ts in enumerate(all_series):
        label = labels[idx]
        a = np.mean([dtw_path_from_metric(ts, other)[1] for other in clusters[label] if not np.array_equal(ts, other)])

        b = np.inf
        for j, cluster in enumerate(clusters):
            if j == label:
                continue
            dist = np.mean([dtw_path_from_metric(ts, other)[1] for other in cluster])
            b = min(b, dist)

        s = (b - a) / max(a, b) if max(a, b) != 0 else 0
        scores.append(s)
        scores_per_cluster[label].append(s)
        
    return round(np.mean(scores), 2), [round(np.mean(score_per_cluster_list),2) for score_per_cluster_list in scores_per_cluster]

def cluster_size_entropy(clusters):
    sizes = np.array([len(c) for c in clusters])
    probs = sizes / np.sum(sizes)
    return entropy(probs)

def cluster_variance_across_samples(clusters):
    variances = []
    for i, cluster in enumerate(clusters):
        cluster = np.array(cluster)
        assert cluster.shape == (len(clusters[i]), SAMPLE_LEN, 2) or cluster.shape == (len(clusters[i]), SAMPLE_LEN)
        # for each time step and variable: how much do samples vary? i.e., cluster spread
        variances.append(round(np.mean(np.var(cluster, axis=0)), 2))
    return variances

def intra_class_variance(clusters, centroids):
    variances = []
    for i, cluster in enumerate(clusters):
        cluster = np.array(cluster)
        assert cluster.shape == (len(clusters[i]), SAMPLE_LEN)
        assert centroids[i].shape == (SAMPLE_LEN,)
        # squared distances of each signal to centroid
        squared_diffs = (cluster - centroids[i]) ** 2
        squared_dists = np.mean(squared_diffs, axis=1)
        # avg over all signals
        variances.append(round(np.mean(squared_dists), 2))
    return variances

def append_metrics(centroids, signals, pred_labels, sub_results_dict):
    clusters = [[] for _ in range(len(centroids))]
    for i, label in enumerate(pred_labels):
        clusters[label].append(signals[i])
    
    intra = intra_cluster_dtw_distances(clusters, centroids)
    inter = inter_cluster_dtw_distances(centroids)
    sil_score, sil_score_per_cluster = dtw_silhouette_score(clusters)
    entropy_score = cluster_size_entropy(clusters)
    
    cvas = cluster_variance_across_samples(clusters)
    icv = intra_class_variance(clusters, centroids)
    cluster_sizes = [len(i) for i in clusters]
    
    print("intra:", intra)
    sub_results_dict["intra"] = intra
    print("\ninter:", inter)
    sub_results_dict["inter"] = inter
    print("\nsil_score:", sil_score)
    sub_results_dict["sil_score"] = sil_score
    print("\nsil_score_per_cluster:", sil_score_per_cluster)
    sub_results_dict["sil_score_per_cluster"] = sil_score_per_cluster
    print("\nentropy:", entropy_score, "max possible:", np.log(len(centroids)))
    sub_results_dict["entropy"] = entropy
    print("\nBalance:", entropy_score / np.log(len(centroids)))
    sub_results_dict["normalized_entropy"] = entropy_score / np.log(len(centroids))
    print("\ncluster_variance_across_samples:", cvas)
    sub_results_dict["cluster_variance_across_samples"] = cvas
    print("\nintra_class_variance:", icv)
    sub_results_dict["intra_class_variance"] = icv
    print("\nsizes:", cluster_sizes)
    sub_results_dict["sizes"] = cluster_sizes

In [None]:
results = {}

dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_saliency_InsectWingbeatSound_CLASS0.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_input_InsectWingbeatSound_CLASS0.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_multivariate_InsectWingbeatSound_CLASS0.pkl'
)

results["Insect0Input"] = {}
results["Insect0Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["Insect0Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["Insect0Multi"])

In [None]:
dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_saliency_InsectWingbeatSound_CLASS1.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_input_InsectWingbeatSound_CLASS1.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\InsectWingbeatSound_final\dba_km_multivariate_InsectWingbeatSound_CLASS1.pkl'
)

results["Insect1Input"] = {}
results["Insect1Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["Insect1Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["Insect1Multi"])

In [None]:
dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_saliency_Mallat_CLASS0.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_input_Mallat_CLASS0.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_multivariate_Mallat_CLASS0.pkl'
)

results["Mallat0Input"] = {}
results["Mallat0Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["Mallat0Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["Mallat0Multi"])

In [None]:
dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_saliency_Mallat_CLASS1.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_input_Mallat_CLASS1.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\Mallat_final\dba_km_multivariate_Mallat_CLASS1.pkl'
)

results["Mallat1Input"] = {}
results["Mallat1Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["Mallat1Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["Mallat1Multi"])

In [None]:
dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_saliency_UWaveGestureLibraryAll_CLASS0.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_input_UWaveGestureLibraryAll_CLASS0.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_multivariate_UWaveGestureLibraryAll_CLASS0.pkl'
)

results["UWave0Input"] = {}
results["UWave0Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["UWave0Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["UWave0Multi"])

In [None]:
dba_km, pred_labels_saliency, gt_labels_per_cluster_saliency, centroids_saliency, saliency_maps_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_saliency_UWaveGestureLibraryAll_CLASS1.pkl'
)
dba_km, pred_labels_input, gt_labels_per_cluster_input, centroids_input, input_signals_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_input_UWaveGestureLibraryAll_CLASS1.pkl'
)
dba_km, pred_labels_multivariate, gt_labels_per_cluster_multivariate, centroids_multivariate, multivar_signals_to_cluster = joblib.load(
    'trained_models\\UWaveGestureLibrary_final\dba_km_multivariate_UWaveGestureLibraryAll_CLASS1.pkl'
)

results["UWave1Input"] = {}
results["UWave1Multi"] = {}

print("For Input sample clustering only:")
append_metrics(centroids_input.squeeze(), input_signals_to_cluster, pred_labels_input, results["UWave1Input"])
print("For Input samples of multivariate clustering:")
centroids_univariate = centroids_multivariate[:,:,0]
signals_univariate = multivar_signals_to_cluster[:,:,0]
append_metrics(centroids_univariate, signals_univariate, pred_labels_multivariate, results["UWave1Multi"])

## Correlation between clustering-wide metrics and ARI, NMI and purity

In [None]:
dataset_order = [
    "UWave0Multi", "UWave1Multi", "Insect0Multi", "Insect1Multi", "Mallat0Multi", "Mallat1Multi", 
    "UWave0Input", "UWave1Input", "Insect0Input", "Insect1Input", "Mallat0Input", "Mallat1Input"
]

In [None]:
clustering_performances = np.array(
    [[0.615, 0.580, 0.823], [0.773, 0.737, 0.889], [0.233, 0.368, 0.398], 
     [0.169, 0.248, 0.326], [0.649, 0.804, 0.713], [0.918, 0.931, 0.940], 
     [0.426, 0.476, 0.712 ], [0.511, 0.520, 0.700], [0.018, 0.064, 0.233], 
     [0.007, 0.019, 0.226], [0.651, 0.807, 0.713], [0.634, 0.803, 0.721]]
)

In [None]:
aris = clustering_performances[:,0]
nmis = clustering_performances[:,1]
puritys = clustering_performances[:,2]

### Silhouette Score

In [None]:
sil_scores = [results[dataset]["sil_score"] for dataset in dataset_order]

In [None]:
pearsonr(np.array(sil_scores), np.array(aris))

In [None]:
pearsonr(np.array(sil_scores), np.array(nmis))

In [None]:
pearsonr(np.array(sil_scores), np.array(puritys))

### Dataset balance / normalized entropy

In [None]:
normalized_entropys = [results[dataset]["normalized_entropy"] for dataset in dataset_order]

In [None]:
pearsonr(np.array(normalized_entropys), np.array(aris))

In [None]:
pearsonr(np.array(normalized_entropys), np.array(nmis))

In [None]:
pearsonr(np.array(normalized_entropys), np.array(puritys))

### intra/inter

In [None]:
intras = [results[dataset]["intra"] for dataset in dataset_order]
inters = [results[dataset]["inter"] for dataset in dataset_order]

In [None]:
for inter_sublist in inters:
    for inter_subsublist in inter_sublist:
        if 0 in inter_subsublist:
            inter_subsublist.remove(0) 
inter_means = [[np.mean(subsublist) for subsublist in sublist] for sublist in inters]

In [None]:
intra_inter_list = []
for i in range(len(intras)):
    intra_inter_list.append(np.mean(np.array(intras[i]) / np.array(inter_means[i])))

In [None]:
pearsonr(np.array(intra_inter_list), np.array(aris))

In [None]:
pearsonr(np.array(intra_inter_list), np.array(nmis))

In [None]:
pearsonr(np.array(intra_inter_list), np.array(puritys))

### intra means

In [None]:
intra_means = [np.mean(sublist) for sublist in intras]

In [None]:
pearsonr(np.array(intra_means), np.array(aris))

In [None]:
pearsonr(np.array(intra_means), np.array(nmis))

In [None]:
pearsonr(np.array(intra_means), np.array(puritys))

### Cluster variance across samples

In [None]:
cvas = [results[dataset]["cluster_variance_across_samples"] for dataset in dataset_order]
cvas_means = [np.mean(sublist) for sublist in cvas]

In [None]:
pearsonr(np.array(cvas_means), np.array(aris))

In [None]:
pearsonr(np.array(cvas_means), np.array(nmis))

In [None]:
pearsonr(np.array(cvas_means), np.array(puritys))

### Intra class variance

In [None]:
icv = [results[dataset]["intra_class_variance"] for dataset in dataset_order]
icv_means = [np.mean(sublist) for sublist in icv]

In [None]:
pearsonr(np.array(icv_means), np.array(aris))

In [None]:
pearsonr(np.array(icv_means), np.array(nmis))

In [None]:
pearsonr(np.array(icv_means), np.array(puritys))

## Correlations for the per-cluster metrics

In [None]:
from itertools import chain
def flatten(listOfLists):
    "Flatten one level of nesting"
    return list(chain.from_iterable(listOfLists))

In [None]:
# calculate correlation between sscore per-cluster metrics and probability of matching correctly.

cluster_matching_prob = list(flatten([
    [0.4,0,0.4,0], [0,0.2,0.2,0.4], [0.8,0.6,0], [0,0,0], [0.2,0.2,0.6], [0.4,0.8,0,1], [0.8,0.4,0,0], [1,0.8,0], [0,0,1], [0,1,0], [0,0,0.2], [1,0,1]
]))

In [None]:
# sil_score
sil_scores_per_cluster = list(flatten([results[dataset]["sil_score_per_cluster"] for dataset in dataset_order]))
pearsonr(np.array(cluster_matching_prob), np.array(sil_scores_per_cluster))

In [None]:
# intra
intra_per_cluster = list(flatten([results[dataset]["intra"] for dataset in dataset_order]))
pearsonr(np.array(cluster_matching_prob), np.array(intra_per_cluster))

In [None]:
# cluster_variance_across_samples
cvas_per_cluster = list(flatten([results[dataset]["cluster_variance_across_samples"] for dataset in dataset_order]))
pearsonr(np.array(cluster_matching_prob), np.array(cvas_per_cluster))

In [None]:
# intra_class_variance
icv_per_cluster = list(flatten([results[dataset]["intra_class_variance"] for dataset in dataset_order]))
pearsonr(np.array(cluster_matching_prob), np.array(icv_per_cluster))

In [None]:
# inter means
inter_per_cluster = list(flatten([results[dataset]["inter"] for dataset in dataset_order]))
for inter_sublist in inter_per_cluster:
    if 0 in inter_sublist:
        inter_sublist.remove(0)
inter_means_per_cluster = [np.mean(inter_sublist) for inter_sublist in inter_per_cluster]
pearsonr(np.array(cluster_matching_prob), np.array(inter_means_per_cluster))

In [None]:
# intra inter frac
pearsonr(np.array(cluster_matching_prob), np.array(intra_per_cluster)/np.array(inter_means_per_cluster))