In [1]:
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt



In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_completeness_v_measure

In [3]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [14]:
metrics_df = pd.concat([pd.read_pickle(i) for i in list((DATA_DIR / 'clustering_metric_df').glob('*.pickle'))])
metrics_df.loc[metrics_df.dataset == 'cassins', 'dataset'] = 'cassins_dtw'
metrics_df[:3]

Unnamed: 0,dataset,class_,dim,silhouette,homogeneity,completeness,v_measure,init_,n_clusters,model
0,mnist,PCA,2,0.37744,0.272552,0.39746,0.323363,0,5,"KMeans(n_clusters=5, random_state=0)"
1,mnist,PCA,2,0.377355,0.273045,0.39793,0.323865,1,5,"KMeans(n_clusters=5, random_state=1)"
2,mnist,PCA,2,0.377199,0.273357,0.39823,0.324184,2,5,"KMeans(n_clusters=5, random_state=2)"


In [15]:
metrics_df

Unnamed: 0,dataset,class_,dim,silhouette,homogeneity,completeness,v_measure,init_,n_clusters,model
0,mnist,PCA,2,0.377440,0.272552,0.397460,0.323363,0,5,"KMeans(n_clusters=5, random_state=0)"
1,mnist,PCA,2,0.377355,0.273045,0.397930,0.323865,1,5,"KMeans(n_clusters=5, random_state=1)"
2,mnist,PCA,2,0.377199,0.273357,0.398230,0.324184,2,5,"KMeans(n_clusters=5, random_state=2)"
3,mnist,PCA,2,0.377346,0.273049,0.397946,0.323874,3,5,"KMeans(n_clusters=5, random_state=3)"
4,mnist,PCA,2,0.377413,0.272501,0.397373,0.323298,4,5,"KMeans(n_clusters=5, random_state=4)"
...,...,...,...,...,...,...,...,...,...,...
45,fmnist,vae,2,0.375252,0.575557,0.511677,0.541740,0,14,"KMeans(n_clusters=14, random_state=0)"
46,fmnist,vae,2,0.374912,0.575514,0.511521,0.541634,1,14,"KMeans(n_clusters=14, random_state=1)"
47,fmnist,vae,2,0.374924,0.575444,0.511468,0.541573,2,14,"KMeans(n_clusters=14, random_state=2)"
48,fmnist,vae,2,0.374862,0.575051,0.511110,0.541198,3,14,"KMeans(n_clusters=14, random_state=3)"


In [16]:
metrics_df.class_.unique()

array(['PCA', 'ae_only', 'autoencoder', 'direct', 'vae', 'network',
       'umap-learn', 'TSNE', 'parametric-tsne'], dtype=object)

### plot on the basis of the best silhouette score

In [17]:
metrics_df.dataset.unique()

array(['mnist', 'cassins_dtw', 'fmnist', 'cifar10', 'macosko2015'],
      dtype=object)

In [18]:
### get the best silhouette score for each dataset, class_, dim_
metrics_df_max_iter = []
for dataset in metrics_df.dataset.unique():
    for class_ in metrics_df.class_.unique():
        for dim in metrics_df.dim.unique():
            subset_df = metrics_df[
                (dataset == metrics_df.dataset.values) &
                (class_ == metrics_df.class_.values) &
                (dim == metrics_df.dim.values) 
            ]
            if len(subset_df) > 0:
                #print(np.argmax(subset_df.silhouette))
                metrics_df_max_iter.append(subset_df.iloc[np.argmax(subset_df.silhouette)])

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return bound(*args, **kwds)


In [19]:
metrics_max_df = pd.concat(metrics_df_max_iter, axis=1).T.reset_index()
metrics_max_df[:3]

Unnamed: 0,index,dataset,class_,dim,silhouette,homogeneity,completeness,v_measure,init_,n_clusters,model
0,0,mnist,PCA,2,0.37744,0.272552,0.39746,0.323363,0,5,"KMeans(n_clusters=5, random_state=0)"
1,18,mnist,PCA,64,0.0912846,0.472775,0.53284,0.501013,3,8,KMeans(random_state=3)
2,5,mnist,ae_only,2,0.493923,0.307547,0.611768,0.409321,0,6,"KMeans(n_clusters=6, random_state=0)"


In [20]:
metrics_max_df

Unnamed: 0,index,dataset,class_,dim,silhouette,homogeneity,completeness,v_measure,init_,n_clusters,model
0,0,mnist,PCA,2,0.37744,0.272552,0.39746,0.323363,0,5,"KMeans(n_clusters=5, random_state=0)"
1,18,mnist,PCA,64,0.0912846,0.472775,0.53284,0.501013,3,8,KMeans(random_state=3)
2,5,mnist,ae_only,2,0.493923,0.307547,0.611768,0.409321,0,6,"KMeans(n_clusters=6, random_state=0)"
3,14,mnist,ae_only,64,0.0979506,0.389987,0.467557,0.425264,4,7,"KMeans(n_clusters=7, random_state=4)"
4,28,mnist,autoencoder,2,0.529301,0.837299,0.85496,0.846037,3,10,"KMeans(n_clusters=10, random_state=3)"
...,...,...,...,...,...,...,...,...,...,...,...
80,33,macosko2015,umap-learn,2,0.503299,0.78593,0.424224,0.551021,3,12,"KMeans(n_clusters=12, random_state=3)"
81,4,macosko2015,umap-learn,64,0.640607,0.76894,0.783878,0.776338,4,6,"KMeans(n_clusters=6, random_state=4)"
82,21,macosko2015,TSNE,2,0.402322,0.749659,0.389235,0.512415,1,10,"KMeans(n_clusters=10, random_state=1)"
83,16,macosko2015,parametric-tsne,2,0.505702,0.782231,0.475215,0.591243,1,9,"KMeans(n_clusters=9, random_state=1)"


In [21]:
metrics_max_df = metrics_max_df[["dataset","class_","dim","v_measure"]].set_index(['dataset', 'dim'])
metrics_max_df = metrics_max_df.pivot_table(
    index=["dataset", "dim"],
    columns="class_",
    values="v_measure",
    aggfunc="first",
)
metrics_max_df 

Unnamed: 0_level_0,class_,PCA,TSNE,ae_only,autoencoder,direct,network,parametric-tsne,umap-learn,vae
dataset,dim,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cassins_dtw,2,0.602941,0.962837,0.598428,0.965995,0.953765,0.968604,0.960494,0.958064,0.719812
cassins_dtw,64,0.901887,,0.750089,0.966182,0.959669,0.966486,0.94343,0.959621,0.792266
cifar10,2,0.06051,0.068812,0.025795,0.072967,0.073806,0.071882,0.038281,0.074285,0.055956
cifar10,64,0.059858,,0.057365,0.074582,0.074152,0.074183,0.05702,0.073321,0.090487
fmnist,2,0.422137,0.540777,0.481776,0.660207,0.649126,0.659404,0.624835,0.660286,0.531902
fmnist,64,0.524356,,0.554064,0.663545,0.660976,0.661821,0.468033,0.660182,0.563896
macosko2015,2,0.552168,0.512415,0.569513,0.586217,0.556708,0.711236,0.591243,0.551021,0.569113
macosko2015,64,0.669553,,0.589705,0.579288,0.782354,0.63561,0.468243,0.776338,0.590808
mnist,2,0.323363,0.770399,0.409321,0.846037,0.851221,0.782399,0.744601,0.837543,0.546676
mnist,64,0.501013,,0.425264,0.870142,0.77487,0.781833,0.49768,0.774738,0.561659


In [24]:
def can_float(x):
    try:
        float(x)
        if np.isnan(float(x)):
            return False
        return True
    except:
        return False

In [23]:
metric_string = (
    metrics_max_df[["TSNE", 'parametric-tsne', "umap-learn", "network", "autoencoder", "ae_only", "vae", "PCA"]]
    .round(4)
    .to_latex()
    .replace("cassins\_dtw", "Cassin's")
    .replace("cifar10", "CIFAR10")
    .replace("fmnist", "FMNIST")
    .replace("mnist", "MNIST")
    .replace("macosko2015", "Retina")
    .replace("autoencoder", "UMAP/AE")
    .replace("ae\_only", "AE")
    .replace("network", "P. UMAP")
    .replace("umap-learn", "UMAP")
    .replace("vae", "VAE")
    .replace("pca", "PCA")
    .replace("parametric-tsne", "P. t-SNE")
    .replace("TSNE", "t-SNE")
    .replace("NaN", "-")
)

In [26]:
lines = metric_string.split('\n')
skip = 1
for line in lines:
    line_elements = line.split(' ')
    floatables = [can_float(le) for (le) in line_elements]
    floats = [float(j) for i, j in zip(floatables, line_elements) if i]
    if len(floats)> 1:
        best = np.argmax(floats[skip:])
        replace_element = np.where(floatables)[0][skip + best]
        line_elements[replace_element] = '\\textbf{' +line_elements[replace_element]+ '}'
    print(' '.join(line_elements))

\begin{tabular}{llrrrrrrrr}
\toprule
      & class\_ &    t-SNE &  P. t-SNE &  UMAP &  P. UMAP &  UMAP/AE &  AE &     VAE &     PCA \\
dataset & dim &         &                  &             &          &              &          &         &         \\
\midrule
Cassin's & 2  &  0.9628 &           0.9605 &      0.9581 &   \textbf{0.9686} &       0.9660 &   0.5984 &  0.7198 &  0.6029 \\
      & 64 &     - &           0.9434 &      0.9596 &   \textbf{0.9665} &       0.9662 &   0.7501 &  0.7923 &  0.9019 \\
CIFAR10 & 2  &  0.0688 &           0.0383 &      \textbf{0.0743} &   0.0719 &       0.0730 &   0.0258 &  0.0560 &  0.0605 \\
      & 64 &     - &           0.0570 &      0.0733 &   0.0742 &       0.0746 &   0.0574 &  \textbf{0.0905} &  0.0599 \\
FMNIST & 2  &  0.5408 &           0.6248 &      \textbf{0.6603} &   0.6594 &       0.6602 &   0.4818 &  0.5319 &  0.4221 \\
      & 64 &     - &           0.4680 &      0.6602 &   0.6618 &       \textbf{0.6635} &   0.5541 &  0.5639 &  0.5244 \\
R